diff options
Diffstat (limited to 'fs/bcachefs')
158 files changed, 9150 insertions, 5101 deletions
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig index 07709b0d7688..8cb2b9d5da96 100644 --- a/fs/bcachefs/Kconfig +++ b/fs/bcachefs/Kconfig @@ -103,6 +103,14 @@ config BCACHEFS_PATH_TRACEPOINTS Enable extra tracepoints for debugging btree_path operations; we don't normally want these enabled because they happen at very high rates. +config BCACHEFS_TRANS_KMALLOC_TRACE + bool "Trace bch2_trans_kmalloc() calls" + depends on BCACHEFS_FS + +config BCACHEFS_ASYNC_OBJECT_LISTS + bool "Keep async objects on fast_lists for debugfs visibility" + depends on BCACHEFS_FS && DEBUG_FS + config MEAN_AND_VARIANCE_UNIT_TEST tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS depends on KUNIT diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index 9af65079374f..93c8ee5425c8 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -35,11 +35,13 @@ bcachefs-y := \ disk_accounting.o \ disk_groups.o \ ec.o \ + enumerated_ref.o \ errcode.o \ error.o \ extents.o \ extent_update.o \ eytzinger.o \ + fast_list.o \ fs.o \ fs-ioctl.o \ fs-io.o \ @@ -97,6 +99,8 @@ bcachefs-y := \ varint.o \ xattr.o +bcachefs-$(CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS) += async_objs.o + obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST) += mean_and_variance_test.o # Silence "note: xyz changed in GCC X.X" messages diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 94ea9e49aec4..b228a5a64479 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -17,10 +17,10 @@ #include "debug.h" #include "disk_accounting.h" #include "ec.h" +#include "enumerated_ref.h" #include "error.h" #include "lru.h" #include "recovery.h" -#include "trace.h" #include "varint.h" #include <linux/kthread.h> @@ -308,7 +308,8 @@ int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k, "data type inconsistency"); bkey_fsck_err_on(!a.io_time[READ] && - c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs, + !(c->recovery.passes_to_run & + BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs)), c, alloc_key_cached_but_read_time_zero, "cached bucket with read_time == 0"); break; @@ -335,11 +336,10 @@ void bch2_alloc_v4_swab(struct bkey_s k) a->stripe_sectors = swab32(a->stripe_sectors); } -void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) +static inline void __bch2_alloc_v4_to_text(struct printbuf *out, struct bch_fs *c, + unsigned dev, const struct bch_alloc_v4 *a) { - struct bch_alloc_v4 _a; - const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a); - struct bch_dev *ca = c ? bch2_dev_bucket_tryget_noerror(c, k.k->p) : NULL; + struct bch_dev *ca = c ? bch2_dev_tryget_noerror(c, dev) : NULL; prt_newline(out); printbuf_indent_add(out, 2); @@ -367,6 +367,19 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c bch2_dev_put(ca); } +void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) +{ + struct bch_alloc_v4 _a; + const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a); + + __bch2_alloc_v4_to_text(out, c, k.k->p.inode, a); +} + +void bch2_alloc_v4_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) +{ + __bch2_alloc_v4_to_text(out, c, k.k->p.inode, bkey_s_c_to_alloc_v4(k).v); +} + void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out) { if (k.k->type == KEY_TYPE_alloc_v4) { @@ -478,12 +491,27 @@ struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans, enum btree_iter_update_trigger_flags flags) { struct btree_iter iter; - struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update_noupdate(trans, &iter, pos); - int ret = PTR_ERR_OR_ZERO(a); - if (ret) + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, pos, + BTREE_ITER_with_updates| + BTREE_ITER_cached| + BTREE_ITER_intent); + int ret = bkey_err(k); + if (unlikely(ret)) return ERR_PTR(ret); - ret = bch2_trans_update(trans, &iter, &a->k_i, flags); + if ((void *) k.v >= trans->mem && + (void *) k.v < trans->mem + trans->mem_top) { + bch2_trans_iter_exit(trans, &iter); + return container_of(bkey_s_c_to_alloc_v4(k).v, struct bkey_i_alloc_v4, v); + } + + struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut_inlined(trans, k); + if (IS_ERR(a)) { + bch2_trans_iter_exit(trans, &iter); + return a; + } + + ret = bch2_trans_update_ip(trans, &iter, &a->k_i, flags, _RET_IP_); bch2_trans_iter_exit(trans, &iter); return unlikely(ret) ? ERR_PTR(ret) : a; } @@ -680,8 +708,8 @@ static int __need_discard_or_freespace_err(struct btree_trans *trans, set ? "" : "un", bch2_btree_id_str(btree), buf.buf); - if (ret == -BCH_ERR_fsck_ignore || - ret == -BCH_ERR_fsck_errors_not_fixed) + if (bch2_err_matches(ret, BCH_ERR_fsck_ignore) || + bch2_err_matches(ret, BCH_ERR_fsck_errors_not_fixed)) ret = 0; printbuf_exit(&buf); @@ -837,7 +865,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p); if (!ca) - return -BCH_ERR_trigger_alloc; + return bch_err_throw(c, trigger_alloc); struct bch_alloc_v4 old_a_convert; const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert); @@ -913,15 +941,6 @@ int bch2_trigger_alloc(struct btree_trans *trans, goto err; } - if ((flags & BTREE_TRIGGER_bucket_invalidate) && - old_a->cached_sectors) { - ret = bch2_mod_dev_cached_sectors(trans, ca->dev_idx, - -((s64) old_a->cached_sectors), - flags & BTREE_TRIGGER_gc); - if (ret) - goto err; - } - ret = bch2_alloc_key_to_dev_counters(trans, ca, old_a, new_a, flags); if (ret) goto err; @@ -980,14 +999,11 @@ int bch2_trigger_alloc(struct btree_trans *trans, } if (new_a->gen != old_a->gen) { - rcu_read_lock(); + guard(rcu)(); u8 *gen = bucket_gen(ca, new.k->p.offset); - if (unlikely(!gen)) { - rcu_read_unlock(); + if (unlikely(!gen)) goto invalid_bucket; - } *gen = new_a->gen; - rcu_read_unlock(); } #define eval_state(_a, expr) ({ const struct bch_alloc_v4 *a = _a; expr; }) @@ -1013,15 +1029,12 @@ int bch2_trigger_alloc(struct btree_trans *trans, } if ((flags & BTREE_TRIGGER_gc) && (flags & BTREE_TRIGGER_insert)) { - rcu_read_lock(); + guard(rcu)(); struct bucket *g = gc_bucket(ca, new.k->p.offset); - if (unlikely(!g)) { - rcu_read_unlock(); + if (unlikely(!g)) goto invalid_bucket; - } g->gen_valid = 1; g->gen = new_a->gen; - rcu_read_unlock(); } err: fsck_err: @@ -1031,7 +1044,7 @@ fsck_err: invalid_bucket: bch2_fs_inconsistent(c, "reference to invalid bucket\n%s", (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf)); - ret = -BCH_ERR_trigger_alloc; + ret = bch_err_throw(c, trigger_alloc); goto err; } @@ -1097,13 +1110,12 @@ static bool next_bucket(struct bch_fs *c, struct bch_dev **ca, struct bpos *buck bucket->offset = 0; } - rcu_read_lock(); + guard(rcu)(); *ca = __bch2_next_dev_idx(c, bucket->inode, NULL); if (*ca) { *bucket = POS((*ca)->dev_idx, (*ca)->mi.first_bucket); bch2_dev_get(*ca); } - rcu_read_unlock(); return *ca != NULL; } @@ -1381,7 +1393,7 @@ static void check_discard_freespace_key_work(struct work_struct *work) container_of(work, struct check_discard_freespace_key_async, work); bch2_trans_do(w->c, bch2_recheck_discard_freespace_key(trans, w->pos)); - bch2_write_ref_put(w->c, BCH_WRITE_REF_check_discard_freespace_key); + enumerated_ref_put(&w->c->writes, BCH_WRITE_REF_check_discard_freespace_key); kfree(w); } @@ -1446,7 +1458,7 @@ delete: ret = bch2_btree_bit_mod_iter(trans, iter, false) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: - -BCH_ERR_transaction_restart_commit; + bch_err_throw(c, transaction_restart_commit); goto out; } else { /* @@ -1458,7 +1470,7 @@ delete: if (!w) goto out; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_check_discard_freespace_key)) { + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_check_discard_freespace_key)) { kfree(w); goto out; } @@ -1467,6 +1479,8 @@ delete: w->c = c; w->pos = BBPOS(iter->btree_id, iter->pos); queue_work(c->write_ref_wq, &w->work); + + ret = 1; /* don't allocate from this bucket */ goto out; } } @@ -1767,14 +1781,16 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c) static int discard_in_flight_add(struct bch_dev *ca, u64 bucket, bool in_progress) { + struct bch_fs *c = ca->fs; int ret; mutex_lock(&ca->discard_buckets_in_flight_lock); - darray_for_each(ca->discard_buckets_in_flight, i) - if (i->bucket == bucket) { - ret = -BCH_ERR_EEXIST_discard_in_flight_add; - goto out; - } + struct discard_in_flight *i = + darray_find_p(ca->discard_buckets_in_flight, i, i->bucket == bucket); + if (i) { + ret = bch_err_throw(c, EEXIST_discard_in_flight_add); + goto out; + } ret = darray_push(&ca->discard_buckets_in_flight, ((struct discard_in_flight) { .in_progress = in_progress, @@ -1788,14 +1804,11 @@ out: static void discard_in_flight_remove(struct bch_dev *ca, u64 bucket) { mutex_lock(&ca->discard_buckets_in_flight_lock); - darray_for_each(ca->discard_buckets_in_flight, i) - if (i->bucket == bucket) { - BUG_ON(!i->in_progress); - darray_remove_item(&ca->discard_buckets_in_flight, i); - goto found; - } - BUG(); -found: + struct discard_in_flight *i = + darray_find_p(ca->discard_buckets_in_flight, i, i->bucket == bucket); + BUG_ON(!i || !i->in_progress); + + darray_remove_item(&ca->discard_buckets_in_flight, i); mutex_unlock(&ca->discard_buckets_in_flight_lock); } @@ -1806,19 +1819,6 @@ struct discard_buckets_state { u64 discarded; }; -/* - * This is needed because discard is both a filesystem option and a device - * option, and mount options are supposed to apply to that mount and not be - * persisted, i.e. if it's set as a mount option we can't propagate it to the - * device. - */ -static inline bool discard_opt_enabled(struct bch_fs *c, struct bch_dev *ca) -{ - return test_bit(BCH_FS_discard_mount_opt_set, &c->flags) - ? c->opts.discard - : ca->mi.discard; -} - static int bch2_discard_one_bucket(struct btree_trans *trans, struct bch_dev *ca, struct btree_iter *need_discard_iter, @@ -1882,7 +1882,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, s->discarded++; *discard_pos_done = iter.pos; - if (discard_opt_enabled(c, ca) && !c->opts.nochanges) { + if (bch2_discard_opt_enabled(c, ca) && !c->opts.nochanges) { /* * This works without any other locks because this is the only * thread that removes items from the need_discard tree @@ -1952,26 +1952,26 @@ static void bch2_do_discards_work(struct work_struct *work) trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); - percpu_ref_put(&ca->io_ref[WRITE]); - bch2_write_ref_put(c, BCH_WRITE_REF_discard); + enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_dev_do_discards); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard); } void bch2_dev_do_discards(struct bch_dev *ca) { struct bch_fs *c = ca->fs; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard)) + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_discard)) return; - if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) + if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_dev_do_discards)) goto put_write_ref; if (queue_work(c->write_ref_wq, &ca->discard_work)) return; - percpu_ref_put(&ca->io_ref[WRITE]); + enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_dev_do_discards); put_write_ref: - bch2_write_ref_put(c, BCH_WRITE_REF_discard); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard); } void bch2_do_discards(struct bch_fs *c) @@ -2047,8 +2047,8 @@ static void bch2_do_discards_fast_work(struct work_struct *work) trace_discard_buckets_fast(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); bch2_trans_put(trans); - percpu_ref_put(&ca->io_ref[WRITE]); - bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); + enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_discard_one_bucket_fast); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard_fast); } static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket) @@ -2058,18 +2058,18 @@ static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket) if (discard_in_flight_add(ca, bucket, false)) return; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast)) + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_discard_fast)) return; - if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) + if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_discard_one_bucket_fast)) goto put_ref; if (queue_work(c->write_ref_wq, &ca->discard_fast_work)) return; - percpu_ref_put(&ca->io_ref[WRITE]); + enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_discard_one_bucket_fast); put_ref: - bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard_fast); } static int invalidate_one_bp(struct btree_trans *trans, @@ -2180,8 +2180,11 @@ static int invalidate_one_bucket(struct btree_trans *trans, BUG_ON(a->data_type != BCH_DATA_cached); BUG_ON(a->dirty_sectors); - if (!a->cached_sectors) - bch_err(c, "invalidating empty bucket, confused"); + if (!a->cached_sectors) { + bch2_check_bucket_backpointer_mismatch(trans, ca, bucket.offset, + true, last_flushed); + goto out; + } unsigned cached_sectors = a->cached_sectors; u8 gen = a->gen; @@ -2261,27 +2264,27 @@ restart_err: bch2_trans_iter_exit(trans, &iter); err: bch2_trans_put(trans); - percpu_ref_put(&ca->io_ref[WRITE]); bch2_bkey_buf_exit(&last_flushed, c); - bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); + enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_do_invalidates); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_invalidate); } void bch2_dev_do_invalidates(struct bch_dev *ca) { struct bch_fs *c = ca->fs; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate)) + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_invalidate)) return; - if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) + if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_do_invalidates)) goto put_ref; if (queue_work(c->write_ref_wq, &ca->invalidate_work)) return; - percpu_ref_put(&ca->io_ref[WRITE]); + enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_do_invalidates); put_ref: - bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_invalidate); } void bch2_do_invalidates(struct bch_fs *c) @@ -2392,14 +2395,16 @@ bkey_err: int bch2_fs_freespace_init(struct bch_fs *c) { - int ret = 0; - bool doing_init = false; + if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) + return 0; + /* * We can crash during the device add path, so we need to check this on * every mount: */ + bool doing_init = false; for_each_member_device(c, ca) { if (ca->mi.freespace_initialized) continue; @@ -2409,7 +2414,7 @@ int bch2_fs_freespace_init(struct bch_fs *c) doing_init = true; } - ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets); + int ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets); if (ret) { bch2_dev_put(ca); bch_err_fn(c, ret); @@ -2439,8 +2444,7 @@ int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) * We clear the LRU and need_discard btrees first so that we don't race * with bch2_do_invalidates() and bch2_do_discards() */ - ret = bch2_dev_remove_stripes(c, ca->dev_idx) ?: - bch2_btree_delete_range(c, BTREE_ID_lru, start, end, + ret = bch2_btree_delete_range(c, BTREE_ID_lru, start, end, BTREE_TRIGGER_norun, NULL) ?: bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end, BTREE_TRIGGER_norun, NULL) ?: @@ -2503,15 +2507,15 @@ void bch2_recalc_capacity(struct bch_fs *c) lockdep_assert_held(&c->state_lock); - for_each_online_member(c, ca) { - struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi; - - ra_pages += bdi->ra_pages; - } + guard(rcu)(); + for_each_member_device_rcu(c, ca, NULL) { + struct block_device *bdev = READ_ONCE(ca->disk_sb.bdev); + if (bdev) + ra_pages += bdev->bd_disk->bdi->ra_pages; - bch2_set_ra_pages(c, ra_pages); + if (ca->mi.state != BCH_MEMBER_STATE_rw) + continue; - __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), READ) { u64 dev_reserve = 0; /* @@ -2549,6 +2553,8 @@ void bch2_recalc_capacity(struct bch_fs *c) ca->mi.bucket_size); } + bch2_set_ra_pages(c, ra_pages); + gc_reserve = c->opts.gc_reserve_bytes ? c->opts.gc_reserve_bytes >> 9 : div64_u64(capacity * c->opts.gc_reserve_percent, 100); @@ -2570,7 +2576,8 @@ u64 bch2_min_rw_member_capacity(struct bch_fs *c) { u64 ret = U64_MAX; - for_each_rw_member(c, ca) + guard(rcu)(); + for_each_rw_member_rcu(c, ca) ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size); return ret; } @@ -2578,19 +2585,31 @@ u64 bch2_min_rw_member_capacity(struct bch_fs *c) static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca) { struct open_bucket *ob; - bool ret = false; for (ob = c->open_buckets; ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) { - spin_lock(&ob->lock); - if (ob->valid && !ob->on_partial_list && - ob->dev == ca->dev_idx) - ret = true; - spin_unlock(&ob->lock); + scoped_guard(spinlock, &ob->lock) { + if (ob->valid && !ob->on_partial_list && + ob->dev == ca->dev_idx) + return true; + } } - return ret; + return false; +} + +void bch2_dev_allocator_set_rw(struct bch_fs *c, struct bch_dev *ca, bool rw) +{ + /* BCH_DATA_free == all rw devs */ + + for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++) + if (rw && + (i == BCH_DATA_free || + (ca->mi.data_allowed & BIT(i)))) + set_bit(ca->dev_idx, c->rw_devs[i].d); + else + clear_bit(ca->dev_idx, c->rw_devs[i].d); } /* device goes ro: */ @@ -2599,9 +2618,7 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) lockdep_assert_held(&c->state_lock); /* First, remove device from allocation groups: */ - - for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++) - clear_bit(ca->dev_idx, c->rw_devs[i].d); + bch2_dev_allocator_set_rw(c, ca, false); c->rw_devs_change_count++; @@ -2635,10 +2652,7 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) { lockdep_assert_held(&c->state_lock); - for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++) - if (ca->mi.data_allowed & (1 << i)) - set_bit(ca->dev_idx, c->rw_devs[i].d); - + bch2_dev_allocator_set_rw(c, ca, true); c->rw_devs_change_count++; } diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index 34b3d6ac4fbb..0cc5adc55b6f 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -13,11 +13,9 @@ static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos) { - rcu_read_lock(); + guard(rcu)(); struct bch_dev *ca = bch2_dev_rcu_noerror(c, pos.inode); - bool ret = ca && bucket_valid(ca, pos.offset); - rcu_read_unlock(); - return ret; + return ca && bucket_valid(ca, pos.offset); } static inline u64 bucket_to_u64(struct bpos bucket) @@ -253,6 +251,7 @@ int bch2_alloc_v4_validate(struct bch_fs *, struct bkey_s_c, struct bkey_validate_context); void bch2_alloc_v4_swab(struct bkey_s); void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +void bch2_alloc_v4_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_alloc ((struct bkey_ops) { \ .key_validate = bch2_alloc_v1_validate, \ @@ -277,7 +276,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_alloc_v4 ((struct bkey_ops) { \ .key_validate = bch2_alloc_v4_validate, \ - .val_to_text = bch2_alloc_to_text, \ + .val_to_text = bch2_alloc_v4_to_text, \ .swab = bch2_alloc_v4_swab, \ .trigger = bch2_trigger_alloc, \ .min_val_size = 48, \ @@ -350,6 +349,7 @@ int bch2_dev_remove_alloc(struct bch_fs *, struct bch_dev *); void bch2_recalc_capacity(struct bch_fs *); u64 bch2_min_rw_member_capacity(struct bch_fs *); +void bch2_dev_allocator_set_rw(struct bch_fs *, struct bch_dev *, bool); void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *); void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *); diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 7ec022e9361a..b375ad610acd 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -69,10 +69,9 @@ const char * const bch2_watermarks[] = { void bch2_reset_alloc_cursors(struct bch_fs *c) { - rcu_read_lock(); + guard(rcu)(); for_each_member_device_rcu(c, ca, NULL) memset(ca->alloc_cursor, 0, sizeof(ca->alloc_cursor)); - rcu_read_unlock(); } static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob) @@ -154,7 +153,7 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) static inline bool is_superblock_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b) { - if (c->curr_recovery_pass > BCH_RECOVERY_PASS_trans_mark_dev_sbs) + if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_trans_mark_dev_sbs)) return false; return bch2_is_superblock_bucket(ca, b); @@ -166,9 +165,8 @@ static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob) ARRAY_SIZE(c->open_buckets_partial)); spin_lock(&c->freelist_lock); - rcu_read_lock(); - bch2_dev_rcu(c, ob->dev)->nr_partial_buckets++; - rcu_read_unlock(); + scoped_guard(rcu) + bch2_dev_rcu(c, ob->dev)->nr_partial_buckets++; ob->on_partial_list = true; c->open_buckets_partial[c->open_buckets_partial_nr++] = @@ -180,11 +178,11 @@ static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob) } static inline bool may_alloc_bucket(struct bch_fs *c, - struct bpos bucket, - struct bucket_alloc_state *s) + struct alloc_request *req, + struct bpos bucket) { if (bch2_bucket_is_open(c, bucket.inode, bucket.offset)) { - s->skipped_open++; + req->counters.skipped_open++; return false; } @@ -193,48 +191,49 @@ static inline bool may_alloc_bucket(struct bch_fs *c, bucket.inode, bucket.offset); if (journal_seq_ready > c->journal.flushed_seq_ondisk) { if (journal_seq_ready > c->journal.flushing_seq) - s->need_journal_commit++; - s->skipped_need_journal_commit++; + req->counters.need_journal_commit++; + req->counters.skipped_need_journal_commit++; return false; } if (bch2_bucket_nocow_is_locked(&c->nocow_locks, bucket)) { - s->skipped_nocow++; + req->counters.skipped_nocow++; return false; } return true; } -static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, +static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, + struct alloc_request *req, u64 bucket, u8 gen, - enum bch_watermark watermark, - struct bucket_alloc_state *s, struct closure *cl) { + struct bch_dev *ca = req->ca; + if (unlikely(is_superblock_bucket(c, ca, bucket))) return NULL; if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) { - s->skipped_nouse++; + req->counters.skipped_nouse++; return NULL; } spin_lock(&c->freelist_lock); - if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(watermark))) { + if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(req->watermark))) { if (cl) closure_wait(&c->open_buckets_wait, cl); track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], true); spin_unlock(&c->freelist_lock); - return ERR_PTR(-BCH_ERR_open_buckets_empty); + return ERR_PTR(bch_err_throw(c, open_buckets_empty)); } /* Recheck under lock: */ if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) { spin_unlock(&c->freelist_lock); - s->skipped_open++; + req->counters.skipped_open++; return NULL; } @@ -258,16 +257,15 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * return ob; } -static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca, - enum bch_watermark watermark, - struct bucket_alloc_state *s, +static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, + struct alloc_request *req, struct btree_iter *freespace_iter, struct closure *cl) { struct bch_fs *c = trans->c; u64 b = freespace_iter->pos.offset & ~(~0ULL << 56); - if (!may_alloc_bucket(c, POS(ca->dev_idx, b), s)) + if (!may_alloc_bucket(c, req, POS(req->ca->dev_idx, b))) return NULL; u8 gen; @@ -277,7 +275,7 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc if (ret) return NULL; - return __try_alloc_bucket(c, ca, b, gen, watermark, s, cl); + return __try_alloc_bucket(c, req, b, gen, cl); } /* @@ -285,17 +283,16 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc */ static noinline struct open_bucket * bch2_bucket_alloc_early(struct btree_trans *trans, - struct bch_dev *ca, - enum bch_watermark watermark, - struct bucket_alloc_state *s, + struct alloc_request *req, struct closure *cl) { struct bch_fs *c = trans->c; + struct bch_dev *ca = req->ca; struct btree_iter iter, citer; struct bkey_s_c k, ck; struct open_bucket *ob = NULL; u64 first_bucket = ca->mi.first_bucket; - u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap]; + u64 *dev_alloc_cursor = &ca->alloc_cursor[req->btree_bitmap]; u64 alloc_start = max(first_bucket, *dev_alloc_cursor); u64 alloc_cursor = alloc_start; int ret; @@ -317,10 +314,10 @@ again: if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets))) break; - if (s->btree_bitmap != BTREE_BITMAP_ANY && - s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, + if (req->btree_bitmap != BTREE_BITMAP_ANY && + req->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, bucket_to_sector(ca, bucket), ca->mi.bucket_size)) { - if (s->btree_bitmap == BTREE_BITMAP_YES && + if (req->btree_bitmap == BTREE_BITMAP_YES && bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift) break; @@ -328,8 +325,8 @@ again: round_up(bucket_to_sector(ca, bucket) + 1, 1ULL << ca->mi.btree_bitmap_shift)); bch2_btree_iter_set_pos(trans, &iter, POS(ca->dev_idx, bucket)); - s->buckets_seen++; - s->skipped_mi_btree_bitmap++; + req->counters.buckets_seen++; + req->counters.skipped_mi_btree_bitmap++; continue; } @@ -348,11 +345,10 @@ again: if (a->data_type != BCH_DATA_free) goto next; - s->buckets_seen++; + req->counters.buckets_seen++; - ob = may_alloc_bucket(c, k.k->p, s) - ? __try_alloc_bucket(c, ca, k.k->p.offset, a->gen, - watermark, s, cl) + ob = may_alloc_bucket(c, req, k.k->p) + ? __try_alloc_bucket(c, req, k.k->p.offset, a->gen, cl) : NULL; next: bch2_set_btree_iter_dontneed(trans, &citer); @@ -378,15 +374,14 @@ next: } static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, - struct bch_dev *ca, - enum bch_watermark watermark, - struct bucket_alloc_state *s, - struct closure *cl) + struct alloc_request *req, + struct closure *cl) { + struct bch_dev *ca = req->ca; struct btree_iter iter; struct bkey_s_c k; struct open_bucket *ob = NULL; - u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap]; + u64 *dev_alloc_cursor = &ca->alloc_cursor[req->btree_bitmap]; u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(*dev_alloc_cursor)); u64 alloc_cursor = alloc_start; int ret; @@ -402,13 +397,13 @@ again: iter.k.size = iter.k.p.offset - iter.pos.offset; while (iter.k.size) { - s->buckets_seen++; + req->counters.buckets_seen++; u64 bucket = iter.pos.offset & ~(~0ULL << 56); - if (s->btree_bitmap != BTREE_BITMAP_ANY && - s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, + if (req->btree_bitmap != BTREE_BITMAP_ANY && + req->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, bucket_to_sector(ca, bucket), ca->mi.bucket_size)) { - if (s->btree_bitmap == BTREE_BITMAP_YES && + if (req->btree_bitmap == BTREE_BITMAP_YES && bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift) goto fail; @@ -418,11 +413,11 @@ again: alloc_cursor = bucket|(iter.pos.offset & (~0ULL << 56)); bch2_btree_iter_set_pos(trans, &iter, POS(ca->dev_idx, alloc_cursor)); - s->skipped_mi_btree_bitmap++; + req->counters.skipped_mi_btree_bitmap++; goto next; } - ob = try_alloc_bucket(trans, ca, watermark, s, &iter, cl); + ob = try_alloc_bucket(trans, req, &iter, cl); if (ob) { if (!IS_ERR(ob)) *dev_alloc_cursor = iter.pos.offset; @@ -453,33 +448,30 @@ fail: return ob; } -static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca, - enum bch_watermark watermark, - enum bch_data_type data_type, +static noinline void trace_bucket_alloc2(struct bch_fs *c, + struct alloc_request *req, struct closure *cl, - struct bch_dev_usage *usage, - struct bucket_alloc_state *s, struct open_bucket *ob) { struct printbuf buf = PRINTBUF; printbuf_tabstop_push(&buf, 24); - prt_printf(&buf, "dev\t%s (%u)\n", ca->name, ca->dev_idx); - prt_printf(&buf, "watermark\t%s\n", bch2_watermarks[watermark]); - prt_printf(&buf, "data type\t%s\n", __bch2_data_types[data_type]); + prt_printf(&buf, "dev\t%s (%u)\n", req->ca->name, req->ca->dev_idx); + prt_printf(&buf, "watermark\t%s\n", bch2_watermarks[req->watermark]); + prt_printf(&buf, "data type\t%s\n", __bch2_data_types[req->data_type]); prt_printf(&buf, "blocking\t%u\n", cl != NULL); - prt_printf(&buf, "free\t%llu\n", usage->buckets[BCH_DATA_free]); - prt_printf(&buf, "avail\t%llu\n", dev_buckets_free(ca, *usage, watermark)); - prt_printf(&buf, "copygc_wait\t%lu/%lli\n", + prt_printf(&buf, "free\t%llu\n", req->usage.buckets[BCH_DATA_free]); + prt_printf(&buf, "avail\t%llu\n", dev_buckets_free(req->ca, req->usage, req->watermark)); + prt_printf(&buf, "copygc_wait\t%llu/%lli\n", bch2_copygc_wait_amount(c), c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now)); - prt_printf(&buf, "seen\t%llu\n", s->buckets_seen); - prt_printf(&buf, "open\t%llu\n", s->skipped_open); - prt_printf(&buf, "need journal commit\t%llu\n", s->skipped_need_journal_commit); - prt_printf(&buf, "nocow\t%llu\n", s->skipped_nocow); - prt_printf(&buf, "nouse\t%llu\n", s->skipped_nouse); - prt_printf(&buf, "mi_btree_bitmap\t%llu\n", s->skipped_mi_btree_bitmap); + prt_printf(&buf, "seen\t%llu\n", req->counters.buckets_seen); + prt_printf(&buf, "open\t%llu\n", req->counters.skipped_open); + prt_printf(&buf, "need journal commit\t%llu\n", req->counters.skipped_need_journal_commit); + prt_printf(&buf, "nocow\t%llu\n", req->counters.skipped_nocow); + prt_printf(&buf, "nouse\t%llu\n", req->counters.skipped_nouse); + prt_printf(&buf, "mi_btree_bitmap\t%llu\n", req->counters.skipped_mi_btree_bitmap); if (!IS_ERR(ob)) { prt_printf(&buf, "allocated\t%llu\n", ob->bucket); @@ -495,47 +487,42 @@ static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca, /** * bch2_bucket_alloc_trans - allocate a single bucket from a specific device * @trans: transaction object - * @ca: device to allocate from - * @watermark: how important is this allocation? - * @data_type: BCH_DATA_journal, btree, user... + * @req: state for the entire allocation * @cl: if not NULL, closure to be used to wait if buckets not available * @nowait: if true, do not wait for buckets to become available - * @usage: for secondarily also returning the current device usage * * Returns: an open_bucket on success, or an ERR_PTR() on failure. */ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, - struct bch_dev *ca, - enum bch_watermark watermark, - enum bch_data_type data_type, - struct closure *cl, - bool nowait, - struct bch_dev_usage *usage) + struct alloc_request *req, + struct closure *cl, + bool nowait) { struct bch_fs *c = trans->c; + struct bch_dev *ca = req->ca; struct open_bucket *ob = NULL; bool freespace = READ_ONCE(ca->mi.freespace_initialized); u64 avail; - struct bucket_alloc_state s = { - .btree_bitmap = data_type == BCH_DATA_btree, - }; bool waiting = nowait; + + req->btree_bitmap = req->data_type == BCH_DATA_btree; + memset(&req->counters, 0, sizeof(req->counters)); again: - bch2_dev_usage_read_fast(ca, usage); - avail = dev_buckets_free(ca, *usage, watermark); + bch2_dev_usage_read_fast(ca, &req->usage); + avail = dev_buckets_free(ca, req->usage, req->watermark); - if (usage->buckets[BCH_DATA_need_discard] > avail) + if (req->usage.buckets[BCH_DATA_need_discard] > avail) bch2_dev_do_discards(ca); - if (usage->buckets[BCH_DATA_need_gc_gens] > avail) + if (req->usage.buckets[BCH_DATA_need_gc_gens] > avail) bch2_gc_gens_async(c); - if (should_invalidate_buckets(ca, *usage)) + if (should_invalidate_buckets(ca, req->usage)) bch2_dev_do_invalidates(ca); if (!avail) { - if (watermark > BCH_WATERMARK_normal && - c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) + if (req->watermark > BCH_WATERMARK_normal && + c->recovery.pass_done < BCH_RECOVERY_PASS_check_allocations) goto alloc; if (cl && !waiting) { @@ -546,7 +533,7 @@ again: track_event_change(&c->times[BCH_TIME_blocked_allocate], true); - ob = ERR_PTR(-BCH_ERR_freelist_empty); + ob = ERR_PTR(bch_err_throw(c, freelist_empty)); goto err; } @@ -554,27 +541,27 @@ again: closure_wake_up(&c->freelist_wait); alloc: ob = likely(freespace) - ? bch2_bucket_alloc_freelist(trans, ca, watermark, &s, cl) - : bch2_bucket_alloc_early(trans, ca, watermark, &s, cl); + ? bch2_bucket_alloc_freelist(trans, req, cl) + : bch2_bucket_alloc_early(trans, req, cl); - if (s.need_journal_commit * 2 > avail) + if (req->counters.need_journal_commit * 2 > avail) bch2_journal_flush_async(&c->journal, NULL); - if (!ob && s.btree_bitmap != BTREE_BITMAP_ANY) { - s.btree_bitmap = BTREE_BITMAP_ANY; + if (!ob && req->btree_bitmap != BTREE_BITMAP_ANY) { + req->btree_bitmap = BTREE_BITMAP_ANY; goto alloc; } - if (!ob && freespace && c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) { + if (!ob && freespace && c->recovery.pass_done < BCH_RECOVERY_PASS_check_alloc_info) { freespace = false; goto alloc; } err: if (!ob) - ob = ERR_PTR(-BCH_ERR_no_buckets_found); + ob = ERR_PTR(bch_err_throw(c, no_buckets_found)); if (!IS_ERR(ob)) - ob->data_type = data_type; + ob->data_type = req->data_type; if (!IS_ERR(ob)) count_event(c, bucket_alloc); @@ -584,7 +571,7 @@ err: if (!IS_ERR(ob) ? trace_bucket_alloc_enabled() : trace_bucket_alloc_fail_enabled()) - trace_bucket_alloc2(c, ca, watermark, data_type, cl, usage, &s, ob); + trace_bucket_alloc2(c, req, cl, ob); return ob; } @@ -594,12 +581,15 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, enum bch_data_type data_type, struct closure *cl) { - struct bch_dev_usage usage; struct open_bucket *ob; + struct alloc_request req = { + .watermark = watermark, + .data_type = data_type, + .ca = ca, + }; bch2_trans_do(c, - PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark, - data_type, cl, false, &usage))); + PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, &req, cl, false))); return ob; } @@ -611,18 +601,18 @@ static int __dev_stripe_cmp(struct dev_stripe_state *stripe, #define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r) -struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c, - struct dev_stripe_state *stripe, - struct bch_devs_mask *devs) +void bch2_dev_alloc_list(struct bch_fs *c, + struct dev_stripe_state *stripe, + struct bch_devs_mask *devs, + struct dev_alloc_list *ret) { - struct dev_alloc_list ret = { .nr = 0 }; - unsigned i; + ret->nr = 0; + unsigned i; for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX) - ret.data[ret.nr++] = i; + ret->data[ret->nr++] = i; - bubble_sort(ret.data, ret.nr, dev_stripe_cmp); - return ret; + bubble_sort(ret->data, ret->nr, dev_stripe_cmp); } static const u64 stripe_clock_hand_rescale = 1ULL << 62; /* trigger rescale at */ @@ -693,64 +683,53 @@ void bch2_dev_stripe_increment(struct bch_dev *ca, } static int add_new_bucket(struct bch_fs *c, - struct open_buckets *ptrs, - struct bch_devs_mask *devs_may_alloc, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, - struct open_bucket *ob) + struct alloc_request *req, + struct open_bucket *ob) { unsigned durability = ob_dev(c, ob)->mi.durability; - BUG_ON(*nr_effective >= nr_replicas); + BUG_ON(req->nr_effective >= req->nr_replicas); - __clear_bit(ob->dev, devs_may_alloc->d); - *nr_effective += durability; - *have_cache |= !durability; + __clear_bit(ob->dev, req->devs_may_alloc.d); + req->nr_effective += durability; + req->have_cache |= !durability; - ob_push(c, ptrs, ob); + ob_push(c, &req->ptrs, ob); - if (*nr_effective >= nr_replicas) + if (req->nr_effective >= req->nr_replicas) return 1; if (ob->ec) return 1; return 0; } -int bch2_bucket_alloc_set_trans(struct btree_trans *trans, - struct open_buckets *ptrs, - struct dev_stripe_state *stripe, - struct bch_devs_mask *devs_may_alloc, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, - enum bch_write_flags flags, - enum bch_data_type data_type, - enum bch_watermark watermark, - struct closure *cl) +inline int bch2_bucket_alloc_set_trans(struct btree_trans *trans, + struct alloc_request *req, + struct dev_stripe_state *stripe, + struct closure *cl) { struct bch_fs *c = trans->c; - int ret = -BCH_ERR_insufficient_devices; + int ret = 0; + + BUG_ON(req->nr_effective >= req->nr_replicas); - BUG_ON(*nr_effective >= nr_replicas); + bch2_dev_alloc_list(c, stripe, &req->devs_may_alloc, &req->devs_sorted); - struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, stripe, devs_may_alloc); - darray_for_each(devs_sorted, i) { - struct bch_dev *ca = bch2_dev_tryget_noerror(c, *i); - if (!ca) + darray_for_each(req->devs_sorted, i) { + req->ca = bch2_dev_tryget_noerror(c, *i); + if (!req->ca) continue; - if (!ca->mi.durability && *have_cache) { - bch2_dev_put(ca); + if (!req->ca->mi.durability && req->have_cache) { + bch2_dev_put(req->ca); continue; } - struct bch_dev_usage usage; - struct open_bucket *ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type, - cl, flags & BCH_WRITE_alloc_nowait, &usage); + struct open_bucket *ob = bch2_bucket_alloc_trans(trans, req, cl, + req->flags & BCH_WRITE_alloc_nowait); if (!IS_ERR(ob)) - bch2_dev_stripe_increment_inlined(ca, stripe, &usage); - bch2_dev_put(ca); + bch2_dev_stripe_increment_inlined(req->ca, stripe, &req->usage); + bch2_dev_put(req->ca); if (IS_ERR(ob)) { ret = PTR_ERR(ob); @@ -759,15 +738,16 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, continue; } - if (add_new_bucket(c, ptrs, devs_may_alloc, - nr_replicas, nr_effective, - have_cache, ob)) { - ret = 0; + ret = add_new_bucket(c, req, ob); + if (ret) break; - } } - return ret; + if (ret == 1) + return 0; + if (ret) + return ret; + return bch_err_throw(c, insufficient_devices); } /* Allocate from stripes: */ @@ -779,35 +759,28 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, */ static int bucket_alloc_from_stripe(struct btree_trans *trans, - struct open_buckets *ptrs, - struct write_point *wp, - struct bch_devs_mask *devs_may_alloc, - u16 target, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, - enum bch_watermark watermark, - enum bch_write_flags flags, - struct closure *cl) + struct alloc_request *req, + struct closure *cl) { struct bch_fs *c = trans->c; int ret = 0; - if (nr_replicas < 2) + if (req->nr_replicas < 2) return 0; - if (ec_open_bucket(c, ptrs)) + if (ec_open_bucket(c, &req->ptrs)) return 0; struct ec_stripe_head *h = - bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl); + bch2_ec_stripe_head_get(trans, req, 0, cl); if (IS_ERR(h)) return PTR_ERR(h); if (!h) return 0; - struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc); - darray_for_each(devs_sorted, i) + bch2_dev_alloc_list(c, &req->wp->stripe, &req->devs_may_alloc, &req->devs_sorted); + + darray_for_each(req->devs_sorted, i) for (unsigned ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) { if (!h->s->blocks[ec_idx]) continue; @@ -818,9 +791,7 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans, ob->ec = h->s; ec_stripe_new_get(h->s, STRIPE_REF_io); - ret = add_new_bucket(c, ptrs, devs_may_alloc, - nr_replicas, nr_effective, - have_cache, ob); + ret = add_new_bucket(c, req, ob); goto out; } } @@ -832,65 +803,49 @@ out: /* Sector allocator */ static bool want_bucket(struct bch_fs *c, - struct write_point *wp, - struct bch_devs_mask *devs_may_alloc, - bool *have_cache, bool ec, + struct alloc_request *req, struct open_bucket *ob) { struct bch_dev *ca = ob_dev(c, ob); - if (!test_bit(ob->dev, devs_may_alloc->d)) + if (!test_bit(ob->dev, req->devs_may_alloc.d)) return false; - if (ob->data_type != wp->data_type) + if (ob->data_type != req->wp->data_type) return false; if (!ca->mi.durability && - (wp->data_type == BCH_DATA_btree || ec || *have_cache)) + (req->wp->data_type == BCH_DATA_btree || req->ec || req->have_cache)) return false; - if (ec != (ob->ec != NULL)) + if (req->ec != (ob->ec != NULL)) return false; return true; } static int bucket_alloc_set_writepoint(struct bch_fs *c, - struct open_buckets *ptrs, - struct write_point *wp, - struct bch_devs_mask *devs_may_alloc, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, - bool ec) + struct alloc_request *req) { - struct open_buckets ptrs_skip = { .nr = 0 }; struct open_bucket *ob; unsigned i; int ret = 0; - open_bucket_for_each(c, &wp->ptrs, ob, i) { - if (!ret && want_bucket(c, wp, devs_may_alloc, - have_cache, ec, ob)) - ret = add_new_bucket(c, ptrs, devs_may_alloc, - nr_replicas, nr_effective, - have_cache, ob); + req->scratch_ptrs.nr = 0; + + open_bucket_for_each(c, &req->wp->ptrs, ob, i) { + if (!ret && want_bucket(c, req, ob)) + ret = add_new_bucket(c, req, ob); else - ob_push(c, &ptrs_skip, ob); + ob_push(c, &req->scratch_ptrs, ob); } - wp->ptrs = ptrs_skip; + req->wp->ptrs = req->scratch_ptrs; return ret; } static int bucket_alloc_set_partial(struct bch_fs *c, - struct open_buckets *ptrs, - struct write_point *wp, - struct bch_devs_mask *devs_may_alloc, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, bool ec, - enum bch_watermark watermark) + struct alloc_request *req) { int i, ret = 0; @@ -905,13 +860,12 @@ static int bucket_alloc_set_partial(struct bch_fs *c, for (i = c->open_buckets_partial_nr - 1; i >= 0; --i) { struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i]; - if (want_bucket(c, wp, devs_may_alloc, have_cache, ec, ob)) { + if (want_bucket(c, req, ob)) { struct bch_dev *ca = ob_dev(c, ob); - struct bch_dev_usage usage; u64 avail; - bch2_dev_usage_read_fast(ca, &usage); - avail = dev_buckets_free(ca, usage, watermark) + ca->nr_partial_buckets; + bch2_dev_usage_read_fast(ca, &req->usage); + avail = dev_buckets_free(ca, req->usage, req->watermark) + ca->nr_partial_buckets; if (!avail) continue; @@ -920,13 +874,10 @@ static int bucket_alloc_set_partial(struct bch_fs *c, i); ob->on_partial_list = false; - rcu_read_lock(); - bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--; - rcu_read_unlock(); + scoped_guard(rcu) + bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--; - ret = add_new_bucket(c, ptrs, devs_may_alloc, - nr_replicas, nr_effective, - have_cache, ob); + ret = add_new_bucket(c, req, ob); if (ret) break; } @@ -937,61 +888,41 @@ unlock: } static int __open_bucket_add_buckets(struct btree_trans *trans, - struct open_buckets *ptrs, - struct write_point *wp, - struct bch_devs_list *devs_have, - u16 target, - bool erasure_code, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, - enum bch_watermark watermark, - enum bch_write_flags flags, - struct closure *_cl) + struct alloc_request *req, + struct closure *_cl) { struct bch_fs *c = trans->c; - struct bch_devs_mask devs; struct open_bucket *ob; struct closure *cl = NULL; unsigned i; int ret; - devs = target_rw_devs(c, wp->data_type, target); + req->devs_may_alloc = target_rw_devs(c, req->wp->data_type, req->target); /* Don't allocate from devices we already have pointers to: */ - darray_for_each(*devs_have, i) - __clear_bit(*i, devs.d); + darray_for_each(*req->devs_have, i) + __clear_bit(*i, req->devs_may_alloc.d); - open_bucket_for_each(c, ptrs, ob, i) - __clear_bit(ob->dev, devs.d); + open_bucket_for_each(c, &req->ptrs, ob, i) + __clear_bit(ob->dev, req->devs_may_alloc.d); - ret = bucket_alloc_set_writepoint(c, ptrs, wp, &devs, - nr_replicas, nr_effective, - have_cache, erasure_code); + ret = bucket_alloc_set_writepoint(c, req); if (ret) return ret; - ret = bucket_alloc_set_partial(c, ptrs, wp, &devs, - nr_replicas, nr_effective, - have_cache, erasure_code, watermark); + ret = bucket_alloc_set_partial(c, req); if (ret) return ret; - if (erasure_code) { - ret = bucket_alloc_from_stripe(trans, ptrs, wp, &devs, - target, - nr_replicas, nr_effective, - have_cache, - watermark, flags, _cl); + if (req->ec) { + ret = bucket_alloc_from_stripe(trans, req, _cl); } else { retry_blocking: /* * Try nonblocking first, so that if one device is full we'll try from * other devices: */ - ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs, - nr_replicas, nr_effective, have_cache, - flags, wp->data_type, watermark, cl); + ret = bch2_bucket_alloc_set_trans(trans, req, &req->wp->stripe, cl); if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart) && !bch2_err_matches(ret, BCH_ERR_insufficient_devices) && @@ -1005,38 +936,27 @@ retry_blocking: } static int open_bucket_add_buckets(struct btree_trans *trans, - struct open_buckets *ptrs, - struct write_point *wp, - struct bch_devs_list *devs_have, - u16 target, - unsigned erasure_code, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, - enum bch_watermark watermark, - enum bch_write_flags flags, - struct closure *cl) + struct alloc_request *req, + struct closure *cl) { int ret; - if (erasure_code && !ec_open_bucket(trans->c, ptrs)) { - ret = __open_bucket_add_buckets(trans, ptrs, wp, - devs_have, target, erasure_code, - nr_replicas, nr_effective, have_cache, - watermark, flags, cl); + if (req->ec && !ec_open_bucket(trans->c, &req->ptrs)) { + ret = __open_bucket_add_buckets(trans, req, cl); if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || bch2_err_matches(ret, BCH_ERR_operation_blocked) || bch2_err_matches(ret, BCH_ERR_freelist_empty) || bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) return ret; - if (*nr_effective >= nr_replicas) + if (req->nr_effective >= req->nr_replicas) return 0; } - ret = __open_bucket_add_buckets(trans, ptrs, wp, - devs_have, target, false, - nr_replicas, nr_effective, have_cache, - watermark, flags, cl); + bool ec = false; + swap(ec, req->ec); + ret = __open_bucket_add_buckets(trans, req, cl); + swap(ec, req->ec); + return ret < 0 ? ret : 0; } @@ -1137,9 +1057,8 @@ void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca, ob->on_partial_list = false; - rcu_read_lock(); - bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--; - rcu_read_unlock(); + scoped_guard(rcu) + bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--; spin_unlock(&c->freelist_lock); bch2_open_bucket_put(c, ob); @@ -1167,14 +1086,11 @@ static struct write_point *__writepoint_find(struct hlist_head *head, { struct write_point *wp; - rcu_read_lock(); + guard(rcu)(); hlist_for_each_entry_rcu(wp, head, node) if (wp->write_point == write_point) - goto out; - wp = NULL; -out: - rcu_read_unlock(); - return wp; + return wp; + return NULL; } static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor) @@ -1185,7 +1101,7 @@ static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor) return stranded * factor > free; } -static bool try_increase_writepoints(struct bch_fs *c) +static noinline bool try_increase_writepoints(struct bch_fs *c) { struct write_point *wp; @@ -1198,7 +1114,7 @@ static bool try_increase_writepoints(struct bch_fs *c) return true; } -static bool try_decrease_writepoints(struct btree_trans *trans, unsigned old_nr) +static noinline bool try_decrease_writepoints(struct btree_trans *trans, unsigned old_nr) { struct bch_fs *c = trans->c; struct write_point *wp; @@ -1289,26 +1205,26 @@ out: static noinline void deallocate_extra_replicas(struct bch_fs *c, - struct open_buckets *ptrs, - struct open_buckets *ptrs_no_use, - unsigned extra_replicas) + struct alloc_request *req) { - struct open_buckets ptrs2 = { 0 }; struct open_bucket *ob; + unsigned extra_replicas = req->nr_effective - req->nr_replicas; unsigned i; - open_bucket_for_each(c, ptrs, ob, i) { + req->scratch_ptrs.nr = 0; + + open_bucket_for_each(c, &req->ptrs, ob, i) { unsigned d = ob_dev(c, ob)->mi.durability; if (d && d <= extra_replicas) { extra_replicas -= d; - ob_push(c, ptrs_no_use, ob); + ob_push(c, &req->wp->ptrs, ob); } else { - ob_push(c, &ptrs2, ob); + ob_push(c, &req->scratch_ptrs, ob); } } - *ptrs = ptrs2; + req->ptrs = req->scratch_ptrs; } /* @@ -1327,51 +1243,53 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, struct write_point **wp_ret) { struct bch_fs *c = trans->c; - struct write_point *wp; struct open_bucket *ob; - struct open_buckets ptrs; - unsigned nr_effective, write_points_nr; - bool have_cache; - int ret; + unsigned write_points_nr; int i; + struct alloc_request *req = bch2_trans_kmalloc_nomemzero(trans, sizeof(*req)); + int ret = PTR_ERR_OR_ZERO(req); + if (unlikely(ret)) + return ret; + if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING)) erasure_code = false; + req->nr_replicas = nr_replicas; + req->target = target; + req->ec = erasure_code; + req->watermark = watermark; + req->flags = flags; + req->devs_have = devs_have; + BUG_ON(!nr_replicas || !nr_replicas_required); retry: - ptrs.nr = 0; - nr_effective = 0; - write_points_nr = c->write_points_nr; - have_cache = false; + req->ptrs.nr = 0; + req->nr_effective = 0; + req->have_cache = false; + write_points_nr = c->write_points_nr; + + *wp_ret = req->wp = writepoint_find(trans, write_point.v); - *wp_ret = wp = writepoint_find(trans, write_point.v); + req->data_type = req->wp->data_type; ret = bch2_trans_relock(trans); if (ret) goto err; /* metadata may not allocate on cache devices: */ - if (wp->data_type != BCH_DATA_user) - have_cache = true; + if (req->data_type != BCH_DATA_user) + req->have_cache = true; if (target && !(flags & BCH_WRITE_only_specified_devs)) { - ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, - target, erasure_code, - nr_replicas, &nr_effective, - &have_cache, watermark, - flags, NULL); + ret = open_bucket_add_buckets(trans, req, NULL); if (!ret || bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto alloc_done; /* Don't retry from all devices if we're out of open buckets: */ if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) { - int ret2 = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, - target, erasure_code, - nr_replicas, &nr_effective, - &have_cache, watermark, - flags, cl); + int ret2 = open_bucket_add_buckets(trans, req, cl); if (!ret2 || bch2_err_matches(ret2, BCH_ERR_transaction_restart) || bch2_err_matches(ret2, BCH_ERR_open_buckets_empty)) { @@ -1384,45 +1302,38 @@ retry: * Only try to allocate cache (durability = 0 devices) from the * specified target: */ - have_cache = true; + req->have_cache = true; + req->target = 0; - ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, - 0, erasure_code, - nr_replicas, &nr_effective, - &have_cache, watermark, - flags, cl); + ret = open_bucket_add_buckets(trans, req, cl); } else { - ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, - target, erasure_code, - nr_replicas, &nr_effective, - &have_cache, watermark, - flags, cl); + ret = open_bucket_add_buckets(trans, req, cl); } alloc_done: - BUG_ON(!ret && nr_effective < nr_replicas); + BUG_ON(!ret && req->nr_effective < req->nr_replicas); - if (erasure_code && !ec_open_bucket(c, &ptrs)) + if (erasure_code && !ec_open_bucket(c, &req->ptrs)) pr_debug("failed to get ec bucket: ret %u", ret); if (ret == -BCH_ERR_insufficient_devices && - nr_effective >= nr_replicas_required) + req->nr_effective >= nr_replicas_required) ret = 0; if (ret) goto err; - if (nr_effective > nr_replicas) - deallocate_extra_replicas(c, &ptrs, &wp->ptrs, nr_effective - nr_replicas); + if (req->nr_effective > req->nr_replicas) + deallocate_extra_replicas(c, req); /* Free buckets we didn't use: */ - open_bucket_for_each(c, &wp->ptrs, ob, i) + open_bucket_for_each(c, &req->wp->ptrs, ob, i) open_bucket_free_unused(c, ob); - wp->ptrs = ptrs; + req->wp->ptrs = req->ptrs; - wp->sectors_free = UINT_MAX; + req->wp->sectors_free = UINT_MAX; - open_bucket_for_each(c, &wp->ptrs, ob, i) { + open_bucket_for_each(c, &req->wp->ptrs, ob, i) { /* * Ensure proper write alignment - either due to misaligned * bucket sizes (from buggy bcachefs-tools), or writes that mix @@ -1436,58 +1347,44 @@ alloc_done: ob->sectors_free = max_t(int, 0, ob->sectors_free - align); - wp->sectors_free = min(wp->sectors_free, ob->sectors_free); + req->wp->sectors_free = min(req->wp->sectors_free, ob->sectors_free); } - wp->sectors_free = rounddown(wp->sectors_free, block_sectors(c)); + req->wp->sectors_free = rounddown(req->wp->sectors_free, block_sectors(c)); /* Did alignment use up space in an open_bucket? */ - if (unlikely(!wp->sectors_free)) { - bch2_alloc_sectors_done(c, wp); + if (unlikely(!req->wp->sectors_free)) { + bch2_alloc_sectors_done(c, req->wp); goto retry; } - BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX); + BUG_ON(!req->wp->sectors_free || req->wp->sectors_free == UINT_MAX); return 0; err: - open_bucket_for_each(c, &wp->ptrs, ob, i) - if (ptrs.nr < ARRAY_SIZE(ptrs.v)) - ob_push(c, &ptrs, ob); + open_bucket_for_each(c, &req->wp->ptrs, ob, i) + if (req->ptrs.nr < ARRAY_SIZE(req->ptrs.v)) + ob_push(c, &req->ptrs, ob); else open_bucket_free_unused(c, ob); - wp->ptrs = ptrs; + req->wp->ptrs = req->ptrs; - mutex_unlock(&wp->lock); + mutex_unlock(&req->wp->lock); if (bch2_err_matches(ret, BCH_ERR_freelist_empty) && try_decrease_writepoints(trans, write_points_nr)) goto retry; if (cl && bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) - ret = -BCH_ERR_bucket_alloc_blocked; + ret = bch_err_throw(c, bucket_alloc_blocked); if (cl && !(flags & BCH_WRITE_alloc_nowait) && bch2_err_matches(ret, BCH_ERR_freelist_empty)) - ret = -BCH_ERR_bucket_alloc_blocked; + ret = bch_err_throw(c, bucket_alloc_blocked); return ret; } -struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob) -{ - struct bch_dev *ca = ob_dev(c, ob); - - return (struct bch_extent_ptr) { - .type = 1 << BCH_EXTENT_ENTRY_ptr, - .gen = ob->gen, - .dev = ob->dev, - .offset = bucket_to_sector(ca, ob->bucket) + - ca->mi.bucket_size - - ob->sectors_free, - }; -} - void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp, struct bkey_i *k, unsigned sectors, bool cached) @@ -1617,6 +1514,8 @@ static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob; unsigned i; + mutex_lock(&wp->lock); + prt_printf(out, "%lu: ", wp->write_point); prt_human_readable_u64(out, wp->sectors_allocated << 9); @@ -1634,6 +1533,8 @@ static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c, open_bucket_for_each(c, &wp->ptrs, ob, i) bch2_open_bucket_to_text(out, c, ob); printbuf_indent_sub(out, 2); + + mutex_unlock(&wp->lock); } void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c) @@ -1731,13 +1632,18 @@ static noinline void bch2_print_allocator_stuck(struct bch_fs *c) printbuf_indent_sub(&buf, 2); prt_newline(&buf); - for_each_online_member(c, ca) { - prt_printf(&buf, "Dev %u:\n", ca->dev_idx); - printbuf_indent_add(&buf, 2); - bch2_dev_alloc_debug_to_text(&buf, ca); - printbuf_indent_sub(&buf, 2); - prt_newline(&buf); - } + bch2_printbuf_make_room(&buf, 4096); + + buf.atomic++; + scoped_guard(rcu) + for_each_online_member_rcu(c, ca) { + prt_printf(&buf, "Dev %u:\n", ca->dev_idx); + printbuf_indent_add(&buf, 2); + bch2_dev_alloc_debug_to_text(&buf, ca); + printbuf_indent_sub(&buf, 2); + prt_newline(&buf); + } + --buf.atomic; prt_printf(&buf, "Copygc debug:\n"); printbuf_indent_add(&buf, 2); @@ -1750,7 +1656,7 @@ static noinline void bch2_print_allocator_stuck(struct bch_fs *c) bch2_journal_debug_to_text(&buf, &c->journal); printbuf_indent_sub(&buf, 2); - bch2_print_string_as_lines(KERN_ERR, buf.buf); + bch2_print_str(c, KERN_ERR, buf.buf); printbuf_exit(&buf); } diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h index 4c1e33cf57c0..1b3fc8460096 100644 --- a/fs/bcachefs/alloc_foreground.h +++ b/fs/bcachefs/alloc_foreground.h @@ -3,8 +3,10 @@ #define _BCACHEFS_ALLOC_FOREGROUND_H #include "bcachefs.h" +#include "buckets.h" #include "alloc_types.h" #include "extents.h" +#include "io_write_types.h" #include "sb-members.h" #include <linux/hash.h> @@ -23,9 +25,57 @@ struct dev_alloc_list { u8 data[BCH_SB_MEMBERS_MAX]; }; -struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *, - struct dev_stripe_state *, - struct bch_devs_mask *); +struct alloc_request { + unsigned nr_replicas; + unsigned target; + bool ec; + enum bch_watermark watermark; + enum bch_write_flags flags; + enum bch_data_type data_type; + struct bch_devs_list *devs_have; + struct write_point *wp; + + /* These fields are used primarily by open_bucket_add_buckets */ + struct open_buckets ptrs; + unsigned nr_effective; /* sum of @ptrs durability */ + bool have_cache; /* have we allocated from a 0 durability dev */ + struct bch_devs_mask devs_may_alloc; + + /* bch2_bucket_alloc_set_trans(): */ + struct dev_alloc_list devs_sorted; + struct bch_dev_usage usage; + + /* bch2_bucket_alloc_trans(): */ + struct bch_dev *ca; + + enum { + BTREE_BITMAP_NO, + BTREE_BITMAP_YES, + BTREE_BITMAP_ANY, + } btree_bitmap; + + struct { + u64 buckets_seen; + u64 skipped_open; + u64 skipped_need_journal_commit; + u64 need_journal_commit; + u64 skipped_nocow; + u64 skipped_nouse; + u64 skipped_mi_btree_bitmap; + } counters; + + unsigned scratch_nr_replicas; + unsigned scratch_nr_effective; + bool scratch_have_cache; + enum bch_data_type scratch_data_type; + struct open_buckets scratch_ptrs; + struct bch_devs_mask scratch_devs_may_alloc; +}; + +void bch2_dev_alloc_list(struct bch_fs *, + struct dev_stripe_state *, + struct bch_devs_mask *, + struct dev_alloc_list *); void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *); static inline struct bch_dev *ob_dev(struct bch_fs *c, struct open_bucket *ob) @@ -173,11 +223,8 @@ static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64 } enum bch_write_flags; -int bch2_bucket_alloc_set_trans(struct btree_trans *, struct open_buckets *, - struct dev_stripe_state *, struct bch_devs_mask *, - unsigned, unsigned *, bool *, enum bch_write_flags, - enum bch_data_type, enum bch_watermark, - struct closure *); +int bch2_bucket_alloc_set_trans(struct btree_trans *, struct alloc_request *, + struct dev_stripe_state *, struct closure *); int bch2_alloc_sectors_start_trans(struct btree_trans *, unsigned, unsigned, @@ -189,7 +236,19 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *, struct closure *, struct write_point **); -struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *, struct open_bucket *); +static inline struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob) +{ + struct bch_dev *ca = ob_dev(c, ob); + + return (struct bch_extent_ptr) { + .type = 1 << BCH_EXTENT_ENTRY_ptr, + .gen = ob->gen, + .dev = ob->dev, + .offset = bucket_to_sector(ca, ob->bucket) + + ca->mi.bucket_size - + ob->sectors_free, + }; +} /* * Append pointers to the space we just allocated to @k, and mark @sectors space diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h index 8f79f46c2a78..e7becdf22cba 100644 --- a/fs/bcachefs/alloc_types.h +++ b/fs/bcachefs/alloc_types.h @@ -8,22 +8,6 @@ #include "clock_types.h" #include "fifo.h" -struct bucket_alloc_state { - enum { - BTREE_BITMAP_NO, - BTREE_BITMAP_YES, - BTREE_BITMAP_ANY, - } btree_bitmap; - - u64 buckets_seen; - u64 skipped_open; - u64 skipped_need_journal_commit; - u64 need_journal_commit; - u64 skipped_nocow; - u64 skipped_nouse; - u64 skipped_mi_btree_bitmap; -}; - #define BCH_WATERMARKS() \ x(stripe) \ x(normal) \ diff --git a/fs/bcachefs/async_objs.c b/fs/bcachefs/async_objs.c new file mode 100644 index 000000000000..a7cd1f0f0964 --- /dev/null +++ b/fs/bcachefs/async_objs.c @@ -0,0 +1,132 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Async obj debugging: keep asynchronous objects on (very fast) lists, make + * them visibile in debugfs: + */ + +#include "bcachefs.h" +#include "async_objs.h" +#include "btree_io.h" +#include "debug.h" +#include "io_read.h" +#include "io_write.h" + +#include <linux/debugfs.h> + +static void promote_obj_to_text(struct printbuf *out, void *obj) +{ + bch2_promote_op_to_text(out, obj); +} + +static void rbio_obj_to_text(struct printbuf *out, void *obj) +{ + bch2_read_bio_to_text(out, obj); +} + +static void write_op_obj_to_text(struct printbuf *out, void *obj) +{ + bch2_write_op_to_text(out, obj); +} + +static void btree_read_bio_obj_to_text(struct printbuf *out, void *obj) +{ + struct btree_read_bio *rbio = obj; + bch2_btree_read_bio_to_text(out, rbio); +} + +static void btree_write_bio_obj_to_text(struct printbuf *out, void *obj) +{ + struct btree_write_bio *wbio = obj; + bch2_bio_to_text(out, &wbio->wbio.bio); +} + +static int bch2_async_obj_list_open(struct inode *inode, struct file *file) +{ + struct async_obj_list *list = inode->i_private; + struct dump_iter *i; + + i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL); + if (!i) + return -ENOMEM; + + file->private_data = i; + i->from = POS_MIN; + i->iter = 0; + i->c = container_of(list, struct bch_fs, async_objs[list->idx]); + i->list = list; + i->buf = PRINTBUF; + return 0; +} + +static ssize_t bch2_async_obj_list_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + struct dump_iter *i = file->private_data; + struct async_obj_list *list = i->list; + ssize_t ret = 0; + + i->ubuf = buf; + i->size = size; + i->ret = 0; + + struct genradix_iter iter; + void *obj; + fast_list_for_each_from(&list->list, iter, obj, i->iter) { + ret = bch2_debugfs_flush_buf(i); + if (ret) + return ret; + + if (!i->size) + break; + + list->obj_to_text(&i->buf, obj); + } + + if (i->buf.allocation_failure) + ret = -ENOMEM; + else + i->iter = iter.pos; + + if (!ret) + ret = bch2_debugfs_flush_buf(i); + + return ret ?: i->ret; +} + +static const struct file_operations async_obj_ops = { + .owner = THIS_MODULE, + .open = bch2_async_obj_list_open, + .release = bch2_dump_release, + .read = bch2_async_obj_list_read, +}; + +void bch2_fs_async_obj_debugfs_init(struct bch_fs *c) +{ + c->async_obj_dir = debugfs_create_dir("async_objs", c->fs_debug_dir); + +#define x(n) debugfs_create_file(#n, 0400, c->async_obj_dir, \ + &c->async_objs[BCH_ASYNC_OBJ_LIST_##n], &async_obj_ops); + BCH_ASYNC_OBJ_LISTS() +#undef x +} + +void bch2_fs_async_obj_exit(struct bch_fs *c) +{ + for (unsigned i = 0; i < ARRAY_SIZE(c->async_objs); i++) + fast_list_exit(&c->async_objs[i].list); +} + +int bch2_fs_async_obj_init(struct bch_fs *c) +{ + for (unsigned i = 0; i < ARRAY_SIZE(c->async_objs); i++) { + if (fast_list_init(&c->async_objs[i].list)) + return -BCH_ERR_ENOMEM_async_obj_init; + c->async_objs[i].idx = i; + } + +#define x(n) c->async_objs[BCH_ASYNC_OBJ_LIST_##n].obj_to_text = n##_obj_to_text; + BCH_ASYNC_OBJ_LISTS() +#undef x + + return 0; +} diff --git a/fs/bcachefs/async_objs.h b/fs/bcachefs/async_objs.h new file mode 100644 index 000000000000..cd6489b8cf76 --- /dev/null +++ b/fs/bcachefs/async_objs.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ASYNC_OBJS_H +#define _BCACHEFS_ASYNC_OBJS_H + +#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS +static inline void __async_object_list_del(struct fast_list *head, unsigned idx) +{ + fast_list_remove(head, idx); +} + +static inline int __async_object_list_add(struct fast_list *head, void *obj, unsigned *idx) +{ + int ret = fast_list_add(head, obj); + *idx = ret > 0 ? ret : 0; + return ret < 0 ? ret : 0; +} + +#define async_object_list_del(_c, _list, idx) \ + __async_object_list_del(&(_c)->async_objs[BCH_ASYNC_OBJ_LIST_##_list].list, idx) + +#define async_object_list_add(_c, _list, obj, idx) \ + __async_object_list_add(&(_c)->async_objs[BCH_ASYNC_OBJ_LIST_##_list].list, obj, idx) + +void bch2_fs_async_obj_debugfs_init(struct bch_fs *); +void bch2_fs_async_obj_exit(struct bch_fs *); +int bch2_fs_async_obj_init(struct bch_fs *); + +#else /* CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS */ + +#define async_object_list_del(_c, _n, idx) do {} while (0) + +static inline int __async_object_list_add(void) +{ + return 0; +} +#define async_object_list_add(_c, _n, obj, idx) __async_object_list_add() + +static inline void bch2_fs_async_obj_debugfs_init(struct bch_fs *c) {} +static inline void bch2_fs_async_obj_exit(struct bch_fs *c) {} +static inline int bch2_fs_async_obj_init(struct bch_fs *c) { return 0; } + +#endif /* CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS */ + +#endif /* _BCACHEFS_ASYNC_OBJS_H */ diff --git a/fs/bcachefs/async_objs_types.h b/fs/bcachefs/async_objs_types.h new file mode 100644 index 000000000000..8d713c0f5841 --- /dev/null +++ b/fs/bcachefs/async_objs_types.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ASYNC_OBJS_TYPES_H +#define _BCACHEFS_ASYNC_OBJS_TYPES_H + +#define BCH_ASYNC_OBJ_LISTS() \ + x(promote) \ + x(rbio) \ + x(write_op) \ + x(btree_read_bio) \ + x(btree_write_bio) + +enum bch_async_obj_lists { +#define x(n) BCH_ASYNC_OBJ_LIST_##n, + BCH_ASYNC_OBJ_LISTS() +#undef x + BCH_ASYNC_OBJ_NR +}; + +struct async_obj_list { + struct fast_list list; + void (*obj_to_text)(struct printbuf *, void *); + unsigned idx; +}; + +#endif /* _BCACHEFS_ASYNC_OBJS_TYPES_H */ diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 5f195d2280a4..e76809e71858 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -12,9 +12,20 @@ #include "disk_accounting.h" #include "error.h" #include "progress.h" +#include "recovery_passes.h" #include <linux/mm.h> +static int bch2_bucket_bitmap_set(struct bch_dev *, struct bucket_bitmap *, u64); + +static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp) +{ + return (struct bbpos) { + .btree = bp.btree_id, + .pos = bp.pos, + }; +} + int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k, struct bkey_validate_context from) { @@ -37,17 +48,19 @@ void bch2_backpointer_to_text(struct printbuf *out, struct bch_fs *c, struct bke { struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); - rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp.k->p.inode); - if (ca) { - u32 bucket_offset; - struct bpos bucket = bp_pos_to_bucket_and_offset(ca, bp.k->p, &bucket_offset); - rcu_read_unlock(); + struct bch_dev *ca; + u32 bucket_offset; + struct bpos bucket; + scoped_guard(rcu) { + ca = bch2_dev_rcu_noerror(c, bp.k->p.inode); + if (ca) + bucket = bp_pos_to_bucket_and_offset(ca, bp.k->p, &bucket_offset); + } + + if (ca) prt_printf(out, "bucket=%llu:%llu:%u ", bucket.inode, bucket.offset, bucket_offset); - } else { - rcu_read_unlock(); + else prt_printf(out, "sector=%llu:%llu ", bp.k->p.inode, bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT); - } bch2_btree_id_level_to_text(out, bp.v->btree_id, bp.v->level); prt_str(out, " data_type="); @@ -96,6 +109,8 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; + bool will_check = c->recovery.passes_to_run & + BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers); int ret = 0; if (insert) { @@ -110,9 +125,7 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, prt_printf(&buf, "for "); bch2_bkey_val_to_text(&buf, c, orig_k); - - bch_err(c, "%s", buf.buf); - } else if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) { + } else if (!will_check) { prt_printf(&buf, "backpointer not found when deleting\n"); printbuf_indent_add(&buf, 2); @@ -128,9 +141,8 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, bch2_bkey_val_to_text(&buf, c, orig_k); } - if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers && - __bch2_inconsistent_error(c, &buf)) - ret = -BCH_ERR_erofs_unfixed_errors; + if (!will_check && __bch2_inconsistent_error(c, &buf)) + ret = bch_err_throw(c, erofs_unfixed_errors); bch_err(c, "%s", buf.buf); printbuf_exit(&buf); @@ -174,7 +186,7 @@ err: static int bch2_backpointer_del(struct btree_trans *trans, struct bpos pos) { - return (likely(!bch2_backpointers_no_use_write_buffer) + return (!static_branch_unlikely(&bch2_backpointers_no_use_write_buffer) ? bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, pos) : bch2_btree_delete(trans, BTREE_ID_backpointers, pos, 0)) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); @@ -184,7 +196,7 @@ static inline int bch2_backpointers_maybe_flush(struct btree_trans *trans, struct bkey_s_c visiting_k, struct bkey_buf *last_flushed) { - return likely(!bch2_backpointers_no_use_write_buffer) + return !static_branch_unlikely(&bch2_backpointers_no_use_write_buffer) ? bch2_btree_write_buffer_maybe_flush(trans, visiting_k, last_flushed) : 0; } @@ -283,7 +295,7 @@ static struct btree *__bch2_backpointer_get_node(struct btree_trans *trans, return b; if (btree_node_will_make_reachable(b)) { - b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node); + b = ERR_PTR(bch_err_throw(c, backpointer_to_overwritten_btree_node)); } else { int ret = backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key), last_flushed, commit); @@ -341,7 +353,7 @@ static struct bkey_s_c __bch2_backpointer_get_key(struct btree_trans *trans, return ret ? bkey_s_c_err(ret) : bkey_s_c_null; } else { struct btree *b = __bch2_backpointer_get_node(trans, bp, iter, last_flushed, commit); - if (b == ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node)) + if (b == ERR_PTR(bch_err_throw(c, backpointer_to_overwritten_btree_node))) return bkey_s_c_null; if (IS_ERR_OR_NULL(b)) return ((struct bkey_s_c) { .k = ERR_CAST(b) }); @@ -478,7 +490,8 @@ found: bytes = p.crc.compressed_size << 9; - struct bch_dev *ca = bch2_dev_get_ioref(c, dev, READ); + struct bch_dev *ca = bch2_dev_get_ioref(c, dev, READ, + BCH_DEV_READ_REF_check_extent_checksums); if (!ca) return false; @@ -515,7 +528,8 @@ err: if (bio) bio_put(bio); kvfree(data_buf); - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], + BCH_DEV_READ_REF_check_extent_checksums); printbuf_exit(&buf); return ret; } @@ -579,6 +593,7 @@ check_existing_bp: bkey_for_each_ptr(other_extent_ptrs, ptr) if (ptr->dev == bp->k.p.inode && dev_ptr_stale_rcu(ca, ptr)) { + rcu_read_unlock(); ret = drop_dev_and_update(trans, other_bp.v->btree_id, other_extent, bp->k.p.inode); if (ret) @@ -636,7 +651,7 @@ check_existing_bp: prt_newline(&buf); bch2_bkey_val_to_text(&buf, c, other_extent); bch_err(c, "%s", buf.buf); - ret = -BCH_ERR_fsck_repair_unimplemented; + ret = bch_err_throw(c, fsck_repair_unimplemented); goto err; missing: printbuf_reset(&buf); @@ -667,24 +682,32 @@ static int check_extent_to_backpointers(struct btree_trans *trans, if (p.ptr.dev == BCH_SB_MEMBER_INVALID) continue; - rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev); - bool check = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_mismatches); - bool empty = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_empty); + bool empty; + { + /* scoped_guard() is a loop, so it breaks continue */ + guard(rcu)(); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev); + if (!ca) + continue; - bool stale = p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr)); - rcu_read_unlock(); + if (p.ptr.cached && dev_ptr_stale_rcu(ca, &p.ptr)) + continue; - if ((check || empty) && !stale) { - struct bkey_i_backpointer bp; - bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp); + u64 b = PTR_BUCKET_NR(ca, &p.ptr); + if (!bch2_bucket_bitmap_test(&ca->bucket_backpointer_mismatch, b)) + continue; - int ret = check - ? check_bp_exists(trans, s, &bp, k) - : bch2_bucket_backpointer_mod(trans, k, &bp, true); - if (ret) - return ret; + empty = bch2_bucket_bitmap_test(&ca->bucket_backpointer_empty, b); } + + struct bkey_i_backpointer bp; + bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp); + + int ret = !empty + ? check_bp_exists(trans, s, &bp, k) + : bch2_bucket_backpointer_mod(trans, k, &bp, true); + if (ret) + return ret; } return 0; @@ -722,14 +745,6 @@ err: return ret; } -static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp) -{ - return (struct bbpos) { - .btree = bp.btree_id, - .pos = bp.pos, - }; -} - static u64 mem_may_pin_bytes(struct bch_fs *c) { struct sysinfo i; @@ -788,6 +803,13 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, return ret; } +static inline int bch2_fs_going_ro(struct bch_fs *c) +{ + return test_bit(BCH_FS_going_ro, &c->flags) + ? -EROFS + : 0; +} + static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, struct extents_to_bp_state *s) { @@ -815,6 +837,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, ret = for_each_btree_key_continue(trans, iter, 0, k, ({ bch2_progress_update_iter(trans, &progress, &iter, "extents_to_backpointers"); + bch2_fs_going_ro(c) ?: check_extent_to_backpointers(trans, s, btree_id, level, k) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); })); @@ -854,6 +877,7 @@ static int data_type_to_alloc_counter(enum bch_data_type t) static int check_bucket_backpointers_to_extents(struct btree_trans *, struct bch_dev *, struct bpos); static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct bkey_s_c alloc_k, + bool *had_mismatch, struct bkey_buf *last_flushed) { struct bch_fs *c = trans->c; @@ -861,6 +885,8 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert); bool need_commit = false; + *had_mismatch = false; + if (a->data_type == BCH_DATA_sb || a->data_type == BCH_DATA_journal || a->data_type == BCH_DATA_parity) @@ -927,16 +953,22 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b sectors[ALLOC_cached] > a->cached_sectors || sectors[ALLOC_stripe] > a->stripe_sectors) { ret = check_bucket_backpointers_to_extents(trans, ca, alloc_k.k->p) ?: - -BCH_ERR_transaction_restart_nested; + bch_err_throw(c, transaction_restart_nested); goto err; } - if (!sectors[ALLOC_dirty] && - !sectors[ALLOC_stripe] && - !sectors[ALLOC_cached]) - __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_empty); - else - __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_mismatches); + bool empty = (sectors[ALLOC_dirty] + + sectors[ALLOC_stripe] + + sectors[ALLOC_cached]) == 0; + + ret = bch2_bucket_bitmap_set(ca, &ca->bucket_backpointer_mismatch, + alloc_k.k->p.offset) ?: + (empty + ? bch2_bucket_bitmap_set(ca, &ca->bucket_backpointer_empty, + alloc_k.k->p.offset) + : 0); + + *had_mismatch = true; } err: bch2_dev_put(ca); @@ -949,7 +981,7 @@ static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k) case KEY_TYPE_btree_ptr_v2: { bool ret = false; - rcu_read_lock(); + guard(rcu)(); struct bpos pos = bkey_s_c_to_btree_ptr_v2(k).v->min_key; while (pos.inode <= k.k->p.inode) { if (pos.inode >= c->sb.nr_devices) @@ -960,8 +992,14 @@ static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k) goto next; struct bpos bucket = bp_pos_to_bucket(ca, pos); - bucket.offset = find_next_bit(ca->bucket_backpointer_mismatches, - ca->mi.nbuckets, bucket.offset); + u64 next = ca->mi.nbuckets; + + unsigned long *bitmap = READ_ONCE(ca->bucket_backpointer_mismatch.buckets); + if (bitmap) + next = min_t(u64, next, + find_next_bit(bitmap, ca->mi.nbuckets, bucket.offset)); + + bucket.offset = next; if (bucket.offset == ca->mi.nbuckets) goto next; @@ -971,7 +1009,6 @@ static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k) next: pos = SPOS(pos.inode + 1, 0, 0); } - rcu_read_unlock(); return ret; } @@ -1070,28 +1107,6 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) { int ret = 0; - /* - * Can't allow devices to come/go/resize while we have bucket bitmaps - * allocated - */ - down_read(&c->state_lock); - - for_each_member_device(c, ca) { - BUG_ON(ca->bucket_backpointer_mismatches); - ca->bucket_backpointer_mismatches = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets), - sizeof(unsigned long), - GFP_KERNEL); - ca->bucket_backpointer_empty = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets), - sizeof(unsigned long), - GFP_KERNEL); - if (!ca->bucket_backpointer_mismatches || - !ca->bucket_backpointer_empty) { - bch2_dev_put(ca); - ret = -BCH_ERR_ENOMEM_backpointer_mismatches_bitmap; - goto err_free_bitmaps; - } - } - struct btree_trans *trans = bch2_trans_get(c); struct extents_to_bp_state s = { .bp_start = POS_MIN }; @@ -1100,23 +1115,24 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, BTREE_ITER_prefetch, k, ({ - check_bucket_backpointer_mismatch(trans, k, &s.last_flushed); + bool had_mismatch; + bch2_fs_going_ro(c) ?: + check_bucket_backpointer_mismatch(trans, k, &had_mismatch, &s.last_flushed); })); if (ret) goto err; - u64 nr_buckets = 0, nr_mismatches = 0, nr_empty = 0; + u64 nr_buckets = 0, nr_mismatches = 0; for_each_member_device(c, ca) { nr_buckets += ca->mi.nbuckets; - nr_mismatches += bitmap_weight(ca->bucket_backpointer_mismatches, ca->mi.nbuckets); - nr_empty += bitmap_weight(ca->bucket_backpointer_empty, ca->mi.nbuckets); + nr_mismatches += ca->bucket_backpointer_mismatch.nr; } - if (!nr_mismatches && !nr_empty) + if (!nr_mismatches) goto err; bch_info(c, "scanning for missing backpointers in %llu/%llu buckets", - nr_mismatches + nr_empty, nr_buckets); + nr_mismatches, nr_buckets); while (1) { ret = bch2_pin_backpointer_nodes_with_missing(trans, s.bp_start, &s.bp_end); @@ -1147,23 +1163,71 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) s.bp_start = bpos_successor(s.bp_end); } + + for_each_member_device(c, ca) { + bch2_bucket_bitmap_free(&ca->bucket_backpointer_mismatch); + bch2_bucket_bitmap_free(&ca->bucket_backpointer_empty); + } err: bch2_trans_put(trans); bch2_bkey_buf_exit(&s.last_flushed, c); bch2_btree_cache_unpin(c); -err_free_bitmaps: - for_each_member_device(c, ca) { - kvfree(ca->bucket_backpointer_empty); - ca->bucket_backpointer_empty = NULL; - kvfree(ca->bucket_backpointer_mismatches); - ca->bucket_backpointer_mismatches = NULL; - } - up_read(&c->state_lock); bch_err_fn(c, ret); return ret; } +static int check_bucket_backpointer_pos_mismatch(struct btree_trans *trans, + struct bpos bucket, + bool *had_mismatch, + struct bkey_buf *last_flushed) +{ + struct btree_iter alloc_iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &alloc_iter, + BTREE_ID_alloc, bucket, + BTREE_ITER_cached); + int ret = bkey_err(k); + if (ret) + return ret; + + ret = check_bucket_backpointer_mismatch(trans, k, had_mismatch, last_flushed); + bch2_trans_iter_exit(trans, &alloc_iter); + return ret; +} + +int bch2_check_bucket_backpointer_mismatch(struct btree_trans *trans, + struct bch_dev *ca, u64 bucket, + bool copygc, + struct bkey_buf *last_flushed) +{ + struct bch_fs *c = trans->c; + bool had_mismatch; + int ret = lockrestart_do(trans, + check_bucket_backpointer_pos_mismatch(trans, POS(ca->dev_idx, bucket), + &had_mismatch, last_flushed)); + if (ret || !had_mismatch) + return ret; + + u64 nr = ca->bucket_backpointer_mismatch.nr; + u64 allowed = copygc ? ca->mi.nbuckets >> 7 : 0; + + struct printbuf buf = PRINTBUF; + __bch2_log_msg_start(ca->name, &buf); + + prt_printf(&buf, "Detected missing backpointers in bucket %llu, now have %llu/%llu with missing\n", + bucket, nr, ca->mi.nbuckets); + + bch2_run_explicit_recovery_pass(c, &buf, + BCH_RECOVERY_PASS_check_extents_to_backpointers, + nr < allowed ? RUN_RECOVERY_PASS_ratelimit : 0); + + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + return 0; +} + +/* backpointers -> extents */ + static int check_one_backpointer(struct btree_trans *trans, struct bbpos start, struct bbpos end, @@ -1279,3 +1343,49 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c) bch_err_fn(c, ret); return ret; } + +static int bch2_bucket_bitmap_set(struct bch_dev *ca, struct bucket_bitmap *b, u64 bit) +{ + scoped_guard(mutex, &b->lock) { + if (!b->buckets) { + b->buckets = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets), + sizeof(unsigned long), GFP_KERNEL); + if (!b->buckets) + return bch_err_throw(ca->fs, ENOMEM_backpointer_mismatches_bitmap); + } + + b->nr += !__test_and_set_bit(bit, b->buckets); + } + + return 0; +} + +int bch2_bucket_bitmap_resize(struct bch_dev *ca, struct bucket_bitmap *b, + u64 old_size, u64 new_size) +{ + scoped_guard(mutex, &b->lock) { + if (!b->buckets) + return 0; + + unsigned long *n = kvcalloc(BITS_TO_LONGS(new_size), + sizeof(unsigned long), GFP_KERNEL); + if (!n) + return bch_err_throw(ca->fs, ENOMEM_backpointer_mismatches_bitmap); + + memcpy(n, b->buckets, + BITS_TO_LONGS(min(old_size, new_size)) * sizeof(unsigned long)); + kvfree(b->buckets); + b->buckets = n; + } + + return 0; +} + +void bch2_bucket_bitmap_free(struct bucket_bitmap *b) +{ + mutex_lock(&b->lock); + kvfree(b->buckets); + b->buckets = NULL; + b->nr = 0; + mutex_unlock(&b->lock); +} diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h index 16575dbc5736..7e71afee1ac0 100644 --- a/fs/bcachefs/backpointers.h +++ b/fs/bcachefs/backpointers.h @@ -53,11 +53,10 @@ static inline struct bpos bp_pos_to_bucket_and_offset(const struct bch_dev *ca, static inline bool bp_pos_to_bucket_nodev_noerror(struct bch_fs *c, struct bpos bp_pos, struct bpos *bucket) { - rcu_read_lock(); + guard(rcu)(); struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp_pos.inode); if (ca) *bucket = bp_pos_to_bucket(ca, bp_pos); - rcu_read_unlock(); return ca != NULL; } @@ -102,7 +101,7 @@ static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans, struct bkey_i_backpointer *bp, bool insert) { - if (unlikely(bch2_backpointers_no_use_write_buffer)) + if (static_branch_unlikely(&bch2_backpointers_no_use_write_buffer)) return bch2_bucket_backpointer_mod_nowritebuffer(trans, orig_k, bp, insert); if (!insert) { @@ -182,8 +181,20 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct bkey_s_c_b struct btree *bch2_backpointer_get_node(struct btree_trans *, struct bkey_s_c_backpointer, struct btree_iter *, struct bkey_buf *); +int bch2_check_bucket_backpointer_mismatch(struct btree_trans *, struct bch_dev *, u64, + bool, struct bkey_buf *); + int bch2_check_btree_backpointers(struct bch_fs *); int bch2_check_extents_to_backpointers(struct bch_fs *); int bch2_check_backpointers_to_extents(struct bch_fs *); +static inline bool bch2_bucket_bitmap_test(struct bucket_bitmap *b, u64 i) +{ + unsigned long *bitmap = READ_ONCE(b->buckets); + return bitmap && test_bit(i, bitmap); +} + +int bch2_bucket_bitmap_resize(struct bch_dev *, struct bucket_bitmap *, u64, u64); +void bch2_bucket_bitmap_free(struct bucket_bitmap *); + #endif /* _BCACHEFS_BACKPOINTERS_BACKGROUND_H */ diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 75f7408da173..3651a296d506 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -183,6 +183,16 @@ #define pr_fmt(fmt) "%s() " fmt "\n", __func__ #endif +#ifdef CONFIG_BCACHEFS_DEBUG +#define ENUMERATED_REF_DEBUG +#endif + +#ifndef dynamic_fault +#define dynamic_fault(...) 0 +#endif + +#define race_fault(...) dynamic_fault("bcachefs:race") + #include <linux/backing-dev-defs.h> #include <linux/bug.h> #include <linux/bio.h> @@ -209,24 +219,40 @@ #include "btree_journal_iter_types.h" #include "disk_accounting_types.h" #include "errcode.h" +#include "fast_list.h" #include "fifo.h" #include "nocow_locking_types.h" #include "opts.h" -#include "recovery_passes_types.h" #include "sb-errors_types.h" #include "seqmutex.h" +#include "snapshot_types.h" #include "time_stats.h" #include "util.h" -#ifdef CONFIG_BCACHEFS_DEBUG -#define BCH_WRITE_REF_DEBUG -#endif - -#ifndef dynamic_fault -#define dynamic_fault(...) 0 -#endif +#include "alloc_types.h" +#include "async_objs_types.h" +#include "btree_gc_types.h" +#include "btree_types.h" +#include "btree_node_scan_types.h" +#include "btree_write_buffer_types.h" +#include "buckets_types.h" +#include "buckets_waiting_for_journal_types.h" +#include "clock_types.h" +#include "disk_groups_types.h" +#include "ec_types.h" +#include "enumerated_ref_types.h" +#include "journal_types.h" +#include "keylist_types.h" +#include "quota_types.h" +#include "rebalance_types.h" +#include "recovery_passes_types.h" +#include "replicas_types.h" +#include "sb-members_types.h" +#include "subvolume_types.h" +#include "super_types.h" +#include "thread_with_file_types.h" -#define race_fault(...) dynamic_fault("bcachefs:race") +#include "trace.h" #define count_event(_c, _name) this_cpu_inc((_c)->counters[BCH_COUNTER_##_name]) @@ -269,7 +295,8 @@ do { \ #define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n") -void bch2_print_str(struct bch_fs *, const char *); +void bch2_print_str(struct bch_fs *, const char *, const char *); +void bch2_print_str_nonblocking(struct bch_fs *, const char *, const char *); __printf(2, 3) void bch2_print_opts(struct bch_opts *, const char *, ...); @@ -293,6 +320,16 @@ do { \ bch2_print(_c, __VA_ARGS__); \ } while (0) +#define bch2_print_str_ratelimited(_c, ...) \ +do { \ + static DEFINE_RATELIMIT_STATE(_rs, \ + DEFAULT_RATELIMIT_INTERVAL, \ + DEFAULT_RATELIMIT_BURST); \ + \ + if (__ratelimit(&_rs)) \ + bch2_print_str(_c, __VA_ARGS__); \ +} while (0) + #define bch_info(c, fmt, ...) \ bch2_print(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__) #define bch_info_ratelimited(c, fmt, ...) \ @@ -368,6 +405,14 @@ do { \ pr_info(fmt, ##__VA_ARGS__); \ } while (0) +static inline int __bch2_err_trace(struct bch_fs *c, int err) +{ + trace_error_throw(c, err, _THIS_IP_); + return err; +} + +#define bch_err_throw(_c, _err) __bch2_err_trace(_c, -BCH_ERR_##_err) + /* Parameters that are useful for debugging, but should always be compiled in: */ #define BCH_DEBUG_PARAMS_ALWAYS() \ BCH_DEBUG_PARAM(key_merging_disabled, \ @@ -390,17 +435,20 @@ do { \ "compare them") \ BCH_DEBUG_PARAM(backpointers_no_use_write_buffer, \ "Don't use the write buffer for backpointers, enabling "\ - "extra runtime checks") - -/* Parameters that should only be compiled in debug mode: */ -#define BCH_DEBUG_PARAMS_DEBUG() \ - BCH_DEBUG_PARAM(expensive_debug_checks, \ - "Enables various runtime debugging checks that " \ - "significantly affect performance") \ + "extra runtime checks") \ + BCH_DEBUG_PARAM(debug_check_btree_locking, \ + "Enable additional asserts for btree locking") \ BCH_DEBUG_PARAM(debug_check_iterators, \ "Enables extra verification for btree iterators") \ + BCH_DEBUG_PARAM(debug_check_bset_lookups, \ + "Enables extra verification for bset lookups") \ BCH_DEBUG_PARAM(debug_check_btree_accounting, \ "Verify btree accounting for keys within a node") \ + BCH_DEBUG_PARAM(debug_check_bkey_unpack, \ + "Enables extra verification for bkey unpack") + +/* Parameters that should only be compiled in debug mode: */ +#define BCH_DEBUG_PARAMS_DEBUG() \ BCH_DEBUG_PARAM(journal_seq_verify, \ "Store the journal sequence number in the version " \ "number of every btree key, and verify that btree " \ @@ -427,15 +475,9 @@ do { \ #define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS() #endif -#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name; -BCH_DEBUG_PARAMS() -#undef BCH_DEBUG_PARAM - -#ifndef CONFIG_BCACHEFS_DEBUG -#define BCH_DEBUG_PARAM(name, description) static const __maybe_unused bool bch2_##name; -BCH_DEBUG_PARAMS_DEBUG() +#define BCH_DEBUG_PARAM(name, description) extern struct static_key_false bch2_##name; +BCH_DEBUG_PARAMS_ALL() #undef BCH_DEBUG_PARAM -#endif #define BCH_TIME_STATS() \ x(btree_node_mem_alloc) \ @@ -443,6 +485,7 @@ BCH_DEBUG_PARAMS_DEBUG() x(btree_node_compact) \ x(btree_node_merge) \ x(btree_node_sort) \ + x(btree_node_get) \ x(btree_node_read) \ x(btree_node_read_done) \ x(btree_node_write) \ @@ -450,6 +493,10 @@ BCH_DEBUG_PARAMS_DEBUG() x(btree_interior_update_total) \ x(btree_gc) \ x(data_write) \ + x(data_write_to_submit) \ + x(data_write_to_queue) \ + x(data_write_to_btree_update) \ + x(data_write_btree_update) \ x(data_read) \ x(data_promote) \ x(journal_flush_write) \ @@ -472,26 +519,6 @@ enum bch_time_stats { BCH_TIME_STAT_NR }; -#include "alloc_types.h" -#include "btree_gc_types.h" -#include "btree_types.h" -#include "btree_node_scan_types.h" -#include "btree_write_buffer_types.h" -#include "buckets_types.h" -#include "buckets_waiting_for_journal_types.h" -#include "clock_types.h" -#include "disk_groups_types.h" -#include "ec_types.h" -#include "journal_types.h" -#include "keylist_types.h" -#include "quota_types.h" -#include "rebalance_types.h" -#include "replicas_types.h" -#include "sb-members_types.h" -#include "subvolume_types.h" -#include "super_types.h" -#include "thread_with_file_types.h" - /* Number of nodes btree coalesce will try to coalesce at once */ #define GC_MERGE_NODES 4U @@ -514,6 +541,57 @@ struct discard_in_flight { u64 bucket:63; }; +#define BCH_DEV_READ_REFS() \ + x(bch2_online_devs) \ + x(trans_mark_dev_sbs) \ + x(read_fua_test) \ + x(sb_field_resize) \ + x(write_super) \ + x(journal_read) \ + x(fs_journal_alloc) \ + x(fs_resize_on_mount) \ + x(btree_node_read) \ + x(btree_node_read_all_replicas) \ + x(btree_node_scrub) \ + x(btree_node_write) \ + x(btree_node_scan) \ + x(btree_verify_replicas) \ + x(btree_node_ondisk_to_text) \ + x(io_read) \ + x(check_extent_checksums) \ + x(ec_block) + +enum bch_dev_read_ref { +#define x(n) BCH_DEV_READ_REF_##n, + BCH_DEV_READ_REFS() +#undef x + BCH_DEV_READ_REF_NR, +}; + +#define BCH_DEV_WRITE_REFS() \ + x(journal_write) \ + x(journal_do_discards) \ + x(dev_do_discards) \ + x(discard_one_bucket_fast) \ + x(do_invalidates) \ + x(nocow_flush) \ + x(io_write) \ + x(ec_block) \ + x(ec_bucket_zero) + +enum bch_dev_write_ref { +#define x(n) BCH_DEV_WRITE_REF_##n, + BCH_DEV_WRITE_REFS() +#undef x + BCH_DEV_WRITE_REF_NR, +}; + +struct bucket_bitmap { + unsigned long *buckets; + u64 nr; + struct mutex lock; +}; + struct bch_dev { struct kobject kobj; #ifdef CONFIG_BCACHEFS_DEBUG @@ -524,8 +602,7 @@ struct bch_dev { struct percpu_ref ref; #endif struct completion ref_completion; - struct percpu_ref io_ref[2]; - struct completion io_ref_completion[2]; + struct enumerated_ref io_ref[2]; struct bch_fs *fs; @@ -559,8 +636,8 @@ struct bch_dev { u8 *oldest_gen; unsigned long *buckets_nouse; - unsigned long *bucket_backpointer_mismatches; - unsigned long *bucket_backpointer_empty; + struct bucket_bitmap bucket_backpointer_mismatch; + struct bucket_bitmap bucket_backpointer_empty; struct bch_dev_usage_full __percpu *usage; @@ -572,10 +649,6 @@ struct bch_dev { unsigned nr_partial_buckets; unsigned nr_btree_reserve; - size_t inc_gen_needs_gc; - size_t inc_gen_really_needs_gc; - size_t buckets_waiting_on_journal; - struct work_struct invalidate_work; struct work_struct discard_work; struct mutex discard_buckets_in_flight_lock; @@ -614,14 +687,15 @@ struct bch_dev { x(accounting_replay_done) \ x(may_go_rw) \ x(rw) \ + x(rw_init_done) \ x(was_rw) \ x(stopping) \ x(emergency_ro) \ x(going_ro) \ x(write_disable_complete) \ x(clean_shutdown) \ - x(recovery_running) \ - x(fsck_running) \ + x(in_recovery) \ + x(in_fsck) \ x(initial_gc_unfixed) \ x(need_delete_dead_snapshots) \ x(error) \ @@ -648,8 +722,10 @@ struct btree_transaction_stats { struct bch2_time_stats lock_hold_times; struct mutex lock; unsigned nr_max_paths; - unsigned journal_entries_size; unsigned max_mem; +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE + darray_trans_kmalloc_trace trans_kmalloc_trace; +#endif char *max_paths_text; }; @@ -670,9 +746,6 @@ struct btree_trans_buf { struct btree_trans *trans; }; -#define BCACHEFS_ROOT_SUBVOL_INUM \ - ((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO }) - #define BCH_WRITE_REFS() \ x(journal) \ x(trans) \ @@ -694,7 +767,8 @@ struct btree_trans_buf { x(snapshot_delete_pagecache) \ x(sysfs) \ x(btree_write_buffer) \ - x(btree_node_scrub) + x(btree_node_scrub) \ + x(async_recovery_passes) enum bch_write_ref { #define x(n) BCH_WRITE_REF_##n, @@ -728,11 +802,7 @@ struct bch_fs { struct rw_semaphore state_lock; /* Counts outstanding writes, for clean transition to read-only */ -#ifdef BCH_WRITE_REF_DEBUG - atomic_long_t writes[BCH_WRITE_REF_NR]; -#else - struct percpu_ref writes; -#endif + struct enumerated_ref writes; /* * Certain operations are only allowed in single threaded mode, during * recovery, and we want to assert that this is the case: @@ -776,6 +846,7 @@ struct bch_fs { u8 nr_devices; u8 clean; + bool multi_device; /* true if we've ever had more than one device */ u8 encryption_type; @@ -785,6 +856,7 @@ struct bch_fs { unsigned nsec_per_time_unit; u64 features; u64 compat; + u64 recovery_passes_required; unsigned long errors_silent[BITS_TO_LONGS(BCH_FSCK_ERR_MAX)]; u64 btrees_lost_data; } sb; @@ -809,7 +881,7 @@ struct bch_fs { struct mutex snapshot_table_lock; struct rw_semaphore snapshot_create_lock; - struct work_struct snapshot_delete_work; + struct snapshot_delete snapshot_delete; struct work_struct snapshot_wait_for_pagecache_and_delete_work; snapshot_id_list snapshots_unlinked; struct mutex snapshots_unlinked_lock; @@ -874,7 +946,7 @@ struct bch_fs { struct btree_write_buffer btree_write_buffer; struct workqueue_struct *btree_update_wq; - struct workqueue_struct *btree_io_complete_wq; + struct workqueue_struct *btree_write_complete_wq; /* copygc needs its own workqueue for index updates.. */ struct workqueue_struct *copygc_wq; /* @@ -885,6 +957,7 @@ struct bch_fs { struct workqueue_struct *write_ref_wq; /* ALLOCATION */ + struct bch_devs_mask online_devs; struct bch_devs_mask rw_devs[BCH_DATA_NR]; unsigned long rw_devs_change_count; @@ -979,6 +1052,10 @@ struct bch_fs { nocow_locks; struct rhashtable promote_table; +#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS + struct async_obj_list async_objs[BCH_ASYNC_OBJ_NR]; +#endif + mempool_t compression_bounce[2]; mempool_t compress_workspace[BCH_COMPRESSION_OPT_NR]; size_t zstd_workspace_size; @@ -1048,25 +1125,12 @@ struct bch_fs { /* RECOVERY */ u64 journal_replay_seq_start; u64 journal_replay_seq_end; - /* - * Two different uses: - * "Has this fsck pass?" - i.e. should this type of error be an - * emergency read-only - * And, in certain situations fsck will rewind to an earlier pass: used - * for signaling to the toplevel code which pass we want to run now. - */ - enum bch_recovery_pass curr_recovery_pass; - enum bch_recovery_pass next_recovery_pass; - /* bitmask of recovery passes that we actually ran */ - u64 recovery_passes_complete; - /* never rewinds version of curr_recovery_pass */ - enum bch_recovery_pass recovery_pass_done; - spinlock_t recovery_pass_lock; - struct semaphore online_fsck_mutex; + struct bch_fs_recovery recovery; /* DEBUG JUNK */ struct dentry *fs_debug_dir; struct dentry *btree_debug_dir; + struct dentry *async_obj_dir; struct btree_debug btree_debug[BTREE_ID_NR]; struct btree *verify_data; struct btree_node *verify_ondisk; @@ -1108,54 +1172,6 @@ struct bch_fs { extern struct wait_queue_head bch2_read_only_wait; -static inline void bch2_write_ref_get(struct bch_fs *c, enum bch_write_ref ref) -{ -#ifdef BCH_WRITE_REF_DEBUG - atomic_long_inc(&c->writes[ref]); -#else - percpu_ref_get(&c->writes); -#endif -} - -static inline bool __bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref) -{ -#ifdef BCH_WRITE_REF_DEBUG - return !test_bit(BCH_FS_going_ro, &c->flags) && - atomic_long_inc_not_zero(&c->writes[ref]); -#else - return percpu_ref_tryget(&c->writes); -#endif -} - -static inline bool bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref) -{ -#ifdef BCH_WRITE_REF_DEBUG - return !test_bit(BCH_FS_going_ro, &c->flags) && - atomic_long_inc_not_zero(&c->writes[ref]); -#else - return percpu_ref_tryget_live(&c->writes); -#endif -} - -static inline void bch2_write_ref_put(struct bch_fs *c, enum bch_write_ref ref) -{ -#ifdef BCH_WRITE_REF_DEBUG - long v = atomic_long_dec_return(&c->writes[ref]); - - BUG_ON(v < 0); - if (v) - return; - for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) - if (atomic_long_read(&c->writes[i])) - return; - - set_bit(BCH_FS_write_disable_complete, &c->flags); - wake_up(&bch2_read_only_wait); -#else - percpu_ref_put(&c->writes); -#endif -} - static inline bool bch2_ro_ref_tryget(struct bch_fs *c) { if (test_bit(BCH_FS_stopping, &c->flags)) @@ -1256,4 +1272,17 @@ static inline unsigned data_replicas_required(struct bch_fs *c) #define BKEY_PADDED_ONSTACK(key, pad) \ struct { struct bkey_i key; __u64 key ## _pad[pad]; } +/* + * This is needed because discard is both a filesystem option and a device + * option, and mount options are supposed to apply to that mount and not be + * persisted, i.e. if it's set as a mount option we can't propagate it to the + * device. + */ +static inline bool bch2_discard_opt_enabled(struct bch_fs *c, struct bch_dev *ca) +{ + return test_bit(BCH_FS_discard_mount_opt_set, &c->flags) + ? c->opts.discard + : ca->mi.discard; +} + #endif /* _BCACHEFS_H */ diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index d6e4a496f02b..b4a04df5ea95 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -497,7 +497,8 @@ struct bch_sb_field { x(members_v2, 11) \ x(errors, 12) \ x(ext, 13) \ - x(downgrade, 14) + x(downgrade, 14) \ + x(recovery_passes, 15) #include "alloc_background_format.h" #include "dirent_format.h" @@ -510,6 +511,7 @@ struct bch_sb_field { #include "logged_ops_format.h" #include "lru_format.h" #include "quota_format.h" +#include "recovery_passes_format.h" #include "reflink_format.h" #include "replicas_format.h" #include "snapshot_format.h" @@ -695,7 +697,10 @@ struct bch_sb_field_ext { x(stripe_backpointers, BCH_VERSION(1, 22)) \ x(stripe_lru, BCH_VERSION(1, 23)) \ x(casefolding, BCH_VERSION(1, 24)) \ - x(extent_flags, BCH_VERSION(1, 25)) + x(extent_flags, BCH_VERSION(1, 25)) \ + x(snapshot_deletion_v2, BCH_VERSION(1, 26)) \ + x(fast_device_removal, BCH_VERSION(1, 27)) \ + x(inode_has_case_insensitive, BCH_VERSION(1, 28)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, @@ -846,7 +851,7 @@ LE64_BITMASK(BCH_SB_SHARD_INUMS, struct bch_sb, flags[3], 28, 29); LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30); LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62); LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63); -/* one free bit */ +LE64_BITMASK(BCH_SB_MULTI_DEVICE, struct bch_sb, flags[3], 63, 64); LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32); LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33); LE64_BITMASK(BCH_SB_NOCOW, struct bch_sb, flags[4], 33, 34); @@ -867,7 +872,9 @@ LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED, LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6], 0, 4); LE64_BITMASK(BCH_SB_WRITE_ERROR_TIMEOUT,struct bch_sb, flags[6], 4, 14); LE64_BITMASK(BCH_SB_CSUM_ERR_RETRY_NR, struct bch_sb, flags[6], 14, 20); +LE64_BITMASK(BCH_SB_DEGRADED_ACTION, struct bch_sb, flags[6], 20, 22); LE64_BITMASK(BCH_SB_CASEFOLD, struct bch_sb, flags[6], 22, 23); +LE64_BITMASK(BCH_SB_REBALANCE_AC_ONLY, struct bch_sb, flags[6], 23, 24); static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb) { @@ -922,7 +929,9 @@ static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u x(alloc_v2, 17) \ x(extents_across_btree_nodes, 18) \ x(incompat_version_field, 19) \ - x(casefolding, 20) + x(casefolding, 20) \ + x(no_alloc_info, 21) \ + x(small_image, 22) #define BCH_SB_FEATURES_ALWAYS \ (BIT_ULL(BCH_FEATURE_new_extent_overwrite)| \ @@ -989,6 +998,19 @@ enum bch_error_actions { BCH_ON_ERROR_NR }; +#define BCH_DEGRADED_ACTIONS() \ + x(ask, 0) \ + x(yes, 1) \ + x(very, 2) \ + x(no, 3) + +enum bch_degraded_actions { +#define x(t, n) BCH_DEGRADED_##t = n, + BCH_DEGRADED_ACTIONS() +#undef x + BCH_DEGRADED_ACTIONS_NR +}; + #define BCH_STR_HASH_TYPES() \ x(crc32c, 0) \ x(crc64, 1) \ diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c index 995ba32e9b6e..ee823c640642 100644 --- a/fs/bcachefs/bkey.c +++ b/fs/bcachefs/bkey.c @@ -47,11 +47,9 @@ void bch2_bkey_packed_to_binary_text(struct printbuf *out, } } -#ifdef CONFIG_BCACHEFS_DEBUG - -static void bch2_bkey_pack_verify(const struct bkey_packed *packed, - const struct bkey *unpacked, - const struct bkey_format *format) +static void __bch2_bkey_pack_verify(const struct bkey_packed *packed, + const struct bkey *unpacked, + const struct bkey_format *format) { struct bkey tmp; @@ -95,11 +93,13 @@ static void bch2_bkey_pack_verify(const struct bkey_packed *packed, } } -#else static inline void bch2_bkey_pack_verify(const struct bkey_packed *packed, - const struct bkey *unpacked, - const struct bkey_format *format) {} -#endif + const struct bkey *unpacked, + const struct bkey_format *format) +{ + if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) + __bch2_bkey_pack_verify(packed, unpacked, format); +} struct pack_state { const struct bkey_format *format; @@ -398,7 +398,6 @@ static bool set_inc_field_lossy(struct pack_state *state, unsigned field, u64 v) return ret; } -#ifdef CONFIG_BCACHEFS_DEBUG static bool bkey_packed_successor(struct bkey_packed *out, const struct btree *b, struct bkey_packed k) @@ -455,7 +454,6 @@ static bool bkey_format_has_too_big_fields(const struct bkey_format *f) return false; } -#endif /* * Returns a packed key that compares <= in @@ -472,9 +470,7 @@ enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out, const struct bkey_format *f = &b->format; struct pack_state state = pack_state_init(f, out); u64 *w = out->_data; -#ifdef CONFIG_BCACHEFS_DEBUG struct bpos orig = in; -#endif bool exact = true; unsigned i; @@ -527,18 +523,18 @@ enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out, out->format = KEY_FORMAT_LOCAL_BTREE; out->type = KEY_TYPE_deleted; -#ifdef CONFIG_BCACHEFS_DEBUG - if (exact) { - BUG_ON(bkey_cmp_left_packed(b, out, &orig)); - } else { - struct bkey_packed successor; + if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) { + if (exact) { + BUG_ON(bkey_cmp_left_packed(b, out, &orig)); + } else { + struct bkey_packed successor; - BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0); - BUG_ON(bkey_packed_successor(&successor, b, *out) && - bkey_cmp_left_packed(b, &successor, &orig) < 0 && - !bkey_format_has_too_big_fields(f)); + BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0); + BUG_ON(bkey_packed_successor(&successor, b, *out) && + bkey_cmp_left_packed(b, &successor, &orig) < 0 && + !bkey_format_has_too_big_fields(f)); + } } -#endif return exact ? BKEY_PACK_POS_EXACT : BKEY_PACK_POS_SMALLER; } @@ -627,14 +623,13 @@ struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s) } } -#ifdef CONFIG_BCACHEFS_DEBUG - { + if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) { struct printbuf buf = PRINTBUF; BUG_ON(bch2_bkey_format_invalid(NULL, &ret, 0, &buf)); printbuf_exit(&buf); } -#endif + return ret; } diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h index 054e2d5e8448..3ccd521c190a 100644 --- a/fs/bcachefs/bkey.h +++ b/fs/bcachefs/bkey.h @@ -191,6 +191,7 @@ static inline struct bpos bkey_max(struct bpos l, struct bpos r) static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r) { return bpos_eq(l.k->p, r.k->p) && + l.k->size == r.k->size && bkey_bytes(l.k) == bkey_bytes(r.k) && !memcmp(l.v, r.v, bkey_val_bytes(l.k)); } @@ -397,8 +398,7 @@ __bkey_unpack_key_format_checked(const struct btree *b, compiled_unpack_fn unpack_fn = b->aux_data; unpack_fn(dst, src); - if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && - bch2_expensive_debug_checks) { + if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) { struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src); BUG_ON(memcmp(dst, &dst2, sizeof(*dst))); diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c index 00d05ccfaf73..fcd8c82cba4f 100644 --- a/fs/bcachefs/bkey_methods.c +++ b/fs/bcachefs/bkey_methods.c @@ -356,7 +356,7 @@ bool bch2_bkey_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) return ops->key_merge && bch2_bkey_maybe_mergable(l.k, r.k) && (u64) l.k->size + r.k->size <= KEY_SIZE_MAX && - !bch2_key_merging_disabled && + !static_branch_unlikely(&bch2_key_merging_disabled) && ops->key_merge(c, l, r); } diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c index 9a4a83d6fd2d..32841f762eb2 100644 --- a/fs/bcachefs/bset.c +++ b/fs/bcachefs/bset.c @@ -144,8 +144,6 @@ struct btree_nr_keys bch2_btree_node_count_keys(struct btree *b) return nr; } -#ifdef CONFIG_BCACHEFS_DEBUG - void __bch2_verify_btree_nr_keys(struct btree *b) { struct btree_nr_keys nr = bch2_btree_node_count_keys(b); @@ -153,7 +151,7 @@ void __bch2_verify_btree_nr_keys(struct btree *b) BUG_ON(memcmp(&nr, &b->nr, sizeof(nr))); } -static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter, +static void __bch2_btree_node_iter_next_check(struct btree_node_iter *_iter, struct btree *b) { struct btree_node_iter iter = *_iter; @@ -190,8 +188,8 @@ static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter, } } -void bch2_btree_node_iter_verify(struct btree_node_iter *iter, - struct btree *b) +void __bch2_btree_node_iter_verify(struct btree_node_iter *iter, + struct btree *b) { struct btree_node_iter_set *set, *s2; struct bkey_packed *k, *p; @@ -237,8 +235,8 @@ found: } } -void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, - struct bkey_packed *insert, unsigned clobber_u64s) +static void __bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, + struct bkey_packed *insert, unsigned clobber_u64s) { struct bset_tree *t = bch2_bkey_to_bset(b, where); struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where); @@ -285,12 +283,15 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, #endif } -#else - -static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter, - struct btree *b) {} +static inline void bch2_verify_insert_pos(struct btree *b, + struct bkey_packed *where, + struct bkey_packed *insert, + unsigned clobber_u64s) +{ + if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) + __bch2_verify_insert_pos(b, where, insert, clobber_u64s); +} -#endif /* Auxiliary search trees */ @@ -361,9 +362,8 @@ static struct bkey_float *bkey_float(const struct btree *b, return ro_aux_tree_base(b, t)->f + idx; } -static void bset_aux_tree_verify(struct btree *b) +static void __bset_aux_tree_verify(struct btree *b) { -#ifdef CONFIG_BCACHEFS_DEBUG for_each_bset(b, t) { if (t->aux_data_offset == U16_MAX) continue; @@ -375,7 +375,12 @@ static void bset_aux_tree_verify(struct btree *b) BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b)); BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b)); } -#endif +} + +static inline void bset_aux_tree_verify(struct btree *b) +{ + if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) + __bset_aux_tree_verify(b); } void bch2_btree_keys_init(struct btree *b) @@ -495,15 +500,11 @@ static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t, }; } -static void bch2_bset_verify_rw_aux_tree(struct btree *b, - struct bset_tree *t) +static void __bch2_bset_verify_rw_aux_tree(struct btree *b, struct bset_tree *t) { struct bkey_packed *k = btree_bkey_first(b, t); unsigned j = 0; - if (!bch2_expensive_debug_checks) - return; - BUG_ON(bset_has_ro_aux_tree(t)); if (!bset_has_rw_aux_tree(t)) @@ -530,6 +531,13 @@ start: } } +static inline void bch2_bset_verify_rw_aux_tree(struct btree *b, + struct bset_tree *t) +{ + if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) + __bch2_bset_verify_rw_aux_tree(b, t); +} + /* returns idx of first entry >= offset: */ static unsigned rw_aux_tree_bsearch(struct btree *b, struct bset_tree *t, @@ -869,7 +877,7 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, k = p; } - if (bch2_expensive_debug_checks) { + if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) { BUG_ON(ret >= orig_k); for (i = ret @@ -1195,7 +1203,7 @@ struct bkey_packed *bch2_bset_search_linear(struct btree *b, bkey_iter_pos_cmp(b, m, search) < 0) m = bkey_p_next(m); - if (bch2_expensive_debug_checks) { + if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) { struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m); BUG_ON(prev && @@ -1435,9 +1443,9 @@ static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter, void bch2_btree_node_iter_advance(struct btree_node_iter *iter, struct btree *b) { - if (bch2_expensive_debug_checks) { - bch2_btree_node_iter_verify(iter, b); - bch2_btree_node_iter_next_check(iter, b); + if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) { + __bch2_btree_node_iter_verify(iter, b); + __bch2_btree_node_iter_next_check(iter, b); } __bch2_btree_node_iter_advance(iter, b); @@ -1453,8 +1461,7 @@ struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, struct btree_node_iter_set *set; unsigned end = 0; - if (bch2_expensive_debug_checks) - bch2_btree_node_iter_verify(iter, b); + bch2_btree_node_iter_verify(iter, b); for_each_bset(b, t) { k = bch2_bkey_prev_all(b, t, @@ -1489,8 +1496,7 @@ found: iter->data[0].k = __btree_node_key_to_offset(b, prev); iter->data[0].end = end; - if (bch2_expensive_debug_checks) - bch2_btree_node_iter_verify(iter, b); + bch2_btree_node_iter_verify(iter, b); return prev; } diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h index 6953d55b72cc..a15ecf9d006e 100644 --- a/fs/bcachefs/bset.h +++ b/fs/bcachefs/bset.h @@ -517,27 +517,19 @@ void bch2_dump_bset(struct bch_fs *, struct btree *, struct bset *, unsigned); void bch2_dump_btree_node(struct bch_fs *, struct btree *); void bch2_dump_btree_node_iter(struct btree *, struct btree_node_iter *); -#ifdef CONFIG_BCACHEFS_DEBUG - void __bch2_verify_btree_nr_keys(struct btree *); -void bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *); -void bch2_verify_insert_pos(struct btree *, struct bkey_packed *, - struct bkey_packed *, unsigned); - -#else +void __bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *); -static inline void __bch2_verify_btree_nr_keys(struct btree *b) {} static inline void bch2_btree_node_iter_verify(struct btree_node_iter *iter, - struct btree *b) {} -static inline void bch2_verify_insert_pos(struct btree *b, - struct bkey_packed *where, - struct bkey_packed *insert, - unsigned clobber_u64s) {} -#endif + struct btree *b) +{ + if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) + __bch2_btree_node_iter_verify(iter, b); +} static inline void bch2_verify_btree_nr_keys(struct btree *b) { - if (bch2_debug_check_btree_accounting) + if (static_branch_unlikely(&bch2_debug_check_btree_accounting)) __bch2_verify_btree_nr_keys(b); } diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 899891295797..91e0aa796e6b 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -17,12 +17,6 @@ #include <linux/sched/mm.h> #include <linux/swap.h> -#define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \ -do { \ - if (shrinker_counter) \ - bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_##counter]++; \ -} while (0) - const char * const bch2_btree_node_flags[] = { "typebit", "typebit", @@ -155,7 +149,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) b->data = kvmalloc(btree_buf_bytes(b), gfp); if (!b->data) - return -BCH_ERR_ENOMEM_btree_node_mem_alloc; + return bch_err_throw(c, ENOMEM_btree_node_mem_alloc); #ifdef __KERNEL__ b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp); #else @@ -168,7 +162,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) if (!b->aux_data) { kvfree(b->data); b->data = NULL; - return -BCH_ERR_ENOMEM_btree_node_mem_alloc; + return bch_err_throw(c, ENOMEM_btree_node_mem_alloc); } return 0; @@ -350,115 +344,118 @@ static inline struct btree *btree_cache_find(struct btree_cache *bc, return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params); } -/* - * this version is for btree nodes that have already been freed (we're not - * reaping a real btree node) - */ -static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter) +static int __btree_node_reclaim_checks(struct bch_fs *c, struct btree *b, + bool flush, bool locked) { struct btree_cache *bc = &c->btree_cache; - int ret = 0; lockdep_assert_held(&bc->lock); -wait_on_io: - if (b->flags & ((1U << BTREE_NODE_dirty)| - (1U << BTREE_NODE_read_in_flight)| + + if (btree_node_noevict(b)) { + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_noevict]++; + return bch_err_throw(c, ENOMEM_btree_node_reclaim); + } + if (btree_node_write_blocked(b)) { + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_blocked]++; + return bch_err_throw(c, ENOMEM_btree_node_reclaim); + } + if (btree_node_will_make_reachable(b)) { + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_will_make_reachable]++; + return bch_err_throw(c, ENOMEM_btree_node_reclaim); + } + + if (btree_node_dirty(b)) { + if (!flush) { + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_dirty]++; + return bch_err_throw(c, ENOMEM_btree_node_reclaim); + } + + if (locked) { + /* + * Using the underscore version because we don't want to compact + * bsets after the write, since this node is about to be evicted + * - unless btree verify mode is enabled, since it runs out of + * the post write cleanup: + */ + if (static_branch_unlikely(&bch2_verify_btree_ondisk)) + bch2_btree_node_write(c, b, SIX_LOCK_intent, + BTREE_WRITE_cache_reclaim); + else + __bch2_btree_node_write(c, b, + BTREE_WRITE_cache_reclaim); + } + } + + if (b->flags & ((1U << BTREE_NODE_read_in_flight)| (1U << BTREE_NODE_write_in_flight))) { if (!flush) { - if (btree_node_dirty(b)) - BTREE_CACHE_NOT_FREED_INCREMENT(dirty); - else if (btree_node_read_in_flight(b)) - BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight); + if (btree_node_read_in_flight(b)) + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_read_in_flight]++; else if (btree_node_write_in_flight(b)) - BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight); - return -BCH_ERR_ENOMEM_btree_node_reclaim; + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_in_flight]++; + return bch_err_throw(c, ENOMEM_btree_node_reclaim); } + if (locked) + return -EINTR; + /* XXX: waiting on IO with btree cache lock held */ bch2_btree_node_wait_on_read(b); bch2_btree_node_wait_on_write(b); } + return 0; +} + +/* + * this version is for btree nodes that have already been freed (we're not + * reaping a real btree node) + */ +static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) +{ + struct btree_cache *bc = &c->btree_cache; + int ret = 0; + + lockdep_assert_held(&bc->lock); +retry_unlocked: + ret = __btree_node_reclaim_checks(c, b, flush, false); + if (ret) + return ret; + if (!six_trylock_intent(&b->c.lock)) { - BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent); - return -BCH_ERR_ENOMEM_btree_node_reclaim; + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_lock_intent]++; + return bch_err_throw(c, ENOMEM_btree_node_reclaim); } if (!six_trylock_write(&b->c.lock)) { - BTREE_CACHE_NOT_FREED_INCREMENT(lock_write); - goto out_unlock_intent; + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_lock_write]++; + six_unlock_intent(&b->c.lock); + return bch_err_throw(c, ENOMEM_btree_node_reclaim); } /* recheck under lock */ - if (b->flags & ((1U << BTREE_NODE_read_in_flight)| - (1U << BTREE_NODE_write_in_flight))) { - if (!flush) { - if (btree_node_read_in_flight(b)) - BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight); - else if (btree_node_write_in_flight(b)) - BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight); - goto out_unlock; - } + ret = __btree_node_reclaim_checks(c, b, flush, true); + if (ret) { six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); - goto wait_on_io; - } - - if (btree_node_noevict(b)) { - BTREE_CACHE_NOT_FREED_INCREMENT(noevict); - goto out_unlock; - } - if (btree_node_write_blocked(b)) { - BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked); - goto out_unlock; - } - if (btree_node_will_make_reachable(b)) { - BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable); - goto out_unlock; + if (ret == -EINTR) + goto retry_unlocked; + return ret; } - if (btree_node_dirty(b)) { - if (!flush) { - BTREE_CACHE_NOT_FREED_INCREMENT(dirty); - goto out_unlock; - } - /* - * Using the underscore version because we don't want to compact - * bsets after the write, since this node is about to be evicted - * - unless btree verify mode is enabled, since it runs out of - * the post write cleanup: - */ - if (bch2_verify_btree_ondisk) - bch2_btree_node_write(c, b, SIX_LOCK_intent, - BTREE_WRITE_cache_reclaim); - else - __bch2_btree_node_write(c, b, - BTREE_WRITE_cache_reclaim); - - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); - goto wait_on_io; - } -out: if (b->hash_val && !ret) trace_and_count(c, btree_cache_reap, c, b); - return ret; -out_unlock: - six_unlock_write(&b->c.lock); -out_unlock_intent: - six_unlock_intent(&b->c.lock); - ret = -BCH_ERR_ENOMEM_btree_node_reclaim; - goto out; + return 0; } -static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter) +static int btree_node_reclaim(struct bch_fs *c, struct btree *b) { - return __btree_node_reclaim(c, b, false, shrinker_counter); + return __btree_node_reclaim(c, b, false); } static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) { - return __btree_node_reclaim(c, b, true, false); + return __btree_node_reclaim(c, b, true); } static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, @@ -476,7 +473,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, unsigned long ret = SHRINK_STOP; bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >= list->nr * 3 / 4; - if (bch2_btree_shrinker_disabled) + if (static_branch_unlikely(&bch2_btree_shrinker_disabled)) return SHRINK_STOP; mutex_lock(&bc->lock); @@ -490,7 +487,10 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, * IO can always make forward progress: */ can_free = btree_cache_can_free(list); - nr = min_t(unsigned long, nr, can_free); + if (nr > can_free) { + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_cache_reserve] += nr - can_free; + nr = can_free; + } i = 0; list_for_each_entry_safe(b, t, &bc->freeable, list) { @@ -506,7 +506,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, if (touched >= nr) goto out; - if (!btree_node_reclaim(c, b, true)) { + if (!btree_node_reclaim(c, b)) { btree_node_data_free(bc, b); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); @@ -522,7 +522,7 @@ restart: clear_btree_node_accessed(b); bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_access_bit]++; --touched;; - } else if (!btree_node_reclaim(c, b, true)) { + } else if (!btree_node_reclaim(c, b)) { __bch2_btree_node_hash_remove(bc, b); __btree_node_data_free(bc, b); @@ -569,7 +569,7 @@ static unsigned long bch2_btree_cache_count(struct shrinker *shrink, { struct btree_cache_list *list = shrink->private_data; - if (bch2_btree_shrinker_disabled) + if (static_branch_unlikely(&bch2_btree_shrinker_disabled)) return 0; return btree_cache_can_free(list); @@ -682,7 +682,7 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) return 0; err: - return -BCH_ERR_ENOMEM_fs_btree_cache_init; + return bch_err_throw(c, ENOMEM_fs_btree_cache_init); } void bch2_fs_btree_cache_init_early(struct btree_cache *bc) @@ -727,7 +727,7 @@ int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure if (!cl) { trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); - return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock; + return bch_err_throw(c, ENOMEM_btree_cache_cannibalize_lock); } closure_wait(&bc->alloc_wait, cl); @@ -741,7 +741,7 @@ int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure } trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); - return -BCH_ERR_btree_cache_cannibalize_lock_blocked; + return bch_err_throw(c, btree_cache_cannibalize_lock_blocked); success: trace_and_count(c, btree_cache_cannibalize_lock, trans); @@ -755,7 +755,7 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c) for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) list_for_each_entry_reverse(b, &bc->live[i].list, list) - if (!btree_node_reclaim(c, b, false)) + if (!btree_node_reclaim(c, b)) return b; while (1) { @@ -790,7 +790,7 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea * disk node. Check the freed list before allocating a new one: */ list_for_each_entry(b, freed, list) - if (!btree_node_reclaim(c, b, false)) { + if (!btree_node_reclaim(c, b)) { list_del_init(&b->list); goto got_node; } @@ -817,7 +817,7 @@ got_node: * the list. Check if there's any freed nodes there: */ list_for_each_entry(b2, &bc->freeable, list) - if (!btree_node_reclaim(c, b2, false)) { + if (!btree_node_reclaim(c, b2)) { swap(b->data, b2->data); swap(b->aux_data, b2->aux_data); @@ -977,7 +977,7 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, /* Unlock before doing IO: */ six_unlock_intent(&b->c.lock); - bch2_trans_unlock_noassert(trans); + bch2_trans_unlock(trans); bch2_btree_node_read(trans, b, sync); @@ -1003,7 +1003,7 @@ static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) { struct printbuf buf = PRINTBUF; - if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) + if (c->recovery.pass_done < BCH_RECOVERY_PASS_check_allocations) return; prt_printf(&buf, @@ -1492,9 +1492,10 @@ void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc prt_btree_cache_line(out, c, "live:", bc->live[0].nr); prt_btree_cache_line(out, c, "pinned:", bc->live[1].nr); - prt_btree_cache_line(out, c, "freeable:", bc->nr_freeable); + prt_btree_cache_line(out, c, "reserve:", bc->nr_reserve); + prt_btree_cache_line(out, c, "freed:", bc->nr_freeable); prt_btree_cache_line(out, c, "dirty:", atomic_long_read(&bc->nr_dirty)); - prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock); + prt_printf(out, "cannibalize lock:\t%s\n", bc->alloc_lock ? "held" : "not held"); prt_newline(out); for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) { @@ -1505,6 +1506,7 @@ void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc } prt_newline(out); + prt_printf(out, "counters since mount:\n"); prt_printf(out, "freed:\t%zu\n", bc->nr_freed); prt_printf(out, "not freed:\n"); diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 37b69d89341f..9ddcbe1bda78 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -22,6 +22,7 @@ #include "debug.h" #include "disk_accounting.h" #include "ec.h" +#include "enumerated_ref.h" #include "error.h" #include "extents.h" #include "journal.h" @@ -149,7 +150,7 @@ static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min) new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL); if (!new) - return -BCH_ERR_ENOMEM_gc_repair_key; + return bch_err_throw(c, ENOMEM_gc_repair_key); btree_ptr_to_v2(b, new); b->data->min_key = new_min; @@ -189,7 +190,7 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max) new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL); if (!new) - return -BCH_ERR_ENOMEM_gc_repair_key; + return bch_err_throw(c, ENOMEM_gc_repair_key); btree_ptr_to_v2(b, new); b->data->max_key = new_max; @@ -370,20 +371,13 @@ again: prt_char(&buf, ' '); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k)); - if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO), - trans, btree_node_read_error, - "Topology repair: unreadable btree node at\n%s", - buf.buf)) { + if (bch2_err_matches(ret, EIO)) { bch2_btree_node_evict(trans, cur_k.k); cur = NULL; ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level, cur_k.k->k.p); if (ret) break; - - ret = bch2_btree_lost_data(c, b->c.btree_id); - if (ret) - break; continue; } @@ -545,9 +539,6 @@ int bch2_check_topology(struct bch_fs *c) bch2_btree_id_to_text(&buf, i); if (r->error) { - ret = bch2_btree_lost_data(c, i); - if (ret) - break; reconstruct_root: bch_info(c, "btree root %s unreadable, must recover from scan", buf.buf); @@ -628,7 +619,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, deleted.p = k.k->p; if (initial) { - BUG_ON(bch2_journal_seq_verify && + BUG_ON(static_branch_unlikely(&bch2_journal_seq_verify) && k.k->bversion.lo > atomic64_read(&c->journal.seq)); if (fsck_err_on(btree_id != BTREE_ID_accounting && @@ -944,7 +935,7 @@ static int bch2_gc_alloc_start(struct bch_fs *c) ret = genradix_prealloc(&ca->buckets_gc, ca->mi.nbuckets, GFP_KERNEL); if (ret) { bch2_dev_put(ca); - ret = -BCH_ERR_ENOMEM_gc_alloc_start; + ret = bch_err_throw(c, ENOMEM_gc_alloc_start); break; } } @@ -1088,6 +1079,10 @@ out: * allocator thread - issue wakeup in case they blocked on gc_lock: */ closure_wake_up(&c->freelist_wait); + + if (!ret && !test_bit(BCH_FS_errors_not_fixed, &c->flags)) + bch2_sb_members_clean_deleted(c); + bch_err_fn(c, ret); return ret; } @@ -1098,42 +1093,41 @@ static int gc_btree_gens_key(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - struct bkey_i *u; - int ret; if (unlikely(test_bit(BCH_FS_going_ro, &c->flags))) return -EROFS; - rcu_read_lock(); - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); - if (!ca) - continue; + bool too_stale = false; + scoped_guard(rcu) { + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); + if (!ca) + continue; - if (dev_ptr_stale(ca, ptr) > 16) { - rcu_read_unlock(); - goto update; + too_stale |= dev_ptr_stale(ca, ptr) > 16; } + + if (!too_stale) + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); + if (!ca) + continue; + + u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)]; + if (gen_after(*gen, ptr->gen)) + *gen = ptr->gen; + } } - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); - if (!ca) - continue; + if (too_stale) { + struct bkey_i *u = bch2_bkey_make_mut(trans, iter, &k, 0); + int ret = PTR_ERR_OR_ZERO(u); + if (ret) + return ret; - u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)]; - if (gen_after(*gen, ptr->gen)) - *gen = ptr->gen; + bch2_extent_normalize(c, bkey_i_to_s(u)); } - rcu_read_unlock(); - return 0; -update: - u = bch2_bkey_make_mut(trans, iter, &k, 0); - ret = PTR_ERR_OR_ZERO(u); - if (ret) - return ret; - bch2_extent_normalize(c, bkey_i_to_s(u)); return 0; } @@ -1186,7 +1180,7 @@ int bch2_gc_gens(struct bch_fs *c) ca->oldest_gen = kvmalloc(gens->nbuckets, GFP_KERNEL); if (!ca->oldest_gen) { bch2_dev_put(ca); - ret = -BCH_ERR_ENOMEM_gc_gens; + ret = bch_err_throw(c, ENOMEM_gc_gens); goto err; } @@ -1256,26 +1250,21 @@ static void bch2_gc_gens_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, gc_gens_work); bch2_gc_gens(c); - bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_gc_gens); } void bch2_gc_gens_async(struct bch_fs *c) { - if (bch2_write_ref_tryget(c, BCH_WRITE_REF_gc_gens) && + if (enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_gc_gens) && !queue_work(c->write_ref_wq, &c->gc_gens_work)) - bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_gc_gens); } -void bch2_fs_btree_gc_exit(struct bch_fs *c) -{ -} - -int bch2_fs_btree_gc_init(struct bch_fs *c) +void bch2_fs_btree_gc_init_early(struct bch_fs *c) { seqcount_init(&c->gc_pos_lock); INIT_WORK(&c->gc_gens_work, bch2_gc_gens_work); init_rwsem(&c->gc_lock); mutex_init(&c->gc_gens_lock); - return 0; } diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h index 9693a90a48a2..ec77662369a2 100644 --- a/fs/bcachefs/btree_gc.h +++ b/fs/bcachefs/btree_gc.h @@ -83,7 +83,6 @@ void bch2_gc_pos_to_text(struct printbuf *, struct gc_pos *); int bch2_gc_gens(struct bch_fs *); void bch2_gc_gens_async(struct bch_fs *); -void bch2_fs_btree_gc_exit(struct bch_fs *); -int bch2_fs_btree_gc_init(struct bch_fs *); +void bch2_fs_btree_gc_init_early(struct bch_fs *); #endif /* _BCACHEFS_BTREE_GC_H */ diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 60782f3e5aec..57eff3012a7b 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "async_objs.h" #include "bkey_buf.h" #include "bkey_methods.h" #include "bkey_sort.h" @@ -13,6 +14,7 @@ #include "buckets.h" #include "checksum.h" #include "debug.h" +#include "enumerated_ref.h" #include "error.h" #include "extents.h" #include "io_write.h" @@ -514,19 +516,23 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b) static void btree_err_msg(struct printbuf *out, struct bch_fs *c, struct bch_dev *ca, + bool print_pos, struct btree *b, struct bset *i, struct bkey_packed *k, - unsigned offset, int write) + unsigned offset, int rw) { - prt_printf(out, bch2_log_msg(c, "%s"), - write == READ - ? "error validating btree node " - : "corrupt btree node before write "); + if (print_pos) { + prt_str(out, rw == READ + ? "error validating btree node " + : "corrupt btree node before write "); + prt_printf(out, "at btree "); + bch2_btree_pos_to_text(out, c, b); + prt_newline(out); + } + if (ca) - prt_printf(out, "on %s ", ca->name); - prt_printf(out, "at btree "); - bch2_btree_pos_to_text(out, c, b); + prt_printf(out, "%s ", ca->name); - prt_printf(out, "\nnode offset %u/%u", + prt_printf(out, "node offset %u/%u", b->written, btree_ptr_sectors_written(bkey_i_to_s_c(&b->key))); if (i) prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s)); @@ -537,75 +543,110 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c, prt_str(out, ": "); } -__printf(10, 11) +__printf(11, 12) static int __btree_err(int ret, struct bch_fs *c, struct bch_dev *ca, struct btree *b, struct bset *i, struct bkey_packed *k, - int write, - bool have_retry, + int rw, enum bch_sb_error_id err_type, + struct bch_io_failures *failed, + struct printbuf *err_msg, const char *fmt, ...) { - bool silent = c->curr_recovery_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes; + if (c->recovery.curr_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes) + return bch_err_throw(c, fsck_fix); + + bool have_retry = false; + int ret2; + + if (ca) { + bch2_mark_btree_validate_failure(failed, ca->dev_idx); + + struct extent_ptr_decoded pick; + have_retry = !bch2_bkey_pick_read_device(c, + bkey_i_to_s_c(&b->key), + failed, &pick, -1); + } if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry) - ret = -BCH_ERR_btree_node_read_err_fixable; + ret = bch_err_throw(c, btree_node_read_err_fixable); if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry) - ret = -BCH_ERR_btree_node_read_err_bad_node; + ret = bch_err_throw(c, btree_node_read_err_bad_node); + + bch2_sb_error_count(c, err_type); - if (!silent && ret != -BCH_ERR_btree_node_read_err_fixable) - bch2_sb_error_count(c, err_type); + bool print_deferred = err_msg && + rw == READ && + !(test_bit(BCH_FS_in_fsck, &c->flags) && + c->opts.fix_errors == FSCK_FIX_ask); struct printbuf out = PRINTBUF; - if (write != WRITE && ret != -BCH_ERR_btree_node_read_err_fixable) { - printbuf_indent_add_nextline(&out, 2); -#ifdef BCACHEFS_LOG_PREFIX - prt_printf(&out, bch2_log_msg(c, "")); -#endif - } + bch2_log_msg_start(c, &out); + + if (!print_deferred) + err_msg = &out; - btree_err_msg(&out, c, ca, b, i, k, b->written, write); + btree_err_msg(err_msg, c, ca, !print_deferred, b, i, k, b->written, rw); va_list args; va_start(args, fmt); - prt_vprintf(&out, fmt, args); + prt_vprintf(err_msg, fmt, args); va_end(args); - if (write == WRITE) { + if (print_deferred) { + prt_newline(err_msg); + + switch (ret) { + case -BCH_ERR_btree_node_read_err_fixable: + ret2 = bch2_fsck_err_opt(c, FSCK_CAN_FIX, err_type); + if (!bch2_err_matches(ret2, BCH_ERR_fsck_fix) && + !bch2_err_matches(ret2, BCH_ERR_fsck_ignore)) { + ret = ret2; + goto fsck_err; + } + + if (!have_retry) + ret = bch_err_throw(c, fsck_fix); + goto out; + case -BCH_ERR_btree_node_read_err_bad_node: + prt_str(&out, ", "); + ret = __bch2_topology_error(c, &out); + break; + } + + goto out; + } + + if (rw == WRITE) { prt_str(&out, ", "); ret = __bch2_inconsistent_error(c, &out) ? -BCH_ERR_fsck_errors_not_fixed : 0; - silent = false; + goto print; } switch (ret) { case -BCH_ERR_btree_node_read_err_fixable: - ret = !silent - ? __bch2_fsck_err(c, NULL, FSCK_CAN_FIX, err_type, "%s", out.buf) - : -BCH_ERR_fsck_fix; - if (ret != -BCH_ERR_fsck_fix && - ret != -BCH_ERR_fsck_ignore) + ret2 = __bch2_fsck_err(c, NULL, FSCK_CAN_FIX, err_type, "%s", out.buf); + if (!bch2_err_matches(ret2, BCH_ERR_fsck_fix) && + !bch2_err_matches(ret2, BCH_ERR_fsck_ignore)) { + ret = ret2; goto fsck_err; - ret = -BCH_ERR_fsck_fix; + } + + if (!have_retry) + ret = bch_err_throw(c, fsck_fix); goto out; case -BCH_ERR_btree_node_read_err_bad_node: prt_str(&out, ", "); ret = __bch2_topology_error(c, &out); - if (ret) - silent = false; - break; - case -BCH_ERR_btree_node_read_err_incompatible: - ret = -BCH_ERR_fsck_errors_not_fixed; - silent = false; break; } - - if (!silent) - bch2_print_string_as_lines(KERN_ERR, out.buf); +print: + bch2_print_str(c, KERN_ERR, out.buf); out: fsck_err: printbuf_exit(&out); @@ -614,16 +655,17 @@ fsck_err: #define btree_err(type, c, ca, b, i, k, _err_type, msg, ...) \ ({ \ - int _ret = __btree_err(type, c, ca, b, i, k, write, have_retry, \ + int _ret = __btree_err(type, c, ca, b, i, k, write, \ BCH_FSCK_ERR_##_err_type, \ + failed, err_msg, \ msg, ##__VA_ARGS__); \ \ - if (_ret != -BCH_ERR_fsck_fix) { \ + if (!bch2_err_matches(_ret, BCH_ERR_fsck_fix)) { \ ret = _ret; \ goto fsck_err; \ } \ \ - *saw_error = true; \ + true; \ }) #define btree_err_on(cond, ...) ((cond) ? btree_err(__VA_ARGS__) : false) @@ -681,8 +723,9 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b) static int validate_bset(struct bch_fs *c, struct bch_dev *ca, struct btree *b, struct bset *i, - unsigned offset, unsigned sectors, - int write, bool have_retry, bool *saw_error) + unsigned offset, unsigned sectors, int write, + struct bch_io_failures *failed, + struct printbuf *err_msg) { unsigned version = le16_to_cpu(i->version); unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)); @@ -895,7 +938,8 @@ static inline int btree_node_read_bkey_cmp(const struct btree *b, static int validate_bset_keys(struct bch_fs *c, struct btree *b, struct bset *i, int write, - bool have_retry, bool *saw_error) + struct bch_io_failures *failed, + struct printbuf *err_msg) { unsigned version = le16_to_cpu(i->version); struct bkey_packed *k, *prev = NULL; @@ -1008,7 +1052,9 @@ fsck_err: } int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, - struct btree *b, bool have_retry, bool *saw_error) + struct btree *b, + struct bch_io_failures *failed, + struct printbuf *err_msg) { struct btree_node_entry *bne; struct sort_iter *iter; @@ -1018,11 +1064,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, bool used_mempool, blacklisted; bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 && BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v); - unsigned u64s; unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)); u64 max_journal_seq = 0; struct printbuf buf = PRINTBUF; - int ret = 0, retry_read = 0, write = READ; + int ret = 0, write = READ; u64 start_time = local_clock(); b->version_ondisk = U16_MAX; @@ -1156,15 +1201,14 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, b->version_ondisk = min(b->version_ondisk, le16_to_cpu(i->version)); - ret = validate_bset(c, ca, b, i, b->written, sectors, - READ, have_retry, saw_error); + ret = validate_bset(c, ca, b, i, b->written, sectors, READ, failed, err_msg); if (ret) goto fsck_err; if (!b->written) btree_node_set_format(b, b->data->format); - ret = validate_bset_keys(c, b, i, READ, have_retry, saw_error); + ret = validate_bset_keys(c, b, i, READ, failed, err_msg); if (ret) goto fsck_err; @@ -1225,23 +1269,20 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, sorted = btree_bounce_alloc(c, btree_buf_bytes(b), &used_mempool); sorted->keys.u64s = 0; - set_btree_bset(b, b->set, &b->data->keys); - b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter); memset((uint8_t *)(sorted + 1) + b->nr.live_u64s * sizeof(u64), 0, btree_buf_bytes(b) - sizeof(struct btree_node) - b->nr.live_u64s * sizeof(u64)); - u64s = le16_to_cpu(sorted->keys.u64s); + b->data->keys.u64s = sorted->keys.u64s; *sorted = *b->data; - sorted->keys.u64s = cpu_to_le16(u64s); swap(sorted, b->data); set_btree_bset(b, b->set, &b->data->keys); b->nsets = 1; b->data->keys.journal_seq = cpu_to_le64(max_journal_seq); - BUG_ON(b->nr.live_u64s != u64s); + BUG_ON(b->nr.live_u64s != le16_to_cpu(b->data->keys.u64s)); btree_bounce_free(c, btree_buf_bytes(b), used_mempool, sorted); @@ -1255,7 +1296,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, ret = btree_node_bkey_val_validate(c, b, u.s_c, READ); if (ret == -BCH_ERR_fsck_delete_bkey || - (bch2_inject_invalid_keys && + (static_branch_unlikely(&bch2_inject_invalid_keys) && !bversion_cmp(u.k->bversion, MAX_VERSION))) { btree_keys_account_key_drop(&b->nr, 0, k); @@ -1284,31 +1325,21 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_node_reset_sib_u64s(b); - rcu_read_lock(); - bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { - struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev); + scoped_guard(rcu) + bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { + struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev); - if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) - set_btree_node_need_rewrite(b); - } - rcu_read_unlock(); + if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) + set_btree_node_need_rewrite(b); + } if (!ptr_written) set_btree_node_need_rewrite(b); -out: +fsck_err: mempool_free(iter, &c->fill_iter); printbuf_exit(&buf); bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time); - return retry_read; -fsck_err: - if (ret == -BCH_ERR_btree_node_read_err_want_retry || - ret == -BCH_ERR_btree_node_read_err_must_retry) { - retry_read = 1; - } else { - set_btree_node_read_error(b); - bch2_btree_lost_data(c, b->c.btree_id); - } - goto out; + return ret; } static void btree_node_read_work(struct work_struct *work) @@ -1320,16 +1351,26 @@ static void btree_node_read_work(struct work_struct *work) struct btree *b = rb->b; struct bio *bio = &rb->bio; struct bch_io_failures failed = { .nr = 0 }; + int ret = 0; + struct printbuf buf = PRINTBUF; - bool saw_error = false; - bool retry = false; - bool can_retry; + bch2_log_msg_start(c, &buf); + + prt_printf(&buf, "btree node read error at btree "); + bch2_btree_pos_to_text(&buf, c, b); + prt_newline(&buf); goto start; while (1) { - retry = true; - bch_info(c, "retrying read"); - ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ); + ret = bch2_bkey_pick_read_device(c, + bkey_i_to_s_c(&b->key), + &failed, &rb->pick, -1); + if (ret) { + set_btree_node_read_error(b); + break; + } + + ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ, BCH_DEV_READ_REF_btree_node_read); rb->have_ioref = ca != NULL; rb->start_time = local_clock(); bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META); @@ -1346,59 +1387,59 @@ static void btree_node_read_work(struct work_struct *work) bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, rb->start_time, !bio->bi_status); start: - printbuf_reset(&buf); - bch2_btree_pos_to_text(&buf, c, b); - - if (ca && bio->bi_status) - bch_err_dev_ratelimited(ca, - "btree read error %s for %s", - bch2_blk_status_to_str(bio->bi_status), buf.buf); if (rb->have_ioref) - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_read); rb->have_ioref = false; - bch2_mark_io_failure(&failed, &rb->pick, false); - - can_retry = bch2_bkey_pick_read_device(c, - bkey_i_to_s_c(&b->key), - &failed, &rb->pick, -1) > 0; - - if (!bio->bi_status && - !bch2_btree_node_read_done(c, ca, b, can_retry, &saw_error)) { - if (retry) - bch_info(c, "retry success"); - break; + if (bio->bi_status) { + bch2_mark_io_failure(&failed, &rb->pick, false); + continue; } - saw_error = true; + ret = bch2_btree_node_read_done(c, ca, b, &failed, &buf); + if (ret == -BCH_ERR_btree_node_read_err_want_retry || + ret == -BCH_ERR_btree_node_read_err_must_retry) + continue; - if (!can_retry) { + if (ret) set_btree_node_read_error(b); - bch2_btree_lost_data(c, b->c.btree_id); - break; - } + + break; } - bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read], - rb->start_time); - bio_put(&rb->bio); + bch2_io_failures_to_text(&buf, c, &failed); + + if (btree_node_read_error(b)) + bch2_btree_lost_data(c, &buf, b->c.btree_id); + + /* + * only print retry success if we read from a replica with no errors + */ + if (btree_node_read_error(b)) + prt_printf(&buf, "ret %s", bch2_err_str(ret)); + else if (failed.nr) { + if (!bch2_dev_io_failures(&failed, rb->pick.ptr.dev)) + prt_printf(&buf, "retry success"); + else + prt_printf(&buf, "repair success"); + } - if ((saw_error || + if ((failed.nr || btree_node_need_rewrite(b)) && !btree_node_read_error(b) && - c->curr_recovery_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) { - if (saw_error) { - printbuf_reset(&buf); - bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); - prt_str(&buf, " "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - bch_err_ratelimited(c, "%s: rewriting btree node at due to error\n %s", - __func__, buf.buf); - } - + c->recovery.curr_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) { + prt_printf(&buf, " (rewriting node)"); bch2_btree_node_rewrite_async(c, b); } + prt_newline(&buf); + + if (failed.nr) + bch2_print_str_ratelimited(c, KERN_ERR, buf.buf); + async_object_list_del(c, btree_read_bio, rb->list_idx); + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read], + rb->start_time); + bio_put(&rb->bio); printbuf_exit(&buf); clear_btree_node_read_in_flight(b); smp_mb__after_atomic(); @@ -1419,6 +1460,11 @@ static void btree_node_read_endio(struct bio *bio) queue_work(c->btree_read_complete_wq, &rb->work); } +void bch2_btree_read_bio_to_text(struct printbuf *out, struct btree_read_bio *rbio) +{ + bch2_bio_to_text(out, &rbio->bio); +} + struct btree_node_read_all { struct closure cl; struct bch_fs *c; @@ -1478,12 +1524,13 @@ static CLOSURE_CALLBACK(btree_node_read_all_replicas_done) struct btree *b = ra->b; struct printbuf buf = PRINTBUF; bool dump_bset_maps = false; - bool have_retry = false; int ret = 0, best = -1, write = READ; unsigned i, written = 0, written2 = 0; __le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2 ? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0; bool _saw_error = false, *saw_error = &_saw_error; + struct printbuf *err_msg = NULL; + struct bch_io_failures *failed = NULL; for (i = 0; i < ra->nr; i++) { struct btree_node *bn = ra->buf[i]; @@ -1576,14 +1623,19 @@ fsck_err: if (best >= 0) { memcpy(b->data, ra->buf[best], btree_buf_bytes(b)); - ret = bch2_btree_node_read_done(c, NULL, b, false, saw_error); + ret = bch2_btree_node_read_done(c, NULL, b, NULL, NULL); } else { ret = -1; } if (ret) { set_btree_node_read_error(b); - bch2_btree_lost_data(c, b->c.btree_id); + + struct printbuf buf = PRINTBUF; + bch2_btree_lost_data(c, &buf, b->c.btree_id); + if (buf.pos) + bch_err(c, "%s", buf.buf); + printbuf_exit(&buf); } else if (*saw_error) bch2_btree_node_rewrite_async(c, b); @@ -1612,7 +1664,8 @@ static void btree_node_read_all_replicas_endio(struct bio *bio) struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev); bch2_latency_acct(ca, rb->start_time, READ); - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], + BCH_DEV_READ_REF_btree_node_read_all_replicas); } ra->err[rb->idx] = bio->bi_status; @@ -1634,7 +1687,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool ra = kzalloc(sizeof(*ra), GFP_NOFS); if (!ra) - return -BCH_ERR_ENOMEM_btree_node_read_all_replicas; + return bch_err_throw(c, ENOMEM_btree_node_read_all_replicas); closure_init(&ra->cl, NULL); ra->c = c; @@ -1652,7 +1705,8 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool i = 0; bkey_for_each_ptr_decode(k.k, ptrs, pick, entry) { - struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); + struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, + BCH_DEV_READ_REF_btree_node_read_all_replicas); struct btree_read_bio *rb = container_of(ra->bio[i], struct btree_read_bio, bio); rb->c = c; @@ -1703,7 +1757,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, trace_and_count(c, btree_node_read, trans, b); - if (bch2_verify_all_btree_replicas && + if (static_branch_unlikely(&bch2_verify_all_btree_replicas) && !btree_node_read_all_replicas(c, b, sync)) return; @@ -1711,26 +1765,34 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, NULL, &pick, -1); if (ret <= 0) { + bool ratelimit = true; struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); prt_str(&buf, "btree node read error: no device to read from\n at "); bch2_btree_pos_to_text(&buf, c, b); - bch_err_ratelimited(c, "%s", buf.buf); - - if (c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_check_topology) && - c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology) - bch2_fatal_error(c); + prt_newline(&buf); + bch2_btree_lost_data(c, &buf, b->c.btree_id); + + if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_check_topology) && + bch2_fs_emergency_read_only2(c, &buf)) + ratelimit = false; + + static DEFINE_RATELIMIT_STATE(rs, + DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + if (!ratelimit || __ratelimit(&rs)) + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); set_btree_node_read_error(b); - bch2_btree_lost_data(c, b->c.btree_id); clear_btree_node_read_in_flight(b); smp_mb__after_atomic(); wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); - printbuf_exit(&buf); return; } - ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); + ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, BCH_DEV_READ_REF_btree_node_read); bio = bio_alloc_bioset(NULL, buf_pages(b->data, btree_buf_bytes(b)), @@ -1749,6 +1811,8 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, bio->bi_end_io = btree_node_read_endio; bch2_bio_map(bio, b->data, btree_buf_bytes(b)); + async_object_list_add(c, btree_read_bio, rb, &rb->list_idx); + if (rb->have_ioref) { this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree], bio_sectors(bio)); @@ -1805,7 +1869,7 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id, bch2_btree_node_hash_remove(&c->btree_cache, b); mutex_unlock(&c->btree_cache.lock); - ret = -BCH_ERR_btree_node_read_error; + ret = bch_err_throw(c, btree_node_read_error); goto err; } @@ -1922,7 +1986,7 @@ static void btree_node_scrub_work(struct work_struct *work) bch_err(c, "error validating btree node during scrub on %s at btree %s", scrub->ca->name, err.buf); - ret = bch2_btree_node_rewrite(trans, &iter, b, 0); + ret = bch2_btree_node_rewrite(trans, &iter, b, 0, 0); } err: bch2_trans_iter_exit(trans, &iter); @@ -1933,9 +1997,9 @@ err: printbuf_exit(&err); bch2_bkey_buf_exit(&scrub->key, c);; btree_bounce_free(c, c->opts.btree_node_size, scrub->used_mempool, scrub->buf); - percpu_ref_put(&scrub->ca->io_ref[READ]); + enumerated_ref_put(&scrub->ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scrub); kfree(scrub); - bch2_write_ref_put(c, BCH_WRITE_REF_btree_node_scrub); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_node_scrub); } static void btree_node_scrub_endio(struct bio *bio) @@ -1954,17 +2018,18 @@ int bch2_btree_node_scrub(struct btree_trans *trans, struct bch_fs *c = trans->c; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_node_scrub)) - return -BCH_ERR_erofs_no_writes; + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_node_scrub)) + return bch_err_throw(c, erofs_no_writes); struct extent_ptr_decoded pick; int ret = bch2_bkey_pick_read_device(c, k, NULL, &pick, dev); if (ret <= 0) goto err; - struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); + struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, + BCH_DEV_READ_REF_btree_node_scrub); if (!ca) { - ret = -BCH_ERR_device_offline; + ret = bch_err_throw(c, device_offline); goto err; } @@ -2002,9 +2067,9 @@ int bch2_btree_node_scrub(struct btree_trans *trans, return 0; err_free: btree_bounce_free(c, c->opts.btree_node_size, used_mempool, buf); - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scrub); err: - bch2_write_ref_put(c, BCH_WRITE_REF_btree_node_scrub); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_node_scrub); return ret; } @@ -2101,7 +2166,7 @@ static void btree_node_write_work(struct work_struct *work) bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev)); if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) { - ret = -BCH_ERR_btree_node_write_all_failed; + ret = bch_err_throw(c, btree_node_write_all_failed); goto err; } @@ -2121,6 +2186,7 @@ static void btree_node_write_work(struct work_struct *work) goto err; } out: + async_object_list_del(c, btree_write_bio, wbio->list_idx); bio_put(&wbio->wbio.bio); btree_node_write_done(c, b, start_time); return; @@ -2172,7 +2238,8 @@ static void btree_node_write_endio(struct bio *bio) * btree writes yet (due to device removal/ro): */ if (wbio->have_ioref) - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], + BCH_DEV_READ_REF_btree_node_write); if (parent) { bio_put(bio); @@ -2184,14 +2251,12 @@ static void btree_node_write_endio(struct bio *bio) smp_mb__after_atomic(); wake_up_bit(&b->flags, BTREE_NODE_write_in_flight_inner); INIT_WORK(&wb->work, btree_node_write_work); - queue_work(c->btree_io_complete_wq, &wb->work); + queue_work(c->btree_write_complete_wq, &wb->work); } static int validate_bset_for_write(struct bch_fs *c, struct btree *b, struct bset *i, unsigned sectors) { - bool saw_error; - int ret = bch2_bkey_validate(c, bkey_i_to_s_c(&b->key), (struct bkey_validate_context) { .from = BKEY_VALIDATE_btree_node, @@ -2204,8 +2269,8 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b, return ret; } - ret = validate_bset_keys(c, b, i, WRITE, false, &saw_error) ?: - validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false, &saw_error); + ret = validate_bset_keys(c, b, i, WRITE, NULL, NULL) ?: + validate_bset(c, NULL, b, i, b->written, sectors, WRITE, NULL, NULL); if (ret) { bch2_inconsistent_error(c); dump_stack(); @@ -2472,6 +2537,8 @@ do_write: atomic64_inc(&c->btree_write_stats[type].nr); atomic64_add(bytes_to_write, &c->btree_write_stats[type].bytes); + async_object_list_add(c, btree_write_bio, wbio, &wbio->list_idx); + INIT_WORK(&wbio->work, btree_write_submit); queue_work(c->btree_write_submit_wq, &wbio->work); return; diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h index dbf76d22c660..30a5180532c8 100644 --- a/fs/bcachefs/btree_io.h +++ b/fs/bcachefs/btree_io.h @@ -41,6 +41,9 @@ struct btree_read_bio { u64 start_time; unsigned have_ioref:1; unsigned idx:7; +#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS + unsigned list_idx; +#endif struct extent_ptr_decoded pick; struct work_struct work; struct bio bio; @@ -53,6 +56,9 @@ struct btree_write_bio { unsigned data_bytes; unsigned sector_offset; u64 start_time; +#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS + unsigned list_idx; +#endif struct bch_write_bio wbio; }; @@ -128,11 +134,15 @@ void bch2_btree_build_aux_trees(struct btree *); void bch2_btree_init_next(struct btree_trans *, struct btree *); int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *, - struct btree *, bool, bool *); + struct btree *, + struct bch_io_failures *, + struct printbuf *); void bch2_btree_node_read(struct btree_trans *, struct btree *, bool); int bch2_btree_root_read(struct bch_fs *, enum btree_id, const struct bkey_i *, unsigned); +void bch2_btree_read_bio_to_text(struct printbuf *, struct btree_read_bio *); + int bch2_btree_node_scrub(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, unsigned); diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index ac5f2046550d..b78403376c07 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -16,6 +16,7 @@ #include "journal_io.h" #include "replicas.h" #include "snapshot.h" +#include "super.h" #include "trace.h" #include <linux/random.h> @@ -114,11 +115,9 @@ static inline bool btree_path_pos_in_node(struct btree_path *path, !btree_path_pos_after_node(path, b); } -/* Btree iterator: */ +/* Debug: */ -#ifdef CONFIG_BCACHEFS_DEBUG - -static void bch2_btree_path_verify_cached(struct btree_trans *trans, +static void __bch2_btree_path_verify_cached(struct btree_trans *trans, struct btree_path *path) { struct bkey_cached *ck; @@ -135,7 +134,7 @@ static void bch2_btree_path_verify_cached(struct btree_trans *trans, btree_node_unlock(trans, path, 0); } -static void bch2_btree_path_verify_level(struct btree_trans *trans, +static void __bch2_btree_path_verify_level(struct btree_trans *trans, struct btree_path *path, unsigned level) { struct btree_path_level *l; @@ -147,16 +146,13 @@ static void bch2_btree_path_verify_level(struct btree_trans *trans, struct printbuf buf3 = PRINTBUF; const char *msg; - if (!bch2_debug_check_iterators) - return; - l = &path->l[level]; tmp = l->iter; locked = btree_node_locked(path, level); if (path->cached) { if (!level) - bch2_btree_path_verify_cached(trans, path); + __bch2_btree_path_verify_cached(trans, path); return; } @@ -217,7 +213,7 @@ err: msg, level, buf1.buf, buf2.buf, buf3.buf); } -static void bch2_btree_path_verify(struct btree_trans *trans, +static void __bch2_btree_path_verify(struct btree_trans *trans, struct btree_path *path) { struct bch_fs *c = trans->c; @@ -229,22 +225,22 @@ static void bch2_btree_path_verify(struct btree_trans *trans, break; } - bch2_btree_path_verify_level(trans, path, i); + __bch2_btree_path_verify_level(trans, path, i); } - bch2_btree_path_verify_locks(path); + bch2_btree_path_verify_locks(trans, path); } -void bch2_trans_verify_paths(struct btree_trans *trans) +void __bch2_trans_verify_paths(struct btree_trans *trans) { struct btree_path *path; unsigned iter; trans_for_each_path(trans, path, iter) - bch2_btree_path_verify(trans, path); + __bch2_btree_path_verify(trans, path); } -static void bch2_btree_iter_verify(struct btree_trans *trans, struct btree_iter *iter) +static void __bch2_btree_iter_verify(struct btree_trans *trans, struct btree_iter *iter) { BUG_ON(!!(iter->flags & BTREE_ITER_cached) != btree_iter_path(trans, iter)->cached); @@ -256,11 +252,11 @@ static void bch2_btree_iter_verify(struct btree_trans *trans, struct btree_iter !btree_type_has_snapshot_field(iter->btree_id)); if (iter->update_path) - bch2_btree_path_verify(trans, &trans->paths[iter->update_path]); - bch2_btree_path_verify(trans, btree_iter_path(trans, iter)); + __bch2_btree_path_verify(trans, &trans->paths[iter->update_path]); + __bch2_btree_path_verify(trans, btree_iter_path(trans, iter)); } -static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) +static void __bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) { BUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && !iter->pos.snapshot); @@ -274,16 +270,13 @@ static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) bkey_gt(iter->pos, iter->k.p))); } -static int bch2_btree_iter_verify_ret(struct btree_trans *trans, - struct btree_iter *iter, struct bkey_s_c k) +static int __bch2_btree_iter_verify_ret(struct btree_trans *trans, + struct btree_iter *iter, struct bkey_s_c k) { struct btree_iter copy; struct bkey_s_c prev; int ret = 0; - if (!bch2_debug_check_iterators) - return 0; - if (!(iter->flags & BTREE_ITER_filter_snapshots)) return 0; @@ -324,7 +317,7 @@ out: return ret; } -void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, +void __bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, struct bpos pos) { bch2_trans_verify_not_unlocked_or_in_restart(trans); @@ -357,19 +350,40 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, panic("not locked: %s %s\n", bch2_btree_id_str(id), buf.buf); } -#else - static inline void bch2_btree_path_verify_level(struct btree_trans *trans, - struct btree_path *path, unsigned l) {} + struct btree_path *path, unsigned l) +{ + if (static_branch_unlikely(&bch2_debug_check_iterators)) + __bch2_btree_path_verify_level(trans, path, l); +} + static inline void bch2_btree_path_verify(struct btree_trans *trans, - struct btree_path *path) {} + struct btree_path *path) +{ + if (static_branch_unlikely(&bch2_debug_check_iterators)) + __bch2_btree_path_verify(trans, path); +} + static inline void bch2_btree_iter_verify(struct btree_trans *trans, - struct btree_iter *iter) {} -static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {} -static inline int bch2_btree_iter_verify_ret(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c k) { return 0; } + struct btree_iter *iter) +{ + if (static_branch_unlikely(&bch2_debug_check_iterators)) + __bch2_btree_iter_verify(trans, iter); +} -#endif +static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) +{ + if (static_branch_unlikely(&bch2_debug_check_iterators)) + __bch2_btree_iter_verify_entry_exit(iter); +} + +static inline int bch2_btree_iter_verify_ret(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c k) +{ + return static_branch_unlikely(&bch2_debug_check_iterators) + ? __bch2_btree_iter_verify_ret(trans, iter, k) + : 0; +} /* Btree path: fixups after btree updates */ @@ -523,7 +537,7 @@ void bch2_btree_node_iter_fix(struct btree_trans *trans, __bch2_btree_node_iter_fix(path, b, node_iter, t, where, clobber_u64s, new_u64s); - if (bch2_debug_check_iterators) + if (static_branch_unlikely(&bch2_debug_check_iterators)) bch2_btree_node_iter_verify(node_iter, b); } @@ -876,8 +890,7 @@ static noinline void btree_node_mem_ptr_set(struct btree_trans *trans, static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, struct btree_path *path, - unsigned flags, - struct bkey_buf *out) + unsigned flags) { struct bch_fs *c = trans->c; struct btree_path_level *l = path_l(path); @@ -901,7 +914,7 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, goto err; } - bch2_bkey_buf_reassemble(out, c, k); + bkey_reassemble(&trans->btree_path_down, k); if ((flags & BTREE_ITER_prefetch) && c->opts.btree_node_prefetch) @@ -912,6 +925,22 @@ err: return ret; } +static noinline_for_stack int btree_node_missing_err(struct btree_trans *trans, + struct btree_path *path) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "node not found at pos "); + bch2_bpos_to_text(&buf, path->pos); + prt_str(&buf, " within parent node "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&path_l(path)->b->key)); + + bch2_fs_fatal_error(c, "%s", buf.buf); + printbuf_exit(&buf); + return bch_err_throw(c, btree_need_topology_repair); +} + static __always_inline int btree_path_down(struct btree_trans *trans, struct btree_path *path, unsigned flags, @@ -922,51 +951,38 @@ static __always_inline int btree_path_down(struct btree_trans *trans, struct btree *b; unsigned level = path->level - 1; enum six_lock_type lock_type = __btree_lock_want(path, level); - struct bkey_buf tmp; int ret; EBUG_ON(!btree_node_locked(path, path->level)); - bch2_bkey_buf_init(&tmp); - if (unlikely(trans->journal_replay_not_finished)) { - ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp); + ret = btree_node_iter_and_journal_peek(trans, path, flags); if (ret) - goto err; + return ret; } else { struct bkey_packed *k = bch2_btree_node_iter_peek(&l->iter, l->b); - if (!k) { - struct printbuf buf = PRINTBUF; - - prt_str(&buf, "node not found at pos "); - bch2_bpos_to_text(&buf, path->pos); - prt_str(&buf, " within parent node "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&l->b->key)); - - bch2_fs_fatal_error(c, "%s", buf.buf); - printbuf_exit(&buf); - ret = -BCH_ERR_btree_need_topology_repair; - goto err; - } + if (unlikely(!k)) + return btree_node_missing_err(trans, path); - bch2_bkey_buf_unpack(&tmp, c, l->b, k); + bch2_bkey_unpack(l->b, &trans->btree_path_down, k); - if ((flags & BTREE_ITER_prefetch) && + if (unlikely((flags & BTREE_ITER_prefetch)) && c->opts.btree_node_prefetch) { ret = btree_path_prefetch(trans, path); if (ret) - goto err; + return ret; } } - b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip); + b = bch2_btree_node_get(trans, path, &trans->btree_path_down, + level, lock_type, trace_ip); ret = PTR_ERR_OR_ZERO(b); if (unlikely(ret)) - goto err; + return ret; - if (likely(!trans->journal_replay_not_finished && - tmp.k->k.type == KEY_TYPE_btree_ptr_v2) && - unlikely(b != btree_node_mem_ptr(tmp.k))) + if (unlikely(b != btree_node_mem_ptr(&trans->btree_path_down)) && + likely(!trans->journal_replay_not_finished && + trans->btree_path_down.k.type == KEY_TYPE_btree_ptr_v2)) btree_node_mem_ptr_set(trans, path, level + 1, b); if (btree_node_read_locked(path, level + 1)) @@ -977,10 +993,8 @@ static __always_inline int btree_path_down(struct btree_trans *trans, path->level = level; bch2_btree_path_level_init(trans, path, b); - bch2_btree_path_verify_locks(path); -err: - bch2_bkey_buf_exit(&tmp, c); - return ret; + bch2_btree_path_verify_locks(trans, path); + return 0; } static int bch2_btree_path_traverse_all(struct btree_trans *trans) @@ -992,7 +1006,7 @@ static int bch2_btree_path_traverse_all(struct btree_trans *trans) int ret = 0; if (trans->in_traverse_all) - return -BCH_ERR_transaction_restart_in_traverse_all; + return bch_err_throw(c, transaction_restart_in_traverse_all); trans->in_traverse_all = true; retry_all: @@ -1089,7 +1103,7 @@ static void btree_path_set_level_down(struct btree_trans *trans, if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED) btree_node_unlock(trans, path, l); - btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); bch2_btree_path_verify(trans, path); } @@ -1287,7 +1301,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans, if (unlikely(path->cached)) { btree_node_unlock(trans, path, 0); path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_up); - btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); goto out; } @@ -1316,7 +1330,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans, } if (unlikely(level != path->level)) { - btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); __bch2_btree_path_unlock(trans, path); } out: @@ -1385,45 +1399,45 @@ static bool bch2_btree_path_can_relock(struct btree_trans *trans, struct btree_p void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool intent) { - struct btree_path *path = trans->paths + path_idx, *dup; + struct btree_path *path = trans->paths + path_idx, *dup = NULL; if (!__btree_path_put(trans, path, intent)) return; + if (!path->preserve && !path->should_be_locked) + goto free; + dup = path->preserve ? have_path_at_pos(trans, path) : have_node_at_pos(trans, path); - - trace_btree_path_free(trans, path_idx, dup); - - if (!dup && !(!path->preserve && !is_btree_node(path, path->level))) + if (!dup) return; - if (path->should_be_locked && !trans->restarted) { - if (!dup) - return; - + /* + * If we need this path locked, the duplicate also has te be locked + * before we free this one: + */ + if (path->should_be_locked && + !dup->should_be_locked && + !trans->restarted) { if (!(trans->locked ? bch2_btree_path_relock_norestart(trans, dup) : bch2_btree_path_can_relock(trans, dup))) return; - } - if (dup) { - dup->preserve |= path->preserve; - dup->should_be_locked |= path->should_be_locked; + dup->should_be_locked = true; } - __bch2_path_free(trans, path_idx); -} - -static void bch2_path_put_nokeep(struct btree_trans *trans, btree_path_idx_t path, - bool intent) -{ - if (!__btree_path_put(trans, trans->paths + path, intent)) - return; + BUG_ON(path->should_be_locked && + !trans->restarted && + trans->locked && + !btree_node_locked(dup, dup->level)); - __bch2_path_free(trans, path); + path->should_be_locked = false; + dup->preserve |= path->preserve; +free: + trace_btree_path_free(trans, path_idx, dup); + __bch2_path_free(trans, path_idx); } void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count) @@ -1485,7 +1499,7 @@ void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) prt_newline(buf); } - for (struct jset_entry *e = trans->journal_entries; + for (struct jset_entry *e = btree_trans_journal_entries_start(trans); e != btree_trans_journal_entries_top(trans); e = vstruct_next(e)) { bch2_journal_entry_to_text(buf, trans->c, e); @@ -1591,7 +1605,7 @@ void __bch2_dump_trans_paths_updates(struct btree_trans *trans, bool nosort) __bch2_trans_paths_to_text(&buf, trans, nosort); bch2_trans_updates_to_text(&buf, trans); - bch2_print_str(trans->c, buf.buf); + bch2_print_str(trans->c, KERN_ERR, buf.buf); printbuf_exit(&buf); } @@ -1735,6 +1749,10 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans, btree_trans_sort_paths(trans); + if (intent) + locks_want = max(locks_want, level + 1); + locks_want = min(locks_want, BTREE_MAX_DEPTH); + trans_for_each_path_inorder(trans, path, iter) { if (__btree_path_cmp(path, btree_id, @@ -1749,7 +1767,8 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans, if (path_pos && trans->paths[path_pos].cached == cached && trans->paths[path_pos].btree_id == btree_id && - trans->paths[path_pos].level == level) { + trans->paths[path_pos].level == level && + bch2_btree_path_upgrade_norestart(trans, trans->paths + path_pos, locks_want)) { trace_btree_path_get(trans, trans->paths + path_pos, &pos); __btree_path_get(trans, trans->paths + path_pos, intent); @@ -1781,9 +1800,6 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans, if (!(flags & BTREE_ITER_nopreserve)) path->preserve = true; - if (path->intent_ref) - locks_want = max(locks_want, level + 1); - /* * If the path has locks_want greater than requested, we don't downgrade * it here - on transaction restart because btree node split needs to @@ -1792,10 +1808,6 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans, * a successful transaction commit. */ - locks_want = min(locks_want, BTREE_MAX_DEPTH); - if (locks_want > path->locks_want) - bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want, NULL); - return path_idx; } @@ -1967,6 +1979,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_trans *trans, struct btree_ /* got to end? */ if (!btree_path_node(path, path->level + 1)) { + path->should_be_locked = false; btree_path_set_level_up(trans, path); return NULL; } @@ -1978,12 +1991,12 @@ struct btree *bch2_btree_iter_next_node(struct btree_trans *trans, struct btree_ bch2_btree_path_downgrade(trans, path); if (!bch2_btree_node_relock(trans, path, path->level + 1)) { + trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path); + ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); __bch2_btree_path_unlock(trans, path); path->l[path->level].b = ERR_PTR(-BCH_ERR_no_btree_node_relock); path->l[path->level + 1].b = ERR_PTR(-BCH_ERR_no_btree_node_relock); - btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); - trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path); - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); + btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); goto err; } @@ -2344,8 +2357,7 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *trans, struct btree } if (iter->update_path) { - bch2_path_put_nokeep(trans, iter->update_path, - iter->flags & BTREE_ITER_intent); + bch2_path_put(trans, iter->update_path, iter->flags & BTREE_ITER_intent); iter->update_path = 0; } @@ -2374,8 +2386,8 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *trans, struct btree if (iter->update_path && !bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) { - bch2_path_put_nokeep(trans, iter->update_path, - iter->flags & BTREE_ITER_intent); + bch2_path_put(trans, iter->update_path, + iter->flags & BTREE_ITER_intent); iter->update_path = 0; } @@ -2634,7 +2646,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct * the last possible snapshot overwrite, return * it: */ - bch2_path_put_nokeep(trans, iter->path, + bch2_path_put(trans, iter->path, iter->flags & BTREE_ITER_intent); iter->path = saved_path; saved_path = 0; @@ -2664,8 +2676,8 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct * our previous saved candidate: */ if (saved_path) { - bch2_path_put_nokeep(trans, saved_path, - iter->flags & BTREE_ITER_intent); + bch2_path_put(trans, saved_path, + iter->flags & BTREE_ITER_intent); saved_path = 0; } @@ -2708,7 +2720,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct iter->pos.snapshot = iter->snapshot; out_no_locked: if (saved_path) - bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_intent); + bch2_path_put(trans, saved_path, iter->flags & BTREE_ITER_intent); bch2_btree_iter_verify_entry_exit(iter); bch2_btree_iter_verify(trans, iter); @@ -2929,7 +2941,7 @@ static void btree_trans_verify_sorted(struct btree_trans *trans) struct btree_path *path, *prev = NULL; struct trans_for_each_path_inorder_iter iter; - if (!bch2_debug_check_iterators) + if (!static_branch_unlikely(&bch2_debug_check_iterators)) return; trans_for_each_path_inorder(trans, path, iter) { @@ -3031,7 +3043,7 @@ static inline void btree_path_list_add(struct btree_trans *trans, void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) { if (iter->update_path) - bch2_path_put_nokeep(trans, iter->update_path, + bch2_path_put(trans, iter->update_path, iter->flags & BTREE_ITER_intent); if (iter->path) bch2_path_put(trans, iter->path, @@ -3095,7 +3107,19 @@ void bch2_trans_copy_iter(struct btree_trans *trans, dst->key_cache_path = 0; } -void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE +void bch2_trans_kmalloc_trace_to_text(struct printbuf *out, + darray_trans_kmalloc_trace *trace) +{ + printbuf_tabstops_reset(out); + printbuf_tabstop_push(out, 60); + + darray_for_each(*trace, i) + prt_printf(out, "%pS\t%zu\n", (void *) i->ip, i->bytes); +} +#endif + +void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size, unsigned long ip) { struct bch_fs *c = trans->c; unsigned new_top = trans->mem_top + size; @@ -3105,14 +3129,35 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) void *new_mem; void *p; - WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX); + if (WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX)) { +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE + struct printbuf buf = PRINTBUF; + bch2_trans_kmalloc_trace_to_text(&buf, &trans->trans_kmalloc_trace); + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); +#endif + } ret = trans_maybe_inject_restart(trans, _RET_IP_); if (ret) return ERR_PTR(ret); struct btree_transaction_stats *s = btree_trans_stats(trans); - s->max_mem = max(s->max_mem, new_bytes); + if (new_bytes > s->max_mem) { + mutex_lock(&s->lock); +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE + darray_resize(&s->trans_kmalloc_trace, trans->trans_kmalloc_trace.nr); + s->trans_kmalloc_trace.nr = min(s->trans_kmalloc_trace.size, + trans->trans_kmalloc_trace.nr); + + memcpy(s->trans_kmalloc_trace.data, + trans->trans_kmalloc_trace.data, + sizeof(s->trans_kmalloc_trace.data[0]) * + s->trans_kmalloc_trace.nr); +#endif + s->max_mem = new_bytes; + mutex_unlock(&s->lock); + } if (trans->used_mempool) { if (trans->mem_bytes >= new_bytes) @@ -3172,6 +3217,8 @@ out_new_mem: BCH_ERR_transaction_restart_mem_realloced, _RET_IP_)); } out_change_top: + bch2_trans_kmalloc_trace(trans, size, ip); + p = trans->mem + trans->mem_top; trans->mem_top += size; memset(p, 0, size); @@ -3231,7 +3278,6 @@ u32 bch2_trans_begin(struct btree_trans *trans) trans->restart_count++; trans->mem_top = 0; - trans->journal_entries = NULL; trans_for_each_path(trans, path, i) { path->should_be_locked = false; @@ -3285,6 +3331,10 @@ u32 bch2_trans_begin(struct btree_trans *trans) } #endif +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE + trans->trans_kmalloc_trace.nr = 0; +#endif + trans_set_locked(trans, false); if (trans->restarted) { @@ -3385,7 +3435,6 @@ got_trans: } trans->nr_paths_max = s->nr_max_paths; - trans->journal_entries_size = s->journal_entries_size; } trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); @@ -3397,29 +3446,45 @@ got_trans: return trans; } -static void check_btree_paths_leaked(struct btree_trans *trans) -{ #ifdef CONFIG_BCACHEFS_DEBUG - struct bch_fs *c = trans->c; + +static bool btree_paths_leaked(struct btree_trans *trans) +{ struct btree_path *path; unsigned i; trans_for_each_path(trans, path, i) if (path->ref) - goto leaked; - return; -leaked: - bch_err(c, "btree paths leaked from %s!", trans->fn); - trans_for_each_path(trans, path, i) - if (path->ref) - printk(KERN_ERR " btree %s %pS\n", - bch2_btree_id_str(path->btree_id), - (void *) path->ip_allocated); - /* Be noisy about this: */ - bch2_fatal_error(c); -#endif + return true; + return false; } +static void check_btree_paths_leaked(struct btree_trans *trans) +{ + if (btree_paths_leaked(trans)) { + struct bch_fs *c = trans->c; + struct btree_path *path; + unsigned i; + + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + + prt_printf(&buf, "btree paths leaked from %s!\n", trans->fn); + trans_for_each_path(trans, path, i) + if (path->ref) + prt_printf(&buf, "btree %s %pS\n", + bch2_btree_id_str(path->btree_id), + (void *) path->ip_allocated); + + bch2_fs_emergency_read_only2(c, &buf); + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + } +} +#else +static inline void check_btree_paths_leaked(struct btree_trans *trans) {} +#endif + void bch2_trans_put(struct btree_trans *trans) __releases(&c->btree_trans_barrier) { @@ -3454,6 +3519,9 @@ void bch2_trans_put(struct btree_trans *trans) #ifdef CONFIG_BCACHEFS_DEBUG darray_exit(&trans->last_restarted_trace); #endif +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE + darray_exit(&trans->trans_kmalloc_trace); +#endif unsigned long *paths_allocated = trans->paths_allocated; trans->paths_allocated = NULL; @@ -3500,13 +3568,12 @@ bch2_btree_bkey_cached_common_to_text(struct printbuf *out, struct btree_bkey_cached_common *b) { struct six_lock_count c = six_lock_counts(&b->lock); - struct task_struct *owner; pid_t pid; - rcu_read_lock(); - owner = READ_ONCE(b->lock.owner); - pid = owner ? owner->pid : 0; - rcu_read_unlock(); + scoped_guard(rcu) { + struct task_struct *owner = READ_ONCE(b->lock.owner); + pid = owner ? owner->pid : 0; + } prt_printf(out, "\t%px %c ", b, b->cached ? 'c' : 'b'); bch2_btree_id_to_text(out, b->btree_id); @@ -3535,7 +3602,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) prt_printf(out, "%i %s\n", task ? task->pid : 0, trans->fn); /* trans->paths is rcu protected vs. freeing */ - rcu_read_lock(); + guard(rcu)(); out->atomic++; struct btree_path *paths = rcu_dereference(trans->paths); @@ -3578,7 +3645,6 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) } out: --out->atomic; - rcu_read_unlock(); } void bch2_fs_btree_iter_exit(struct bch_fs *c) @@ -3608,6 +3674,9 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c) for (s = c->btree_transaction_stats; s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); s++) { +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE + darray_exit(&s->trans_kmalloc_trace); +#endif kfree(s->max_paths_text); bch2_time_stats_exit(&s->lock_hold_times); } diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index 9d2cccf5d21a..09dd3e52622e 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -46,9 +46,11 @@ static inline bool __btree_path_put(struct btree_trans *trans, struct btree_path return --path->ref == 0; } -static inline void btree_path_set_dirty(struct btree_path *path, +static inline void btree_path_set_dirty(struct btree_trans *trans, + struct btree_path *path, enum btree_path_uptodate u) { + BUG_ON(path->should_be_locked && trans->locked && !trans->restarted); path->uptodate = max_t(unsigned, path->uptodate, u); } @@ -285,14 +287,23 @@ static inline int bch2_trans_mutex_lock(struct btree_trans *trans, struct mutex : __bch2_trans_mutex_lock(trans, lock); } -#ifdef CONFIG_BCACHEFS_DEBUG -void bch2_trans_verify_paths(struct btree_trans *); -void bch2_assert_pos_locked(struct btree_trans *, enum btree_id, struct bpos); -#else -static inline void bch2_trans_verify_paths(struct btree_trans *trans) {} -static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, - struct bpos pos) {} -#endif +/* Debug: */ + +void __bch2_trans_verify_paths(struct btree_trans *); +void __bch2_assert_pos_locked(struct btree_trans *, enum btree_id, struct bpos); + +static inline void bch2_trans_verify_paths(struct btree_trans *trans) +{ + if (static_branch_unlikely(&bch2_debug_check_iterators)) + __bch2_trans_verify_paths(trans); +} + +static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id btree, + struct bpos pos) +{ + if (static_branch_unlikely(&bch2_debug_check_iterators)) + __bch2_assert_pos_locked(trans, btree, pos); +} void bch2_btree_path_fix_key_modified(struct btree_trans *trans, struct btree *, struct bkey_packed *); @@ -543,43 +554,73 @@ void bch2_trans_copy_iter(struct btree_trans *, struct btree_iter *, struct btre void bch2_set_btree_iter_dontneed(struct btree_trans *, struct btree_iter *); -void *__bch2_trans_kmalloc(struct btree_trans *, size_t); +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE +void bch2_trans_kmalloc_trace_to_text(struct printbuf *, + darray_trans_kmalloc_trace *); +#endif -/** - * bch2_trans_kmalloc - allocate memory for use by the current transaction - * - * Must be called after bch2_trans_begin, which on second and further calls - * frees all memory allocated in this transaction - */ -static inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) +void *__bch2_trans_kmalloc(struct btree_trans *, size_t, unsigned long); + +static inline void bch2_trans_kmalloc_trace(struct btree_trans *trans, size_t size, + unsigned long ip) +{ +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE + darray_push(&trans->trans_kmalloc_trace, + ((struct trans_kmalloc_trace) { .ip = ip, .bytes = size })); +#endif +} + +static __always_inline void *bch2_trans_kmalloc_nomemzero_ip(struct btree_trans *trans, size_t size, + unsigned long ip) { size = roundup(size, 8); + bch2_trans_kmalloc_trace(trans, size, ip); + if (likely(trans->mem_top + size <= trans->mem_bytes)) { void *p = trans->mem + trans->mem_top; trans->mem_top += size; - memset(p, 0, size); return p; } else { - return __bch2_trans_kmalloc(trans, size); + return __bch2_trans_kmalloc(trans, size, ip); } } -static inline void *bch2_trans_kmalloc_nomemzero(struct btree_trans *trans, size_t size) +static __always_inline void *bch2_trans_kmalloc_ip(struct btree_trans *trans, size_t size, + unsigned long ip) { - size = round_up(size, 8); + size = roundup(size, 8); + + bch2_trans_kmalloc_trace(trans, size, ip); if (likely(trans->mem_top + size <= trans->mem_bytes)) { void *p = trans->mem + trans->mem_top; trans->mem_top += size; + memset(p, 0, size); return p; } else { - return __bch2_trans_kmalloc(trans, size); + return __bch2_trans_kmalloc(trans, size, ip); } } +/** + * bch2_trans_kmalloc - allocate memory for use by the current transaction + * + * Must be called after bch2_trans_begin, which on second and further calls + * frees all memory allocated in this transaction + */ +static __always_inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) +{ + return bch2_trans_kmalloc_ip(trans, size, _THIS_IP_); +} + +static __always_inline void *bch2_trans_kmalloc_nomemzero(struct btree_trans *trans, size_t size) +{ + return bch2_trans_kmalloc_nomemzero_ip(trans, size, _THIS_IP_); +} + static inline struct bkey_s_c __bch2_bkey_get_iter(struct btree_trans *trans, struct btree_iter *iter, unsigned btree_id, struct bpos pos, @@ -922,16 +963,6 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_trans *, _p; \ }) -#define bch2_trans_run(_c, _do) \ -({ \ - struct btree_trans *trans = bch2_trans_get(_c); \ - int _ret = (_do); \ - bch2_trans_put(trans); \ - _ret; \ -}) - -#define bch2_trans_do(_c, _do) bch2_trans_run(_c, lockrestart_do(trans, _do)) - struct btree_trans *__bch2_trans_get(struct bch_fs *, unsigned); void bch2_trans_put(struct btree_trans *); @@ -949,6 +980,27 @@ unsigned bch2_trans_get_fn_idx(const char *); __bch2_trans_get(_c, trans_fn_idx); \ }) +/* + * We don't use DEFINE_CLASS() because using a function for the constructor + * breaks bch2_trans_get()'s use of __func__ + */ +typedef struct btree_trans * class_btree_trans_t; +static inline void class_btree_trans_destructor(struct btree_trans **p) +{ + struct btree_trans *trans = *p; + bch2_trans_put(trans); +} + +#define class_btree_trans_constructor(_c) bch2_trans_get(_c) + +#define bch2_trans_run(_c, _do) \ +({ \ + CLASS(btree_trans, trans)(_c); \ + (_do); \ +}) + +#define bch2_trans_do(_c, _do) bch2_trans_run(_c, lockrestart_do(trans, _do)) + void bch2_btree_trans_to_text(struct printbuf *, struct btree_trans *); void bch2_fs_btree_iter_exit(struct bch_fs *); diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index ade3b5addd75..cf7398751644 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -292,7 +292,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, if (!new_keys.data) { bch_err(c, "%s: error allocating new key array (size %zu)", __func__, new_keys.size); - return -BCH_ERR_ENOMEM_journal_key_insert; + return bch_err_throw(c, ENOMEM_journal_key_insert); } /* Since @keys was full, there was no gap: */ @@ -331,7 +331,7 @@ int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL); if (!n) - return -BCH_ERR_ENOMEM_journal_key_insert; + return bch_err_throw(c, ENOMEM_journal_key_insert); bkey_copy(n, k); ret = bch2_journal_key_insert_take(c, id, level, n); @@ -457,11 +457,9 @@ static void bch2_journal_iter_advance(struct journal_iter *iter) static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) { - struct bkey_s_c ret = bkey_s_c_null; - journal_iter_verify(iter); - rcu_read_lock(); + guard(rcu)(); while (iter->idx < iter->keys->size) { struct journal_key *k = iter->keys->data + iter->idx; @@ -470,19 +468,16 @@ static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) break; BUG_ON(cmp); - if (!k->overwritten) { - ret = bkey_i_to_s_c(k->k); - break; - } + if (!k->overwritten) + return bkey_i_to_s_c(k->k); if (k->overwritten_range) iter->idx = idx_to_pos(iter->keys, rcu_dereference(k->overwritten_range)->end); else bch2_journal_iter_advance(iter); } - rcu_read_unlock(); - return ret; + return bkey_s_c_null; } static void bch2_journal_iter_exit(struct journal_iter *iter) @@ -741,7 +736,7 @@ int bch2_journal_keys_sort(struct bch_fs *c) if (keys->nr * 8 > keys->size * 7) { bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu keys at seq %llu", keys->nr, keys->size, nr_read, le64_to_cpu(i->j.seq)); - return -BCH_ERR_ENOMEM_journal_keys_sort; + return bch_err_throw(c, ENOMEM_journal_keys_sort); } BUG_ON(darray_push(keys, n)); diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 669825f89cdd..d96188b92db2 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -101,8 +101,8 @@ static void __bkey_cached_free(struct rcu_pending *pending, struct rcu_head *rcu kmem_cache_free(bch2_key_cache, ck); } -static void bkey_cached_free(struct btree_key_cache *bc, - struct bkey_cached *ck) +static inline void bkey_cached_free_noassert(struct btree_key_cache *bc, + struct bkey_cached *ck) { kfree(ck->k); ck->k = NULL; @@ -116,6 +116,19 @@ static void bkey_cached_free(struct btree_key_cache *bc, this_cpu_inc(*bc->nr_pending); } +static void bkey_cached_free(struct btree_trans *trans, + struct btree_key_cache *bc, + struct bkey_cached *ck) +{ + /* + * we'll hit strange issues in the SRCU code if we aren't holding an + * SRCU read lock... + */ + EBUG_ON(!trans->srcu_held); + + bkey_cached_free_noassert(bc, ck); +} + static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp) { gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE; @@ -174,27 +187,23 @@ lock: static struct bkey_cached * bkey_cached_reuse(struct btree_key_cache *c) { - struct bucket_table *tbl; + + guard(rcu)(); + struct bucket_table *tbl = rht_dereference_rcu(c->table.tbl, &c->table); struct rhash_head *pos; struct bkey_cached *ck; - unsigned i; - rcu_read_lock(); - tbl = rht_dereference_rcu(c->table.tbl, &c->table); - for (i = 0; i < tbl->size; i++) + for (unsigned i = 0; i < tbl->size; i++) rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && bkey_cached_lock_for_evict(ck)) { if (bkey_cached_evict(c, ck)) - goto out; + return ck; six_unlock_write(&ck->c.lock); six_unlock_intent(&ck->c.lock); } } - ck = NULL; -out: - rcu_read_unlock(); - return ck; + return NULL; } static int btree_key_cache_create(struct btree_trans *trans, @@ -229,7 +238,7 @@ static int btree_key_cache_create(struct btree_trans *trans, if (unlikely(!ck)) { bch_err(c, "error allocating memory for key cache item, btree %s", bch2_btree_id_str(ck_path->btree_id)); - return -BCH_ERR_ENOMEM_btree_key_cache_create; + return bch_err_throw(c, ENOMEM_btree_key_cache_create); } } @@ -247,7 +256,7 @@ static int btree_key_cache_create(struct btree_trans *trans, if (unlikely(!new_k)) { bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", bch2_btree_id_str(ck->key.btree_id), key_u64s); - ret = -BCH_ERR_ENOMEM_btree_key_cache_fill; + ret = bch_err_throw(c, ENOMEM_btree_key_cache_fill); } else if (ret) { kfree(new_k); goto err; @@ -281,7 +290,7 @@ static int btree_key_cache_create(struct btree_trans *trans, ck_path->uptodate = BTREE_ITER_UPTODATE; return 0; err: - bkey_cached_free(bc, ck); + bkey_cached_free(trans, bc, ck); mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED); return ret; @@ -511,7 +520,7 @@ evict: mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); if (bkey_cached_evict(&c->btree_key_cache, ck)) { - bkey_cached_free(&c->btree_key_cache, ck); + bkey_cached_free(trans, &c->btree_key_cache, ck); } else { six_unlock_write(&ck->c.lock); six_unlock_intent(&ck->c.lock); @@ -625,7 +634,7 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans, } bkey_cached_evict(bc, ck); - bkey_cached_free(bc, ck); + bkey_cached_free(trans, bc, ck); mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED); @@ -633,10 +642,17 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans, unsigned i; trans_for_each_path(trans, path2, i) if (path2->l[0].b == (void *) ck) { + /* + * It's safe to clear should_be_locked here because + * we're evicting from the key cache, and we still have + * the underlying btree locked: filling into the key + * cache would require taking a write lock on the btree + * node + */ + path2->should_be_locked = false; __bch2_btree_path_unlock(trans, path2); path2->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_drop); - path2->should_be_locked = false; - btree_path_set_dirty(path2, BTREE_ITER_NEED_TRAVERSE); + btree_path_set_dirty(trans, path2, BTREE_ITER_NEED_TRAVERSE); } bch2_trans_verify_locks(trans); @@ -693,7 +709,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, } else if (!bkey_cached_lock_for_evict(ck)) { bc->skipped_lock_fail++; } else if (bkey_cached_evict(bc, ck)) { - bkey_cached_free(bc, ck); + bkey_cached_free_noassert(bc, ck); bc->freed++; freed++; } else { @@ -806,20 +822,20 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) bc->nr_pending = alloc_percpu(size_t); if (!bc->nr_pending) - return -BCH_ERR_ENOMEM_fs_btree_cache_init; + return bch_err_throw(c, ENOMEM_fs_btree_cache_init); if (rcu_pending_init(&bc->pending[0], &c->btree_trans_barrier, __bkey_cached_free) || rcu_pending_init(&bc->pending[1], &c->btree_trans_barrier, __bkey_cached_free)) - return -BCH_ERR_ENOMEM_fs_btree_cache_init; + return bch_err_throw(c, ENOMEM_fs_btree_cache_init); if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params)) - return -BCH_ERR_ENOMEM_fs_btree_cache_init; + return bch_err_throw(c, ENOMEM_fs_btree_cache_init); bc->table_init_done = true; shrink = shrinker_alloc(0, "%s-btree_key_cache", c->name); if (!shrink) - return -BCH_ERR_ENOMEM_fs_btree_cache_init; + return bch_err_throw(c, ENOMEM_fs_btree_cache_init); bc->shrink = shrink; shrink->count_objects = bch2_btree_key_cache_count; shrink->scan_objects = bch2_btree_key_cache_scan; diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index 94eb2b73a843..47035aae232e 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "btree_cache.h" #include "btree_locking.h" #include "btree_types.h" @@ -193,6 +194,30 @@ static int btree_trans_abort_preference(struct btree_trans *trans) return 3; } +static noinline __noreturn void break_cycle_fail(struct lock_graph *g) +{ + struct printbuf buf = PRINTBUF; + buf.atomic++; + + prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks")); + + for (struct trans_waiting_for_lock *i = g->g; i < g->g + g->nr; i++) { + struct btree_trans *trans = i->trans; + + bch2_btree_trans_to_text(&buf, trans); + + prt_printf(&buf, "backtrace:\n"); + printbuf_indent_add(&buf, 2); + bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2, GFP_NOWAIT); + printbuf_indent_sub(&buf, 2); + prt_newline(&buf); + } + + bch2_print_str_nonblocking(g->g->trans->c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + BUG(); +} + static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle, struct trans_waiting_for_lock *from) { @@ -218,28 +243,8 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle, } } - if (unlikely(!best)) { - struct printbuf buf = PRINTBUF; - buf.atomic++; - - prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks")); - - for (i = g->g; i < g->g + g->nr; i++) { - struct btree_trans *trans = i->trans; - - bch2_btree_trans_to_text(&buf, trans); - - prt_printf(&buf, "backtrace:\n"); - printbuf_indent_add(&buf, 2); - bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2, GFP_NOWAIT); - printbuf_indent_sub(&buf, 2); - prt_newline(&buf); - } - - bch2_print_string_as_lines_nonblocking(KERN_ERR, buf.buf); - printbuf_exit(&buf); - BUG(); - } + if (unlikely(!best)) + break_cycle_fail(g); ret = abort_lock(g, abort); out: @@ -254,15 +259,14 @@ static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans, struct printbuf *cycle) { struct btree_trans *orig_trans = g->g->trans; - struct trans_waiting_for_lock *i; - for (i = g->g; i < g->g + g->nr; i++) + for (struct trans_waiting_for_lock *i = g->g; i < g->g + g->nr; i++) if (i->trans == trans) { closure_put(&trans->ref); return break_cycle(g, cycle, i); } - if (g->nr == ARRAY_SIZE(g->g)) { + if (unlikely(g->nr == ARRAY_SIZE(g->g))) { closure_put(&trans->ref); if (orig_trans->lock_may_not_fail) @@ -307,7 +311,7 @@ int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle) lock_graph_down(&g, trans); /* trans->paths is rcu protected vs. freeing */ - rcu_read_lock(); + guard(rcu)(); if (cycle) cycle->atomic++; next: @@ -405,7 +409,6 @@ up: out: if (cycle) --cycle->atomic; - rcu_read_unlock(); return ret; } @@ -450,13 +453,13 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *trans, /* relock */ -static inline bool btree_path_get_locks(struct btree_trans *trans, - struct btree_path *path, - bool upgrade, - struct get_locks_fail *f) +static int btree_path_get_locks(struct btree_trans *trans, + struct btree_path *path, + bool upgrade, + struct get_locks_fail *f, + int restart_err) { unsigned l = path->level; - int fail_idx = -1; do { if (!btree_path_node(path, l)) @@ -464,39 +467,49 @@ static inline bool btree_path_get_locks(struct btree_trans *trans, if (!(upgrade ? bch2_btree_node_upgrade(trans, path, l) - : bch2_btree_node_relock(trans, path, l))) { - fail_idx = l; - - if (f) { - f->l = l; - f->b = path->l[l].b; - } - } + : bch2_btree_node_relock(trans, path, l))) + goto err; l++; } while (l < path->locks_want); + if (path->uptodate == BTREE_ITER_NEED_RELOCK) + path->uptodate = BTREE_ITER_UPTODATE; + + return path->uptodate < BTREE_ITER_NEED_RELOCK ? 0 : -1; +err: + if (f) { + f->l = l; + f->b = path->l[l].b; + } + + /* + * Do transaction restart before unlocking, so we don't pop + * should_be_locked asserts + */ + if (restart_err) { + btree_trans_restart(trans, restart_err); + } else if (path->should_be_locked && !trans->restarted) { + if (upgrade) + path->locks_want = l; + return -1; + } + + __bch2_btree_path_unlock(trans, path); + btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); + /* * When we fail to get a lock, we have to ensure that any child nodes * can't be relocked so bch2_btree_path_traverse has to walk back up to * the node that we failed to relock: */ - if (fail_idx >= 0) { - __bch2_btree_path_unlock(trans, path); - btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); - - do { - path->l[fail_idx].b = upgrade - ? ERR_PTR(-BCH_ERR_no_btree_node_upgrade) - : ERR_PTR(-BCH_ERR_no_btree_node_relock); - --fail_idx; - } while (fail_idx >= 0); - } - - if (path->uptodate == BTREE_ITER_NEED_RELOCK) - path->uptodate = BTREE_ITER_UPTODATE; + do { + path->l[l].b = upgrade + ? ERR_PTR(-BCH_ERR_no_btree_node_upgrade) + : ERR_PTR(-BCH_ERR_no_btree_node_relock); + } while (l--); - return path->uptodate < BTREE_ITER_NEED_RELOCK; + return -restart_err ?: -1; } bool __bch2_btree_node_relock(struct btree_trans *trans, @@ -583,7 +596,7 @@ int bch2_btree_path_relock_intent(struct btree_trans *trans, l++) { if (!bch2_btree_node_relock(trans, path, l)) { __bch2_btree_path_unlock(trans, path); - btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); trace_and_count(trans->c, trans_restart_relock_path_intent, trans, _RET_IP_, path); return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path_intent); } @@ -595,9 +608,7 @@ int bch2_btree_path_relock_intent(struct btree_trans *trans, __flatten bool bch2_btree_path_relock_norestart(struct btree_trans *trans, struct btree_path *path) { - struct get_locks_fail f; - - bool ret = btree_path_get_locks(trans, path, false, &f); + bool ret = !btree_path_get_locks(trans, path, false, NULL, 0); bch2_trans_verify_locks(trans); return ret; } @@ -613,27 +624,37 @@ int __bch2_btree_path_relock(struct btree_trans *trans, return 0; } -bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans, - struct btree_path *path, - unsigned new_locks_want, - struct get_locks_fail *f) +bool __bch2_btree_path_upgrade_norestart(struct btree_trans *trans, + struct btree_path *path, + unsigned new_locks_want) { - EBUG_ON(path->locks_want >= new_locks_want); - path->locks_want = new_locks_want; - bool ret = btree_path_get_locks(trans, path, true, f); - bch2_trans_verify_locks(trans); + /* + * If we need it locked, we can't touch it. Otherwise, we can return + * success - bch2_path_get() will use this path, and it'll just be + * retraversed: + */ + bool ret = !btree_path_get_locks(trans, path, true, NULL, 0) || + !path->should_be_locked; + + bch2_btree_path_verify_locks(trans, path); return ret; } -bool __bch2_btree_path_upgrade(struct btree_trans *trans, - struct btree_path *path, - unsigned new_locks_want, - struct get_locks_fail *f) +int __bch2_btree_path_upgrade(struct btree_trans *trans, + struct btree_path *path, + unsigned new_locks_want) { - bool ret = bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, f); - if (ret) + unsigned old_locks = path->nodes_locked; + unsigned old_locks_want = path->locks_want; + + path->locks_want = max_t(unsigned, path->locks_want, new_locks_want); + + struct get_locks_fail f = {}; + int ret = btree_path_get_locks(trans, path, true, &f, + BCH_ERR_transaction_restart_upgrade); + if (!ret) goto out; /* @@ -665,9 +686,30 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans, linked->btree_id == path->btree_id && linked->locks_want < new_locks_want) { linked->locks_want = new_locks_want; - btree_path_get_locks(trans, linked, true, NULL); + btree_path_get_locks(trans, linked, true, NULL, 0); } } + + count_event(trans->c, trans_restart_upgrade); + if (trace_trans_restart_upgrade_enabled()) { + struct printbuf buf = PRINTBUF; + + prt_printf(&buf, "%s %pS\n", trans->fn, (void *) _RET_IP_); + prt_printf(&buf, "btree %s pos\n", bch2_btree_id_str(path->btree_id)); + bch2_bpos_to_text(&buf, path->pos); + prt_printf(&buf, "locks want %u -> %u level %u\n", + old_locks_want, new_locks_want, f.l); + prt_printf(&buf, "nodes_locked %x -> %x\n", + old_locks, path->nodes_locked); + prt_printf(&buf, "node %s ", IS_ERR(f.b) ? bch2_err_str(PTR_ERR(f.b)) : + !f.b ? "(null)" : "(node)"); + prt_printf(&buf, "path seq %u node seq %u\n", + IS_ERR_OR_NULL(f.b) ? 0 : f.b->c.lock.seq, + path->l[f.l].lock_seq); + + trace_trans_restart_upgrade(trans->c, buf.buf); + printbuf_exit(&buf); + } out: bch2_trans_verify_locks(trans); return ret; @@ -699,7 +741,7 @@ void __bch2_btree_path_downgrade(struct btree_trans *trans, } } - bch2_btree_path_verify_locks(path); + bch2_btree_path_verify_locks(trans, path); trace_path_downgrade(trans, _RET_IP_, path, old_locks_want); } @@ -728,7 +770,7 @@ static inline void __bch2_trans_unlock(struct btree_trans *trans) __bch2_btree_path_unlock(trans, path); } -static noinline __cold int bch2_trans_relock_fail(struct btree_trans *trans, struct btree_path *path, +static noinline __cold void bch2_trans_relock_fail(struct btree_trans *trans, struct btree_path *path, struct get_locks_fail *f, bool trace) { if (!trace) @@ -738,7 +780,9 @@ static noinline __cold int bch2_trans_relock_fail(struct btree_trans *trans, str struct printbuf buf = PRINTBUF; bch2_bpos_to_text(&buf, path->pos); - prt_printf(&buf, " l=%u seq=%u node seq=", f->l, path->l[f->l].lock_seq); + prt_printf(&buf, " %s l=%u seq=%u node seq=", + bch2_btree_id_str(path->btree_id), + f->l, path->l[f->l].lock_seq); if (IS_ERR_OR_NULL(f->b)) { prt_str(&buf, bch2_err_str(PTR_ERR(f->b))); } else { @@ -760,7 +804,6 @@ static noinline __cold int bch2_trans_relock_fail(struct btree_trans *trans, str out: __bch2_trans_unlock(trans); bch2_trans_verify_locks(trans); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); } static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace) @@ -777,10 +820,14 @@ static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace) trans_for_each_path(trans, path, i) { struct get_locks_fail f; + int ret; if (path->should_be_locked && - !btree_path_get_locks(trans, path, false, &f)) - return bch2_trans_relock_fail(trans, path, &f, trace); + (ret = btree_path_get_locks(trans, path, false, &f, + BCH_ERR_transaction_restart_relock))) { + bch2_trans_relock_fail(trans, path, &f, trace); + return ret; + } } trans_set_locked(trans, true); @@ -799,18 +846,11 @@ int bch2_trans_relock_notrace(struct btree_trans *trans) return __bch2_trans_relock(trans, false); } -void bch2_trans_unlock_noassert(struct btree_trans *trans) +void bch2_trans_unlock(struct btree_trans *trans) { - __bch2_trans_unlock(trans); - trans_set_unlocked(trans); -} -void bch2_trans_unlock(struct btree_trans *trans) -{ __bch2_trans_unlock(trans); - - trans_set_unlocked(trans); } void bch2_trans_unlock_long(struct btree_trans *trans) @@ -842,32 +882,28 @@ int __bch2_trans_mutex_lock(struct btree_trans *trans, /* Debug */ -#ifdef CONFIG_BCACHEFS_DEBUG - -void bch2_btree_path_verify_locks(struct btree_path *path) +void __bch2_btree_path_verify_locks(struct btree_trans *trans, struct btree_path *path) { - /* - * A path may be uptodate and yet have nothing locked if and only if - * there is no node at path->level, which generally means we were - * iterating over all nodes and got to the end of the btree - */ - BUG_ON(path->uptodate == BTREE_ITER_UPTODATE && - btree_path_node(path, path->level) && - !path->nodes_locked); + if (!path->nodes_locked && btree_path_node(path, path->level)) { + /* + * A path may be uptodate and yet have nothing locked if and only if + * there is no node at path->level, which generally means we were + * iterating over all nodes and got to the end of the btree + */ + BUG_ON(path->uptodate == BTREE_ITER_UPTODATE); + BUG_ON(path->should_be_locked && trans->locked && !trans->restarted); + } if (!path->nodes_locked) return; for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) { int want = btree_lock_want(path, l); - int have = btree_node_locked_type(path, l); + int have = btree_node_locked_type_nowrite(path, l); BUG_ON(!is_btree_node(path, l) && have != BTREE_NODE_UNLOCKED); - BUG_ON(is_btree_node(path, l) && - (want == BTREE_NODE_UNLOCKED || - have != BTREE_NODE_WRITE_LOCKED) && - want != have); + BUG_ON(is_btree_node(path, l) && want != have); BUG_ON(btree_node_locked(path, l) && path->l[l].lock_seq != six_lock_seq(&path->l[l].b->c.lock)); @@ -885,7 +921,7 @@ static bool bch2_trans_locked(struct btree_trans *trans) return false; } -void bch2_trans_verify_locks(struct btree_trans *trans) +void __bch2_trans_verify_locks(struct btree_trans *trans) { if (!trans->locked) { BUG_ON(bch2_trans_locked(trans)); @@ -896,7 +932,5 @@ void bch2_trans_verify_locks(struct btree_trans *trans) unsigned i; trans_for_each_path(trans, path, i) - bch2_btree_path_verify_locks(path); + __bch2_btree_path_verify_locks(trans, path); } - -#endif diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h index b33ab7af8440..9adca77e2580 100644 --- a/fs/bcachefs/btree_locking.h +++ b/fs/bcachefs/btree_locking.h @@ -15,7 +15,6 @@ void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags, gfp_t gfp); -void bch2_trans_unlock_noassert(struct btree_trans *); void bch2_trans_unlock_write(struct btree_trans *); static inline bool is_btree_node(struct btree_path *path, unsigned l) @@ -44,6 +43,15 @@ static inline int btree_node_locked_type(struct btree_path *path, return BTREE_NODE_UNLOCKED + ((path->nodes_locked >> (level << 1)) & 3); } +static inline int btree_node_locked_type_nowrite(struct btree_path *path, + unsigned level) +{ + int have = btree_node_locked_type(path, level); + return have == BTREE_NODE_WRITE_LOCKED + ? BTREE_NODE_INTENT_LOCKED + : have; +} + static inline bool btree_node_write_locked(struct btree_path *path, unsigned l) { return btree_node_locked_type(path, l) == BTREE_NODE_WRITE_LOCKED; @@ -152,7 +160,7 @@ static inline int btree_path_highest_level_locked(struct btree_path *path) static inline void __bch2_btree_path_unlock(struct btree_trans *trans, struct btree_path *path) { - btree_path_set_dirty(path, BTREE_ITER_NEED_RELOCK); + btree_path_set_dirty(trans, path, BTREE_ITER_NEED_RELOCK); while (path->nodes_locked) btree_node_unlock(trans, path, btree_path_lowest_level_locked(path)); @@ -367,8 +375,8 @@ static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans, struct btree_path *path, unsigned level) { EBUG_ON(btree_node_locked(path, level) && - !btree_node_write_locked(path, level) && - btree_node_locked_type(path, level) != __btree_lock_want(path, level)); + btree_node_locked_type_nowrite(path, level) != + __btree_lock_want(path, level)); return likely(btree_node_locked(path, level)) || (!IS_ERR_OR_NULL(path->l[level].b) && @@ -377,31 +385,29 @@ static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans, /* upgrade */ -bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *, - struct btree_path *, unsigned, - struct get_locks_fail *); +bool __bch2_btree_path_upgrade_norestart(struct btree_trans *, struct btree_path *, unsigned); -bool __bch2_btree_path_upgrade(struct btree_trans *, - struct btree_path *, unsigned, - struct get_locks_fail *); +static inline bool bch2_btree_path_upgrade_norestart(struct btree_trans *trans, + struct btree_path *path, + unsigned new_locks_want) +{ + return new_locks_want > path->locks_want + ? __bch2_btree_path_upgrade_norestart(trans, path, new_locks_want) + : true; +} + +int __bch2_btree_path_upgrade(struct btree_trans *, + struct btree_path *, unsigned); static inline int bch2_btree_path_upgrade(struct btree_trans *trans, struct btree_path *path, unsigned new_locks_want) { - struct get_locks_fail f = {}; - unsigned old_locks_want = path->locks_want; - new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH); - if (path->locks_want < new_locks_want - ? __bch2_btree_path_upgrade(trans, path, new_locks_want, &f) - : path->nodes_locked) - return 0; - - trace_and_count(trans->c, trans_restart_upgrade, trans, _THIS_IP_, path, - old_locks_want, new_locks_want, &f); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade); + return likely(path->locks_want >= new_locks_want && path->nodes_locked) + ? 0 + : __bch2_btree_path_upgrade(trans, path, new_locks_want); } /* misc: */ @@ -427,7 +433,7 @@ static inline void btree_path_set_level_up(struct btree_trans *trans, struct btree_path *path) { __btree_path_set_level_up(trans, path, path->level++); - btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); } /* debug */ @@ -439,12 +445,20 @@ struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *, int bch2_check_for_deadlock(struct btree_trans *, struct printbuf *); -#ifdef CONFIG_BCACHEFS_DEBUG -void bch2_btree_path_verify_locks(struct btree_path *); -void bch2_trans_verify_locks(struct btree_trans *); -#else -static inline void bch2_btree_path_verify_locks(struct btree_path *path) {} -static inline void bch2_trans_verify_locks(struct btree_trans *trans) {} -#endif +void __bch2_btree_path_verify_locks(struct btree_trans *, struct btree_path *); +void __bch2_trans_verify_locks(struct btree_trans *); + +static inline void bch2_btree_path_verify_locks(struct btree_trans *trans, + struct btree_path *path) +{ + if (static_branch_unlikely(&bch2_debug_check_btree_locking)) + __bch2_btree_path_verify_locks(trans, path); +} + +static inline void bch2_trans_verify_locks(struct btree_trans *trans) +{ + if (static_branch_unlikely(&bch2_debug_check_btree_locking)) + __bch2_trans_verify_locks(trans); +} #endif /* _BCACHEFS_BTREE_LOCKING_H */ diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c index 86acf037590c..a35847734a60 100644 --- a/fs/bcachefs/btree_node_scan.c +++ b/fs/bcachefs/btree_node_scan.c @@ -271,7 +271,7 @@ static int read_btree_nodes_worker(void *p) err: bio_put(bio); free_page((unsigned long) buf); - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan); closure_put(w->cl); kfree(w); return 0; @@ -285,13 +285,13 @@ static int read_btree_nodes(struct find_btree_nodes *f) closure_init_stack(&cl); - for_each_online_member(c, ca) { + for_each_online_member(c, ca, BCH_DEV_READ_REF_btree_node_scan) { if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree))) continue; struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL); if (!w) { - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan); ret = -ENOMEM; goto err; } @@ -303,14 +303,14 @@ static int read_btree_nodes(struct find_btree_nodes *f) struct task_struct *t = kthread_create(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name); ret = PTR_ERR_OR_ZERO(t); if (ret) { - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan); kfree(w); bch_err_msg(c, ret, "starting kthread"); break; } closure_get(&cl); - percpu_ref_get(&ca->io_ref[READ]); + enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan); wake_up_process(t); } err: @@ -363,6 +363,8 @@ static int handle_overwrites(struct bch_fs *c, min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL); } } + + cond_resched(); } return 0; @@ -395,7 +397,7 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c) printbuf_reset(&buf); prt_printf(&buf, "%s: nodes found:\n", __func__); found_btree_nodes_to_text(&buf, c, f->nodes); - bch2_print_string_as_lines(KERN_INFO, buf.buf); + bch2_print_str(c, KERN_INFO, buf.buf); } sort_nonatomic(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL); @@ -424,7 +426,7 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c) printbuf_reset(&buf); prt_printf(&buf, "%s: nodes after merging replicas:\n", __func__); found_btree_nodes_to_text(&buf, c, f->nodes); - bch2_print_string_as_lines(KERN_INFO, buf.buf); + bch2_print_str(c, KERN_INFO, buf.buf); } swap(nodes_heap, f->nodes); @@ -470,7 +472,7 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c) printbuf_reset(&buf); prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__); found_btree_nodes_to_text(&buf, c, f->nodes); - bch2_print_string_as_lines(KERN_INFO, buf.buf); + bch2_print_str(c, KERN_INFO, buf.buf); } else { bch_info(c, "btree node scan found %zu nodes after overwrites", f->nodes.nr); } @@ -541,7 +543,7 @@ int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree, struct find_btree_nodes *f = &c->found_btree_nodes; - int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); + int ret = bch2_run_print_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); if (ret) return ret; diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 7d7e52ddde02..d9710801e3ee 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -11,6 +11,7 @@ #include "btree_write_buffer.h" #include "buckets.h" #include "disk_accounting.h" +#include "enumerated_ref.h" #include "errcode.h" #include "error.h" #include "journal.h" @@ -20,6 +21,7 @@ #include "snapshot.h" #include <linux/prefetch.h> +#include <linux/string_helpers.h> static const char * const trans_commit_flags_strs[] = { #define x(n, ...) #n, @@ -366,14 +368,15 @@ static noinline void journal_transaction_name(struct btree_trans *trans) struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry); - strncpy(l->d, trans->fn, JSET_ENTRY_LOG_U64s * sizeof(u64)); + memcpy_and_pad(l->d, JSET_ENTRY_LOG_U64s * sizeof(u64), + trans->fn, strlen(trans->fn), 0); } static inline int btree_key_can_insert(struct btree_trans *trans, struct btree *b, unsigned u64s) { if (!bch2_btree_node_insert_fits(b, u64s)) - return -BCH_ERR_btree_insert_btree_node_full; + return bch_err_throw(trans->c, btree_insert_btree_node_full); return 0; } @@ -391,9 +394,10 @@ btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags, new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL); if (!new_k) { - bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", + struct bch_fs *c = trans->c; + bch_err(c, "error allocating memory for key cache key, btree %s u64s %u", bch2_btree_id_str(path->btree_id), new_u64s); - return -BCH_ERR_ENOMEM_btree_key_cache_insert; + return bch_err_throw(c, ENOMEM_btree_key_cache_insert); } ret = bch2_trans_relock(trans) ?: @@ -429,7 +433,7 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags if (watermark < BCH_WATERMARK_reclaim && !test_bit(BKEY_CACHED_DIRTY, &ck->flags) && bch2_btree_key_cache_must_wait(c)) - return -BCH_ERR_btree_insert_need_journal_reclaim; + return bch_err_throw(c, btree_insert_need_journal_reclaim); /* * bch2_varint_decode can read past the end of the buffer by at most 7 @@ -644,10 +648,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && !(flags & BCH_TRANS_COMMIT_no_journal_res)) { - if (bch2_journal_seq_verify) + if (static_branch_unlikely(&bch2_journal_seq_verify)) trans_for_each_update(trans, i) i->k->k.bversion.lo = trans->journal_res.seq; - else if (bch2_inject_invalid_keys) + else if (static_branch_unlikely(&bch2_inject_invalid_keys)) trans_for_each_update(trans, i) i->k->k.bversion = MAX_VERSION; } @@ -660,18 +664,17 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, h = h->next; } - struct jset_entry *entry = trans->journal_entries; + struct bkey_i *accounting; percpu_down_read(&c->mark_lock); - for (entry = trans->journal_entries; - entry != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); - entry = vstruct_next(entry)) - if (entry->type == BCH_JSET_ENTRY_write_buffer_keys && - entry->start->k.type == KEY_TYPE_accounting) { - ret = bch2_accounting_trans_commit_hook(trans, bkey_i_to_accounting(entry->start), flags); - if (ret) - goto revert_fs_usage; - } + for (accounting = btree_trans_subbuf_base(trans, &trans->accounting); + accounting != btree_trans_subbuf_top(trans, &trans->accounting); + accounting = bkey_next(accounting)) { + ret = bch2_accounting_trans_commit_hook(trans, + bkey_i_to_accounting(accounting), flags); + if (ret) + goto revert_fs_usage; + } percpu_up_read(&c->mark_lock); /* XXX: we only want to run this if deltas are nonzero */ @@ -695,8 +698,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) validate_context.flags = BCH_VALIDATE_write|BCH_VALIDATE_commit; - for (struct jset_entry *i = trans->journal_entries; - i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); + for (struct jset_entry *i = btree_trans_journal_entries_start(trans); + i != btree_trans_journal_entries_top(trans); i = vstruct_next(i)) { ret = bch2_journal_entry_validate(c, NULL, i, bcachefs_metadata_version_current, @@ -751,11 +754,18 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, } memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res), - trans->journal_entries, - trans->journal_entries_u64s); + btree_trans_journal_entries_start(trans), + trans->journal_entries.u64s); + + trans->journal_res.offset += trans->journal_entries.u64s; + trans->journal_res.u64s -= trans->journal_entries.u64s; - trans->journal_res.offset += trans->journal_entries_u64s; - trans->journal_res.u64s -= trans->journal_entries_u64s; + memcpy_u64s_small(bch2_journal_add_entry(j, &trans->journal_res, + BCH_JSET_ENTRY_write_buffer_keys, + BTREE_ID_accounting, 0, + trans->accounting.u64s)->_data, + btree_trans_subbuf_base(trans, &trans->accounting), + trans->accounting.u64s); if (trans->journal_seq) *trans->journal_seq = trans->journal_res.seq; @@ -777,13 +787,10 @@ fatal_err: bch2_fs_fatal_error(c, "fatal error in transaction commit: %s", bch2_err_str(ret)); percpu_down_read(&c->mark_lock); revert_fs_usage: - for (struct jset_entry *entry2 = trans->journal_entries; - entry2 != entry; - entry2 = vstruct_next(entry2)) - if (entry2->type == BCH_JSET_ENTRY_write_buffer_keys && - entry2->start->k.type == KEY_TYPE_accounting) - bch2_accounting_trans_commit_revert(trans, - bkey_i_to_accounting(entry2->start), flags); + for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting); + i != accounting; + i = bkey_next(i)) + bch2_accounting_trans_commit_revert(trans, bkey_i_to_accounting(i), flags); percpu_up_read(&c->mark_lock); return ret; } @@ -888,7 +895,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, */ if ((flags & BCH_TRANS_COMMIT_journal_reclaim) && watermark < BCH_WATERMARK_reclaim) { - ret = -BCH_ERR_journal_reclaim_would_deadlock; + ret = bch_err_throw(c, journal_reclaim_would_deadlock); goto out; } @@ -958,15 +965,36 @@ do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans) return ret; } - for (struct jset_entry *i = trans->journal_entries; - i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); - i = vstruct_next(i)) + for (struct jset_entry *i = btree_trans_journal_entries_start(trans); + i != btree_trans_journal_entries_top(trans); + i = vstruct_next(i)) { if (i->type == BCH_JSET_ENTRY_btree_keys || i->type == BCH_JSET_ENTRY_write_buffer_keys) { - int ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->start); - if (ret) - return ret; + jset_entry_for_each_key(i, k) { + int ret = bch2_journal_key_insert(c, i->btree_id, i->level, k); + if (ret) + return ret; + } + } + + if (i->type == BCH_JSET_ENTRY_btree_root) { + guard(mutex)(&c->btree_root_lock); + + struct btree_root *r = bch2_btree_id_root(c, i->btree_id); + + bkey_copy(&r->key, i->start); + r->level = i->level; + r->alive = true; } + } + + for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting); + i != btree_trans_subbuf_top(trans, &trans->accounting); + i = bkey_next(i)) { + int ret = bch2_journal_key_insert(c, BTREE_ID_accounting, 0, i); + if (ret) + return ret; + } return 0; } @@ -984,7 +1012,8 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) goto out_reset; if (!trans->nr_updates && - !trans->journal_entries_u64s) + !trans->journal_entries.u64s && + !trans->accounting.u64s) goto out_reset; ret = bch2_trans_commit_run_triggers(trans); @@ -992,17 +1021,17 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) goto out_reset; if (!(flags & BCH_TRANS_COMMIT_no_check_rw) && - unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) { + unlikely(!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_trans))) { if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) ret = do_bch2_trans_commit_to_journal_replay(trans); else - ret = -BCH_ERR_erofs_trans_commit; + ret = bch_err_throw(c, erofs_trans_commit); goto out_reset; } EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); - trans->journal_u64s = trans->journal_entries_u64s; + trans->journal_u64s = trans->journal_entries.u64s + jset_u64s(trans->accounting.u64s); trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names); if (trans->journal_transaction_names) trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s); @@ -1058,7 +1087,7 @@ retry: trace_and_count(c, transaction_commit, trans, _RET_IP_); out: if (likely(!(flags & BCH_TRANS_COMMIT_no_check_rw))) - bch2_write_ref_put(c, BCH_WRITE_REF_trans); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_trans); out_reset: if (!ret) bch2_trans_downgrade(trans); @@ -1078,7 +1107,7 @@ err: * restart: */ if (flags & BCH_TRANS_COMMIT_no_journal_res) { - ret = -BCH_ERR_transaction_restart_nested; + ret = bch_err_throw(c, transaction_restart_nested); goto out; } diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index 023c472dc9ee..c61c4171ae50 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -139,6 +139,7 @@ struct btree { }; #define BCH_BTREE_CACHE_NOT_FREED_REASONS() \ + x(cache_reserve) \ x(lock_intent) \ x(lock_write) \ x(dirty) \ @@ -257,9 +258,6 @@ struct btree_node_iter { * * BTREE_TRIGGER_insert - @new is entering the btree * BTREE_TRIGGER_overwrite - @old is leaving the btree - * - * BTREE_TRIGGER_bucket_invalidate - signal from bucket invalidate path to alloc - * trigger */ #define BTREE_TRIGGER_FLAGS() \ x(norun) \ @@ -269,8 +267,7 @@ struct btree_node_iter { x(gc) \ x(insert) \ x(overwrite) \ - x(is_root) \ - x(bucket_invalidate) + x(is_root) enum { #define x(n) BTREE_ITER_FLAG_BIT_##n, @@ -477,6 +474,18 @@ struct btree_trans_paths { struct btree_path paths[]; }; +struct trans_kmalloc_trace { + unsigned long ip; + size_t bytes; +}; +typedef DARRAY(struct trans_kmalloc_trace) darray_trans_kmalloc_trace; + +struct btree_trans_subbuf { + u16 base; + u16 u64s; + u16 size;; +}; + struct btree_trans { struct bch_fs *c; @@ -488,6 +497,9 @@ struct btree_trans { void *mem; unsigned mem_top; unsigned mem_bytes; +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE + darray_trans_kmalloc_trace trans_kmalloc_trace; +#endif btree_path_idx_t nr_sorted; btree_path_idx_t nr_paths; @@ -528,9 +540,8 @@ struct btree_trans { int srcu_idx; /* update path: */ - u16 journal_entries_u64s; - u16 journal_entries_size; - struct jset_entry *journal_entries; + struct btree_trans_subbuf journal_entries; + struct btree_trans_subbuf accounting; struct btree_trans_commit_hook *hooks; struct journal_entry_pin *journal_pin; @@ -544,6 +555,8 @@ struct btree_trans { unsigned journal_u64s; unsigned extra_disk_res; /* XXX kill */ + __BKEY_PADDED(btree_path_down, BKEY_BTREE_PTR_VAL_U64s_MAX); + #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif @@ -647,13 +660,13 @@ static inline struct bset_tree *bset_tree_last(struct btree *b) static inline void * __btree_node_offset_to_ptr(const struct btree *b, u16 offset) { - return (void *) ((u64 *) b->data + 1 + offset); + return (void *) ((u64 *) b->data + offset); } static inline u16 __btree_node_ptr_to_offset(const struct btree *b, const void *p) { - u16 ret = (u64 *) p - 1 - (u64 *) b->data; + u16 ret = (u64 *) p - (u64 *) b->data; EBUG_ON(__btree_node_offset_to_ptr(b, ret) != p); return ret; diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index 1e6b7836cc01..e97e78c10f49 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -14,6 +14,8 @@ #include "snapshot.h" #include "trace.h" +#include <linux/string_helpers.h> + static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, const struct btree_insert_entry *r) { @@ -121,65 +123,44 @@ static int need_whiteout_for_snapshot(struct btree_trans *trans, } int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, - enum btree_id id, - struct bpos old_pos, - struct bpos new_pos) + enum btree_id btree, struct bpos pos, + snapshot_id_list *s) { - struct bch_fs *c = trans->c; - struct btree_iter old_iter, new_iter = {}; - struct bkey_s_c old_k, new_k; - snapshot_id_list s; - struct bkey_i *update; int ret = 0; - if (!bch2_snapshot_has_children(c, old_pos.snapshot)) - return 0; + darray_for_each(*s, id) { + pos.snapshot = *id; - darray_init(&s); - - bch2_trans_iter_init(trans, &old_iter, id, old_pos, - BTREE_ITER_not_extents| - BTREE_ITER_all_snapshots); - while ((old_k = bch2_btree_iter_prev(trans, &old_iter)).k && - !(ret = bkey_err(old_k)) && - bkey_eq(old_pos, old_k.k->p)) { - struct bpos whiteout_pos = - SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot); - - if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) || - snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot)) - continue; - - new_k = bch2_bkey_get_iter(trans, &new_iter, id, whiteout_pos, - BTREE_ITER_not_extents| - BTREE_ITER_intent); - ret = bkey_err(new_k); + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, btree, pos, + BTREE_ITER_not_extents| + BTREE_ITER_intent); + ret = bkey_err(k); if (ret) break; - if (new_k.k->type == KEY_TYPE_deleted) { - update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); + if (k.k->type == KEY_TYPE_deleted) { + struct bkey_i *update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); ret = PTR_ERR_OR_ZERO(update); - if (ret) + if (ret) { + bch2_trans_iter_exit(trans, &iter); break; + } bkey_init(&update->k); - update->k.p = whiteout_pos; + update->k.p = pos; update->k.type = KEY_TYPE_whiteout; - ret = bch2_trans_update(trans, &new_iter, update, + ret = bch2_trans_update(trans, &iter, update, BTREE_UPDATE_internal_snapshot_node); } - bch2_trans_iter_exit(trans, &new_iter); + bch2_trans_iter_exit(trans, &iter); - ret = snapshot_list_add(c, &s, old_k.k->p.snapshot); if (ret) break; } - bch2_trans_iter_exit(trans, &new_iter); - bch2_trans_iter_exit(trans, &old_iter); - darray_exit(&s); + darray_exit(s); return ret; } @@ -509,8 +490,9 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans, return 0; } -int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_i *k, enum btree_iter_update_trigger_flags flags) +int __must_check bch2_trans_update_ip(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_i *k, enum btree_iter_update_trigger_flags flags, + unsigned long ip) { kmsan_check_memory(k, bkey_bytes(&k->k)); @@ -546,7 +528,7 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter path_idx = iter->key_cache_path; } - return bch2_trans_update_by_path(trans, path_idx, k, flags, _RET_IP_); + return bch2_trans_update_by_path(trans, path_idx, k, flags, ip); } int bch2_btree_insert_clone_trans(struct btree_trans *trans, @@ -562,30 +544,29 @@ int bch2_btree_insert_clone_trans(struct btree_trans *trans, return bch2_btree_insert_trans(trans, btree, n, 0); } -struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s) +void *__bch2_trans_subbuf_alloc(struct btree_trans *trans, + struct btree_trans_subbuf *buf, + unsigned u64s) { - unsigned new_top = trans->journal_entries_u64s + u64s; - unsigned old_size = trans->journal_entries_size; + unsigned new_top = buf->u64s + u64s; + unsigned old_size = buf->size; - if (new_top > trans->journal_entries_size) { - trans->journal_entries_size = roundup_pow_of_two(new_top); - - btree_trans_stats(trans)->journal_entries_size = trans->journal_entries_size; - } + if (new_top > buf->size) + buf->size = roundup_pow_of_two(new_top); - struct jset_entry *n = - bch2_trans_kmalloc_nomemzero(trans, - trans->journal_entries_size * sizeof(u64)); + void *n = bch2_trans_kmalloc_nomemzero(trans, buf->size * sizeof(u64)); if (IS_ERR(n)) - return ERR_CAST(n); + return n; - if (trans->journal_entries) - memcpy(n, trans->journal_entries, old_size * sizeof(u64)); - trans->journal_entries = n; + if (buf->u64s) + memcpy(n, + btree_trans_subbuf_base(trans, buf), + old_size * sizeof(u64)); + buf->base = (u64 *) n - (u64 *) trans->mem; - struct jset_entry *e = btree_trans_journal_entries_top(trans); - trans->journal_entries_u64s = new_top; - return e; + void *p = btree_trans_subbuf_top(trans, buf); + buf->u64s = new_top; + return p; } int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter, @@ -606,7 +587,7 @@ int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter, BUG_ON(k.k->type != KEY_TYPE_deleted); if (bkey_gt(k.k->p, end)) { - ret = -BCH_ERR_ENOSPC_btree_slot; + ret = bch_err_throw(trans->c, ENOSPC_btree_slot); goto err; } @@ -826,26 +807,35 @@ int bch2_btree_bit_mod_buffered(struct btree_trans *trans, enum btree_id btree, return bch2_trans_update_buffered(trans, btree, &k); } -int bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf) +static int __bch2_trans_log_str(struct btree_trans *trans, const char *str, unsigned len) { - unsigned u64s = DIV_ROUND_UP(buf->pos, sizeof(u64)); - prt_chars(buf, '\0', u64s * sizeof(u64) - buf->pos); - - int ret = buf->allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; - if (ret) - return ret; + unsigned u64s = DIV_ROUND_UP(len, sizeof(u64)); struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(u64s)); - ret = PTR_ERR_OR_ZERO(e); + int ret = PTR_ERR_OR_ZERO(e); if (ret) return ret; struct jset_entry_log *l = container_of(e, struct jset_entry_log, entry); journal_entry_init(e, BCH_JSET_ENTRY_log, 0, 1, u64s); - memcpy(l->d, buf->buf, buf->pos); + memcpy_and_pad(l->d, u64s * sizeof(u64), str, len, 0); return 0; } +int bch2_trans_log_str(struct btree_trans *trans, const char *str) +{ + return __bch2_trans_log_str(trans, str, strlen(str)); +} + +int bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf) +{ + int ret = buf->allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; + if (ret) + return ret; + + return __bch2_trans_log_str(trans, buf->buf, buf->pos); +} + int bch2_trans_log_bkey(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bkey_i *k) { @@ -868,7 +858,6 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt, prt_vprintf(&buf, fmt, args); unsigned u64s = DIV_ROUND_UP(buf.pos, sizeof(u64)); - prt_chars(&buf, '\0', u64s * sizeof(u64) - buf.pos); int ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; if (ret) @@ -881,7 +870,7 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt, struct jset_entry_log *l = (void *) &darray_top(c->journal.early_journal_entries); journal_entry_init(&l->entry, BCH_JSET_ENTRY_log, 0, 1, u64s); - memcpy(l->d, buf.buf, buf.pos); + memcpy_and_pad(l->d, u64s * sizeof(u64), buf.buf, buf.pos, 0); c->journal.early_journal_entries.nr += jset_u64s(u64s); } else { ret = bch2_trans_commit_do(c, NULL, NULL, commit_flags, diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index 568e56c91190..9feef1dc4de5 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -4,6 +4,7 @@ #include "btree_iter.h" #include "journal.h" +#include "snapshot.h" struct bch_fs; struct btree; @@ -74,7 +75,7 @@ static inline int bch2_btree_delete_at_buffered(struct btree_trans *trans, } int __bch2_insert_snapshot_whiteouts(struct btree_trans *, enum btree_id, - struct bpos, struct bpos); + struct bpos, snapshot_id_list *); /* * For use when splitting extents in existing snapshots: @@ -88,11 +89,20 @@ static inline int bch2_insert_snapshot_whiteouts(struct btree_trans *trans, struct bpos old_pos, struct bpos new_pos) { + BUG_ON(old_pos.snapshot != new_pos.snapshot); + if (!btree_type_has_snapshots(btree) || bkey_eq(old_pos, new_pos)) return 0; - return __bch2_insert_snapshot_whiteouts(trans, btree, old_pos, new_pos); + snapshot_id_list s; + int ret = bch2_get_snapshot_overwrites(trans, btree, old_pos, &s); + if (ret) + return ret; + + return s.nr + ? __bch2_insert_snapshot_whiteouts(trans, btree, new_pos, &s) + : 0; } int bch2_trans_update_extent_overwrite(struct btree_trans *, struct btree_iter *, @@ -102,26 +112,60 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *, struct btree_iter * int bch2_bkey_get_empty_slot(struct btree_trans *, struct btree_iter *, enum btree_id, struct bpos); -int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *, - struct bkey_i *, enum btree_iter_update_trigger_flags); +int __must_check bch2_trans_update_ip(struct btree_trans *, struct btree_iter *, + struct bkey_i *, enum btree_iter_update_trigger_flags, + unsigned long); + +static inline int __must_check +bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_i *k, enum btree_iter_update_trigger_flags flags) +{ + return bch2_trans_update_ip(trans, iter, k, flags, _THIS_IP_); +} + +static inline void *btree_trans_subbuf_base(struct btree_trans *trans, + struct btree_trans_subbuf *buf) +{ + return (u64 *) trans->mem + buf->base; +} + +static inline void *btree_trans_subbuf_top(struct btree_trans *trans, + struct btree_trans_subbuf *buf) +{ + return (u64 *) trans->mem + buf->base + buf->u64s; +} + +void *__bch2_trans_subbuf_alloc(struct btree_trans *, + struct btree_trans_subbuf *, + unsigned); -struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *, unsigned); +static inline void * +bch2_trans_subbuf_alloc(struct btree_trans *trans, + struct btree_trans_subbuf *buf, + unsigned u64s) +{ + if (buf->u64s + u64s > buf->size) + return __bch2_trans_subbuf_alloc(trans, buf, u64s); + + void *p = btree_trans_subbuf_top(trans, buf); + buf->u64s += u64s; + return p; +} + +static inline struct jset_entry *btree_trans_journal_entries_start(struct btree_trans *trans) +{ + return btree_trans_subbuf_base(trans, &trans->journal_entries); +} static inline struct jset_entry *btree_trans_journal_entries_top(struct btree_trans *trans) { - return (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); + return btree_trans_subbuf_top(trans, &trans->journal_entries); } static inline struct jset_entry * bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s) { - if (!trans->journal_entries || - trans->journal_entries_u64s + u64s > trans->journal_entries_size) - return __bch2_trans_jset_entry_alloc(trans, u64s); - - struct jset_entry *e = btree_trans_journal_entries_top(trans); - trans->journal_entries_u64s += u64s; - return e; + return bch2_trans_subbuf_alloc(trans, &trans->journal_entries, u64s); } int bch2_btree_insert_clone_trans(struct btree_trans *, enum btree_id, struct bkey_i *); @@ -135,6 +179,8 @@ static inline int __must_check bch2_trans_update_buffered(struct btree_trans *tr { kmsan_check_memory(k, bkey_bytes(&k->k)); + EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX); + if (unlikely(!btree_type_uses_write_buffer(btree))) { int ret = bch2_btree_write_buffer_insert_err(trans, btree, k); dump_stack(); @@ -169,6 +215,7 @@ void bch2_trans_commit_hook(struct btree_trans *, struct btree_trans_commit_hook *); int __bch2_trans_commit(struct btree_trans *, unsigned); +int bch2_trans_log_str(struct btree_trans *, const char *); int bch2_trans_log_msg(struct btree_trans *, struct printbuf *); int bch2_trans_log_bkey(struct btree_trans *, enum btree_id, unsigned, struct bkey_i *); @@ -217,12 +264,15 @@ static inline void bch2_trans_reset_updates(struct btree_trans *trans) bch2_path_put(trans, i->path, true); trans->nr_updates = 0; - trans->journal_entries_u64s = 0; + trans->journal_entries.u64s = 0; + trans->journal_entries.size = 0; + trans->accounting.u64s = 0; + trans->accounting.size = 0; trans->hooks = NULL; trans->extra_disk_res = 0; } -static inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k, +static __always_inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k, unsigned type, unsigned min_bytes) { unsigned bytes = max_t(unsigned, min_bytes, bkey_bytes(k.k)); @@ -245,7 +295,7 @@ static inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *t return mut; } -static inline struct bkey_i *bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k) +static __always_inline struct bkey_i *bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k) { return __bch2_bkey_make_mut_noupdate(trans, k, 0, 0); } diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 00307356d7c8..d2ecb782919b 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -14,6 +14,7 @@ #include "btree_locking.h" #include "buckets.h" #include "clock.h" +#include "enumerated_ref.h" #include "error.h" #include "extents.h" #include "io_write.h" @@ -56,8 +57,6 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) struct bkey_buf prev; int ret = 0; - printbuf_indent_add_nextline(&buf, 2); - BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key, b->data->min_key)); @@ -68,20 +67,23 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) if (b == btree_node_root(c, b)) { if (!bpos_eq(b->data->min_key, POS_MIN)) { - ret = __bch2_topology_error(c, &buf); - + bch2_log_msg_start(c, &buf); + prt_printf(&buf, "btree root with incorrect min_key: "); bch2_bpos_to_text(&buf, b->data->min_key); - log_fsck_err(trans, btree_root_bad_min_key, - "btree root with incorrect min_key: %s", buf.buf); - goto out; + prt_newline(&buf); + + bch2_count_fsck_err(c, btree_root_bad_min_key, &buf); + goto err; } if (!bpos_eq(b->data->max_key, SPOS_MAX)) { - ret = __bch2_topology_error(c, &buf); + bch2_log_msg_start(c, &buf); + prt_printf(&buf, "btree root with incorrect max_key: "); bch2_bpos_to_text(&buf, b->data->max_key); - log_fsck_err(trans, btree_root_bad_max_key, - "btree root with incorrect max_key: %s", buf.buf); - goto out; + prt_newline(&buf); + + bch2_count_fsck_err(c, btree_root_bad_max_key, &buf); + goto err; } } @@ -99,19 +101,15 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) : bpos_successor(prev.k->k.p); if (!bpos_eq(expected_min, bp.v->min_key)) { - ret = __bch2_topology_error(c, &buf); - - prt_str(&buf, "end of prev node doesn't match start of next node\nin "); - bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); - prt_str(&buf, " node "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + prt_str(&buf, "end of prev node doesn't match start of next node"); prt_str(&buf, "\nprev "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); prt_str(&buf, "\nnext "); bch2_bkey_val_to_text(&buf, c, k); + prt_newline(&buf); - log_fsck_err(trans, btree_node_topology_bad_min_key, "%s", buf.buf); - goto out; + bch2_count_fsck_err(c, btree_node_topology_bad_min_key, &buf); + goto err; } bch2_bkey_buf_reassemble(&prev, c, k); @@ -119,32 +117,34 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) } if (bkey_deleted(&prev.k->k)) { - ret = __bch2_topology_error(c, &buf); - - prt_str(&buf, "empty interior node\nin "); - bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); - prt_str(&buf, " node "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - - log_fsck_err(trans, btree_node_topology_empty_interior_node, "%s", buf.buf); - } else if (!bpos_eq(prev.k->k.p, b->key.k.p)) { - ret = __bch2_topology_error(c, &buf); + prt_printf(&buf, "empty interior node\n"); + bch2_count_fsck_err(c, btree_node_topology_empty_interior_node, &buf); + goto err; + } - prt_str(&buf, "last child node doesn't end at end of parent node\nin "); - bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); - prt_str(&buf, " node "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - prt_str(&buf, "\nlast key "); + if (!bpos_eq(prev.k->k.p, b->key.k.p)) { + prt_str(&buf, "last child node doesn't end at end of parent node\nchild: "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); + prt_newline(&buf); - log_fsck_err(trans, btree_node_topology_bad_max_key, "%s", buf.buf); + bch2_count_fsck_err(c, btree_node_topology_bad_max_key, &buf); + goto err; } out: -fsck_err: bch2_btree_and_journal_iter_exit(&iter); bch2_bkey_buf_exit(&prev, c); printbuf_exit(&buf); return ret; +err: + bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); + prt_char(&buf, ' '); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + prt_newline(&buf); + + ret = __bch2_topology_error(c, &buf); + bch2_print_str(c, KERN_ERR, buf.buf); + BUG_ON(!ret); + goto out; } /* Calculate ideal packed bkey format for new btree nodes: */ @@ -284,6 +284,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, struct disk_reservation *res, struct closure *cl, bool interior_node, + unsigned target, unsigned flags) { struct bch_fs *c = trans->c; @@ -317,6 +318,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, mutex_unlock(&c->btree_reserve_cache_lock); retry: ret = bch2_alloc_sectors_start_trans(trans, + target ?: c->opts.metadata_target ?: c->opts.foreground_target, 0, @@ -325,7 +327,9 @@ retry: res->nr_replicas, min(res->nr_replicas, c->opts.metadata_replicas_required), - watermark, 0, cl, &wp); + watermark, + target ? BCH_WRITE_only_specified_devs : 0, + cl, &wp); if (unlikely(ret)) goto err; @@ -505,6 +509,7 @@ static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans * static int bch2_btree_reserve_get(struct btree_trans *trans, struct btree_update *as, unsigned nr_nodes[2], + unsigned target, unsigned flags, struct closure *cl) { @@ -527,7 +532,7 @@ static int bch2_btree_reserve_get(struct btree_trans *trans, while (p->nr < nr_nodes[interior]) { b = __bch2_btree_node_alloc(trans, &as->disk_res, cl, - interior, flags); + interior, target, flags); if (IS_ERR(b)) { ret = PTR_ERR(b); goto err; @@ -679,12 +684,31 @@ static void btree_update_nodes_written(struct btree_update *as) /* * Wait for any in flight writes to finish before we free the old nodes - * on disk: + * on disk. But we haven't pinned those old nodes in the btree cache, + * they might have already been evicted. + * + * The update we're completing deleted references to those nodes from the + * btree, so we know if they've been evicted they can't be pulled back in. + * We just have to check if the nodes we have pointers to are still those + * old nodes, and haven't been reused. + * + * This can't be done locklessly because the data buffer might have been + * vmalloc allocated, and they're not RCU freed. We also need the + * __no_kmsan_checks annotation because even with the btree node read + * lock, nothing tells us that the data buffer has been initialized (if + * the btree node has been reused for a different node, and the data + * buffer swapped for a new data buffer). */ for (i = 0; i < as->nr_old_nodes; i++) { b = as->old_nodes[i]; - if (btree_node_seq_matches(b, as->old_nodes_seq[i])) + bch2_trans_begin(trans); + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); + bool seq_matches = btree_node_seq_matches(b, as->old_nodes_seq[i]); + six_unlock_read(&b->c.lock); + bch2_trans_unlock_long(trans); + + if (seq_matches) wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner, TASK_UNINTERRUPTIBLE); } @@ -1116,7 +1140,8 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans * static struct btree_update * bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, - unsigned level_start, bool split, unsigned flags) + unsigned level_start, bool split, + unsigned target, unsigned flags) { struct bch_fs *c = trans->c; struct btree_update *as; @@ -1226,7 +1251,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, if (ret) goto err; - ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, NULL); + ret = bch2_btree_reserve_get(trans, as, nr_nodes, target, flags, NULL); if (bch2_err_matches(ret, ENOSPC) || bch2_err_matches(ret, ENOMEM)) { struct closure cl; @@ -1238,14 +1263,14 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, if (bch2_err_matches(ret, ENOSPC) && (flags & BCH_TRANS_COMMIT_journal_reclaim) && watermark < BCH_WATERMARK_reclaim) { - ret = -BCH_ERR_journal_reclaim_would_deadlock; + ret = bch_err_throw(c, journal_reclaim_would_deadlock); goto err; } closure_init_stack(&cl); do { - ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, &cl); + ret = bch2_btree_reserve_get(trans, as, nr_nodes, target, flags, &cl); bch2_trans_unlock(trans); bch2_wait_on_allocator(c, &cl); @@ -1806,10 +1831,10 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t __func__, b->c.level); bch2_btree_update_to_text(&buf, as); bch2_btree_path_to_text(&buf, trans, path_idx); + bch2_fs_emergency_read_only2(c, &buf); - bch2_print_string_as_lines(KERN_ERR, buf.buf); + bch2_print_str(c, KERN_ERR, buf.buf); printbuf_exit(&buf); - bch2_fs_emergency_read_only(c); return -EIO; } @@ -1878,7 +1903,7 @@ int bch2_btree_split_leaf(struct btree_trans *trans, as = bch2_btree_update_start(trans, trans->paths + path, trans->paths[path].level, - true, flags); + true, 0, flags); if (IS_ERR(as)) return PTR_ERR(as); @@ -1948,7 +1973,8 @@ int bch2_btree_increase_depth(struct btree_trans *trans, btree_path_idx_t path, return bch2_btree_split_leaf(trans, path, flags); struct btree_update *as = - bch2_btree_update_start(trans, trans->paths + path, b->c.level, true, flags); + bch2_btree_update_start(trans, trans->paths + path, b->c.level, + true, 0, flags); if (IS_ERR(as)) return PTR_ERR(as); @@ -2077,7 +2103,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, parent = btree_node_parent(trans->paths + path, b); as = bch2_btree_update_start(trans, trans->paths + path, level, false, - BCH_TRANS_COMMIT_no_enospc|flags); + 0, BCH_TRANS_COMMIT_no_enospc|flags); ret = PTR_ERR_OR_ZERO(as); if (ret) goto err; @@ -2170,7 +2196,7 @@ static int get_iter_to_node(struct btree_trans *trans, struct btree_iter *iter, if (btree_iter_path(trans, iter)->l[b->c.level].b != b) { /* node has been freed: */ BUG_ON(!btree_node_dying(b)); - ret = -BCH_ERR_btree_node_dying; + ret = bch_err_throw(trans->c, btree_node_dying); goto err; } @@ -2184,6 +2210,7 @@ err: int bch2_btree_node_rewrite(struct btree_trans *trans, struct btree_iter *iter, struct btree *b, + unsigned target, unsigned flags) { struct bch_fs *c = trans->c; @@ -2196,7 +2223,8 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, struct btree_path *path = btree_iter_path(trans, iter); parent = btree_node_parent(path, b); - as = bch2_btree_update_start(trans, path, b->c.level, false, flags); + as = bch2_btree_update_start(trans, path, b->c.level, + false, target, flags); ret = PTR_ERR_OR_ZERO(as); if (ret) goto out; @@ -2261,7 +2289,7 @@ static int bch2_btree_node_rewrite_key(struct btree_trans *trans, bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(k); ret = found - ? bch2_btree_node_rewrite(trans, &iter, b, flags) + ? bch2_btree_node_rewrite(trans, &iter, b, 0, flags) : -ENOENT; out: bch2_trans_iter_exit(trans, &iter); @@ -2270,7 +2298,9 @@ out: int bch2_btree_node_rewrite_pos(struct btree_trans *trans, enum btree_id btree, unsigned level, - struct bpos pos, unsigned flags) + struct bpos pos, + unsigned target, + unsigned flags) { BUG_ON(!level); @@ -2282,7 +2312,7 @@ int bch2_btree_node_rewrite_pos(struct btree_trans *trans, if (ret) goto err; - ret = bch2_btree_node_rewrite(trans, &iter, b, flags); + ret = bch2_btree_node_rewrite(trans, &iter, b, target, flags); err: bch2_trans_iter_exit(trans, &iter); return ret; @@ -2296,7 +2326,7 @@ int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *trans, if (ret) return ret == -BCH_ERR_btree_node_dying ? 0 : ret; - ret = bch2_btree_node_rewrite(trans, &iter, b, flags); + ret = bch2_btree_node_rewrite(trans, &iter, b, 0, flags); bch2_trans_iter_exit(trans, &iter); return ret; } @@ -2330,7 +2360,7 @@ static void async_btree_node_rewrite_work(struct work_struct *work) closure_wake_up(&c->btree_node_rewrites_wait); bch2_bkey_buf_exit(&a->key, c); - bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_node_rewrite); kfree(a); } @@ -2351,8 +2381,8 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) bool now = false, pending = false; spin_lock(&c->btree_node_rewrites_lock); - if (c->curr_recovery_pass > BCH_RECOVERY_PASS_journal_replay && - bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) { + if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_journal_replay) && + enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_node_rewrite)) { list_add(&a->list, &c->btree_node_rewrites); now = true; } else if (!test_bit(BCH_FS_may_go_rw, &c->flags)) { @@ -2391,7 +2421,7 @@ void bch2_do_pending_node_rewrites(struct bch_fs *c) if (!a) break; - bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite); + enumerated_ref_get(&c->writes, BCH_WRITE_REF_node_rewrite); queue_work(c->btree_node_rewrite_worker, &a->work); } } @@ -2780,16 +2810,16 @@ int bch2_fs_btree_interior_update_init(struct bch_fs *c) c->btree_interior_update_worker = alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 8); if (!c->btree_interior_update_worker) - return -BCH_ERR_ENOMEM_btree_interior_update_worker_init; + return bch_err_throw(c, ENOMEM_btree_interior_update_worker_init); c->btree_node_rewrite_worker = alloc_ordered_workqueue("btree_node_rewrite", WQ_UNBOUND); if (!c->btree_node_rewrite_worker) - return -BCH_ERR_ENOMEM_btree_interior_update_worker_init; + return bch_err_throw(c, ENOMEM_btree_interior_update_worker_init); if (mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1, sizeof(struct btree_update))) - return -BCH_ERR_ENOMEM_btree_interior_update_pool_init; + return bch_err_throw(c, ENOMEM_btree_interior_update_pool_init); return 0; } diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index be71cd73b864..7fe793788a79 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -144,7 +144,7 @@ static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans, EBUG_ON(!btree_node_locked(path, level)); - if (bch2_btree_node_merging_disabled) + if (static_branch_unlikely(&bch2_btree_node_merging_disabled)) return 0; b = path->l[level].b; @@ -168,10 +168,10 @@ static inline int bch2_foreground_maybe_merge(struct btree_trans *trans, } int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *, - struct btree *, unsigned); + struct btree *, unsigned, unsigned); int bch2_btree_node_rewrite_pos(struct btree_trans *, enum btree_id, unsigned, - struct bpos, unsigned); + struct bpos, unsigned, unsigned); int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *, struct btree *, unsigned); diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c index 0941fb2c026d..90b21e61d2b6 100644 --- a/fs/bcachefs/btree_write_buffer.c +++ b/fs/bcachefs/btree_write_buffer.c @@ -7,6 +7,7 @@ #include "btree_update_interior.h" #include "btree_write_buffer.h" #include "disk_accounting.h" +#include "enumerated_ref.h" #include "error.h" #include "extents.h" #include "journal.h" @@ -181,6 +182,8 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite return wb_flush_one_slowpath(trans, iter, wb); } + EBUG_ON(!bpos_eq(wb->k.k.p, path->pos)); + bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq); (*fast)++; return 0; @@ -391,7 +394,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) bool accounting_accumulated = false; do { if (race_fault()) { - ret = -BCH_ERR_journal_reclaim_would_deadlock; + ret = bch_err_throw(c, journal_reclaim_would_deadlock); break; } @@ -629,11 +632,11 @@ int bch2_btree_write_buffer_tryflush(struct btree_trans *trans) { struct bch_fs *c = trans->c; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer)) - return -BCH_ERR_erofs_no_writes; + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_write_buffer)) + return bch_err_throw(c, erofs_no_writes); int ret = bch2_btree_write_buffer_flush_nocheck_rw(trans); - bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer); return ret; } @@ -673,7 +676,7 @@ int bch2_btree_write_buffer_maybe_flush(struct btree_trans *trans, goto err; bch2_bkey_buf_copy(last_flushed, c, tmp.k); - ret = -BCH_ERR_transaction_restart_write_buffer_flush; + ret = bch_err_throw(c, transaction_restart_write_buffer_flush); } err: bch2_bkey_buf_exit(&tmp, c); @@ -692,7 +695,7 @@ static void bch2_btree_write_buffer_flush_work(struct work_struct *work) } while (!ret && bch2_btree_write_buffer_should_flush(c)); mutex_unlock(&wb->flushing.lock); - bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer); } static void wb_accounting_sort(struct btree_write_buffer *wb) @@ -821,9 +824,9 @@ int bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_ bch2_journal_pin_drop(&c->journal, &dst->wb->pin); if (bch2_btree_write_buffer_should_flush(c) && - __bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer) && + __enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_write_buffer) && !queue_work(system_unbound_wq, &c->btree_write_buffer.flush_work)) - bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer); if (dst->wb == &wb->flushing) mutex_unlock(&wb->flushing.lock); @@ -866,13 +869,18 @@ void bch2_fs_btree_write_buffer_exit(struct bch_fs *c) darray_exit(&wb->inc.keys); } -int bch2_fs_btree_write_buffer_init(struct bch_fs *c) +void bch2_fs_btree_write_buffer_init_early(struct bch_fs *c) { struct btree_write_buffer *wb = &c->btree_write_buffer; mutex_init(&wb->inc.lock); mutex_init(&wb->flushing.lock); INIT_WORK(&wb->flush_work, bch2_btree_write_buffer_flush_work); +} + +int bch2_fs_btree_write_buffer_init(struct bch_fs *c) +{ + struct btree_write_buffer *wb = &c->btree_write_buffer; /* Will be resized by journal as needed: */ unsigned initial_size = 1 << 16; diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h index d535cea28bde..05f56fd1eed0 100644 --- a/fs/bcachefs/btree_write_buffer.h +++ b/fs/bcachefs/btree_write_buffer.h @@ -101,6 +101,7 @@ int bch2_journal_keys_to_write_buffer_end(struct bch_fs *, struct journal_keys_t int bch2_btree_write_buffer_resize(struct bch_fs *, size_t); void bch2_fs_btree_write_buffer_exit(struct bch_fs *); +void bch2_fs_btree_write_buffer_init_early(struct bch_fs *); int bch2_fs_btree_write_buffer_init(struct bch_fs *); #endif /* _BCACHEFS_BTREE_WRITE_BUFFER_H */ diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 31fbc2716d8b..f25903c10e8a 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -156,10 +156,14 @@ static int bch2_check_fix_ptr(struct btree_trans *trans, g->gen_valid = true; g->gen = p.ptr.gen; } else { + /* this pointer will be dropped */ *do_update = true; + goto out; } } + /* g->gen_valid == true */ + if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0, trans, ptr_gen_newer_than_bucket_gen, "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" @@ -172,15 +176,13 @@ static int bch2_check_fix_ptr(struct btree_trans *trans, if (!p.ptr.cached && (g->data_type != BCH_DATA_btree || data_type == BCH_DATA_btree)) { - g->gen_valid = true; - g->gen = p.ptr.gen; - g->data_type = 0; + g->data_type = data_type; g->stripe_sectors = 0; g->dirty_sectors = 0; g->cached_sectors = 0; - } else { - *do_update = true; } + + *do_update = true; } if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, @@ -217,9 +219,22 @@ static int bch2_check_fix_ptr(struct btree_trans *trans, bch2_data_type_str(data_type), (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - if (data_type == BCH_DATA_btree) { - g->gen_valid = true; - g->gen = p.ptr.gen; + if (!p.ptr.cached && + data_type == BCH_DATA_btree) { + switch (g->data_type) { + case BCH_DATA_sb: + bch_err(c, "btree and superblock in the same bucket - cannot repair"); + ret = bch_err_throw(c, fsck_repair_unimplemented); + goto out; + case BCH_DATA_journal: + ret = bch2_dev_journal_bucket_delete(ca, PTR_BUCKET_NR(ca, &p.ptr)); + bch_err_msg(c, ret, "error deleting journal bucket %zu", + PTR_BUCKET_NR(ca, &p.ptr)); + if (ret) + goto out; + break; + } + g->data_type = data_type; g->stripe_sectors = 0; g->dirty_sectors = 0; @@ -269,6 +284,9 @@ int bch2_check_fix_ptrs(struct btree_trans *trans, struct printbuf buf = PRINTBUF; int ret = 0; + /* We don't yet do btree key updates correctly for when we're RW */ + BUG_ON(test_bit(BCH_FS_rw, &c->flags)); + bkey_for_each_ptr_decode(k.k, ptrs_c, p, entry_c) { ret = bch2_check_fix_ptr(trans, k, p, entry_c, &do_update); if (ret) @@ -276,20 +294,13 @@ int bch2_check_fix_ptrs(struct btree_trans *trans, } if (do_update) { - if (flags & BTREE_TRIGGER_is_root) { - bch_err(c, "cannot update btree roots yet"); - ret = -EINVAL; - goto err; - } - struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); ret = PTR_ERR_OR_ZERO(new); if (ret) goto err; - rcu_read_lock(); - bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, !bch2_dev_exists(c, ptr->dev)); - rcu_read_unlock(); + scoped_guard(rcu) + bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, !bch2_dev_exists(c, ptr->dev)); if (level) { /* @@ -298,14 +309,11 @@ int bch2_check_fix_ptrs(struct btree_trans *trans, * sort it out: */ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); - rcu_read_lock(); - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); - struct bucket *g = PTR_GC_BUCKET(ca, ptr); - - ptr->gen = g->gen; - } - rcu_read_unlock(); + scoped_guard(rcu) + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); + ptr->gen = PTR_GC_BUCKET(ca, ptr)->gen; + } } else { struct bkey_ptrs ptrs; union bch_extent_entry *entry; @@ -369,19 +377,41 @@ found: bch_info(c, "new key %s", buf.buf); } - struct btree_iter iter; - bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level, - BTREE_ITER_intent|BTREE_ITER_all_snapshots); - ret = bch2_btree_iter_traverse(trans, &iter) ?: - bch2_trans_update(trans, &iter, new, - BTREE_UPDATE_internal_snapshot_node| - BTREE_TRIGGER_norun); - bch2_trans_iter_exit(trans, &iter); - if (ret) - goto err; + if (!(flags & BTREE_TRIGGER_is_root)) { + struct btree_iter iter; + bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level, + BTREE_ITER_intent|BTREE_ITER_all_snapshots); + ret = bch2_btree_iter_traverse(trans, &iter) ?: + bch2_trans_update(trans, &iter, new, + BTREE_UPDATE_internal_snapshot_node| + BTREE_TRIGGER_norun); + bch2_trans_iter_exit(trans, &iter); + if (ret) + goto err; + + if (level) + bch2_btree_node_update_key_early(trans, btree, level - 1, k, new); + } else { + struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, + jset_u64s(new->k.u64s)); + ret = PTR_ERR_OR_ZERO(e); + if (ret) + goto err; - if (level) - bch2_btree_node_update_key_early(trans, btree, level - 1, k, new); + journal_entry_set(e, + BCH_JSET_ENTRY_btree_root, + btree, level - 1, + new, new->k.u64s); + + /* + * no locking, we're single threaded and not rw yet, see + * the big assertino above that we repeat here: + */ + BUG_ON(test_bit(BCH_FS_rw, &c->flags)); + + struct btree *b = bch2_btree_id_root(c, btree)->b; + bkey_copy(&b->key, new); + } } err: printbuf_exit(&buf); @@ -392,29 +422,32 @@ static int bucket_ref_update_err(struct btree_trans *trans, struct printbuf *buf struct bkey_s_c k, bool insert, enum bch_sb_error_id id) { struct bch_fs *c = trans->c; - bool repeat = false, print = true, suppress = false; prt_printf(buf, "\nwhile marking "); bch2_bkey_val_to_text(buf, c, k); prt_newline(buf); - __bch2_count_fsck_err(c, id, buf->buf, &repeat, &print, &suppress); + bool print = __bch2_count_fsck_err(c, id, buf); - int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); + int ret = bch2_run_explicit_recovery_pass(c, buf, + BCH_RECOVERY_PASS_check_allocations, 0); if (insert) { - print = true; - suppress = false; - bch2_trans_updates_to_text(buf, trans); __bch2_inconsistent_error(c, buf); - ret = -BCH_ERR_bucket_ref_update; + /* + * If we're in recovery, run_explicit_recovery_pass might give + * us an error code for rewinding recovery + */ + if (!ret) + ret = bch_err_throw(c, bucket_ref_update); + } else { + /* Always ignore overwrite errors, so that deletion works */ + ret = 0; } - if (suppress) - prt_printf(buf, "Ratelimiting new instances of previous error\n"); - if (print) - bch2_print_string_as_lines(KERN_ERR, buf->buf); + if (print || insert) + bch2_print_str(c, KERN_ERR, buf->buf); return ret; } @@ -599,7 +632,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans, struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev); if (unlikely(!ca)) { if (insert && p.ptr.dev != BCH_SB_MEMBER_INVALID) - ret = -BCH_ERR_trigger_pointer; + ret = bch_err_throw(c, trigger_pointer); goto err; } @@ -607,7 +640,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans, if (!bucket_valid(ca, bucket.offset)) { if (insert) { bch2_dev_bucket_missing(ca, bucket.offset); - ret = -BCH_ERR_trigger_pointer; + ret = bch_err_throw(c, trigger_pointer); } goto err; } @@ -629,7 +662,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans, if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s", p.ptr.dev, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = -BCH_ERR_trigger_pointer; + ret = bch_err_throw(c, trigger_pointer); goto err; } @@ -655,6 +688,8 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans, s64 sectors, enum btree_iter_update_trigger_flags flags) { + struct bch_fs *c = trans->c; + if (flags & BTREE_TRIGGER_transactional) { struct btree_iter iter; struct bkey_i_stripe *s = bch2_bkey_get_mut_typed(trans, &iter, @@ -672,7 +707,7 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans, bch2_trans_inconsistent(trans, "stripe pointer doesn't match stripe %llu", (u64) p.ec.idx); - ret = -BCH_ERR_trigger_stripe_pointer; + ret = bch_err_throw(c, trigger_stripe_pointer); goto err; } @@ -692,13 +727,11 @@ err: } if (flags & BTREE_TRIGGER_gc) { - struct bch_fs *c = trans->c; - struct gc_stripe *m = genradix_ptr_alloc(&c->gc_stripes, p.ec.idx, GFP_KERNEL); if (!m) { bch_err(c, "error allocating memory for gc_stripes, idx %llu", (u64) p.ec.idx); - return -BCH_ERR_ENOMEM_mark_stripe_ptr; + return bch_err_throw(c, ENOMEM_mark_stripe_ptr); } gc_stripe_lock(m); @@ -711,9 +744,9 @@ err: (u64) p.ec.idx); bch2_bkey_val_to_text(&buf, c, k); __bch2_inconsistent_error(c, &buf); - bch2_print_string_as_lines(KERN_ERR, buf.buf); + bch2_print_str(c, KERN_ERR, buf.buf); printbuf_exit(&buf); - return -BCH_ERR_trigger_stripe_pointer; + return bch_err_throw(c, trigger_stripe_pointer); } m->block_sectors[p.ec.block] += sectors; @@ -736,8 +769,7 @@ err: static int __trigger_extent(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c k, - enum btree_iter_update_trigger_flags flags, - s64 *replicas_sectors) + enum btree_iter_update_trigger_flags flags) { bool gc = flags & BTREE_TRIGGER_gc; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); @@ -748,6 +780,8 @@ static int __trigger_extent(struct btree_trans *trans, : BCH_DATA_user; int ret = 0; + s64 replicas_sectors = 0; + struct disk_accounting_pos acc_replicas_key; memset(&acc_replicas_key, 0, sizeof(acc_replicas_key)); acc_replicas_key.type = BCH_DISK_ACCOUNTING_replicas; @@ -774,7 +808,7 @@ static int __trigger_extent(struct btree_trans *trans, if (ret) return ret; } else if (!p.has_ec) { - *replicas_sectors += disk_sectors; + replicas_sectors += disk_sectors; replicas_entry_add_dev(&acc_replicas_key.replicas, p.ptr.dev); } else { ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags); @@ -812,13 +846,13 @@ static int __trigger_extent(struct btree_trans *trans, } if (acc_replicas_key.replicas.nr_devs) { - ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, replicas_sectors, 1, gc); + ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, &replicas_sectors, 1, gc); if (ret) return ret; } if (acc_replicas_key.replicas.nr_devs && !level && k.k->p.snapshot) { - ret = bch2_disk_accounting_mod2_nr(trans, gc, replicas_sectors, 1, snapshot, k.k->p.snapshot); + ret = bch2_disk_accounting_mod2_nr(trans, gc, &replicas_sectors, 1, snapshot, k.k->p.snapshot); if (ret) return ret; } @@ -834,7 +868,7 @@ static int __trigger_extent(struct btree_trans *trans, } if (level) { - ret = bch2_disk_accounting_mod2_nr(trans, gc, replicas_sectors, 1, btree, btree_id); + ret = bch2_disk_accounting_mod2_nr(trans, gc, &replicas_sectors, 1, btree, btree_id); if (ret) return ret; } else { @@ -843,7 +877,7 @@ static int __trigger_extent(struct btree_trans *trans, s64 v[3] = { insert ? 1 : -1, insert ? k.k->size : -((s64) k.k->size), - *replicas_sectors, + replicas_sectors, }; ret = bch2_disk_accounting_mod2(trans, gc, v, inum, k.k->p.inode); if (ret) @@ -875,20 +909,16 @@ int bch2_trigger_extent(struct btree_trans *trans, return 0; if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) { - s64 old_replicas_sectors = 0, new_replicas_sectors = 0; - if (old.k->type) { int ret = __trigger_extent(trans, btree, level, old, - flags & ~BTREE_TRIGGER_insert, - &old_replicas_sectors); + flags & ~BTREE_TRIGGER_insert); if (ret) return ret; } if (new.k->type) { int ret = __trigger_extent(trans, btree, level, new.s_c, - flags & ~BTREE_TRIGGER_overwrite, - &new_replicas_sectors); + flags & ~BTREE_TRIGGER_overwrite); if (ret) return ret; } @@ -966,15 +996,25 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, return PTR_ERR(a); if (a->v.data_type && type && a->v.data_type != type) { - bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); - log_fsck_err(trans, bucket_metadata_type_mismatch, - "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" - "while marking %s", - iter.pos.inode, iter.pos.offset, a->v.gen, - bch2_data_type_str(a->v.data_type), - bch2_data_type_str(type), - bch2_data_type_str(type)); - ret = -BCH_ERR_metadata_bucket_inconsistency; + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + prt_printf(&buf, "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" + "while marking %s\n", + iter.pos.inode, iter.pos.offset, a->v.gen, + bch2_data_type_str(a->v.data_type), + bch2_data_type_str(type), + bch2_data_type_str(type)); + + bch2_count_fsck_err(c, bucket_metadata_type_mismatch, &buf); + + ret = bch2_run_explicit_recovery_pass(c, &buf, + BCH_RECOVERY_PASS_check_allocations, 0); + + /* Always print, this is always fatal */ + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + if (!ret) + ret = bch_err_throw(c, metadata_bucket_inconsistency); goto err; } @@ -985,7 +1025,6 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, ret = bch2_trans_update(trans, &iter, &a->k_i, 0); } err: -fsck_err: bch2_trans_iter_exit(trans, &iter); return ret; } @@ -1028,7 +1067,7 @@ static int bch2_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev * err_unlock: bucket_unlock(g); err: - return -BCH_ERR_metadata_bucket_inconsistency; + return bch_err_throw(c, metadata_bucket_inconsistency); } int bch2_trans_mark_metadata_bucket(struct btree_trans *trans, @@ -1143,10 +1182,10 @@ int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca, int bch2_trans_mark_dev_sbs_flags(struct bch_fs *c, enum btree_iter_update_trigger_flags flags) { - for_each_online_member(c, ca) { + for_each_online_member(c, ca, BCH_DEV_READ_REF_trans_mark_dev_sbs) { int ret = bch2_trans_mark_dev_sb(c, ca, flags); if (ret) { - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_trans_mark_dev_sbs); return ret; } } @@ -1243,7 +1282,7 @@ recalculate: ret = 0; } else { atomic64_set(&c->sectors_available, sectors_available); - ret = -BCH_ERR_ENOSPC_disk_reservation; + ret = bch_err_throw(c, ENOSPC_disk_reservation); } mutex_unlock(&c->sectors_available_lock); @@ -1272,7 +1311,7 @@ int bch2_buckets_nouse_alloc(struct bch_fs *c) GFP_KERNEL|__GFP_ZERO); if (!ca->buckets_nouse) { bch2_dev_put(ca); - return -BCH_ERR_ENOMEM_buckets_nouse; + return bch_err_throw(c, ENOMEM_buckets_nouse); } } @@ -1297,12 +1336,12 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) lockdep_assert_held(&c->state_lock); if (resize && ca->buckets_nouse) - return -BCH_ERR_no_resize_with_buckets_nouse; + return bch_err_throw(c, no_resize_with_buckets_nouse); bucket_gens = bch2_kvmalloc(struct_size(bucket_gens, b, nbuckets), GFP_KERNEL|__GFP_ZERO); if (!bucket_gens) { - ret = -BCH_ERR_ENOMEM_bucket_gens; + ret = bch_err_throw(c, ENOMEM_bucket_gens); goto err; } @@ -1321,6 +1360,11 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) sizeof(bucket_gens->b[0]) * copy); } + ret = bch2_bucket_bitmap_resize(ca, &ca->bucket_backpointer_mismatch, + ca->mi.nbuckets, nbuckets) ?: + bch2_bucket_bitmap_resize(ca, &ca->bucket_backpointer_empty, + ca->mi.nbuckets, nbuckets); + rcu_assign_pointer(ca->bucket_gens, bucket_gens); bucket_gens = old_bucket_gens; @@ -1345,7 +1389,7 @@ int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca) { ca->usage = alloc_percpu(struct bch_dev_usage_full); if (!ca->usage) - return -BCH_ERR_ENOMEM_usage_init; + return bch_err_throw(c, ENOMEM_usage_init); return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets); } diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index af1532de4a37..49a3807a5eab 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -84,10 +84,8 @@ static inline int bucket_gen_get_rcu(struct bch_dev *ca, size_t b) static inline int bucket_gen_get(struct bch_dev *ca, size_t b) { - rcu_read_lock(); - int ret = bucket_gen_get_rcu(ca, b); - rcu_read_unlock(); - return ret; + guard(rcu)(); + return bucket_gen_get_rcu(ca, b); } static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, @@ -156,10 +154,8 @@ static inline int dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ */ static inline int dev_ptr_stale(struct bch_dev *ca, const struct bch_extent_ptr *ptr) { - rcu_read_lock(); - int ret = dev_ptr_stale_rcu(ca, ptr); - rcu_read_unlock(); - return ret; + guard(rcu)(); + return dev_ptr_stale_rcu(ca, ptr); } /* Device usage: */ diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c index c8a488e6b7b8..832eff93acb6 100644 --- a/fs/bcachefs/buckets_waiting_for_journal.c +++ b/fs/bcachefs/buckets_waiting_for_journal.c @@ -108,7 +108,8 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, realloc: n = kvmalloc(sizeof(*n) + (sizeof(n->d[0]) << new_bits), GFP_KERNEL); if (!n) { - ret = -BCH_ERR_ENOMEM_buckets_waiting_for_journal_set; + struct bch_fs *c = container_of(b, struct bch_fs, buckets_waiting_for_journal); + ret = bch_err_throw(c, ENOMEM_buckets_waiting_for_journal_set); goto out; } diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index 5891b3a1e61c..2d38466eddfd 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -613,13 +613,12 @@ static long bch2_ioctl_disk_get_idx(struct bch_fs *c, if (!dev) return -EINVAL; - for_each_online_member(c, ca) - if (ca->dev == dev) { - percpu_ref_put(&ca->io_ref[READ]); + guard(rcu)(); + for_each_online_member_rcu(c, ca) + if (ca->dev == dev) return ca->dev_idx; - } - return -BCH_ERR_ENOENT_dev_idx_not_found; + return bch_err_throw(c, ENOENT_dev_idx_not_found); } static long bch2_ioctl_disk_resize(struct bch_fs *c, diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c index d0a34a097b80..a6795e73f0b9 100644 --- a/fs/bcachefs/checksum.c +++ b/fs/bcachefs/checksum.c @@ -91,7 +91,7 @@ static void bch2_checksum_update(struct bch2_checksum_state *state, const void * } } -static void bch2_chacha20_init(u32 state[CHACHA_STATE_WORDS], +static void bch2_chacha20_init(struct chacha_state *state, const struct bch_key *key, struct nonce nonce) { u32 key_words[CHACHA_KEY_SIZE / sizeof(u32)]; @@ -106,14 +106,14 @@ static void bch2_chacha20_init(u32 state[CHACHA_STATE_WORDS], memzero_explicit(key_words, sizeof(key_words)); } -static void bch2_chacha20(const struct bch_key *key, struct nonce nonce, - void *data, size_t len) +void bch2_chacha20(const struct bch_key *key, struct nonce nonce, + void *data, size_t len) { - u32 state[CHACHA_STATE_WORDS]; + struct chacha_state state; - bch2_chacha20_init(state, key, nonce); - chacha20_crypt(state, data, data, len); - memzero_explicit(state, sizeof(state)); + bch2_chacha20_init(&state, key, nonce); + chacha20_crypt(&state, data, data, len); + chacha_zeroize_state(&state); } static void bch2_poly1305_init(struct poly1305_desc_ctx *desc, @@ -173,7 +173,7 @@ int bch2_encrypt(struct bch_fs *c, unsigned type, if (bch2_fs_inconsistent_on(!c->chacha20_key_set, c, "attempting to encrypt without encryption key")) - return -BCH_ERR_no_encryption_key; + return bch_err_throw(c, no_encryption_key); bch2_chacha20(&c->chacha20_key, nonce, data, len); return 0; @@ -257,14 +257,14 @@ int __bch2_encrypt_bio(struct bch_fs *c, unsigned type, { struct bio_vec bv; struct bvec_iter iter; - u32 chacha_state[CHACHA_STATE_WORDS]; + struct chacha_state chacha_state; int ret = 0; if (bch2_fs_inconsistent_on(!c->chacha20_key_set, c, "attempting to encrypt without encryption key")) - return -BCH_ERR_no_encryption_key; + return bch_err_throw(c, no_encryption_key); - bch2_chacha20_init(chacha_state, &c->chacha20_key, nonce); + bch2_chacha20_init(&chacha_state, &c->chacha20_key, nonce); bio_for_each_segment(bv, bio, iter) { void *p; @@ -280,10 +280,10 @@ int __bch2_encrypt_bio(struct bch_fs *c, unsigned type, } p = bvec_kmap_local(&bv); - chacha20_crypt(chacha_state, p, p, bv.bv_len); + chacha20_crypt(&chacha_state, p, p, bv.bv_len); kunmap_local(p); } - memzero_explicit(chacha_state, sizeof(chacha_state)); + chacha_zeroize_state(&chacha_state); return ret; } @@ -375,7 +375,7 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, prt_str(&buf, ")"); WARN_RATELIMIT(1, "%s", buf.buf); printbuf_exit(&buf); - return -BCH_ERR_recompute_checksum; + return bch_err_throw(c, recompute_checksum); } for (i = splits; i < splits + ARRAY_SIZE(splits); i++) { @@ -659,7 +659,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed) crypt = bch2_sb_field_resize(&c->disk_sb, crypt, sizeof(*crypt) / sizeof(u64)); if (!crypt) { - ret = -BCH_ERR_ENOSPC_sb_crypt; + ret = bch_err_throw(c, ENOSPC_sb_crypt); goto err; } diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h index 1310782d3ae9..7bd9cf6104ca 100644 --- a/fs/bcachefs/checksum.h +++ b/fs/bcachefs/checksum.h @@ -69,6 +69,8 @@ static inline void bch2_csum_err_msg(struct printbuf *out, bch2_csum_to_text(out, type, expected); } +void bch2_chacha20(const struct bch_key *, struct nonce, void *, size_t); + int bch2_request_key(struct bch_sb *, struct bch_key *); #ifndef __KERNEL__ int bch2_revoke_key(struct bch_sb *); diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c index d6dd12d74d4f..8e9264b5a84e 100644 --- a/fs/bcachefs/clock.c +++ b/fs/bcachefs/clock.c @@ -53,7 +53,6 @@ void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer) struct io_clock_wait { struct io_timer io_timer; - struct timer_list cpu_timer; struct task_struct *task; int expired; }; @@ -67,15 +66,6 @@ static void io_clock_wait_fn(struct io_timer *timer) wake_up_process(wait->task); } -static void io_clock_cpu_timeout(struct timer_list *timer) -{ - struct io_clock_wait *wait = container_of(timer, - struct io_clock_wait, cpu_timer); - - wait->expired = 1; - wake_up_process(wait->task); -} - void bch2_io_clock_schedule_timeout(struct io_clock *clock, u64 until) { struct io_clock_wait wait = { @@ -90,8 +80,8 @@ void bch2_io_clock_schedule_timeout(struct io_clock *clock, u64 until) bch2_io_timer_del(clock, &wait.io_timer); } -void bch2_kthread_io_clock_wait(struct io_clock *clock, - u64 io_until, unsigned long cpu_timeout) +unsigned long bch2_kthread_io_clock_wait_once(struct io_clock *clock, + u64 io_until, unsigned long cpu_timeout) { bool kthread = (current->flags & PF_KTHREAD) != 0; struct io_clock_wait wait = { @@ -103,27 +93,26 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock, bch2_io_timer_add(clock, &wait.io_timer); - timer_setup_on_stack(&wait.cpu_timer, io_clock_cpu_timeout, 0); - - if (cpu_timeout != MAX_SCHEDULE_TIMEOUT) - mod_timer(&wait.cpu_timer, cpu_timeout + jiffies); - - do { - set_current_state(TASK_INTERRUPTIBLE); - if (kthread && kthread_should_stop()) - break; - - if (wait.expired) - break; - - schedule(); + set_current_state(TASK_INTERRUPTIBLE); + if (!(kthread && kthread_should_stop())) { + cpu_timeout = schedule_timeout(cpu_timeout); try_to_freeze(); - } while (0); + } __set_current_state(TASK_RUNNING); - timer_delete_sync(&wait.cpu_timer); - destroy_timer_on_stack(&wait.cpu_timer); bch2_io_timer_del(clock, &wait.io_timer); + return cpu_timeout; +} + +void bch2_kthread_io_clock_wait(struct io_clock *clock, + u64 io_until, unsigned long cpu_timeout) +{ + bool kthread = (current->flags & PF_KTHREAD) != 0; + + while (!(kthread && kthread_should_stop()) && + cpu_timeout && + atomic64_read(&clock->now) < io_until) + cpu_timeout = bch2_kthread_io_clock_wait_once(clock, io_until, cpu_timeout); } static struct io_timer *get_expired_timer(struct io_clock *clock, u64 now) diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h index 82c79c8baf92..8769be2aa21e 100644 --- a/fs/bcachefs/clock.h +++ b/fs/bcachefs/clock.h @@ -4,6 +4,7 @@ void bch2_io_timer_add(struct io_clock *, struct io_timer *); void bch2_io_timer_del(struct io_clock *, struct io_timer *); +unsigned long bch2_kthread_io_clock_wait_once(struct io_clock *, u64, unsigned long); void bch2_kthread_io_clock_wait(struct io_clock *, u64, unsigned long); void __bch2_increment_clock(struct io_clock *, u64); diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c index 28ed32449913..b37b1f325f0a 100644 --- a/fs/bcachefs/compress.c +++ b/fs/bcachefs/compress.c @@ -187,7 +187,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, __bch2_compression_types[crc.compression_type])) ret = bch2_check_set_has_compressed_data(c, opt); else - ret = -BCH_ERR_compression_workspace_not_initialized; + ret = bch_err_throw(c, compression_workspace_not_initialized); if (ret) goto err; } @@ -200,7 +200,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, ret2 = LZ4_decompress_safe_partial(src_data.b, dst_data, src_len, dst_len, dst_len); if (ret2 != dst_len) - ret = -BCH_ERR_decompress_lz4; + ret = bch_err_throw(c, decompress_lz4); break; case BCH_COMPRESSION_TYPE_gzip: { z_stream strm = { @@ -219,7 +219,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, mempool_free(workspace, workspace_pool); if (ret2 != Z_STREAM_END) - ret = -BCH_ERR_decompress_gzip; + ret = bch_err_throw(c, decompress_gzip); break; } case BCH_COMPRESSION_TYPE_zstd: { @@ -227,7 +227,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, size_t real_src_len = le32_to_cpup(src_data.b); if (real_src_len > src_len - 4) { - ret = -BCH_ERR_decompress_zstd_src_len_bad; + ret = bch_err_throw(c, decompress_zstd_src_len_bad); goto err; } @@ -241,7 +241,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, mempool_free(workspace, workspace_pool); if (ret2 != dst_len) - ret = -BCH_ERR_decompress_zstd; + ret = bch_err_throw(c, decompress_zstd); break; } default: @@ -270,7 +270,7 @@ int bch2_bio_uncompress_inplace(struct bch_write_op *op, bch2_write_op_error(op, op->pos.offset, "extent too big to decompress (%u > %u)", crc->uncompressed_size << 9, c->opts.encoded_extent_max); - return -BCH_ERR_decompress_exceeded_max_encoded_extent; + return bch_err_throw(c, decompress_exceeded_max_encoded_extent); } data = __bounce_alloc(c, dst_len, WRITE); @@ -314,7 +314,7 @@ int bch2_bio_uncompress(struct bch_fs *c, struct bio *src, if (crc.uncompressed_size << 9 > c->opts.encoded_extent_max || crc.compressed_size << 9 > c->opts.encoded_extent_max) - return -BCH_ERR_decompress_exceeded_max_encoded_extent; + return bch_err_throw(c, decompress_exceeded_max_encoded_extent); dst_data = dst_len == dst_iter.bi_size ? __bio_map_or_bounce(c, dst, dst_iter, WRITE) @@ -656,12 +656,12 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) if (!mempool_initialized(&c->compression_bounce[READ]) && mempool_init_kvmalloc_pool(&c->compression_bounce[READ], 1, c->opts.encoded_extent_max)) - return -BCH_ERR_ENOMEM_compression_bounce_read_init; + return bch_err_throw(c, ENOMEM_compression_bounce_read_init); if (!mempool_initialized(&c->compression_bounce[WRITE]) && mempool_init_kvmalloc_pool(&c->compression_bounce[WRITE], 1, c->opts.encoded_extent_max)) - return -BCH_ERR_ENOMEM_compression_bounce_write_init; + return bch_err_throw(c, ENOMEM_compression_bounce_write_init); for (i = compression_types; i < compression_types + ARRAY_SIZE(compression_types); @@ -675,7 +675,7 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) if (mempool_init_kvmalloc_pool( &c->compress_workspace[i->type], 1, i->compress_workspace)) - return -BCH_ERR_ENOMEM_compression_workspace_init; + return bch_err_throw(c, ENOMEM_compression_workspace_init); } return 0; @@ -714,7 +714,7 @@ int bch2_opt_compression_parse(struct bch_fs *c, const char *_val, u64 *res, ret = match_string(bch2_compression_opts, -1, type_str); if (ret < 0 && err) - prt_str(err, "invalid compression type"); + prt_printf(err, "invalid compression type\n"); if (ret < 0) goto err; @@ -729,7 +729,7 @@ int bch2_opt_compression_parse(struct bch_fs *c, const char *_val, u64 *res, if (!ret && level > 15) ret = -EINVAL; if (ret < 0 && err) - prt_str(err, "invalid compression level"); + prt_printf(err, "invalid compression level\n"); if (ret < 0) goto err; diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h index c6151495985f..4080ee99aadd 100644 --- a/fs/bcachefs/darray.h +++ b/fs/bcachefs/darray.h @@ -8,6 +8,7 @@ * Inspired by CCAN's darray */ +#include <linux/cleanup.h> #include <linux/slab.h> #define DARRAY_PREALLOCATED(_type, _nr) \ @@ -20,7 +21,18 @@ struct { \ #define DARRAY(_type) DARRAY_PREALLOCATED(_type, 0) typedef DARRAY(char) darray_char; -typedef DARRAY(char *) darray_str; +typedef DARRAY(char *) darray_str; +typedef DARRAY(const char *) darray_const_str; + +typedef DARRAY(u8) darray_u8; +typedef DARRAY(u16) darray_u16; +typedef DARRAY(u32) darray_u32; +typedef DARRAY(u64) darray_u64; + +typedef DARRAY(s8) darray_s8; +typedef DARRAY(s16) darray_s16; +typedef DARRAY(s32) darray_s32; +typedef DARRAY(s64) darray_s64; int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t); @@ -76,7 +88,23 @@ int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t); #define darray_remove_item(_d, _pos) \ array_remove_item((_d)->data, (_d)->nr, (_pos) - (_d)->data) -#define __darray_for_each(_d, _i) \ +#define darray_find_p(_d, _i, cond) \ +({ \ + typeof((_d).data) _ret = NULL; \ + \ + darray_for_each(_d, _i) \ + if (cond) { \ + _ret = _i; \ + break; \ + } \ + _ret; \ +}) + +#define darray_find(_d, _item) darray_find_p(_d, _i, *_i == _item) + +/* Iteration: */ + +#define __darray_for_each(_d, _i) \ for ((_i) = (_d).data; _i < (_d).data + (_d).nr; _i++) #define darray_for_each(_d, _i) \ @@ -85,6 +113,8 @@ int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t); #define darray_for_each_reverse(_d, _i) \ for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data && (_d).nr; --_i) +/* Init/exit */ + #define darray_init(_d) \ do { \ (_d)->nr = 0; \ @@ -100,4 +130,29 @@ do { \ darray_init(_d); \ } while (0) +#define DEFINE_DARRAY_CLASS(_type) \ +DEFINE_CLASS(_type, _type, darray_exit(&(_T)), (_type) {}, void) + +#define DEFINE_DARRAY(_type) \ +typedef DARRAY(_type) darray_##_type; \ +DEFINE_DARRAY_CLASS(darray_##_type) + +#define DEFINE_DARRAY_NAMED(_name, _type) \ +typedef DARRAY(_type) _name; \ +DEFINE_DARRAY_CLASS(_name) + +DEFINE_DARRAY_CLASS(darray_char); +DEFINE_DARRAY_CLASS(darray_str) +DEFINE_DARRAY_CLASS(darray_const_str) + +DEFINE_DARRAY_CLASS(darray_u8) +DEFINE_DARRAY_CLASS(darray_u16) +DEFINE_DARRAY_CLASS(darray_u32) +DEFINE_DARRAY_CLASS(darray_u64) + +DEFINE_DARRAY_CLASS(darray_s8) +DEFINE_DARRAY_CLASS(darray_s16) +DEFINE_DARRAY_CLASS(darray_s32) +DEFINE_DARRAY_CLASS(darray_s64) + #endif /* _BCACHEFS_DARRAY_H */ diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index b211c97238ab..5f1174348974 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -66,43 +66,53 @@ static void bkey_nocow_unlock(struct bch_fs *c, struct bkey_s_c k) } } -static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_s_c k) +static noinline_for_stack +bool __bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_ptrs_c ptrs, + const struct bch_extent_ptr *start) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + if (!ctxt) { + bkey_for_each_ptr(ptrs, ptr) { + if (ptr == start) + break; + + struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); + struct bpos bucket = PTR_BUCKET_POS(ca, ptr); + bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0); + } + return false; + } - bkey_for_each_ptr(ptrs, ptr) { + __bkey_for_each_ptr(start, ptrs.end, ptr) { struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); struct bpos bucket = PTR_BUCKET_POS(ca, ptr); - if (ctxt) { - bool locked; - - move_ctxt_wait_event(ctxt, - (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) || - list_empty(&ctxt->ios)); + bool locked; + move_ctxt_wait_event(ctxt, + (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) || + list_empty(&ctxt->ios)); + if (!locked) + bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0); + } + return true; +} - if (!locked) - bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0); - } else { - if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) { - bkey_for_each_ptr(ptrs, ptr2) { - if (ptr2 == ptr) - break; +static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_ptrs_c ptrs) +{ + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); + struct bpos bucket = PTR_BUCKET_POS(ca, ptr); - ca = bch2_dev_have_ref(c, ptr2->dev); - bucket = PTR_BUCKET_POS(ca, ptr2); - bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0); - } - return false; - } - } + if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) + return __bkey_nocow_lock(c, ctxt, ptrs, ptr); } + return true; } -static noinline void trace_io_move_finish2(struct data_update *u, - struct bkey_i *new, - struct bkey_i *insert) +noinline_for_stack +static void trace_io_move_finish2(struct data_update *u, + struct bkey_i *new, + struct bkey_i *insert) { struct bch_fs *c = u->op.c; struct printbuf buf = PRINTBUF; @@ -124,6 +134,7 @@ static noinline void trace_io_move_finish2(struct data_update *u, printbuf_exit(&buf); } +noinline_for_stack static void trace_io_move_fail2(struct data_update *m, struct bkey_s_c new, struct bkey_s_c wrote, @@ -179,24 +190,84 @@ static void trace_io_move_fail2(struct data_update *m, printbuf_exit(&buf); } +noinline_for_stack +static void trace_data_update2(struct data_update *m, + struct bkey_s_c old, struct bkey_s_c k, + struct bkey_i *insert) +{ + struct bch_fs *c = m->op.c; + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "\nold: "); + bch2_bkey_val_to_text(&buf, c, old); + prt_str(&buf, "\nk: "); + bch2_bkey_val_to_text(&buf, c, k); + prt_str(&buf, "\nnew: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); + + trace_data_update(c, buf.buf); + printbuf_exit(&buf); +} + +noinline_for_stack +static void trace_io_move_created_rebalance2(struct data_update *m, + struct bkey_s_c old, struct bkey_s_c k, + struct bkey_i *insert) +{ + struct bch_fs *c = m->op.c; + struct printbuf buf = PRINTBUF; + + bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts); + + prt_str(&buf, "\nold: "); + bch2_bkey_val_to_text(&buf, c, old); + prt_str(&buf, "\nk: "); + bch2_bkey_val_to_text(&buf, c, k); + prt_str(&buf, "\nnew: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); + + trace_io_move_created_rebalance(c, buf.buf); + printbuf_exit(&buf); + + this_cpu_inc(c->counters[BCH_COUNTER_io_move_created_rebalance]); +} + +noinline_for_stack +static int data_update_invalid_bkey(struct data_update *m, + struct bkey_s_c old, struct bkey_s_c k, + struct bkey_i *insert) +{ + struct bch_fs *c = m->op.c; + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + + prt_str(&buf, "about to insert invalid key in data update path"); + prt_printf(&buf, "\nop.nonce: %u", m->op.nonce); + prt_str(&buf, "\nold: "); + bch2_bkey_val_to_text(&buf, c, old); + prt_str(&buf, "\nk: "); + bch2_bkey_val_to_text(&buf, c, k); + prt_str(&buf, "\nnew: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); + + bch2_fs_emergency_read_only2(c, &buf); + + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + + return bch_err_throw(c, invalid_bkey); +} + static int __bch2_data_update_index_update(struct btree_trans *trans, struct bch_write_op *op) { struct bch_fs *c = op->c; struct btree_iter iter; - struct data_update *m = - container_of(op, struct data_update, op); - struct keylist *keys = &op->insert_keys; - struct bkey_buf _new, _insert; - struct printbuf journal_msg = PRINTBUF; + struct data_update *m = container_of(op, struct data_update, op); int ret = 0; - bch2_bkey_buf_init(&_new); - bch2_bkey_buf_init(&_insert); - bch2_bkey_buf_realloc(&_insert, c, U8_MAX); - bch2_trans_iter_init(trans, &iter, m->btree_id, - bkey_start_pos(&bch2_keylist_front(keys)->k), + bkey_start_pos(&bch2_keylist_front(&op->insert_keys)->k), BTREE_ITER_slots|BTREE_ITER_intent); while (1) { @@ -221,19 +292,30 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, if (ret) goto err; - new = bkey_i_to_extent(bch2_keylist_front(keys)); + new = bkey_i_to_extent(bch2_keylist_front(&op->insert_keys)); if (!bch2_extents_match(k, old)) { trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), - NULL, "no match:"); + NULL, "no match:"); goto nowork; } - bkey_reassemble(_insert.k, k); - insert = _insert.k; + insert = bch2_trans_kmalloc(trans, + bkey_bytes(k.k) + + bkey_val_bytes(&new->k) + + sizeof(struct bch_extent_rebalance)); + ret = PTR_ERR_OR_ZERO(insert); + if (ret) + goto err; - bch2_bkey_buf_copy(&_new, c, bch2_keylist_front(keys)); - new = bkey_i_to_extent(_new.k); + bkey_reassemble(insert, k); + + new = bch2_trans_kmalloc(trans, bkey_bytes(&new->k)); + ret = PTR_ERR_OR_ZERO(new); + if (ret) + goto err; + + bkey_copy(&new->k_i, bch2_keylist_front(&op->insert_keys)); bch2_cut_front(iter.pos, &new->k_i); bch2_cut_front(iter.pos, insert); @@ -294,21 +376,21 @@ restart_drop_conflicting_replicas: bch2_bkey_durability(c, bkey_i_to_s_c(&new->k_i)); /* Now, drop excess replicas: */ - rcu_read_lock(); + scoped_guard(rcu) { restart_drop_extra_replicas: - bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) { - unsigned ptr_durability = bch2_extent_ptr_durability(c, &p); + bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) { + unsigned ptr_durability = bch2_extent_ptr_durability(c, &p); - if (!p.ptr.cached && - durability - ptr_durability >= m->op.opts.data_replicas) { - durability -= ptr_durability; + if (!p.ptr.cached && + durability - ptr_durability >= m->op.opts.data_replicas) { + durability -= ptr_durability; - bch2_extent_ptr_set_cached(c, &m->op.opts, - bkey_i_to_s(insert), &entry->ptr); - goto restart_drop_extra_replicas; + bch2_extent_ptr_set_cached(c, &m->op.opts, + bkey_i_to_s(insert), &entry->ptr); + goto restart_drop_extra_replicas; + } } } - rcu_read_unlock(); /* Finally, add the pointers we just wrote: */ extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) @@ -346,44 +428,12 @@ restart_drop_extra_replicas: .btree = m->btree_id, .flags = BCH_VALIDATE_commit, }); - if (invalid) { - struct printbuf buf = PRINTBUF; - - prt_str(&buf, "about to insert invalid key in data update path"); - prt_printf(&buf, "\nop.nonce: %u", m->op.nonce); - prt_str(&buf, "\nold: "); - bch2_bkey_val_to_text(&buf, c, old); - prt_str(&buf, "\nk: "); - bch2_bkey_val_to_text(&buf, c, k); - prt_str(&buf, "\nnew: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); - - bch2_print_string_as_lines(KERN_ERR, buf.buf); - printbuf_exit(&buf); - - bch2_fatal_error(c); - ret = -BCH_ERR_invalid_bkey; + if (unlikely(invalid)) { + ret = data_update_invalid_bkey(m, old, k, insert); goto out; } - if (trace_data_update_enabled()) { - struct printbuf buf = PRINTBUF; - - prt_str(&buf, "\nold: "); - bch2_bkey_val_to_text(&buf, c, old); - prt_str(&buf, "\nk: "); - bch2_bkey_val_to_text(&buf, c, k); - prt_str(&buf, "\nnew: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); - - trace_data_update(c, buf.buf); - printbuf_exit(&buf); - } - - printbuf_reset(&journal_msg); - prt_str(&journal_msg, bch2_data_update_type_strs[m->type]); - - ret = bch2_trans_log_msg(trans, &journal_msg) ?: + ret = bch2_trans_log_str(trans, bch2_data_update_type_strs[m->type]) ?: bch2_trans_log_bkey(trans, m->btree_id, 0, m->k.k) ?: bch2_insert_snapshot_whiteouts(trans, m->btree_id, k.k->p, bkey_start_pos(&insert->k)) ?: @@ -391,28 +441,39 @@ restart_drop_extra_replicas: k.k->p, insert->k.p) ?: bch2_bkey_set_needs_rebalance(c, &op->opts, insert) ?: bch2_trans_update(trans, &iter, insert, - BTREE_UPDATE_internal_snapshot_node) ?: - bch2_trans_commit(trans, &op->res, + BTREE_UPDATE_internal_snapshot_node); + if (ret) + goto err; + + if (trace_data_update_enabled()) + trace_data_update2(m, old, k, insert); + + if (bch2_bkey_sectors_need_rebalance(c, bkey_i_to_s_c(insert)) * k.k->size > + bch2_bkey_sectors_need_rebalance(c, k) * insert->k.size) + trace_io_move_created_rebalance2(m, old, k, insert); + + ret = bch2_trans_commit(trans, &op->res, NULL, BCH_TRANS_COMMIT_no_check_rw| BCH_TRANS_COMMIT_no_enospc| m->data_opts.btree_insert_flags); - if (!ret) { - bch2_btree_iter_set_pos(trans, &iter, next_pos); + if (ret) + goto err; - this_cpu_add(c->counters[BCH_COUNTER_io_move_finish], new->k.size); - if (trace_io_move_finish_enabled()) - trace_io_move_finish2(m, &new->k_i, insert); - } + bch2_btree_iter_set_pos(trans, &iter, next_pos); + + this_cpu_add(c->counters[BCH_COUNTER_io_move_finish], new->k.size); + if (trace_io_move_finish_enabled()) + trace_io_move_finish2(m, &new->k_i, insert); err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ret = 0; if (ret) break; next: - while (bkey_ge(iter.pos, bch2_keylist_front(keys)->k.p)) { - bch2_keylist_pop_front(keys); - if (bch2_keylist_empty(keys)) + while (bkey_ge(iter.pos, bch2_keylist_front(&op->insert_keys)->k.p)) { + bch2_keylist_pop_front(&op->insert_keys); + if (bch2_keylist_empty(&op->insert_keys)) goto out; } continue; @@ -430,10 +491,7 @@ nowork: goto next; } out: - printbuf_exit(&journal_msg); bch2_trans_iter_exit(trans, &iter); - bch2_bkey_buf_exit(&_insert, c); - bch2_bkey_buf_exit(&_new, c); BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); return ret; } @@ -474,8 +532,9 @@ void bch2_data_update_exit(struct data_update *update) bch2_bkey_buf_exit(&update->k, c); } -static int bch2_update_unwritten_extent(struct btree_trans *trans, - struct data_update *update) +static noinline_for_stack +int bch2_update_unwritten_extent(struct btree_trans *trans, + struct data_update *update) { struct bch_fs *c = update->op.c; struct bkey_i_extent *e; @@ -587,6 +646,10 @@ void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, prt_str_indented(out, "extra replicas:\t"); prt_u64(out, data_opts->extra_replicas); + prt_newline(out); + + prt_str_indented(out, "scrub:\t"); + prt_u64(out, data_opts->scrub); } void bch2_data_update_to_text(struct printbuf *out, struct data_update *m) @@ -607,9 +670,17 @@ void bch2_data_update_inflight_to_text(struct printbuf *out, struct data_update prt_newline(out); printbuf_indent_add(out, 2); bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts); - prt_printf(out, "read_done:\t%u\n", m->read_done); - bch2_write_op_to_text(out, &m->op); - printbuf_indent_sub(out, 2); + + if (!m->read_done) { + prt_printf(out, "read:\n"); + printbuf_indent_add(out, 2); + bch2_read_bio_to_text(out, &m->rbio); + } else { + prt_printf(out, "write:\n"); + printbuf_indent_add(out, 2); + bch2_write_op_to_text(out, &m->op); + } + printbuf_indent_sub(out, 4); } int bch2_extent_drop_ptrs(struct btree_trans *trans, @@ -655,18 +726,10 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans, bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); } -int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, - struct bch_io_opts *io_opts) +static int __bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, + struct bch_io_opts *io_opts, + unsigned buf_bytes) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k)); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - - /* write path might have to decompress data: */ - unsigned buf_bytes = 0; - bkey_for_each_ptr_decode(&m->k.k->k, ptrs, p, entry) - buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9); - unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE); m->bvecs = kmalloc_array(nr_vecs, sizeof*(m->bvecs), GFP_KERNEL); @@ -690,11 +753,26 @@ int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, return 0; } +int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, + struct bch_io_opts *io_opts) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k)); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + + /* write path might have to decompress data: */ + unsigned buf_bytes = 0; + bkey_for_each_ptr_decode(&m->k.k->k, ptrs, p, entry) + buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9); + + return __bch2_data_update_bios_init(m, c, io_opts, buf_bytes); +} + static int can_write_extent(struct bch_fs *c, struct data_update *m) { if ((m->op.flags & BCH_WRITE_alloc_nowait) && unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(m->op.watermark))) - return -BCH_ERR_data_update_done_would_block; + return bch_err_throw(c, data_update_done_would_block); unsigned target = m->op.flags & BCH_WRITE_only_specified_devs ? m->op.target @@ -704,10 +782,13 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m) darray_for_each(m->op.devs_have, i) __clear_bit(*i, devs.d); - rcu_read_lock(); + guard(rcu)(); + unsigned nr_replicas = 0, i; for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) { - struct bch_dev *ca = bch2_dev_rcu(c, i); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, i); + if (!ca) + continue; struct bch_dev_usage usage; bch2_dev_usage_read_fast(ca, &usage); @@ -719,12 +800,11 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m) if (nr_replicas >= m->op.nr_replicas) break; } - rcu_read_unlock(); if (!nr_replicas) - return -BCH_ERR_data_update_done_no_rw_devs; + return bch_err_throw(c, data_update_done_no_rw_devs); if (nr_replicas < m->op.nr_replicas) - return -BCH_ERR_insufficient_devices; + return bch_err_throw(c, insufficient_devices); return 0; } @@ -739,19 +819,21 @@ int bch2_data_update_init(struct btree_trans *trans, struct bkey_s_c k) { struct bch_fs *c = trans->c; - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - unsigned reserve_sectors = k.k->size * data_opts.extra_replicas; int ret = 0; - /* - * fs is corrupt we have a key for a snapshot node that doesn't exist, - * and we have to check for this because we go rw before repairing the - * snapshots table - just skip it, we can move it later. - */ - if (unlikely(k.k->p.snapshot && !bch2_snapshot_exists(c, k.k->p.snapshot))) - return -BCH_ERR_data_update_done_no_snapshot; + if (k.k->p.snapshot) { + ret = bch2_check_key_has_snapshot(trans, iter, k); + if (bch2_err_matches(ret, BCH_ERR_recovery_will_run)) { + /* Can't repair yet, waiting on other recovery passes */ + return bch_err_throw(c, data_update_done_no_snapshot); + } + if (ret < 0) + return ret; + if (ret) /* key was deleted */ + return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: + bch_err_throw(c, data_update_done_no_snapshot); + ret = 0; + } bch2_bkey_buf_init(&m->k); bch2_bkey_buf_reassemble(&m->k, c, k); @@ -779,10 +861,17 @@ int bch2_data_update_init(struct btree_trans *trans, unsigned durability_have = 0, durability_removing = 0; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k)); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + unsigned reserve_sectors = k.k->size * data_opts.extra_replicas; + unsigned buf_bytes = 0; + bool unwritten = false; + unsigned ptr_bit = 1; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { if (!p.ptr.cached) { - rcu_read_lock(); + guard(rcu)(); if (ptr_bit & m->data_opts.rewrite_ptrs) { if (crc_is_compressed(p.crc)) reserve_sectors += k.k->size; @@ -793,7 +882,6 @@ int bch2_data_update_init(struct btree_trans *trans, bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev); durability_have += bch2_extent_ptr_durability(c, &p); } - rcu_read_unlock(); } /* @@ -809,6 +897,9 @@ int bch2_data_update_init(struct btree_trans *trans, if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) m->op.incompressible = true; + buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9); + unwritten |= p.ptr.unwritten; + ptr_bit <<= 1; } @@ -847,7 +938,7 @@ int bch2_data_update_init(struct btree_trans *trans, if (iter) ret = bch2_extent_drop_ptrs(trans, iter, k, io_opts, &m->data_opts); if (!ret) - ret = -BCH_ERR_data_update_done_no_writes_needed; + ret = bch_err_throw(c, data_update_done_no_writes_needed); goto out_bkey_buf_exit; } @@ -878,23 +969,25 @@ int bch2_data_update_init(struct btree_trans *trans, } if (!bkey_get_dev_refs(c, k)) { - ret = -BCH_ERR_data_update_done_no_dev_refs; + ret = bch_err_throw(c, data_update_done_no_dev_refs); goto out_put_disk_res; } if (c->opts.nocow_enabled && - !bkey_nocow_lock(c, ctxt, k)) { - ret = -BCH_ERR_nocow_lock_blocked; + !bkey_nocow_lock(c, ctxt, ptrs)) { + ret = bch_err_throw(c, nocow_lock_blocked); goto out_put_dev_refs; } - if (bkey_extent_is_unwritten(k)) { + if (unwritten) { ret = bch2_update_unwritten_extent(trans, m) ?: - -BCH_ERR_data_update_done_unwritten; + bch_err_throw(c, data_update_done_unwritten); goto out_nocow_unlock; } - ret = bch2_data_update_bios_init(m, c, io_opts); + bch2_trans_unlock(trans); + + ret = __bch2_data_update_bios_init(m, c, io_opts, buf_bytes); if (ret) goto out_nocow_unlock; diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h index ed05125867da..5e14d13568de 100644 --- a/fs/bcachefs/data_update.h +++ b/fs/bcachefs/data_update.h @@ -50,6 +50,21 @@ struct data_update { struct bio_vec *bvecs; }; +struct promote_op { + struct rcu_head rcu; + u64 start_time; +#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS + unsigned list_idx; +#endif + + struct rhash_head hash; + struct bpos pos; + + struct work_struct work; + struct data_update write; + struct bio_vec bi_inline_vecs[]; /* must be last */ +}; + void bch2_data_update_to_text(struct printbuf *, struct data_update *); void bch2_data_update_inflight_to_text(struct printbuf *, struct data_update *); diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index 5a8bc7013512..901f643ead83 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -8,6 +8,7 @@ #include "bcachefs.h" #include "alloc_foreground.h" +#include "async_objs.h" #include "bkey_methods.h" #include "btree_cache.h" #include "btree_io.h" @@ -16,6 +17,7 @@ #include "btree_update.h" #include "btree_update_interior.h" #include "buckets.h" +#include "data_update.h" #include "debug.h" #include "error.h" #include "extents.h" @@ -40,9 +42,10 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b, struct btree_node *n_sorted = c->verify_data->data; struct bset *sorted, *inmemory = &b->data->keys; struct bio *bio; - bool failed = false, saw_error = false; + bool failed = false; - struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); + struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, + BCH_DEV_READ_REF_btree_verify_replicas); if (!ca) return false; @@ -57,12 +60,13 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b, submit_bio_wait(bio); bio_put(bio); - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], + BCH_DEV_READ_REF_btree_verify_replicas); memcpy(n_ondisk, n_sorted, btree_buf_bytes(b)); v->written = 0; - if (bch2_btree_node_read_done(c, ca, v, false, &saw_error) || saw_error) + if (bch2_btree_node_read_done(c, ca, v, NULL, NULL)) return false; n_sorted = c->verify_data->data; @@ -196,7 +200,8 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c, return; } - ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); + ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, + BCH_DEV_READ_REF_btree_node_ondisk_to_text); if (!ca) { prt_printf(out, "error getting device to read from: not online\n"); return; @@ -297,28 +302,13 @@ out: if (bio) bio_put(bio); kvfree(n_ondisk); - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], + BCH_DEV_READ_REF_btree_node_ondisk_to_text); } #ifdef CONFIG_DEBUG_FS -/* XXX: bch_fs refcounting */ - -struct dump_iter { - struct bch_fs *c; - enum btree_id id; - struct bpos from; - struct bpos prev_node; - u64 iter; - - struct printbuf buf; - - char __user *ubuf; /* destination user buffer */ - size_t size; /* size of requested read */ - ssize_t ret; /* bytes read so far */ -}; - -static ssize_t flush_buf(struct dump_iter *i) +ssize_t bch2_debugfs_flush_buf(struct dump_iter *i) { if (i->buf.pos) { size_t bytes = min_t(size_t, i->buf.pos, i->size); @@ -330,6 +320,11 @@ static ssize_t flush_buf(struct dump_iter *i) i->buf.pos -= copied; memmove(i->buf.buf, i->buf.buf + copied, i->buf.pos); + if (i->buf.last_newline >= copied) + i->buf.last_newline -= copied; + if (i->buf.last_field >= copied) + i->buf.last_field -= copied; + if (copied != bytes) return -EFAULT; } @@ -356,7 +351,7 @@ static int bch2_dump_open(struct inode *inode, struct file *file) return 0; } -static int bch2_dump_release(struct inode *inode, struct file *file) +int bch2_dump_release(struct inode *inode, struct file *file) { struct dump_iter *i = file->private_data; @@ -374,7 +369,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, i->size = size; i->ret = 0; - return flush_buf(i) ?: + return bch2_debugfs_flush_buf(i) ?: bch2_trans_run(i->c, for_each_btree_key(trans, iter, i->id, i->from, BTREE_ITER_prefetch| @@ -383,7 +378,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, prt_newline(&i->buf); bch2_trans_unlock(trans); i->from = bpos_successor(iter.pos); - flush_buf(i); + bch2_debugfs_flush_buf(i); }))) ?: i->ret; } @@ -404,7 +399,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, i->size = size; i->ret = 0; - ssize_t ret = flush_buf(i); + ssize_t ret = bch2_debugfs_flush_buf(i); if (ret) return ret; @@ -418,7 +413,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, ? bpos_successor(b->key.k.p) : b->key.k.p; - drop_locks_do(trans, flush_buf(i)); + drop_locks_do(trans, bch2_debugfs_flush_buf(i)); }))) ?: i->ret; } @@ -438,7 +433,7 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, i->size = size; i->ret = 0; - return flush_buf(i) ?: + return bch2_debugfs_flush_buf(i) ?: bch2_trans_run(i->c, for_each_btree_key(trans, iter, i->id, i->from, BTREE_ITER_prefetch| @@ -456,7 +451,7 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, bch2_bfloat_to_text(&i->buf, l->b, _k); bch2_trans_unlock(trans); i->from = bpos_successor(iter.pos); - flush_buf(i); + bch2_debugfs_flush_buf(i); }))) ?: i->ret; } @@ -497,6 +492,8 @@ static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs * prt_printf(out, "journal pin %px:\t%llu\n", &b->writes[1].journal, b->writes[1].journal.seq); + prt_printf(out, "ob:\t%u\n", b->ob.nr); + printbuf_indent_sub(out, 2); } @@ -513,34 +510,34 @@ static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf, i->ret = 0; do { - struct bucket_table *tbl; - struct rhash_head *pos; - struct btree *b; - - ret = flush_buf(i); + ret = bch2_debugfs_flush_buf(i); if (ret) return ret; - rcu_read_lock(); i->buf.atomic++; - tbl = rht_dereference_rcu(c->btree_cache.table.tbl, - &c->btree_cache.table); - if (i->iter < tbl->size) { - rht_for_each_entry_rcu(b, pos, tbl, i->iter, hash) - bch2_cached_btree_node_to_text(&i->buf, c, b); - i->iter++; - } else { - done = true; + scoped_guard(rcu) { + struct bucket_table *tbl = + rht_dereference_rcu(c->btree_cache.table.tbl, + &c->btree_cache.table); + if (i->iter < tbl->size) { + struct rhash_head *pos; + struct btree *b; + + rht_for_each_entry_rcu(b, pos, tbl, i->iter, hash) + bch2_cached_btree_node_to_text(&i->buf, c, b); + i->iter++; + } else { + done = true; + } } --i->buf.atomic; - rcu_read_unlock(); } while (!done); if (i->buf.allocation_failure) ret = -ENOMEM; if (!ret) - ret = flush_buf(i); + ret = bch2_debugfs_flush_buf(i); return ret ?: i->ret; } @@ -614,7 +611,7 @@ restart: closure_put(&trans->ref); - ret = flush_buf(i); + ret = bch2_debugfs_flush_buf(i); if (ret) goto unlocked; @@ -627,7 +624,7 @@ unlocked: ret = -ENOMEM; if (!ret) - ret = flush_buf(i); + ret = bch2_debugfs_flush_buf(i); return ret ?: i->ret; } @@ -652,7 +649,7 @@ static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf, i->ret = 0; while (1) { - err = flush_buf(i); + err = bch2_debugfs_flush_buf(i); if (err) return err; @@ -695,7 +692,7 @@ static ssize_t bch2_btree_updates_read(struct file *file, char __user *buf, i->iter++; } - err = flush_buf(i); + err = bch2_debugfs_flush_buf(i); if (err) return err; @@ -753,7 +750,7 @@ static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf, while (1) { struct btree_transaction_stats *s = &c->btree_transaction_stats[i->iter]; - err = flush_buf(i); + err = bch2_debugfs_flush_buf(i); if (err) return err; @@ -770,6 +767,12 @@ static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf, mutex_lock(&s->lock); prt_printf(&i->buf, "Max mem used: %u\n", s->max_mem); +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE + printbuf_indent_add(&i->buf, 2); + bch2_trans_kmalloc_trace_to_text(&i->buf, &s->trans_kmalloc_trace); + printbuf_indent_sub(&i->buf, 2); +#endif + prt_printf(&i->buf, "Transaction duration:\n"); printbuf_indent_add(&i->buf, 2); @@ -868,7 +871,7 @@ static ssize_t bch2_simple_print(struct file *file, char __user *buf, ret = -ENOMEM; if (!ret) - ret = flush_buf(i); + ret = bch2_debugfs_flush_buf(i); return ret ?: i->ret; } @@ -927,7 +930,11 @@ void bch2_fs_debug_init(struct bch_fs *c) if (IS_ERR_OR_NULL(bch_debug)) return; - snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b); + if (c->sb.multi_device) + snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b); + else + strscpy(name, c->name, sizeof(name)); + c->fs_debug_dir = debugfs_create_dir(name, bch_debug); if (IS_ERR_OR_NULL(c->fs_debug_dir)) return; @@ -953,6 +960,8 @@ void bch2_fs_debug_init(struct bch_fs *c) debugfs_create_file("write_points", 0400, c->fs_debug_dir, c->btree_debug, &write_points_ops); + bch2_fs_async_obj_debugfs_init(c); + c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir); if (IS_ERR_OR_NULL(c->btree_debug_dir)) return; diff --git a/fs/bcachefs/debug.h b/fs/bcachefs/debug.h index 2c37143b5fd1..d88b1194b8ac 100644 --- a/fs/bcachefs/debug.h +++ b/fs/bcachefs/debug.h @@ -14,11 +14,29 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *, struct bch_fs *, static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b) { - if (bch2_verify_btree_ondisk) + if (static_branch_unlikely(&bch2_verify_btree_ondisk)) __bch2_btree_verify(c, b); } #ifdef CONFIG_DEBUG_FS +struct dump_iter { + struct bch_fs *c; + struct async_obj_list *list; + enum btree_id id; + struct bpos from; + struct bpos prev_node; + u64 iter; + + struct printbuf buf; + + char __user *ubuf; /* destination user buffer */ + size_t size; /* size of requested read */ + ssize_t ret; /* bytes read so far */ +}; + +ssize_t bch2_debugfs_flush_buf(struct dump_iter *); +int bch2_dump_release(struct inode *, struct file *); + void bch2_fs_debug_exit(struct bch_fs *); void bch2_fs_debug_init(struct bch_fs *); #else diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index a51195088227..300f7cc8abdf 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -212,82 +212,83 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); struct qstr d_name = bch2_dirent_get_name(d); - prt_printf(out, "%.*s -> ", d_name.len, d_name.name); + prt_printf(out, "%.*s", d_name.len, d_name.name); + + if (d.v->d_casefold) { + struct qstr d_name = bch2_dirent_get_lookup_name(d); + prt_printf(out, " (casefold %.*s)", d_name.len, d_name.name); + } + + prt_str(out, " ->"); if (d.v->d_type != DT_SUBVOL) - prt_printf(out, "%llu", le64_to_cpu(d.v->d_inum)); + prt_printf(out, " %llu", le64_to_cpu(d.v->d_inum)); else - prt_printf(out, "%u -> %u", + prt_printf(out, " %u -> %u", le32_to_cpu(d.v->d_parent_subvol), le32_to_cpu(d.v->d_child_subvol)); prt_printf(out, " type %s", bch2_d_type_str(d.v->d_type)); } -static struct bkey_i_dirent *dirent_alloc_key(struct btree_trans *trans, - subvol_inum dir, - u8 type, - int name_len, int cf_name_len, - u64 dst) +int bch2_dirent_init_name(struct bkey_i_dirent *dirent, + const struct bch_hash_info *hash_info, + const struct qstr *name, + const struct qstr *cf_name) { - struct bkey_i_dirent *dirent; - unsigned u64s = BKEY_U64s + dirent_val_u64s(name_len, cf_name_len); + EBUG_ON(hash_info->cf_encoding == NULL && cf_name); + int cf_len = 0; - BUG_ON(u64s > U8_MAX); + if (name->len > BCH_NAME_MAX) + return -ENAMETOOLONG; - dirent = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); - if (IS_ERR(dirent)) - return dirent; + dirent->v.d_casefold = hash_info->cf_encoding != NULL; - bkey_dirent_init(&dirent->k_i); - dirent->k.u64s = u64s; - - if (type != DT_SUBVOL) { - dirent->v.d_inum = cpu_to_le64(dst); + if (!dirent->v.d_casefold) { + memcpy(&dirent->v.d_name[0], name->name, name->len); + memset(&dirent->v.d_name[name->len], 0, + bkey_val_bytes(&dirent->k) - + offsetof(struct bch_dirent, d_name) - + name->len); } else { - dirent->v.d_parent_subvol = cpu_to_le32(dir.subvol); - dirent->v.d_child_subvol = cpu_to_le32(dst); - } +#ifdef CONFIG_UNICODE + memcpy(&dirent->v.d_cf_name_block.d_names[0], name->name, name->len); - dirent->v.d_type = type; - dirent->v.d_unused = 0; - dirent->v.d_casefold = cf_name_len ? 1 : 0; + char *cf_out = &dirent->v.d_cf_name_block.d_names[name->len]; - return dirent; -} + if (cf_name) { + cf_len = cf_name->len; -static void dirent_init_regular_name(struct bkey_i_dirent *dirent, - const struct qstr *name) -{ - EBUG_ON(dirent->v.d_casefold); + memcpy(cf_out, cf_name->name, cf_name->len); + } else { + cf_len = utf8_casefold(hash_info->cf_encoding, name, + cf_out, + bkey_val_end(bkey_i_to_s(&dirent->k_i)) - (void *) cf_out); + if (cf_len <= 0) + return cf_len; + } - memcpy(&dirent->v.d_name[0], name->name, name->len); - memset(&dirent->v.d_name[name->len], 0, - bkey_val_bytes(&dirent->k) - - offsetof(struct bch_dirent, d_name) - - name->len); -} + memset(&dirent->v.d_cf_name_block.d_names[name->len + cf_len], 0, + bkey_val_bytes(&dirent->k) - + offsetof(struct bch_dirent, d_cf_name_block.d_names) - + name->len + cf_len); -static void dirent_init_casefolded_name(struct bkey_i_dirent *dirent, - const struct qstr *name, - const struct qstr *cf_name) -{ - EBUG_ON(!dirent->v.d_casefold); - EBUG_ON(!cf_name->len); - - dirent->v.d_cf_name_block.d_name_len = cpu_to_le16(name->len); - dirent->v.d_cf_name_block.d_cf_name_len = cpu_to_le16(cf_name->len); - memcpy(&dirent->v.d_cf_name_block.d_names[0], name->name, name->len); - memcpy(&dirent->v.d_cf_name_block.d_names[name->len], cf_name->name, cf_name->len); - memset(&dirent->v.d_cf_name_block.d_names[name->len + cf_name->len], 0, - bkey_val_bytes(&dirent->k) - - offsetof(struct bch_dirent, d_cf_name_block.d_names) - - name->len + cf_name->len); - - EBUG_ON(bch2_dirent_get_casefold_name(dirent_i_to_s_c(dirent)).len != cf_name->len); + dirent->v.d_cf_name_block.d_name_len = cpu_to_le16(name->len); + dirent->v.d_cf_name_block.d_cf_name_len = cpu_to_le16(cf_len); + + EBUG_ON(bch2_dirent_get_casefold_name(dirent_i_to_s_c(dirent)).len != cf_len); +#else + return -EOPNOTSUPP; +#endif + } + + unsigned u64s = dirent_val_u64s(name->len, cf_len); + BUG_ON(u64s > bkey_val_u64s(&dirent->k)); + set_bkey_val_u64s(&dirent->k, u64s); + return 0; } -static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, +struct bkey_i_dirent *bch2_dirent_create_key(struct btree_trans *trans, const struct bch_hash_info *hash_info, subvol_inum dir, u8 type, @@ -295,31 +296,28 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, const struct qstr *cf_name, u64 dst) { - struct bkey_i_dirent *dirent; - struct qstr _cf_name; - - if (name->len > BCH_NAME_MAX) - return ERR_PTR(-ENAMETOOLONG); + struct bkey_i_dirent *dirent = bch2_trans_kmalloc(trans, BKEY_U64s_MAX * sizeof(u64)); + if (IS_ERR(dirent)) + return dirent; - if (hash_info->cf_encoding && !cf_name) { - int ret = bch2_casefold(trans, hash_info, name, &_cf_name); - if (ret) - return ERR_PTR(ret); + bkey_dirent_init(&dirent->k_i); + dirent->k.u64s = BKEY_U64s_MAX; - cf_name = &_cf_name; + if (type != DT_SUBVOL) { + dirent->v.d_inum = cpu_to_le64(dst); + } else { + dirent->v.d_parent_subvol = cpu_to_le32(dir.subvol); + dirent->v.d_child_subvol = cpu_to_le32(dst); } - dirent = dirent_alloc_key(trans, dir, type, name->len, cf_name ? cf_name->len : 0, dst); - if (IS_ERR(dirent)) - return dirent; + dirent->v.d_type = type; + dirent->v.d_unused = 0; - if (cf_name) - dirent_init_casefolded_name(dirent, name, cf_name); - else - dirent_init_regular_name(dirent, name); + int ret = bch2_dirent_init_name(dirent, hash_info, name, cf_name); + if (ret) + return ERR_PTR(ret); EBUG_ON(bch2_dirent_get_name(dirent_i_to_s_c(dirent)).len != name->len); - return dirent; } @@ -334,7 +332,7 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans, struct bkey_i_dirent *dirent; int ret; - dirent = dirent_create_key(trans, hash_info, dir_inum, type, name, NULL, dst_inum); + dirent = bch2_dirent_create_key(trans, hash_info, dir_inum, type, name, NULL, dst_inum); ret = PTR_ERR_OR_ZERO(dirent); if (ret) return ret; @@ -358,7 +356,7 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir, struct bkey_i_dirent *dirent; int ret; - dirent = dirent_create_key(trans, hash_info, dir, type, name, NULL, dst_inum); + dirent = bch2_dirent_create_key(trans, hash_info, dir, type, name, NULL, dst_inum); ret = PTR_ERR_OR_ZERO(dirent); if (ret) return ret; @@ -395,8 +393,8 @@ int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir, } int bch2_dirent_rename(struct btree_trans *trans, - subvol_inum src_dir, struct bch_hash_info *src_hash, u64 *src_dir_i_size, - subvol_inum dst_dir, struct bch_hash_info *dst_hash, u64 *dst_dir_i_size, + subvol_inum src_dir, struct bch_hash_info *src_hash, + subvol_inum dst_dir, struct bch_hash_info *dst_hash, const struct qstr *src_name, subvol_inum *src_inum, u64 *src_offset, const struct qstr *dst_name, subvol_inum *dst_inum, u64 *dst_offset, enum bch_rename_mode mode) @@ -463,8 +461,8 @@ int bch2_dirent_rename(struct btree_trans *trans, *src_offset = dst_iter.pos.offset; /* Create new dst key: */ - new_dst = dirent_create_key(trans, dst_hash, dst_dir, 0, dst_name, - dst_hash->cf_encoding ? &dst_name_lookup : NULL, 0); + new_dst = bch2_dirent_create_key(trans, dst_hash, dst_dir, 0, dst_name, + dst_hash->cf_encoding ? &dst_name_lookup : NULL, 0); ret = PTR_ERR_OR_ZERO(new_dst); if (ret) goto out; @@ -474,8 +472,8 @@ int bch2_dirent_rename(struct btree_trans *trans, /* Create new src key: */ if (mode == BCH_RENAME_EXCHANGE) { - new_src = dirent_create_key(trans, src_hash, src_dir, 0, src_name, - src_hash->cf_encoding ? &src_name_lookup : NULL, 0); + new_src = bch2_dirent_create_key(trans, src_hash, src_dir, 0, src_name, + src_hash->cf_encoding ? &src_name_lookup : NULL, 0); ret = PTR_ERR_OR_ZERO(new_src); if (ret) goto out; @@ -535,14 +533,6 @@ int bch2_dirent_rename(struct btree_trans *trans, new_src->v.d_type == DT_SUBVOL) new_src->v.d_parent_subvol = cpu_to_le32(src_dir.subvol); - if (old_dst.k) - *dst_dir_i_size -= bkey_bytes(old_dst.k); - *src_dir_i_size -= bkey_bytes(old_src.k); - - if (mode == BCH_RENAME_EXCHANGE) - *src_dir_i_size += bkey_bytes(&new_src->k); - *dst_dir_i_size += bkey_bytes(&new_dst->k); - ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0); if (ret) goto out; @@ -649,7 +639,7 @@ int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 subvol, u32 struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); if (d.v->d_type == DT_SUBVOL && le32_to_cpu(d.v->d_parent_subvol) != subvol) continue; - ret = -BCH_ERR_ENOTEMPTY_dir_not_empty; + ret = bch_err_throw(trans->c, ENOTEMPTY_dir_not_empty); break; } bch2_trans_iter_exit(trans, &iter); @@ -685,7 +675,9 @@ static int bch2_dir_emit(struct dir_context *ctx, struct bkey_s_c_dirent d, subv return !ret; } -int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) +int bch2_readdir(struct bch_fs *c, subvol_inum inum, + struct bch_hash_info *hash_info, + struct dir_context *ctx) { struct bkey_buf sk; bch2_bkey_buf_init(&sk); @@ -703,7 +695,11 @@ int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) struct bkey_s_c_dirent dirent = bkey_i_to_s_c_dirent(sk.k); subvol_inum target; - int ret2 = bch2_dirent_read_target(trans, inum, dirent, &target); + + bool need_second_pass = false; + int ret2 = bch2_str_hash_check_key(trans, NULL, &bch2_dirent_hash_desc, + hash_info, &iter, k, &need_second_pass) ?: + bch2_dirent_read_target(trans, inum, dirent, &target); if (ret2 > 0) continue; @@ -733,7 +729,7 @@ static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, ret = bch2_inode_unpack(k, inode); goto found; } - ret = -BCH_ERR_ENOENT_inode; + ret = bch_err_throw(trans->c, ENOENT_inode); found: bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr); bch2_trans_iter_exit(trans, &iter); diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h index d3e7ae669575..70fb0b581221 100644 --- a/fs/bcachefs/dirent.h +++ b/fs/bcachefs/dirent.h @@ -38,7 +38,7 @@ static inline int bch2_maybe_casefold(struct btree_trans *trans, } } -struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d); +struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent); static inline unsigned dirent_val_u64s(unsigned len, unsigned cf_len) { @@ -59,6 +59,14 @@ static inline void dirent_copy_target(struct bkey_i_dirent *dst, dst->v.d_type = src.v->d_type; } +int bch2_dirent_init_name(struct bkey_i_dirent *, + const struct bch_hash_info *, + const struct qstr *, + const struct qstr *); +struct bkey_i_dirent *bch2_dirent_create_key(struct btree_trans *, + const struct bch_hash_info *, subvol_inum, u8, + const struct qstr *, const struct qstr *, u64); + int bch2_dirent_create_snapshot(struct btree_trans *, u32, u64, u32, const struct bch_hash_info *, u8, const struct qstr *, u64, u64 *, @@ -80,8 +88,8 @@ enum bch_rename_mode { }; int bch2_dirent_rename(struct btree_trans *, - subvol_inum, struct bch_hash_info *, u64 *, - subvol_inum, struct bch_hash_info *, u64 *, + subvol_inum, struct bch_hash_info *, + subvol_inum, struct bch_hash_info *, const struct qstr *, subvol_inum *, u64 *, const struct qstr *, subvol_inum *, u64 *, enum bch_rename_mode); @@ -95,7 +103,7 @@ u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum, int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32, u32); int bch2_empty_dir_trans(struct btree_trans *, subvol_inum); -int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *); +int bch2_readdir(struct bch_fs *, subvol_inum, struct bch_hash_info *, struct dir_context *); int bch2_fsck_remove_dirent(struct btree_trans *, struct bpos); diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index 1f0422bfae35..3d59a57a5256 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -68,23 +68,31 @@ static const char * const disk_accounting_type_strs[] = { NULL }; -static inline void accounting_key_init(struct bkey_i *k, struct disk_accounting_pos *pos, - s64 *d, unsigned nr) +static inline void __accounting_key_init(struct bkey_i *k, struct bpos pos, + s64 *d, unsigned nr) { struct bkey_i_accounting *acc = bkey_accounting_init(k); - acc->k.p = disk_accounting_pos_to_bpos(pos); + acc->k.p = pos; set_bkey_val_u64s(&acc->k, sizeof(struct bch_accounting) / sizeof(u64) + nr); memcpy_u64s_small(acc->v.d, d, nr); } +static inline void accounting_key_init(struct bkey_i *k, struct disk_accounting_pos *pos, + s64 *d, unsigned nr) +{ + return __accounting_key_init(k, disk_accounting_pos_to_bpos(pos), d, nr); +} + static int bch2_accounting_update_sb_one(struct bch_fs *, struct bpos); int bch2_disk_accounting_mod(struct btree_trans *trans, struct disk_accounting_pos *k, s64 *d, unsigned nr, bool gc) { + BUG_ON(nr > BCH_ACCOUNTING_MAX_COUNTERS); + /* Normalize: */ switch (k->type) { case BCH_DISK_ACCOUNTING_replicas: @@ -92,21 +100,49 @@ int bch2_disk_accounting_mod(struct btree_trans *trans, break; } - BUG_ON(nr > BCH_ACCOUNTING_MAX_COUNTERS); + struct bpos pos = disk_accounting_pos_to_bpos(k); + + if (likely(!gc)) { + struct bkey_i_accounting *a; +#if 0 + for (a = btree_trans_subbuf_base(trans, &trans->accounting); + a != btree_trans_subbuf_top(trans, &trans->accounting); + a = (void *) bkey_next(&a->k_i)) + if (bpos_eq(a->k.p, pos)) { + BUG_ON(nr != bch2_accounting_counters(&a->k)); + acc_u64s(a->v.d, d, nr); + + if (bch2_accounting_key_is_zero(accounting_i_to_s_c(a))) { + unsigned offset = (u64 *) a - + (u64 *) btree_trans_subbuf_base(trans, &trans->accounting); + + trans->accounting.u64s -= a->k.u64s; + memmove_u64s_down(a, + bkey_next(&a->k_i), + trans->accounting.u64s - offset); + } + return 0; + } +#endif + unsigned u64s = sizeof(*a) / sizeof(u64) + nr; + a = bch2_trans_subbuf_alloc(trans, &trans->accounting, u64s); + int ret = PTR_ERR_OR_ZERO(a); + if (ret) + return ret; - struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i; + __accounting_key_init(&a->k_i, pos, d, nr); + return 0; + } else { + struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i; - accounting_key_init(&k_i.k, k, d, nr); + __accounting_key_init(&k_i.k, pos, d, nr); - if (unlikely(gc)) { int ret = bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true); if (ret == -BCH_ERR_btree_insert_need_mark_replicas) ret = drop_locks_do(trans, bch2_accounting_update_sb_one(trans->c, disk_accounting_pos_to_bpos(k))) ?: bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true); return ret; - } else { - return bch2_trans_update_buffered(trans, BTREE_ID_accounting, &k_i.k); } } @@ -287,7 +323,7 @@ static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struc static int bch2_accounting_update_sb_one(struct bch_fs *c, struct bpos p) { - struct bch_replicas_padded r; + union bch_replicas_padded r; return accounting_to_replicas(&r.e, p) ? bch2_mark_replicas(c, &r.e) : 0; @@ -299,14 +335,13 @@ static int bch2_accounting_update_sb_one(struct bch_fs *c, struct bpos p) */ int bch2_accounting_update_sb(struct btree_trans *trans) { - for (struct jset_entry *i = trans->journal_entries; - i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); - i = vstruct_next(i)) - if (jset_entry_is_key(i) && i->start->k.type == KEY_TYPE_accounting) { - int ret = bch2_accounting_update_sb_one(trans->c, i->start->k.p); - if (ret) - return ret; - } + for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting); + i != btree_trans_subbuf_top(trans, &trans->accounting); + i = bkey_next(i)) { + int ret = bch2_accounting_update_sb_one(trans->c, i->k.p); + if (ret) + return ret; + } return 0; } @@ -355,18 +390,18 @@ static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accoun err: free_percpu(n.v[1]); free_percpu(n.v[0]); - return -BCH_ERR_ENOMEM_disk_accounting; + return bch_err_throw(c, ENOMEM_disk_accounting); } int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a, enum bch_accounting_mode mode) { - struct bch_replicas_padded r; + union bch_replicas_padded r; if (mode != BCH_ACCOUNTING_read && accounting_to_replicas(&r.e, a.k->p) && !bch2_replicas_marked_locked(c, &r.e)) - return -BCH_ERR_btree_insert_need_mark_replicas; + return bch_err_throw(c, btree_insert_need_mark_replicas); percpu_up_read(&c->mark_lock); percpu_down_write(&c->mark_lock); @@ -379,12 +414,12 @@ int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a, int bch2_accounting_mem_insert_locked(struct bch_fs *c, struct bkey_s_c_accounting a, enum bch_accounting_mode mode) { - struct bch_replicas_padded r; + union bch_replicas_padded r; if (mode != BCH_ACCOUNTING_read && accounting_to_replicas(&r.e, a.k->p) && !bch2_replicas_marked_locked(c, &r.e)) - return -BCH_ERR_btree_insert_need_mark_replicas; + return bch_err_throw(c, btree_insert_need_mark_replicas); return __bch2_accounting_mem_insert(c, a); } @@ -438,10 +473,12 @@ int bch2_fs_replicas_usage_read(struct bch_fs *c, darray_char *usage) percpu_down_read(&c->mark_lock); darray_for_each(acc->k, i) { - struct { + union { + u8 bytes[struct_size_t(struct bch_replicas_usage, r.devs, + BCH_BKEY_PTRS_MAX)]; struct bch_replicas_usage r; - u8 pad[BCH_BKEY_PTRS_MAX]; } u; + u.r.r.nr_devs = BCH_BKEY_PTRS_MAX; if (!accounting_to_replicas(&u.r.r, i->pos)) continue; @@ -522,7 +559,7 @@ int bch2_gc_accounting_start(struct bch_fs *c) sizeof(u64), GFP_KERNEL); if (!e->v[1]) { bch2_accounting_free_counters(acc, true); - ret = -BCH_ERR_ENOMEM_disk_accounting; + ret = bch_err_throw(c, ENOMEM_disk_accounting); break; } } @@ -570,11 +607,11 @@ int bch2_gc_accounting_done(struct bch_fs *c) prt_str(&buf, "accounting mismatch for "); bch2_accounting_key_to_text(&buf, &acc_k); - prt_str(&buf, ": got"); + prt_str(&buf, ":\n got"); for (unsigned j = 0; j < nr; j++) prt_printf(&buf, " %llu", dst_v[j]); - prt_str(&buf, " should be"); + prt_str(&buf, "\nshould be"); for (unsigned j = 0; j < nr; j++) prt_printf(&buf, " %llu", src_v[j]); @@ -631,17 +668,17 @@ static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k) } static int bch2_disk_accounting_validate_late(struct btree_trans *trans, - struct disk_accounting_pos acc, + struct disk_accounting_pos *acc, u64 *v, unsigned nr) { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; int ret = 0, invalid_dev = -1; - switch (acc.type) { + switch (acc->type) { case BCH_DISK_ACCOUNTING_replicas: { - struct bch_replicas_padded r; - __accounting_to_replicas(&r.e, &acc); + union bch_replicas_padded r; + __accounting_to_replicas(&r.e, acc); for (unsigned i = 0; i < r.e.nr_devs; i++) if (r.e.devs[i] != BCH_SB_MEMBER_INVALID && @@ -660,7 +697,7 @@ static int bch2_disk_accounting_validate_late(struct btree_trans *trans, trans, accounting_replicas_not_marked, "accounting not marked in superblock replicas\n%s", (printbuf_reset(&buf), - bch2_accounting_key_to_text(&buf, &acc), + bch2_accounting_key_to_text(&buf, acc), buf.buf))) { /* * We're not RW yet and still single threaded, dropping @@ -676,8 +713,8 @@ static int bch2_disk_accounting_validate_late(struct btree_trans *trans, } case BCH_DISK_ACCOUNTING_dev_data_type: - if (!bch2_dev_exists(c, acc.dev_data_type.dev)) { - invalid_dev = acc.dev_data_type.dev; + if (!bch2_dev_exists(c, acc->dev_data_type.dev)) { + invalid_dev = acc->dev_data_type.dev; goto invalid_device; } break; @@ -691,16 +728,16 @@ invalid_device: "accounting entry points to invalid device %i\n%s", invalid_dev, (printbuf_reset(&buf), - bch2_accounting_key_to_text(&buf, &acc), + bch2_accounting_key_to_text(&buf, acc), buf.buf))) { for (unsigned i = 0; i < nr; i++) v[i] = -v[i]; ret = commit_do(trans, NULL, NULL, 0, - bch2_disk_accounting_mod(trans, &acc, v, nr, false)) ?: + bch2_disk_accounting_mod(trans, acc, v, nr, false)) ?: -BCH_ERR_remove_disk_accounting_entry; } else { - ret = -BCH_ERR_remove_disk_accounting_entry; + ret = bch_err_throw(c, remove_disk_accounting_entry); } goto fsck_err; } @@ -748,7 +785,7 @@ int bch2_accounting_read(struct bch_fs *c) if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) break; - if (!bch2_accounting_is_mem(acc_k)) { + if (!bch2_accounting_is_mem(&acc_k)) { struct disk_accounting_pos next; memset(&next, 0, sizeof(next)); next.type = acc_k.type + 1; @@ -770,7 +807,7 @@ int bch2_accounting_read(struct bch_fs *c) struct disk_accounting_pos acc_k; bpos_to_disk_accounting_pos(&acc_k, i->k->k.p); - if (!bch2_accounting_is_mem(acc_k)) + if (!bch2_accounting_is_mem(&acc_k)) continue; struct bkey_s_c k = bkey_i_to_s_c(i->k); @@ -826,7 +863,7 @@ int bch2_accounting_read(struct bch_fs *c) */ ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters) ? -BCH_ERR_remove_disk_accounting_entry - : bch2_disk_accounting_validate_late(trans, acc_k, v, i->nr_counters); + : bch2_disk_accounting_validate_late(trans, &acc_k, v, i->nr_counters); if (ret == -BCH_ERR_remove_disk_accounting_entry) { free_percpu(i->v[0]); @@ -860,8 +897,8 @@ int bch2_accounting_read(struct bch_fs *c) case BCH_DISK_ACCOUNTING_replicas: fs_usage_data_type_to_base(usage, k.replicas.data_type, v[0]); break; - case BCH_DISK_ACCOUNTING_dev_data_type: - rcu_read_lock(); + case BCH_DISK_ACCOUNTING_dev_data_type: { + guard(rcu)(); struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.dev_data_type.dev); if (ca) { struct bch_dev_usage_type __percpu *d = &ca->usage->d[k.dev_data_type.data_type]; @@ -873,9 +910,9 @@ int bch2_accounting_read(struct bch_fs *c) k.dev_data_type.data_type == BCH_DATA_journal) usage->hidden += v[0] * ca->mi.bucket_size; } - rcu_read_unlock(); break; } + } } preempt_enable(); fsck_err: @@ -939,7 +976,7 @@ void bch2_verify_accounting_clean(struct bch_fs *c) if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) break; - if (!bch2_accounting_is_mem(acc_k)) { + if (!bch2_accounting_is_mem(&acc_k)) { struct disk_accounting_pos next; memset(&next, 0, sizeof(next)); next.type = acc_k.type + 1; @@ -969,19 +1006,18 @@ void bch2_verify_accounting_clean(struct bch_fs *c) case BCH_DISK_ACCOUNTING_replicas: fs_usage_data_type_to_base(&base, acc_k.replicas.data_type, a.v->d[0]); break; - case BCH_DISK_ACCOUNTING_dev_data_type: { - rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev); - if (!ca) { - rcu_read_unlock(); - continue; + case BCH_DISK_ACCOUNTING_dev_data_type: + { + guard(rcu)(); /* scoped guard is a loop, and doesn't play nicely with continue */ + struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev); + if (!ca) + continue; + + v[0] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].buckets); + v[1] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].sectors); + v[2] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].fragmented); } - v[0] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].buckets); - v[1] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].sectors); - v[2] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].fragmented); - rcu_read_unlock(); - if (memcmp(a.v->d, v, 3 * sizeof(u64))) { struct printbuf buf = PRINTBUF; @@ -995,7 +1031,6 @@ void bch2_verify_accounting_clean(struct bch_fs *c) mismatch = true; } } - } 0; }))); diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h index d557b99b3c0a..d61abebf3e0b 100644 --- a/fs/bcachefs/disk_accounting.h +++ b/fs/bcachefs/disk_accounting.h @@ -139,10 +139,10 @@ int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, enum int bch2_accounting_mem_insert_locked(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode); void bch2_accounting_mem_gc(struct bch_fs *); -static inline bool bch2_accounting_is_mem(struct disk_accounting_pos acc) +static inline bool bch2_accounting_is_mem(struct disk_accounting_pos *acc) { - return acc.type < BCH_DISK_ACCOUNTING_TYPE_NR && - acc.type != BCH_DISK_ACCOUNTING_inum; + return acc->type < BCH_DISK_ACCOUNTING_TYPE_NR && + acc->type != BCH_DISK_ACCOUNTING_inum; } /* @@ -163,7 +163,7 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, if (gc && !acc->gc_running) return 0; - if (!bch2_accounting_is_mem(acc_k)) + if (!bch2_accounting_is_mem(&acc_k)) return 0; if (mode == BCH_ACCOUNTING_normal) { @@ -174,17 +174,17 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, case BCH_DISK_ACCOUNTING_replicas: fs_usage_data_type_to_base(&trans->fs_usage_delta, acc_k.replicas.data_type, a.v->d[0]); break; - case BCH_DISK_ACCOUNTING_dev_data_type: - rcu_read_lock(); + case BCH_DISK_ACCOUNTING_dev_data_type: { + guard(rcu)(); struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev); if (ca) { this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].buckets, a.v->d[0]); this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].sectors, a.v->d[1]); this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].fragmented, a.v->d[2]); } - rcu_read_unlock(); break; } + } } unsigned idx; @@ -259,8 +259,8 @@ static inline int bch2_accounting_trans_commit_hook(struct btree_trans *trans, struct bkey_i_accounting *a, unsigned commit_flags) { - a->k.bversion = journal_pos_to_bversion(&trans->journal_res, - (u64 *) a - (u64 *) trans->journal_entries); + u64 *base = (u64 *) btree_trans_subbuf_base(trans, &trans->accounting); + a->k.bversion = journal_pos_to_bversion(&trans->journal_res, (u64 *) a - base); EBUG_ON(bversion_zero(a->k.bversion)); diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c index 2ca3cbf12b71..cde842ac1886 100644 --- a/fs/bcachefs/disk_groups.c +++ b/fs/bcachefs/disk_groups.c @@ -86,35 +86,6 @@ err: return ret; } -void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c) -{ - out->atomic++; - rcu_read_lock(); - - struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); - if (!g) - goto out; - - for (unsigned i = 0; i < g->nr; i++) { - if (i) - prt_printf(out, " "); - - if (g->entries[i].deleted) { - prt_printf(out, "[deleted]"); - continue; - } - - prt_printf(out, "[parent %d devs", g->entries[i].parent); - for_each_member_device_rcu(c, ca, &g->entries[i].devs) - prt_printf(out, " %s", ca->name); - prt_printf(out, "]"); - } - -out: - rcu_read_unlock(); - out->atomic--; -} - static void bch2_sb_disk_groups_to_text(struct printbuf *out, struct bch_sb *sb, struct bch_sb_field *f) @@ -159,7 +130,7 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c) cpu_g = kzalloc(struct_size(cpu_g, entries, nr_groups), GFP_KERNEL); if (!cpu_g) - return -BCH_ERR_ENOMEM_disk_groups_to_cpu; + return bch_err_throw(c, ENOMEM_disk_groups_to_cpu); cpu_g->nr = nr_groups; @@ -199,36 +170,28 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c) const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target) { struct target t = target_decode(target); - struct bch_devs_mask *devs; - rcu_read_lock(); + guard(rcu)(); switch (t.type) { case TARGET_NULL: - devs = NULL; - break; + return NULL; case TARGET_DEV: { struct bch_dev *ca = t.dev < c->sb.nr_devices ? rcu_dereference(c->devs[t.dev]) : NULL; - devs = ca ? &ca->self : NULL; - break; + return ca ? &ca->self : NULL; } case TARGET_GROUP: { struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); - devs = g && t.group < g->nr && !g->entries[t.group].deleted + return g && t.group < g->nr && !g->entries[t.group].deleted ? &g->entries[t.group].devs : NULL; - break; } default: BUG(); } - - rcu_read_unlock(); - - return devs; } bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target) @@ -241,20 +204,13 @@ bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target) case TARGET_DEV: return dev == t.dev; case TARGET_GROUP: { - struct bch_disk_groups_cpu *g; - const struct bch_devs_mask *m; - bool ret; - - rcu_read_lock(); - g = rcu_dereference(c->disk_groups); - m = g && t.group < g->nr && !g->entries[t.group].deleted + struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); + const struct bch_devs_mask *m = + g && t.group < g->nr && !g->entries[t.group].deleted ? &g->entries[t.group].devs : NULL; - ret = m ? test_bit(dev, m->d) : false; - rcu_read_unlock(); - - return ret; + return m ? test_bit(dev, m->d) : false; } default: BUG(); @@ -377,54 +333,79 @@ int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name) return v; } -void bch2_disk_path_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) +static void __bch2_disk_path_to_text(struct printbuf *out, struct bch_disk_groups_cpu *g, + unsigned v) { - struct bch_disk_groups_cpu *groups; - struct bch_disk_group_cpu *g; - unsigned nr = 0; u16 path[32]; - - out->atomic++; - rcu_read_lock(); - groups = rcu_dereference(c->disk_groups); - if (!groups) - goto invalid; + unsigned nr = 0; while (1) { if (nr == ARRAY_SIZE(path)) goto invalid; - if (v >= groups->nr) + if (v >= (g ? g->nr : 0)) goto invalid; - g = groups->entries + v; + struct bch_disk_group_cpu *e = g->entries + v; - if (g->deleted) + if (e->deleted) goto invalid; path[nr++] = v; - if (!g->parent) + if (!e->parent) break; - v = g->parent - 1; + v = e->parent - 1; } while (nr) { - v = path[--nr]; - g = groups->entries + v; + struct bch_disk_group_cpu *e = g->entries + path[--nr]; - prt_printf(out, "%.*s", (int) sizeof(g->label), g->label); + prt_printf(out, "%.*s", (int) sizeof(e->label), e->label); if (nr) prt_printf(out, "."); } -out: - rcu_read_unlock(); - out->atomic--; return; invalid: prt_printf(out, "invalid label %u", v); - goto out; +} + +void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c) +{ + bch2_printbuf_make_room(out, 4096); + + out->atomic++; + guard(rcu)(); + struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); + + for (unsigned i = 0; i < (g ? g->nr : 0); i++) { + prt_printf(out, "%2u: ", i); + + if (g->entries[i].deleted) { + prt_printf(out, "[deleted]"); + goto next; + } + + __bch2_disk_path_to_text(out, g, i); + + prt_printf(out, " devs"); + + for_each_member_device_rcu(c, ca, &g->entries[i].devs) + prt_printf(out, " %s", ca->name); +next: + prt_newline(out); + } + + out->atomic--; +} + +void bch2_disk_path_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) +{ + out->atomic++; + guard(rcu)(); + __bch2_disk_path_to_text(out, rcu_dereference(c->disk_groups), v), + --out->atomic; } void bch2_disk_path_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v) @@ -544,32 +525,27 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) switch (t.type) { case TARGET_NULL: prt_printf(out, "none"); - break; + return; case TARGET_DEV: { - struct bch_dev *ca; - out->atomic++; - rcu_read_lock(); - ca = t.dev < c->sb.nr_devices + guard(rcu)(); + struct bch_dev *ca = t.dev < c->sb.nr_devices ? rcu_dereference(c->devs[t.dev]) : NULL; - if (ca && percpu_ref_tryget(&ca->io_ref[READ])) { + if (ca && ca->disk_sb.bdev) prt_printf(out, "/dev/%s", ca->name); - percpu_ref_put(&ca->io_ref[READ]); - } else if (ca) { + else if (ca) prt_printf(out, "offline device %u", t.dev); - } else { + else prt_printf(out, "invalid device %u", t.dev); - } - rcu_read_unlock(); out->atomic--; - break; + return; } case TARGET_GROUP: bch2_disk_path_to_text(out, c, t.group); - break; + return; default: BUG(); } diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index c6cb26981923..543dbba9b14f 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -16,6 +16,7 @@ #include "disk_accounting.h" #include "disk_groups.h" #include "ec.h" +#include "enumerated_ref.h" #include "error.h" #include "io_read.h" #include "io_write.h" @@ -212,7 +213,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans, a->dirty_sectors, a->stripe, s.k->p.offset, (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = -BCH_ERR_mark_stripe; + ret = bch_err_throw(c, mark_stripe); goto err; } @@ -223,7 +224,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans, a->dirty_sectors, a->cached_sectors, (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = -BCH_ERR_mark_stripe; + ret = bch_err_throw(c, mark_stripe); goto err; } } else { @@ -233,7 +234,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans, bucket.inode, bucket.offset, a->gen, a->stripe, (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = -BCH_ERR_mark_stripe; + ret = bch_err_throw(c, mark_stripe); goto err; } @@ -243,7 +244,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans, bch2_data_type_str(a->data_type), bch2_data_type_str(data_type), (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = -BCH_ERR_mark_stripe; + ret = bch_err_throw(c, mark_stripe); goto err; } @@ -255,7 +256,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans, a->dirty_sectors, a->cached_sectors, (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = -BCH_ERR_mark_stripe; + ret = bch_err_throw(c, mark_stripe); goto err; } } @@ -294,7 +295,7 @@ static int mark_stripe_bucket(struct btree_trans *trans, struct bch_dev *ca = bch2_dev_tryget(c, ptr->dev); if (unlikely(!ca)) { if (ptr->dev != BCH_SB_MEMBER_INVALID && !(flags & BTREE_TRIGGER_overwrite)) - ret = -BCH_ERR_mark_stripe; + ret = bch_err_throw(c, mark_stripe); goto err; } @@ -324,7 +325,7 @@ static int mark_stripe_bucket(struct btree_trans *trans, if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n%s", ptr->dev, (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = -BCH_ERR_mark_stripe; + ret = bch_err_throw(c, mark_stripe); goto err; } @@ -427,7 +428,7 @@ int bch2_trigger_stripe(struct btree_trans *trans, gc = genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL); if (!gc) { bch_err(c, "error allocating memory for gc_stripes, idx %llu", idx); - return -BCH_ERR_ENOMEM_mark_stripe; + return bch_err_throw(c, ENOMEM_mark_stripe); } /* @@ -535,7 +536,8 @@ static void ec_stripe_buf_exit(struct ec_stripe_buf *buf) } /* XXX: this is a non-mempoolified memory allocation: */ -static int ec_stripe_buf_init(struct ec_stripe_buf *buf, +static int ec_stripe_buf_init(struct bch_fs *c, + struct ec_stripe_buf *buf, unsigned offset, unsigned size) { struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; @@ -563,7 +565,7 @@ static int ec_stripe_buf_init(struct ec_stripe_buf *buf, return 0; err: ec_stripe_buf_exit(buf); - return -BCH_ERR_ENOMEM_stripe_buf; + return bch_err_throw(c, ENOMEM_stripe_buf); } /* Checksumming: */ @@ -700,6 +702,9 @@ static void ec_block_endio(struct bio *bio) struct bch_dev *ca = ec_bio->ca; struct closure *cl = bio->bi_private; int rw = ec_bio->rw; + unsigned ref = rw == READ + ? BCH_DEV_READ_REF_ec_block + : BCH_DEV_WRITE_REF_ec_block; bch2_account_io_completion(ca, bio_data_dir(bio), ec_bio->submit_time, !bio->bi_status); @@ -721,7 +726,7 @@ static void ec_block_endio(struct bio *bio) } bio_put(&ec_bio->bio); - percpu_ref_put(&ca->io_ref[rw]); + enumerated_ref_put(&ca->io_ref[rw], ref); closure_put(cl); } @@ -735,8 +740,11 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, ? BCH_DATA_user : BCH_DATA_parity; int rw = op_is_write(opf); + unsigned ref = rw == READ + ? BCH_DEV_READ_REF_ec_block + : BCH_DEV_WRITE_REF_ec_block; - struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, rw); + struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, rw, ref); if (!ca) { clear_bit(idx, buf->valid); return; @@ -782,14 +790,14 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b); closure_get(cl); - percpu_ref_get(&ca->io_ref[rw]); + enumerated_ref_get(&ca->io_ref[rw], ref); submit_bio(&ec_bio->bio); offset += b; } - percpu_ref_put(&ca->io_ref[rw]); + enumerated_ref_put(&ca->io_ref[rw], ref); } static int get_stripe_key_trans(struct btree_trans *trans, u64 idx, @@ -833,7 +841,7 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio, buf = kzalloc(sizeof(*buf), GFP_NOFS); if (!buf) - return -BCH_ERR_ENOMEM_ec_read_extent; + return bch_err_throw(c, ENOMEM_ec_read_extent); ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf)); if (ret) { @@ -854,7 +862,7 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio, goto err; } - ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio)); + ret = ec_stripe_buf_init(c, buf, offset, bio_sectors(&rbio->bio)); if (ret) { msg = "-ENOMEM"; goto err; @@ -887,7 +895,7 @@ err: bch_err_ratelimited(c, "error doing reconstruct read: %s\n %s", msg, msgbuf.buf); printbuf_exit(&msgbuf); - ret = -BCH_ERR_stripe_reconstruct; + ret = bch_err_throw(c, stripe_reconstruct); goto out; } @@ -897,7 +905,7 @@ static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) { if (c->gc_pos.phase != GC_PHASE_not_running && !genradix_ptr_alloc(&c->gc_stripes, idx, gfp)) - return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; + return bch_err_throw(c, ENOMEM_ec_stripe_mem_alloc); return 0; } @@ -1011,14 +1019,14 @@ static void ec_stripe_delete_work(struct work_struct *work) BCH_TRANS_COMMIT_no_enospc, ({ ec_stripe_delete(trans, lru_k.k->p.offset); }))); - bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_delete); } void bch2_do_stripe_deletes(struct bch_fs *c) { - if (bch2_write_ref_tryget(c, BCH_WRITE_REF_stripe_delete) && + if (enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_stripe_delete) && !queue_work(c->write_ref_wq, &c->ec_stripe_delete_work)) - bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_delete); } /* stripe creation: */ @@ -1122,7 +1130,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans, bch2_fs_inconsistent(c, "%s", buf.buf); printbuf_exit(&buf); - return -BCH_ERR_erasure_coding_found_btree_node; + return bch_err_throw(c, erasure_coding_found_btree_node); } k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, last_flushed); @@ -1188,7 +1196,7 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev); if (!ca) - return -BCH_ERR_ENOENT_dev_not_found; + return bch_err_throw(c, ENOENT_dev_not_found); struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr); @@ -1246,9 +1254,10 @@ static void zero_out_rest_of_ec_bucket(struct bch_fs *c, unsigned block, struct open_bucket *ob) { - struct bch_dev *ca = bch2_dev_get_ioref(c, ob->dev, WRITE); + struct bch_dev *ca = bch2_dev_get_ioref(c, ob->dev, WRITE, + BCH_DEV_WRITE_REF_ec_bucket_zero); if (!ca) { - s->err = -BCH_ERR_erofs_no_writes; + s->err = bch_err_throw(c, erofs_no_writes); return; } @@ -1262,7 +1271,7 @@ static void zero_out_rest_of_ec_bucket(struct bch_fs *c, ob->sectors_free, GFP_KERNEL, 0); - percpu_ref_put(&ca->io_ref[WRITE]); + enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_ec_bucket_zero); if (ret) s->err = ret; @@ -1312,7 +1321,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) if (ec_do_recov(c, &s->existing_stripe)) { bch_err(c, "error creating stripe: error reading existing stripe"); - ret = -BCH_ERR_ec_block_read; + ret = bch_err_throw(c, ec_block_read); goto err; } @@ -1338,7 +1347,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) if (ec_nr_failed(&s->new_stripe)) { bch_err(c, "error creating stripe: error writing redundancy buckets"); - ret = -BCH_ERR_ec_block_write; + ret = bch_err_throw(c, ec_block_write); goto err; } @@ -1412,15 +1421,15 @@ static void ec_stripe_create_work(struct work_struct *work) while ((s = get_pending_stripe(c))) ec_stripe_create(s); - bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_create); } void bch2_ec_do_stripe_creates(struct bch_fs *c) { - bch2_write_ref_get(c, BCH_WRITE_REF_stripe_create); + enumerated_ref_get(&c->writes, BCH_WRITE_REF_stripe_create); if (!queue_work(system_long_wq, &c->ec_stripe_create_work)) - bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_create); } static void ec_stripe_new_set_pending(struct bch_fs *c, struct ec_stripe_head *h) @@ -1570,26 +1579,26 @@ static struct ec_stripe_new *ec_new_stripe_alloc(struct bch_fs *c, struct ec_str static void ec_stripe_head_devs_update(struct bch_fs *c, struct ec_stripe_head *h) { struct bch_devs_mask devs = h->devs; + unsigned nr_devs, nr_devs_with_durability; - rcu_read_lock(); - h->devs = target_rw_devs(c, BCH_DATA_user, h->disk_label - ? group_to_target(h->disk_label - 1) - : 0); - unsigned nr_devs = dev_mask_nr(&h->devs); + scoped_guard(rcu) { + h->devs = target_rw_devs(c, BCH_DATA_user, h->disk_label + ? group_to_target(h->disk_label - 1) + : 0); + nr_devs = dev_mask_nr(&h->devs); - for_each_member_device_rcu(c, ca, &h->devs) - if (!ca->mi.durability) - __clear_bit(ca->dev_idx, h->devs.d); - unsigned nr_devs_with_durability = dev_mask_nr(&h->devs); + for_each_member_device_rcu(c, ca, &h->devs) + if (!ca->mi.durability) + __clear_bit(ca->dev_idx, h->devs.d); + nr_devs_with_durability = dev_mask_nr(&h->devs); - h->blocksize = pick_blocksize(c, &h->devs); + h->blocksize = pick_blocksize(c, &h->devs); - h->nr_active_devs = 0; - for_each_member_device_rcu(c, ca, &h->devs) - if (ca->mi.bucket_size == h->blocksize) - h->nr_active_devs++; - - rcu_read_unlock(); + h->nr_active_devs = 0; + for_each_member_device_rcu(c, ca, &h->devs) + if (ca->mi.bucket_size == h->blocksize) + h->nr_active_devs++; + } /* * If we only have redundancy + 1 devices, we're better off with just @@ -1710,23 +1719,32 @@ err: } static int new_stripe_alloc_buckets(struct btree_trans *trans, + struct alloc_request *req, struct ec_stripe_head *h, struct ec_stripe_new *s, - enum bch_watermark watermark, struct closure *cl) + struct closure *cl) { struct bch_fs *c = trans->c; - struct bch_devs_mask devs = h->devs; struct open_bucket *ob; - struct open_buckets buckets; struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v; unsigned i, j, nr_have_parity = 0, nr_have_data = 0; - bool have_cache = true; int ret = 0; + req->scratch_data_type = req->data_type; + req->scratch_ptrs = req->ptrs; + req->scratch_nr_replicas = req->nr_replicas; + req->scratch_nr_effective = req->nr_effective; + req->scratch_have_cache = req->have_cache; + req->scratch_devs_may_alloc = req->devs_may_alloc; + + req->devs_may_alloc = h->devs; + req->have_cache = true; + BUG_ON(v->nr_blocks != s->nr_data + s->nr_parity); BUG_ON(v->nr_redundant != s->nr_parity); /* * We bypass the sector allocator which normally does this: */ - bitmap_and(devs.d, devs.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX); + bitmap_and(req->devs_may_alloc.d, req->devs_may_alloc.d, + c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX); for_each_set_bit(i, s->blocks_gotten, v->nr_blocks) { /* @@ -1736,7 +1754,7 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, * block when updating the stripe */ if (v->ptrs[i].dev != BCH_SB_MEMBER_INVALID) - __clear_bit(v->ptrs[i].dev, devs.d); + __clear_bit(v->ptrs[i].dev, req->devs_may_alloc.d); if (i < s->nr_data) nr_have_data++; @@ -1747,60 +1765,58 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, BUG_ON(nr_have_data > s->nr_data); BUG_ON(nr_have_parity > s->nr_parity); - buckets.nr = 0; + req->ptrs.nr = 0; if (nr_have_parity < s->nr_parity) { - ret = bch2_bucket_alloc_set_trans(trans, &buckets, - &h->parity_stripe, - &devs, - s->nr_parity, - &nr_have_parity, - &have_cache, 0, - BCH_DATA_parity, - watermark, - cl); - - open_bucket_for_each(c, &buckets, ob, i) { + req->nr_replicas = s->nr_parity; + req->nr_effective = nr_have_parity; + req->data_type = BCH_DATA_parity; + + ret = bch2_bucket_alloc_set_trans(trans, req, &h->parity_stripe, cl); + + open_bucket_for_each(c, &req->ptrs, ob, i) { j = find_next_zero_bit(s->blocks_gotten, s->nr_data + s->nr_parity, s->nr_data); BUG_ON(j >= s->nr_data + s->nr_parity); - s->blocks[j] = buckets.v[i]; + s->blocks[j] = req->ptrs.v[i]; v->ptrs[j] = bch2_ob_ptr(c, ob); __set_bit(j, s->blocks_gotten); } if (ret) - return ret; + goto err; } - buckets.nr = 0; + req->ptrs.nr = 0; if (nr_have_data < s->nr_data) { - ret = bch2_bucket_alloc_set_trans(trans, &buckets, - &h->block_stripe, - &devs, - s->nr_data, - &nr_have_data, - &have_cache, 0, - BCH_DATA_user, - watermark, - cl); - - open_bucket_for_each(c, &buckets, ob, i) { + req->nr_replicas = s->nr_data; + req->nr_effective = nr_have_data; + req->data_type = BCH_DATA_user; + + ret = bch2_bucket_alloc_set_trans(trans, req, &h->block_stripe, cl); + + open_bucket_for_each(c, &req->ptrs, ob, i) { j = find_next_zero_bit(s->blocks_gotten, s->nr_data, 0); BUG_ON(j >= s->nr_data); - s->blocks[j] = buckets.v[i]; + s->blocks[j] = req->ptrs.v[i]; v->ptrs[j] = bch2_ob_ptr(c, ob); __set_bit(j, s->blocks_gotten); } if (ret) - return ret; + goto err; } - - return 0; +err: + req->data_type = req->scratch_data_type; + req->ptrs = req->scratch_ptrs; + req->nr_replicas = req->scratch_nr_replicas; + req->nr_effective = req->scratch_nr_effective; + req->have_cache = req->scratch_have_cache; + req->devs_may_alloc = req->scratch_devs_may_alloc; + return ret; } static int __get_existing_stripe(struct btree_trans *trans, @@ -1850,7 +1866,7 @@ static int init_new_stripe_from_existing(struct bch_fs *c, struct ec_stripe_new s->nr_data = existing_v->nr_blocks - existing_v->nr_redundant; - int ret = ec_stripe_buf_init(&s->existing_stripe, 0, le16_to_cpu(existing_v->sectors)); + int ret = ec_stripe_buf_init(c, &s->existing_stripe, 0, le16_to_cpu(existing_v->sectors)); if (ret) { bch2_stripe_close(c, s); return ret; @@ -1910,7 +1926,7 @@ static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stri } bch2_trans_iter_exit(trans, &lru_iter); if (!ret) - ret = -BCH_ERR_stripe_alloc_blocked; + ret = bch_err_throw(c, stripe_alloc_blocked); if (ret == 1) ret = 0; if (ret) @@ -1951,7 +1967,7 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st continue; } - ret = -BCH_ERR_ENOSPC_stripe_create; + ret = bch_err_throw(c, ENOSPC_stripe_create); break; } @@ -1981,17 +1997,15 @@ err: } struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, - unsigned target, + struct alloc_request *req, unsigned algo, - unsigned redundancy, - enum bch_watermark watermark, struct closure *cl) { struct bch_fs *c = trans->c; - struct ec_stripe_head *h; - bool waiting = false; + unsigned redundancy = req->nr_replicas - 1; unsigned disk_label = 0; - struct target t = target_decode(target); + struct target t = target_decode(req->target); + bool waiting = false; int ret; if (t.type == TARGET_GROUP) { @@ -2002,14 +2016,16 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, disk_label = t.group + 1; /* 0 == no label */ } - h = __bch2_ec_stripe_head_get(trans, disk_label, algo, redundancy, watermark); + struct ec_stripe_head *h = + __bch2_ec_stripe_head_get(trans, disk_label, algo, + redundancy, req->watermark); if (IS_ERR_OR_NULL(h)) return h; if (!h->s) { h->s = ec_new_stripe_alloc(c, h); if (!h->s) { - ret = -BCH_ERR_ENOMEM_ec_new_stripe_alloc; + ret = bch_err_throw(c, ENOMEM_ec_new_stripe_alloc); bch_err(c, "failed to allocate new stripe"); goto err; } @@ -2026,8 +2042,12 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, goto alloc_existing; /* First, try to allocate a full stripe: */ - ret = new_stripe_alloc_buckets(trans, h, s, BCH_WATERMARK_stripe, NULL) ?: + enum bch_watermark saved_watermark = BCH_WATERMARK_stripe; + swap(req->watermark, saved_watermark); + ret = new_stripe_alloc_buckets(trans, req, h, s, NULL) ?: __bch2_ec_stripe_head_reserve(trans, h, s); + swap(req->watermark, saved_watermark); + if (!ret) goto allocate_buf; if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || @@ -2045,8 +2065,8 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked) goto err; - if (watermark == BCH_WATERMARK_copygc) { - ret = new_stripe_alloc_buckets(trans, h, s, watermark, NULL) ?: + if (req->watermark == BCH_WATERMARK_copygc) { + ret = new_stripe_alloc_buckets(trans, req, h, s, NULL) ?: __bch2_ec_stripe_head_reserve(trans, h, s); if (ret) goto err; @@ -2065,12 +2085,12 @@ alloc_existing: * Retry allocating buckets, with the watermark for this * particular write: */ - ret = new_stripe_alloc_buckets(trans, h, s, watermark, cl); + ret = new_stripe_alloc_buckets(trans, req, h, s, cl); if (ret) goto err; allocate_buf: - ret = ec_stripe_buf_init(&s->new_stripe, 0, h->blocksize); + ret = ec_stripe_buf_init(c, &s->new_stripe, 0, h->blocksize); if (ret) goto err; @@ -2087,23 +2107,18 @@ err: /* device removal */ -static int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, struct bkey_s_c k_a) +int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, + unsigned dev_idx, + unsigned flags) { - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k_a, &a_convert); - - if (!a->stripe) + if (k.k->type != KEY_TYPE_stripe) return 0; - if (a->stripe_sectors) { - bch_err(trans->c, "trying to invalidate device in stripe when bucket has stripe data"); - return -BCH_ERR_invalidate_stripe_to_dev; - } - - struct btree_iter iter; + struct bch_fs *c = trans->c; struct bkey_i_stripe *s = - bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_stripes, POS(0, a->stripe), - BTREE_ITER_slots, stripe); + bch2_bkey_make_mut_typed(trans, iter, &k, 0, stripe); int ret = PTR_ERR_OR_ZERO(s); if (ret) return ret; @@ -2120,12 +2135,30 @@ static int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, struct bkey_ acc.replicas.data_type = BCH_DATA_user; ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, false); if (ret) - goto err; + return ret; struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(&s->k_i)); - bkey_for_each_ptr(ptrs, ptr) - if (ptr->dev == k_a.k->p.inode) - ptr->dev = BCH_SB_MEMBER_INVALID; + + /* XXX: how much redundancy do we still have? check degraded flags */ + + unsigned nr_good = 0; + + scoped_guard(rcu) + bkey_for_each_ptr(ptrs, ptr) { + if (ptr->dev == dev_idx) + ptr->dev = BCH_SB_MEMBER_INVALID; + + struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); + nr_good += ca && ca->mi.state != BCH_MEMBER_STATE_failed; + } + + if (nr_good < s->v.nr_blocks && !(flags & BCH_FORCE_IF_DATA_DEGRADED)) + return bch_err_throw(c, remove_would_lose_data); + + unsigned nr_data = s->v.nr_blocks - s->v.nr_redundant; + + if (nr_good < nr_data && !(flags & BCH_FORCE_IF_DATA_LOST)) + return bch_err_throw(c, remove_would_lose_data); sectors = -sectors; @@ -2133,23 +2166,48 @@ static int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, struct bkey_ acc.type = BCH_DISK_ACCOUNTING_replicas; bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i)); acc.replicas.data_type = BCH_DATA_user; - ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, false); + return bch2_disk_accounting_mod(trans, &acc, §ors, 1, false); +} + +static int bch2_invalidate_stripe_to_dev_from_alloc(struct btree_trans *trans, struct bkey_s_c k_a, + unsigned flags) +{ + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k_a, &a_convert); + + if (!a->stripe) + return 0; + + if (a->stripe_sectors) { + struct bch_fs *c = trans->c; + bch_err(c, "trying to invalidate device in stripe when bucket has stripe data"); + return bch_err_throw(c, invalidate_stripe_to_dev); + } + + struct btree_iter iter; + struct bkey_s_c_stripe s = + bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_stripes, POS(0, a->stripe), + BTREE_ITER_slots, stripe); + int ret = bkey_err(s); if (ret) - goto err; -err: + return ret; + + ret = bch2_invalidate_stripe_to_dev(trans, &iter, s.s_c, k_a.k->p.inode, flags); bch2_trans_iter_exit(trans, &iter); return ret; } -int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx) +int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx, unsigned flags) { - return bch2_trans_run(c, + int ret = bch2_trans_run(c, for_each_btree_key_max_commit(trans, iter, BTREE_ID_alloc, POS(dev_idx, 0), POS(dev_idx, U64_MAX), BTREE_ITER_intent, k, NULL, NULL, 0, ({ - bch2_invalidate_stripe_to_dev(trans, k); + bch2_invalidate_stripe_to_dev_from_alloc(trans, k, flags); }))); + bch_err_fn(c, ret); + return ret; } /* startup/shutdown */ diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h index 51893e1ee874..548048adf0d5 100644 --- a/fs/bcachefs/ec.h +++ b/fs/bcachefs/ec.h @@ -255,9 +255,10 @@ void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *, int); int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *); void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *); + +struct alloc_request; struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *, - unsigned, unsigned, unsigned, - enum bch_watermark, struct closure *); + struct alloc_request *, unsigned, struct closure *); void bch2_do_stripe_deletes(struct bch_fs *); void bch2_ec_do_stripe_creates(struct bch_fs *); @@ -287,7 +288,9 @@ static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s, } } -int bch2_dev_remove_stripes(struct bch_fs *, unsigned); +int bch2_invalidate_stripe_to_dev(struct btree_trans *, struct btree_iter *, + struct bkey_s_c, unsigned, unsigned); +int bch2_dev_remove_stripes(struct bch_fs *, unsigned, unsigned); void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *); void bch2_fs_ec_stop(struct bch_fs *); diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h index 06144bfd9c19..809446c78951 100644 --- a/fs/bcachefs/ec_types.h +++ b/fs/bcachefs/ec_types.h @@ -4,9 +4,10 @@ #include "bcachefs_format.h" -struct bch_replicas_padded { +union bch_replicas_padded { + u8 bytes[struct_size_t(struct bch_replicas_entry_v1, + devs, BCH_BKEY_PTRS_MAX)]; struct bch_replicas_entry_v1 e; - u8 pad[BCH_BKEY_PTRS_MAX]; }; struct stripe { @@ -28,7 +29,7 @@ struct gc_stripe { u16 block_sectors[BCH_BKEY_PTRS_MAX]; struct bch_extent_ptr ptrs[BCH_BKEY_PTRS_MAX]; - struct bch_replicas_padded r; + union bch_replicas_padded r; }; #endif /* _BCACHEFS_EC_TYPES_H */ diff --git a/fs/bcachefs/enumerated_ref.c b/fs/bcachefs/enumerated_ref.c new file mode 100644 index 000000000000..56ab430f209f --- /dev/null +++ b/fs/bcachefs/enumerated_ref.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "enumerated_ref.h" +#include "util.h" + +#include <linux/completion.h> + +#ifdef ENUMERATED_REF_DEBUG +void enumerated_ref_get(struct enumerated_ref *ref, unsigned idx) +{ + BUG_ON(idx >= ref->nr); + atomic_long_inc(&ref->refs[idx]); +} + +bool __enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx) +{ + BUG_ON(idx >= ref->nr); + return atomic_long_inc_not_zero(&ref->refs[idx]); +} + +bool enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx) +{ + BUG_ON(idx >= ref->nr); + return !ref->dying && + atomic_long_inc_not_zero(&ref->refs[idx]); +} + +void enumerated_ref_put(struct enumerated_ref *ref, unsigned idx) +{ + BUG_ON(idx >= ref->nr); + long v = atomic_long_dec_return(&ref->refs[idx]); + + BUG_ON(v < 0); + if (v) + return; + + for (unsigned i = 0; i < ref->nr; i++) + if (atomic_long_read(&ref->refs[i])) + return; + + if (ref->stop_fn) + ref->stop_fn(ref); + complete(&ref->stop_complete); +} +#endif + +#ifndef ENUMERATED_REF_DEBUG +static void enumerated_ref_kill_cb(struct percpu_ref *percpu_ref) +{ + struct enumerated_ref *ref = + container_of(percpu_ref, struct enumerated_ref, ref); + + if (ref->stop_fn) + ref->stop_fn(ref); + complete(&ref->stop_complete); +} +#endif + +void enumerated_ref_stop_async(struct enumerated_ref *ref) +{ + reinit_completion(&ref->stop_complete); + +#ifndef ENUMERATED_REF_DEBUG + percpu_ref_kill(&ref->ref); +#else + ref->dying = true; + for (unsigned i = 0; i < ref->nr; i++) + enumerated_ref_put(ref, i); +#endif +} + +void enumerated_ref_stop(struct enumerated_ref *ref, + const char * const names[]) +{ + enumerated_ref_stop_async(ref); + while (!wait_for_completion_timeout(&ref->stop_complete, HZ * 10)) { + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "Waited for 10 seconds to shutdown enumerated ref\n"); + prt_str(&buf, "Outstanding refs:\n"); + enumerated_ref_to_text(&buf, ref, names); + printk(KERN_ERR "%s", buf.buf); + printbuf_exit(&buf); + } +} + +void enumerated_ref_start(struct enumerated_ref *ref) +{ +#ifndef ENUMERATED_REF_DEBUG + percpu_ref_reinit(&ref->ref); +#else + ref->dying = false; + for (unsigned i = 0; i < ref->nr; i++) { + BUG_ON(atomic_long_read(&ref->refs[i])); + atomic_long_inc(&ref->refs[i]); + } +#endif +} + +void enumerated_ref_exit(struct enumerated_ref *ref) +{ +#ifndef ENUMERATED_REF_DEBUG + percpu_ref_exit(&ref->ref); +#else + kfree(ref->refs); + ref->refs = NULL; + ref->nr = 0; +#endif +} + +int enumerated_ref_init(struct enumerated_ref *ref, unsigned nr, + void (*stop_fn)(struct enumerated_ref *)) +{ + init_completion(&ref->stop_complete); + ref->stop_fn = stop_fn; + +#ifndef ENUMERATED_REF_DEBUG + return percpu_ref_init(&ref->ref, enumerated_ref_kill_cb, + PERCPU_REF_INIT_DEAD, GFP_KERNEL); +#else + ref->refs = kzalloc(sizeof(ref->refs[0]) * nr, GFP_KERNEL); + if (!ref->refs) + return -ENOMEM; + + ref->nr = nr; + return 0; +#endif +} + +void enumerated_ref_to_text(struct printbuf *out, + struct enumerated_ref *ref, + const char * const names[]) +{ +#ifdef ENUMERATED_REF_DEBUG + bch2_printbuf_tabstop_push(out, 32); + + for (unsigned i = 0; i < ref->nr; i++) + prt_printf(out, "%s\t%li\n", names[i], + atomic_long_read(&ref->refs[i])); +#else + prt_str(out, "(not in debug mode)\n"); +#endif +} diff --git a/fs/bcachefs/enumerated_ref.h b/fs/bcachefs/enumerated_ref.h new file mode 100644 index 000000000000..ec01cf59ef80 --- /dev/null +++ b/fs/bcachefs/enumerated_ref.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ENUMERATED_REF_H +#define _BCACHEFS_ENUMERATED_REF_H + +#include "enumerated_ref_types.h" + +/* + * A refcount where the users are enumerated: in debug mode, we create sepate + * refcounts for each user, to make leaks and refcount errors easy to track + * down: + */ + +#ifdef ENUMERATED_REF_DEBUG +void enumerated_ref_get(struct enumerated_ref *, unsigned); +bool __enumerated_ref_tryget(struct enumerated_ref *, unsigned); +bool enumerated_ref_tryget(struct enumerated_ref *, unsigned); +void enumerated_ref_put(struct enumerated_ref *, unsigned); +#else + +static inline void enumerated_ref_get(struct enumerated_ref *ref, unsigned idx) +{ + percpu_ref_get(&ref->ref); +} + +static inline bool __enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx) +{ + return percpu_ref_tryget(&ref->ref); +} + +static inline bool enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx) +{ + return percpu_ref_tryget_live(&ref->ref); +} + +static inline void enumerated_ref_put(struct enumerated_ref *ref, unsigned idx) +{ + percpu_ref_put(&ref->ref); +} +#endif + +static inline bool enumerated_ref_is_zero(struct enumerated_ref *ref) +{ +#ifndef ENUMERATED_REF_DEBUG + return percpu_ref_is_zero(&ref->ref); +#else + for (unsigned i = 0; i < ref->nr; i++) + if (atomic_long_read(&ref->refs[i])) + return false; + return true; +#endif +} + +void enumerated_ref_stop_async(struct enumerated_ref *); +void enumerated_ref_stop(struct enumerated_ref *, const char * const[]); +void enumerated_ref_start(struct enumerated_ref *); + +void enumerated_ref_exit(struct enumerated_ref *); +int enumerated_ref_init(struct enumerated_ref *, unsigned, + void (*stop_fn)(struct enumerated_ref *)); + +struct printbuf; +void enumerated_ref_to_text(struct printbuf *, + struct enumerated_ref *, + const char * const[]); + +#endif /* _BCACHEFS_ENUMERATED_REF_H */ diff --git a/fs/bcachefs/enumerated_ref_types.h b/fs/bcachefs/enumerated_ref_types.h new file mode 100644 index 000000000000..0e6076f466d3 --- /dev/null +++ b/fs/bcachefs/enumerated_ref_types.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ENUMERATED_REF_TYPES_H +#define _BCACHEFS_ENUMERATED_REF_TYPES_H + +#include <linux/percpu-refcount.h> + +struct enumerated_ref { +#ifdef ENUMERATED_REF_DEBUG + unsigned nr; + bool dying; + atomic_long_t *refs; +#else + struct percpu_ref ref; +#endif + void (*stop_fn)(struct enumerated_ref *); + struct completion stop_complete; +}; + +#endif /* _BCACHEFS_ENUMERATED_REF_TYPES_H */ diff --git a/fs/bcachefs/errcode.c b/fs/bcachefs/errcode.c index 43557bebd0f8..c39cf304c681 100644 --- a/fs/bcachefs/errcode.c +++ b/fs/bcachefs/errcode.c @@ -13,12 +13,13 @@ static const char * const bch2_errcode_strs[] = { NULL }; -static unsigned bch2_errcode_parents[] = { +static const unsigned bch2_errcode_parents[] = { #define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = class, BCH_ERRCODES() #undef x }; +__attribute__((const)) const char *bch2_err_str(int err) { const char *errstr; @@ -36,6 +37,7 @@ const char *bch2_err_str(int err) return errstr ?: "(Invalid error)"; } +__attribute__((const)) bool __bch2_err_matches(int err, int class) { err = abs(err); diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index d9ebffa5b3a2..ac3264134a15 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -53,6 +53,7 @@ x(ENOMEM, ENOMEM_dio_write_bioset_init) \ x(ENOMEM, ENOMEM_nocow_flush_bioset_init) \ x(ENOMEM, ENOMEM_promote_table_init) \ + x(ENOMEM, ENOMEM_async_obj_init) \ x(ENOMEM, ENOMEM_compression_bounce_read_init) \ x(ENOMEM, ENOMEM_compression_bounce_write_init) \ x(ENOMEM, ENOMEM_compression_workspace_init) \ @@ -174,16 +175,19 @@ x(0, backpointer_to_overwritten_btree_node) \ x(0, journal_reclaim_would_deadlock) \ x(EINVAL, fsck) \ + x(BCH_ERR_fsck, fsck_ask) \ x(BCH_ERR_fsck, fsck_fix) \ x(BCH_ERR_fsck, fsck_delete_bkey) \ x(BCH_ERR_fsck, fsck_ignore) \ x(BCH_ERR_fsck, fsck_errors_not_fixed) \ x(BCH_ERR_fsck, fsck_repair_unimplemented) \ x(BCH_ERR_fsck, fsck_repair_impossible) \ - x(EINVAL, restart_recovery) \ - x(EINVAL, not_in_recovery) \ - x(EINVAL, cannot_rewind_recovery) \ + x(EINVAL, recovery_will_run) \ + x(BCH_ERR_recovery_will_run, restart_recovery) \ + x(BCH_ERR_recovery_will_run, cannot_rewind_recovery) \ + x(BCH_ERR_recovery_will_run, recovery_pass_will_run) \ x(0, data_update_done) \ + x(0, bkey_was_deleted) \ x(BCH_ERR_data_update_done, data_update_done_would_block) \ x(BCH_ERR_data_update_done, data_update_done_unwritten) \ x(BCH_ERR_data_update_done, data_update_done_no_writes_needed) \ @@ -201,6 +205,7 @@ x(EINVAL, device_has_been_removed) \ x(EINVAL, device_splitbrain) \ x(EINVAL, device_already_online) \ + x(EINVAL, filesystem_uuid_already_open) \ x(EINVAL, insufficient_devices_to_start) \ x(EINVAL, invalid) \ x(EINVAL, internal_fsck_err) \ @@ -209,8 +214,11 @@ x(EINVAL, remove_would_lose_data) \ x(EINVAL, no_resize_with_buckets_nouse) \ x(EINVAL, inode_unpack_error) \ + x(EINVAL, inode_not_unlinked) \ + x(EINVAL, inode_has_child_snapshot) \ x(EINVAL, varint_decode_error) \ x(EINVAL, erasure_coding_found_btree_node) \ + x(EINVAL, option_negative) \ x(EOPNOTSUPP, may_not_use_incompat_feature) \ x(EROFS, erofs_trans_commit) \ x(EROFS, erofs_no_writes) \ @@ -219,6 +227,8 @@ x(EROFS, erofs_unfixed_errors) \ x(EROFS, erofs_norecovery) \ x(EROFS, erofs_nochanges) \ + x(EROFS, erofs_no_alloc_info) \ + x(EROFS, erofs_filesystem_full) \ x(EROFS, insufficient_devices) \ x(0, operation_blocked) \ x(BCH_ERR_operation_blocked, btree_cache_cannibalize_lock_blocked) \ @@ -352,9 +362,11 @@ enum bch_errcode { BCH_ERR_MAX }; -const char *bch2_err_str(int); -bool __bch2_err_matches(int, int); +__attribute__((const)) const char *bch2_err_str(int); +__attribute__((const)) bool __bch2_err_matches(int, int); + +__attribute__((const)) static inline bool _bch2_err_matches(int err, int class) { return err < 0 && __bch2_err_matches(err, class); diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 6b8695b1349c..63951e293c47 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -11,12 +11,12 @@ #define FSCK_ERR_RATELIMIT_NR 10 -void bch2_log_msg_start(struct bch_fs *c, struct printbuf *out) +void __bch2_log_msg_start(const char *fs_or_dev_name, struct printbuf *out) { printbuf_indent_add_nextline(out, 2); #ifdef BCACHEFS_LOG_PREFIX - prt_printf(out, bch2_log_msg(c, "")); + prt_printf(out, "bcachefs (%s): ", fs_or_dev_name); #endif } @@ -29,12 +29,10 @@ bool __bch2_inconsistent_error(struct bch_fs *c, struct printbuf *out) return false; case BCH_ON_ERROR_fix_safe: case BCH_ON_ERROR_ro: - if (bch2_fs_emergency_read_only(c)) - prt_printf(out, "inconsistency detected - emergency read only at journal seq %llu\n", - journal_cur_seq(&c->journal)); + bch2_fs_emergency_read_only2(c, out); return true; case BCH_ON_ERROR_panic: - bch2_print_string_as_lines_nonblocking(KERN_ERR, out->buf); + bch2_print_str(c, KERN_ERR, out->buf); panic(bch2_fmt(c, "panic after error")); return true; default: @@ -71,7 +69,7 @@ static bool bch2_fs_trans_inconsistent(struct bch_fs *c, struct btree_trans *tra if (trans) bch2_trans_updates_to_text(&buf, trans); bool ret = __bch2_inconsistent_error(c, &buf); - bch2_print_string_as_lines_nonblocking(KERN_ERR, buf.buf); + bch2_print_str_nonblocking(c, KERN_ERR, buf.buf); printbuf_exit(&buf); return ret; @@ -100,12 +98,12 @@ int __bch2_topology_error(struct bch_fs *c, struct printbuf *out) prt_printf(out, "btree topology error: "); set_bit(BCH_FS_topology_error, &c->flags); - if (!test_bit(BCH_FS_recovery_running, &c->flags)) { + if (!test_bit(BCH_FS_in_recovery, &c->flags)) { __bch2_inconsistent_error(c, out); - return -BCH_ERR_btree_need_topology_repair; + return bch_err_throw(c, btree_need_topology_repair); } else { - return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?: - -BCH_ERR_btree_node_read_validate_error; + return bch2_run_explicit_recovery_pass(c, out, BCH_RECOVERY_PASS_check_topology, 0) ?: + bch_err_throw(c, btree_node_read_validate_error); } } @@ -121,7 +119,7 @@ int bch2_fs_topology_error(struct bch_fs *c, const char *fmt, ...) va_end(args); int ret = __bch2_topology_error(c, &buf); - bch2_print_string_as_lines(KERN_ERR, buf.buf); + bch2_print_str(c, KERN_ERR, buf.buf); printbuf_exit(&buf); return ret; @@ -151,14 +149,17 @@ void bch2_io_error_work(struct work_struct *work) bool dev = !__bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro, BCH_FORCE_IF_DEGRADED); + struct printbuf buf = PRINTBUF; + __bch2_log_msg_start(ca->name, &buf); - bch_err(ca, - "writes erroring for %u seconds, setting %s ro", + prt_printf(&buf, "writes erroring for %u seconds, setting %s ro", c->opts.write_error_timeout, dev ? "device" : "filesystem"); if (!dev) - bch2_fs_emergency_read_only(c); + bch2_fs_emergency_read_only2(c, &buf); + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); } out: up_write(&c->state_lock); @@ -328,7 +329,7 @@ static int do_fsck_ask_yn(struct bch_fs *c, if (bch2_fs_stdio_redirect(c)) bch2_print(c, "%s", question->buf); else - bch2_print_string_as_lines(KERN_ERR, question->buf); + bch2_print_str(c, KERN_ERR, question->buf); int ask = bch2_fsck_ask_yn(c, trans); @@ -376,15 +377,63 @@ static struct fsck_err_state *count_fsck_err_locked(struct bch_fs *c, return s; } -void __bch2_count_fsck_err(struct bch_fs *c, - enum bch_sb_error_id id, const char *msg, - bool *repeat, bool *print, bool *suppress) +bool __bch2_count_fsck_err(struct bch_fs *c, + enum bch_sb_error_id id, struct printbuf *msg) { bch2_sb_error_count(c, id); mutex_lock(&c->fsck_error_msgs_lock); - count_fsck_err_locked(c, id, msg, repeat, print, suppress); + bool print = true, repeat = false, suppress = false; + + count_fsck_err_locked(c, id, msg->buf, &repeat, &print, &suppress); mutex_unlock(&c->fsck_error_msgs_lock); + + if (suppress) + prt_printf(msg, "Ratelimiting new instances of previous error\n"); + + return print && !repeat; +} + +int bch2_fsck_err_opt(struct bch_fs *c, + enum bch_fsck_flags flags, + enum bch_sb_error_id err) +{ + if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra))) + flags |= fsck_flags_extra[err]; + + if (test_bit(BCH_FS_in_fsck, &c->flags)) { + if (!(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) + return bch_err_throw(c, fsck_repair_unimplemented); + + switch (c->opts.fix_errors) { + case FSCK_FIX_exit: + return bch_err_throw(c, fsck_errors_not_fixed); + case FSCK_FIX_yes: + if (flags & FSCK_CAN_FIX) + return bch_err_throw(c, fsck_fix); + fallthrough; + case FSCK_FIX_no: + if (flags & FSCK_CAN_IGNORE) + return bch_err_throw(c, fsck_ignore); + return bch_err_throw(c, fsck_errors_not_fixed); + case FSCK_FIX_ask: + if (flags & FSCK_AUTOFIX) + return bch_err_throw(c, fsck_fix); + return bch_err_throw(c, fsck_ask); + default: + BUG(); + } + } else { + if ((flags & FSCK_AUTOFIX) && + (c->opts.errors == BCH_ON_ERROR_continue || + c->opts.errors == BCH_ON_ERROR_fix_safe)) + return bch_err_throw(c, fsck_fix); + + if (c->opts.errors == BCH_ON_ERROR_continue && + (flags & FSCK_CAN_IGNORE)) + return bch_err_throw(c, fsck_ignore); + return bch_err_throw(c, fsck_errors_not_fixed); + } } int __bch2_fsck_err(struct bch_fs *c, @@ -395,7 +444,7 @@ int __bch2_fsck_err(struct bch_fs *c, { va_list args; struct printbuf buf = PRINTBUF, *out = &buf; - int ret = -BCH_ERR_fsck_ignore; + int ret = 0; const char *action_orig = "fix?", *action = action_orig; might_sleep(); @@ -425,8 +474,8 @@ int __bch2_fsck_err(struct bch_fs *c, if (test_bit(err, c->sb.errors_silent)) return flags & FSCK_CAN_FIX - ? -BCH_ERR_fsck_fix - : -BCH_ERR_fsck_ignore; + ? bch_err_throw(c, fsck_fix) + : bch_err_throw(c, fsck_ignore); printbuf_indent_add_nextline(out, 2); @@ -468,14 +517,14 @@ int __bch2_fsck_err(struct bch_fs *c, prt_str(out, ", "); if (flags & FSCK_CAN_FIX) { prt_actioning(out, action); - ret = -BCH_ERR_fsck_fix; + ret = bch_err_throw(c, fsck_fix); } else { prt_str(out, ", continuing"); - ret = -BCH_ERR_fsck_ignore; + ret = bch_err_throw(c, fsck_ignore); } goto print; - } else if (!test_bit(BCH_FS_fsck_running, &c->flags)) { + } else if (!test_bit(BCH_FS_in_fsck, &c->flags)) { if (c->opts.errors != BCH_ON_ERROR_continue || !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) { prt_str_indented(out, ", shutting down\n" @@ -483,18 +532,18 @@ int __bch2_fsck_err(struct bch_fs *c, "run fsck, and forward to devs so error can be marked for self-healing"); inconsistent = true; print = true; - ret = -BCH_ERR_fsck_errors_not_fixed; + ret = bch_err_throw(c, fsck_errors_not_fixed); } else if (flags & FSCK_CAN_FIX) { prt_str(out, ", "); prt_actioning(out, action); - ret = -BCH_ERR_fsck_fix; + ret = bch_err_throw(c, fsck_fix); } else { prt_str(out, ", continuing"); - ret = -BCH_ERR_fsck_ignore; + ret = bch_err_throw(c, fsck_ignore); } } else if (c->opts.fix_errors == FSCK_FIX_exit) { prt_str(out, ", exiting"); - ret = -BCH_ERR_fsck_errors_not_fixed; + ret = bch_err_throw(c, fsck_errors_not_fixed); } else if (flags & FSCK_CAN_FIX) { int fix = s && s->fix ? s->fix @@ -513,30 +562,37 @@ int __bch2_fsck_err(struct bch_fs *c, : FSCK_FIX_yes; ret = ret & 1 - ? -BCH_ERR_fsck_fix - : -BCH_ERR_fsck_ignore; + ? bch_err_throw(c, fsck_fix) + : bch_err_throw(c, fsck_ignore); } else if (fix == FSCK_FIX_yes || (c->opts.nochanges && !(flags & FSCK_CAN_IGNORE))) { prt_str(out, ", "); prt_actioning(out, action); - ret = -BCH_ERR_fsck_fix; + ret = bch_err_throw(c, fsck_fix); } else { prt_str(out, ", not "); prt_actioning(out, action); + ret = bch_err_throw(c, fsck_ignore); + } + } else { + if (flags & FSCK_CAN_IGNORE) { + prt_str(out, ", continuing"); + ret = bch_err_throw(c, fsck_ignore); + } else { + prt_str(out, " (repair unimplemented)"); + ret = bch_err_throw(c, fsck_repair_unimplemented); } - } else if (!(flags & FSCK_CAN_IGNORE)) { - prt_str(out, " (repair unimplemented)"); } - if (ret == -BCH_ERR_fsck_ignore && + if (bch2_err_matches(ret, BCH_ERR_fsck_ignore) && (c->opts.fix_errors == FSCK_FIX_exit || !(flags & FSCK_CAN_IGNORE))) - ret = -BCH_ERR_fsck_errors_not_fixed; + ret = bch_err_throw(c, fsck_errors_not_fixed); - if (test_bit(BCH_FS_fsck_running, &c->flags) && - (ret != -BCH_ERR_fsck_fix && - ret != -BCH_ERR_fsck_ignore)) { + if (test_bit(BCH_FS_in_fsck, &c->flags) && + (!bch2_err_matches(ret, BCH_ERR_fsck_fix) && + !bch2_err_matches(ret, BCH_ERR_fsck_ignore))) { exiting = true; print = true; } @@ -559,31 +615,31 @@ print: if (bch2_fs_stdio_redirect(c)) bch2_print(c, "%s", out->buf); else - bch2_print_string_as_lines(KERN_ERR, out->buf); + bch2_print_str(c, KERN_ERR, out->buf); } if (s) s->ret = ret; - +err_unlock: + mutex_unlock(&c->fsck_error_msgs_lock); +err: /* * We don't yet track whether the filesystem currently has errors, for * log_fsck_err()s: that would require us to track for every error type * which recovery pass corrects it, to get the fsck exit status correct: */ - if (flags & FSCK_CAN_FIX) { - if (ret == -BCH_ERR_fsck_fix) { - set_bit(BCH_FS_errors_fixed, &c->flags); - } else { - set_bit(BCH_FS_errors_not_fixed, &c->flags); - set_bit(BCH_FS_error, &c->flags); - } + if (bch2_err_matches(ret, BCH_ERR_fsck_fix)) { + set_bit(BCH_FS_errors_fixed, &c->flags); + } else { + set_bit(BCH_FS_errors_not_fixed, &c->flags); + set_bit(BCH_FS_error, &c->flags); } -err_unlock: - mutex_unlock(&c->fsck_error_msgs_lock); -err: + if (action != action_orig) kfree(action); printbuf_exit(&buf); + + BUG_ON(!ret); return ret; } @@ -601,12 +657,12 @@ int __bch2_bkey_fsck_err(struct bch_fs *c, const char *fmt, ...) { if (from.flags & BCH_VALIDATE_silent) - return -BCH_ERR_fsck_delete_bkey; + return bch_err_throw(c, fsck_delete_bkey); unsigned fsck_flags = 0; if (!(from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit))) { if (test_bit(err, c->sb.errors_silent)) - return -BCH_ERR_fsck_delete_bkey; + return bch_err_throw(c, fsck_delete_bkey); fsck_flags |= FSCK_AUTOFIX|FSCK_CAN_FIX; } @@ -693,25 +749,9 @@ void bch2_inum_offset_err_msg(struct bch_fs *c, struct printbuf *out, int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out, struct bpos pos) { - struct bch_fs *c = trans->c; - int ret = 0; - - if (!bch2_snapshot_is_leaf(c, pos.snapshot)) - prt_str(out, "(multiple snapshots) "); - - subvol_inum inum = { - .subvol = bch2_snapshot_tree_oldest_subvol(c, pos.snapshot), - .inum = pos.inode, - }; - - if (inum.subvol) { - ret = bch2_inum_to_path(trans, inum, out); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - return ret; - } - - if (!inum.subvol || ret) - prt_printf(out, "inum %llu:%u", pos.inode, pos.snapshot); + int ret = bch2_inum_snapshot_to_path(trans, pos.inode, pos.snapshot, NULL, out); + if (ret) + return ret; prt_printf(out, " offset %llu: ", pos.offset << 8); return 0; diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index 4a364fd44abe..0c3c3a24fc6f 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -18,7 +18,12 @@ struct work_struct; /* Error messages: */ -void bch2_log_msg_start(struct bch_fs *, struct printbuf *); +void __bch2_log_msg_start(const char *, struct printbuf *); + +static inline void bch2_log_msg_start(struct bch_fs *c, struct printbuf *out) +{ + __bch2_log_msg_start(c->name, out); +} /* * Inconsistency errors: The on disk data is inconsistent. If these occur during @@ -76,12 +81,14 @@ struct fsck_err_state { #define fsck_err_count(_c, _err) bch2_sb_err_count(_c, BCH_FSCK_ERR_##_err) -void __bch2_count_fsck_err(struct bch_fs *, - enum bch_sb_error_id, const char *, - bool *, bool *, bool *); +bool __bch2_count_fsck_err(struct bch_fs *, enum bch_sb_error_id, struct printbuf *); #define bch2_count_fsck_err(_c, _err, ...) \ __bch2_count_fsck_err(_c, BCH_FSCK_ERR_##_err, __VA_ARGS__) +int bch2_fsck_err_opt(struct bch_fs *, + enum bch_fsck_flags, + enum bch_sb_error_id); + __printf(5, 6) __cold int __bch2_fsck_err(struct bch_fs *, struct btree_trans *, enum bch_fsck_flags, @@ -98,13 +105,13 @@ void bch2_free_fsck_errs(struct bch_fs *); #define fsck_err_wrap(_do) \ ({ \ int _ret = _do; \ - if (_ret != -BCH_ERR_fsck_fix && \ - _ret != -BCH_ERR_fsck_ignore) { \ + if (!bch2_err_matches(_ret, BCH_ERR_fsck_fix) && \ + !bch2_err_matches(_ret, BCH_ERR_fsck_ignore)) { \ ret = _ret; \ goto fsck_err; \ } \ \ - _ret == -BCH_ERR_fsck_fix; \ + bch2_err_matches(_ret, BCH_ERR_fsck_fix); \ }) #define __fsck_err(...) fsck_err_wrap(bch2_fsck_err(__VA_ARGS__)) @@ -163,10 +170,10 @@ do { \ int _ret = __bch2_bkey_fsck_err(c, k, from, \ BCH_FSCK_ERR_##_err_type, \ _err_msg, ##__VA_ARGS__); \ - if (_ret != -BCH_ERR_fsck_fix && \ - _ret != -BCH_ERR_fsck_ignore) \ + if (!bch2_err_matches(_ret, BCH_ERR_fsck_fix) && \ + !bch2_err_matches(_ret, BCH_ERR_fsck_ignore)) \ ret = _ret; \ - ret = -BCH_ERR_fsck_delete_bkey; \ + ret = bch_err_throw(c, fsck_delete_bkey); \ goto fsck_err; \ } while (0) diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c index 6bb42985306e..b899ee75f5b9 100644 --- a/fs/bcachefs/extent_update.c +++ b/fs/bcachefs/extent_update.c @@ -37,16 +37,17 @@ static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k) return lru + ret * 2; } +#define EXTENT_ITERS_MAX 64 + static int count_iters_for_insert(struct btree_trans *trans, struct bkey_s_c k, unsigned offset, struct bpos *end, - unsigned *nr_iters, - unsigned max_iters) + unsigned *nr_iters) { int ret = 0, ret2 = 0; - if (*nr_iters >= max_iters) { + if (*nr_iters >= EXTENT_ITERS_MAX) { *end = bpos_min(*end, k.k->p); ret = 1; } @@ -56,7 +57,7 @@ static int count_iters_for_insert(struct btree_trans *trans, case KEY_TYPE_reflink_v: *nr_iters += bch2_bkey_nr_alloc_ptrs(k); - if (*nr_iters >= max_iters) { + if (*nr_iters >= EXTENT_ITERS_MAX) { *end = bpos_min(*end, k.k->p); ret = 1; } @@ -81,7 +82,7 @@ static int count_iters_for_insert(struct btree_trans *trans, *nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k); - if (*nr_iters >= max_iters) { + if (*nr_iters >= EXTENT_ITERS_MAX) { struct bpos pos = bkey_start_pos(k.k); pos.offset += min_t(u64, k.k->size, r_k.k->p.offset - idx); @@ -100,59 +101,31 @@ static int count_iters_for_insert(struct btree_trans *trans, return ret2 ?: ret; } -#define EXTENT_ITERS_MAX (BTREE_ITER_INITIAL / 3) - int bch2_extent_atomic_end(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_i *insert, struct bpos *end) { - struct btree_iter copy; - struct bkey_s_c k; unsigned nr_iters = 0; - int ret; - - ret = bch2_btree_iter_traverse(trans, iter); - if (ret) - return ret; - - *end = insert->k.p; - - /* extent_update_to_keys(): */ - nr_iters += 1; - - ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert), 0, end, - &nr_iters, EXTENT_ITERS_MAX / 2); - if (ret < 0) - return ret; + struct btree_iter copy; bch2_trans_copy_iter(trans, ©, iter); - for_each_btree_key_max_continue_norestart(trans, copy, insert->k.p, 0, k, ret) { - unsigned offset = 0; + int ret = bch2_btree_iter_traverse(trans, ©); + if (ret) + goto err; - if (bkey_gt(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) - offset = bkey_start_offset(&insert->k) - - bkey_start_offset(k.k); + struct bkey_s_c k; + for_each_btree_key_max_continue_norestart(trans, copy, *end, 0, k, ret) { + unsigned offset = 0; - /* extent_handle_overwrites(): */ - switch (bch2_extent_overlap(&insert->k, k.k)) { - case BCH_EXTENT_OVERLAP_ALL: - case BCH_EXTENT_OVERLAP_FRONT: - nr_iters += 1; - break; - case BCH_EXTENT_OVERLAP_BACK: - case BCH_EXTENT_OVERLAP_MIDDLE: - nr_iters += 2; - break; - } + if (bkey_gt(iter->pos, bkey_start_pos(k.k))) + offset = iter->pos.offset - bkey_start_offset(k.k); - ret = count_iters_for_insert(trans, k, offset, end, - &nr_iters, EXTENT_ITERS_MAX); + ret = count_iters_for_insert(trans, k, offset, end, &nr_iters); if (ret) break; } - +err: bch2_trans_iter_exit(trans, ©); return ret < 0 ? ret : 0; } @@ -161,10 +134,8 @@ int bch2_extent_trim_atomic(struct btree_trans *trans, struct btree_iter *iter, struct bkey_i *k) { - struct bpos end; - int ret; - - ret = bch2_extent_atomic_end(trans, iter, k, &end); + struct bpos end = k->k.p; + int ret = bch2_extent_atomic_end(trans, iter, &end); if (ret) return ret; diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h index 6f5cf449361a..34467db53f45 100644 --- a/fs/bcachefs/extent_update.h +++ b/fs/bcachefs/extent_update.h @@ -5,7 +5,7 @@ #include "bcachefs.h" int bch2_extent_atomic_end(struct btree_trans *, struct btree_iter *, - struct bkey_i *, struct bpos *); + struct bpos *); int bch2_extent_trim_atomic(struct btree_trans *, struct btree_iter *, struct bkey_i *); diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index e597fb9c9823..036e4ad95987 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -45,6 +45,49 @@ static void bch2_extent_crc_pack(union bch_extent_crc *, struct bch_extent_crc_unpacked, enum bch_extent_entry_type); +void bch2_io_failures_to_text(struct printbuf *out, + struct bch_fs *c, + struct bch_io_failures *failed) +{ + static const char * const error_types[] = { + "io", "checksum", "ec reconstruct", NULL + }; + + for (struct bch_dev_io_failures *f = failed->devs; + f < failed->devs + failed->nr; + f++) { + unsigned errflags = + ((!!f->failed_io) << 0) | + ((!!f->failed_csum_nr) << 1) | + ((!!f->failed_ec) << 2); + + if (!errflags) + continue; + + bch2_printbuf_make_room(out, 1024); + out->atomic++; + scoped_guard(rcu) { + struct bch_dev *ca = bch2_dev_rcu_noerror(c, f->dev); + if (ca) + prt_str(out, ca->name); + else + prt_printf(out, "(invalid device %u)", f->dev); + } + --out->atomic; + + prt_char(out, ' '); + + if (is_power_of_2(errflags)) { + prt_bitflags(out, error_types, errflags); + prt_str(out, " error"); + } else { + prt_str(out, "errors: "); + prt_bitflags(out, error_types, errflags); + } + prt_newline(out); + } +} + struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *f, unsigned dev) { @@ -79,6 +122,22 @@ void bch2_mark_io_failure(struct bch_io_failures *failed, f->failed_csum_nr++; } +void bch2_mark_btree_validate_failure(struct bch_io_failures *failed, + unsigned dev) +{ + struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, dev); + + if (!f) { + BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs)); + + f = &failed->devs[failed->nr++]; + memset(f, 0, sizeof(*f)); + f->dev = dev; + } + + f->failed_btree_validate = true; +} + static inline u64 dev_latency(struct bch_dev *ca) { return ca ? atomic64_read(&ca->cur_latency[READ]) : S64_MAX; @@ -105,7 +164,7 @@ static inline bool ptr_better(struct bch_fs *c, if (unlikely(failed_delta)) return failed_delta < 0; - if (unlikely(bch2_force_reconstruct_read)) + if (static_branch_unlikely(&bch2_force_reconstruct_read)) return p1.do_ec_reconstruct > p2.do_ec_reconstruct; if (unlikely(p1.do_ec_reconstruct || p2.do_ec_reconstruct)) @@ -134,14 +193,10 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, bool have_dirty_ptrs = false, have_pick = false; if (k.k->type == KEY_TYPE_error) - return -BCH_ERR_key_type_error; - - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - - if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) - return -BCH_ERR_extent_poisoned; + return bch_err_throw(c, key_type_error); rcu_read_lock(); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; u64 pick_latency; @@ -162,7 +217,15 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, if (dev >= 0 && p.ptr.dev != dev) continue; - struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev); + + if (unlikely(!ca && p.ptr.dev != BCH_SB_MEMBER_INVALID)) { + rcu_read_unlock(); + int ret = bch2_dev_missing_bkey(c, k, p.ptr.dev); + if (ret) + return ret; + rcu_read_lock(); + } if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr))) continue; @@ -175,6 +238,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, if (ca && ca->mi.state != BCH_MEMBER_STATE_failed) { have_io_errors |= f->failed_io; + have_io_errors |= f->failed_btree_validate; have_io_errors |= f->failed_ec; } have_csum_errors |= !!f->failed_csum_nr; @@ -182,6 +246,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, if (p.has_ec && (f->failed_io || f->failed_csum_nr)) p.do_ec_reconstruct = true; else if (f->failed_io || + f->failed_btree_validate || f->failed_csum_nr > c->opts.checksum_err_retry_nr) continue; } @@ -194,7 +259,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, p.do_ec_reconstruct = true; } - if (bch2_force_reconstruct_read && p.has_ec) + if (static_branch_unlikely(&bch2_force_reconstruct_read) && p.has_ec) p.do_ec_reconstruct = true; u64 p_latency = dev_latency(ca); @@ -221,17 +286,17 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, if (!have_dirty_ptrs) return 0; if (have_missing_devs) - return -BCH_ERR_no_device_to_read_from; + return bch_err_throw(c, no_device_to_read_from); if (have_csum_errors) - return -BCH_ERR_data_read_csum_err; + return bch_err_throw(c, data_read_csum_err); if (have_io_errors) - return -BCH_ERR_data_read_io_err; + return bch_err_throw(c, data_read_io_err); /* * If we get here, we have pointers (bkey_ptrs_validate() ensures that), * but they don't point to valid devices: */ - return -BCH_ERR_no_devices_valid; + return bch_err_throw(c, no_devices_valid); } /* KEY_TYPE_btree_ptr: */ @@ -342,6 +407,8 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) lp.crc = bch2_extent_crc_unpack(l.k, NULL); rp.crc = bch2_extent_crc_unpack(r.k, NULL); + guard(rcu)(); + while (__bkey_ptr_next_decode(l.k, l_ptrs.end, lp, en_l) && __bkey_ptr_next_decode(r.k, r_ptrs.end, rp, en_r)) { if (lp.ptr.offset + lp.crc.offset + lp.crc.live_size != @@ -353,10 +420,8 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) return false; /* Extents may not straddle buckets: */ - rcu_read_lock(); struct bch_dev *ca = bch2_dev_rcu(c, lp.ptr.dev); bool same_bucket = ca && PTR_BUCKET_NR(ca, &lp.ptr) == PTR_BUCKET_NR(ca, &rp.ptr); - rcu_read_unlock(); if (!same_bucket) return false; @@ -773,11 +838,9 @@ unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k) struct extent_ptr_decoded p; unsigned durability = 0; - rcu_read_lock(); + guard(rcu)(); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) durability += bch2_extent_ptr_durability(c, &p); - rcu_read_unlock(); - return durability; } @@ -788,12 +851,10 @@ static unsigned bch2_bkey_durability_safe(struct bch_fs *c, struct bkey_s_c k) struct extent_ptr_decoded p; unsigned durability = 0; - rcu_read_lock(); + guard(rcu)(); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) if (p.ptr.dev < c->sb.nr_devices && c->devs[p.ptr.dev]) durability += bch2_extent_ptr_durability(c, &p); - rcu_read_unlock(); - return durability; } @@ -950,20 +1011,16 @@ bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); struct bch_dev *ca; - bool ret = false; - rcu_read_lock(); + guard(rcu)(); bkey_for_each_ptr(ptrs, ptr) if (bch2_dev_in_target(c, ptr->dev, target) && (ca = bch2_dev_rcu(c, ptr->dev)) && (!ptr->cached || - !dev_ptr_stale_rcu(ca, ptr))) { - ret = true; - break; - } - rcu_read_unlock(); + !dev_ptr_stale_rcu(ca, ptr))) + return true; - return ret; + return false; } bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k, @@ -1071,33 +1128,48 @@ void bch2_extent_ptr_set_cached(struct bch_fs *c, struct bkey_s k, struct bch_extent_ptr *ptr) { - struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); + struct bkey_ptrs ptrs; union bch_extent_entry *entry; struct extent_ptr_decoded p; + bool have_cached_ptr; + unsigned drop_dev = ptr->dev; - rcu_read_lock(); - if (!want_cached_ptr(c, opts, ptr)) { - bch2_bkey_drop_ptr_noerror(k, ptr); - goto out; - } + guard(rcu)(); +restart_drop_ptrs: + ptrs = bch2_bkey_ptrs(k); + have_cached_ptr = false; - /* - * Stripes can't contain cached data, for - reasons. - * - * Possibly something we can fix in the future? - */ - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - if (&entry->ptr == ptr) { - if (p.has_ec) - bch2_bkey_drop_ptr_noerror(k, ptr); - else - ptr->cached = true; - goto out; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + /* + * Check if it's erasure coded - stripes can't contain cached + * data. Possibly something we can fix in the future? + */ + if (&entry->ptr == ptr && p.has_ec) + goto drop; + + if (p.ptr.cached) { + if (have_cached_ptr || !want_cached_ptr(c, opts, &p.ptr)) { + bch2_bkey_drop_ptr_noerror(k, &entry->ptr); + ptr = NULL; + goto restart_drop_ptrs; + } + + have_cached_ptr = true; } + } - BUG(); -out: - rcu_read_unlock(); + if (!ptr) + bkey_for_each_ptr(ptrs, ptr2) + if (ptr2->dev == drop_dev) + ptr = ptr2; + + if (have_cached_ptr || !want_cached_ptr(c, opts, ptr)) + goto drop; + + ptr->cached = true; + return; +drop: + bch2_bkey_drop_ptr_noerror(k, ptr); } /* @@ -1112,12 +1184,11 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) { struct bch_dev *ca; - rcu_read_lock(); + guard(rcu)(); bch2_bkey_drop_ptrs(k, ptr, ptr->cached && (!(ca = bch2_dev_rcu(c, ptr->dev)) || dev_ptr_stale_rcu(ca, ptr) > 0)); - rcu_read_unlock(); return bkey_deleted(k.k); } @@ -1135,7 +1206,7 @@ bool bch2_extent_normalize_by_opts(struct bch_fs *c, struct bkey_ptrs ptrs; bool have_cached_ptr; - rcu_read_lock(); + guard(rcu)(); restart_drop_ptrs: ptrs = bch2_bkey_ptrs(k); have_cached_ptr = false; @@ -1148,7 +1219,6 @@ restart_drop_ptrs: } have_cached_ptr = true; } - rcu_read_unlock(); return bkey_deleted(k.k); } @@ -1156,7 +1226,7 @@ restart_drop_ptrs: void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_extent_ptr *ptr) { out->atomic++; - rcu_read_lock(); + guard(rcu)(); struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); if (!ca) { prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev, @@ -1180,7 +1250,6 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struc else if (stale) prt_printf(out, " invalid"); } - rcu_read_unlock(); --out->atomic; } @@ -1446,7 +1515,7 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k, struct bch_compression_opt opt = __bch2_compression_decode(r->compression); prt_printf(err, "invalid compression opt %u:%u", opt.type, opt.level); - return -BCH_ERR_invalid_bkey; + return bch_err_throw(c, invalid_bkey); } #endif break; diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index 9fe153183b36..b8590e51b76e 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -392,10 +392,13 @@ out: \ /* utility code common to all keys with pointers: */ +void bch2_io_failures_to_text(struct printbuf *, struct bch_fs *, + struct bch_io_failures *); struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *, unsigned); void bch2_mark_io_failure(struct bch_io_failures *, struct extent_ptr_decoded *, bool); +void bch2_mark_btree_validate_failure(struct bch_io_failures *, unsigned); int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, struct bch_io_failures *, struct extent_ptr_decoded *, int); diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h index e51529dca4c2..b23ce4a373c0 100644 --- a/fs/bcachefs/extents_types.h +++ b/fs/bcachefs/extents_types.h @@ -34,6 +34,7 @@ struct bch_io_failures { u8 dev; unsigned failed_csum_nr:6, failed_io:1, + failed_btree_validate:1, failed_ec:1; } devs[BCH_REPLICAS_MAX + 1]; }; diff --git a/fs/bcachefs/fast_list.c b/fs/bcachefs/fast_list.c new file mode 100644 index 000000000000..2faec143eb31 --- /dev/null +++ b/fs/bcachefs/fast_list.c @@ -0,0 +1,156 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Fast, unordered lists + * + * Supports add, remove, and iterate + * + * Underneath, they're a radix tree and an IDA, with a percpu buffer for slot + * allocation and freeing. + * + * This means that adding, removing, and iterating over items is lockless, + * except when refilling/emptying the percpu slot buffers. + */ + +#include "fast_list.h" + +struct fast_list_pcpu { + u32 nr; + u32 entries[31]; +}; + +static int fast_list_alloc_idx(struct fast_list *l, gfp_t gfp) +{ + int idx = ida_alloc_range(&l->slots_allocated, 1, INT_MAX, gfp); + if (unlikely(idx < 0)) + return 0; + + if (unlikely(!genradix_ptr_alloc_inlined(&l->items, idx, gfp))) { + ida_free(&l->slots_allocated, idx); + return 0; + } + + return idx; +} + +/** + * fast_list_get_idx - get a slot in a fast_list + * @l: list to get slot in + * + * This allocates a slot in the radix tree without storing to it, so that we can + * take the potential memory allocation failure early and do the list add later + * when we can't take an allocation failure. + * + * Returns: positive integer on success, -ENOMEM on failure + */ +int fast_list_get_idx(struct fast_list *l) +{ + unsigned long flags; + int idx; +retry: + local_irq_save(flags); + struct fast_list_pcpu *lp = this_cpu_ptr(l->buffer); + + if (unlikely(!lp->nr)) { + u32 entries[16], nr = 0; + + local_irq_restore(flags); + while (nr < ARRAY_SIZE(entries) && + (idx = fast_list_alloc_idx(l, GFP_KERNEL))) + entries[nr++] = idx; + local_irq_save(flags); + + lp = this_cpu_ptr(l->buffer); + + while (nr && lp->nr < ARRAY_SIZE(lp->entries)) + lp->entries[lp->nr++] = entries[--nr]; + + if (unlikely(nr)) { + local_irq_restore(flags); + while (nr) + ida_free(&l->slots_allocated, entries[--nr]); + goto retry; + } + + if (unlikely(!lp->nr)) { + local_irq_restore(flags); + return -ENOMEM; + } + } + + idx = lp->entries[--lp->nr]; + local_irq_restore(flags); + + return idx; +} + +/** + * fast_list_add - add an item to a fast_list + * @l: list + * @item: item to add + * + * Allocates a slot in the radix tree and stores to it and then returns the + * slot index, which must be passed to fast_list_remove(). + * + * Returns: positive integer on success, -ENOMEM on failure + */ +int fast_list_add(struct fast_list *l, void *item) +{ + int idx = fast_list_get_idx(l); + if (idx < 0) + return idx; + + *genradix_ptr_inlined(&l->items, idx) = item; + return idx; +} + +/** + * fast_list_remove - remove an item from a fast_list + * @l: list + * @idx: item's slot index + * + * Zeroes out the slot in the radix tree and frees the slot for future + * fast_list_add() operations. + */ +void fast_list_remove(struct fast_list *l, unsigned idx) +{ + u32 entries[16], nr = 0; + unsigned long flags; + + if (!idx) + return; + + *genradix_ptr_inlined(&l->items, idx) = NULL; + + local_irq_save(flags); + struct fast_list_pcpu *lp = this_cpu_ptr(l->buffer); + + if (unlikely(lp->nr == ARRAY_SIZE(lp->entries))) + while (nr < ARRAY_SIZE(entries)) + entries[nr++] = lp->entries[--lp->nr]; + + lp->entries[lp->nr++] = idx; + local_irq_restore(flags); + + if (unlikely(nr)) + while (nr) + ida_free(&l->slots_allocated, entries[--nr]); +} + +void fast_list_exit(struct fast_list *l) +{ + /* XXX: warn if list isn't empty */ + free_percpu(l->buffer); + ida_destroy(&l->slots_allocated); + genradix_free(&l->items); +} + +int fast_list_init(struct fast_list *l) +{ + genradix_init(&l->items); + ida_init(&l->slots_allocated); + l->buffer = alloc_percpu(*l->buffer); + if (!l->buffer) + return -ENOMEM; + return 0; +} diff --git a/fs/bcachefs/fast_list.h b/fs/bcachefs/fast_list.h new file mode 100644 index 000000000000..73c9bf591fd6 --- /dev/null +++ b/fs/bcachefs/fast_list.h @@ -0,0 +1,41 @@ +#ifndef _LINUX_FAST_LIST_H +#define _LINUX_FAST_LIST_H + +#include <linux/generic-radix-tree.h> +#include <linux/idr.h> +#include <linux/percpu.h> + +struct fast_list_pcpu; + +struct fast_list { + GENRADIX(void *) items; + struct ida slots_allocated;; + struct fast_list_pcpu __percpu + *buffer; +}; + +static inline void *fast_list_iter_peek(struct genradix_iter *iter, + struct fast_list *list) +{ + void **p; + while ((p = genradix_iter_peek(iter, &list->items)) && !*p) + genradix_iter_advance(iter, &list->items); + + return p ? *p : NULL; +} + +#define fast_list_for_each_from(_list, _iter, _i, _start) \ + for (_iter = genradix_iter_init(&(_list)->items, _start); \ + (_i = fast_list_iter_peek(&(_iter), _list)) != NULL; \ + genradix_iter_advance(&(_iter), &(_list)->items)) + +#define fast_list_for_each(_list, _iter, _i) \ + fast_list_for_each_from(_list, _iter, _i, 0) + +int fast_list_get_idx(struct fast_list *l); +int fast_list_add(struct fast_list *l, void *item); +void fast_list_remove(struct fast_list *l, unsigned idx); +void fast_list_exit(struct fast_list *l); +int fast_list_init(struct fast_list *l); + +#endif /* _LINUX_FAST_LIST_H */ diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index e3a75dcca60c..66bacdd49f78 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -394,17 +394,9 @@ struct bch_writepage_state { struct bch_io_opts opts; struct bch_folio_sector *tmp; unsigned tmp_sectors; + struct blk_plug plug; }; -static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c, - struct bch_inode_info *inode) -{ - struct bch_writepage_state ret = { 0 }; - - bch2_inode_opts_get(&ret.opts, c, &inode->ei_inode); - return ret; -} - /* * Determine when a writepage io is full. We have to limit writepage bios to a * single page per bvec (i.e. 1MB with 4k pages) because that is the limit to @@ -666,17 +658,17 @@ do_io: int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct bch_fs *c = mapping->host->i_sb->s_fs_info; - struct bch_writepage_state w = - bch_writepage_state_init(c, to_bch_ei(mapping->host)); - struct blk_plug plug; - int ret; + struct bch_writepage_state *w = kzalloc(sizeof(*w), GFP_NOFS|__GFP_NOFAIL); - blk_start_plug(&plug); - ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w); - if (w.io) - bch2_writepage_do_io(&w); - blk_finish_plug(&plug); - kfree(w.tmp); + bch2_inode_opts_get(&w->opts, c, &to_bch_ei(mapping->host)->ei_inode); + + blk_start_plug(&w->plug); + int ret = write_cache_pages(mapping, wbc, __bch2_writepage, w); + if (w->io) + bch2_writepage_do_io(w); + blk_finish_plug(&w->plug); + kfree(w->tmp); + kfree(w); return bch2_err_class(ret); } diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c index 535bc5fcbcc0..1f5154d9676b 100644 --- a/fs/bcachefs/fs-io-direct.c +++ b/fs/bcachefs/fs-io-direct.c @@ -3,6 +3,7 @@ #include "bcachefs.h" #include "alloc_foreground.h" +#include "enumerated_ref.h" #include "fs.h" #include "fs-io.h" #include "fs-io-direct.h" @@ -401,7 +402,7 @@ static __always_inline long bch2_dio_write_done(struct dio_write *dio) ret = dio->op.error ?: ((long) dio->written << 9); bio_put(&dio->op.wbio.bio); - bch2_write_ref_put(c, BCH_WRITE_REF_dio_write); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_dio_write); /* inode->i_dio_count is our ref on inode and thus bch_fs */ inode_dio_end(&inode->v); @@ -606,7 +607,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) prefetch(&inode->ei_inode); prefetch((void *) &inode->ei_inode + 64); - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_dio_write)) + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_dio_write)) return -EROFS; inode_lock(&inode->v); @@ -675,7 +676,7 @@ err_put_bio: bio_put(bio); inode_dio_end(&inode->v); err_put_write_ref: - bch2_write_ref_put(c, BCH_WRITE_REF_dio_write); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_dio_write); goto out; } diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c index fbae9c1de746..c2cc405822f2 100644 --- a/fs/bcachefs/fs-io-pagecache.c +++ b/fs/bcachefs/fs-io-pagecache.c @@ -447,7 +447,7 @@ static int __bch2_folio_reservation_get(struct bch_fs *c, if (!reserved) { bch2_disk_reservation_put(c, &disk_res); - return -BCH_ERR_ENOSPC_disk_reservation; + return bch_err_throw(c, ENOSPC_disk_reservation); } break; } diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index 9657144666b8..a233f45875e9 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -7,6 +7,7 @@ #include "btree_update.h" #include "buckets.h" #include "clock.h" +#include "enumerated_ref.h" #include "error.h" #include "extents.h" #include "extent_update.h" @@ -48,7 +49,8 @@ static void nocow_flush_endio(struct bio *_bio) struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio); closure_put(bio->cl); - percpu_ref_put(&bio->ca->io_ref[WRITE]); + enumerated_ref_put(&bio->ca->io_ref[WRITE], + BCH_DEV_WRITE_REF_nocow_flush); bio_put(&bio->bio); } @@ -69,11 +71,12 @@ void bch2_inode_flush_nocow_writes_async(struct bch_fs *c, memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush)); for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) { - rcu_read_lock(); - ca = rcu_dereference(c->devs[dev]); - if (ca && !percpu_ref_tryget(&ca->io_ref[WRITE])) - ca = NULL; - rcu_read_unlock(); + scoped_guard(rcu) { + ca = rcu_dereference(c->devs[dev]); + if (ca && !enumerated_ref_tryget(&ca->io_ref[WRITE], + BCH_DEV_WRITE_REF_nocow_flush)) + ca = NULL; + } if (!ca) continue; @@ -151,10 +154,9 @@ void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, inode->v.i_ino, (u64) inode->v.i_blocks, sectors, inode->ei_inode.bi_sectors); - bool repeat = false, print = false, suppress = false; - bch2_count_fsck_err(c, vfs_inode_i_blocks_underflow, buf.buf, &repeat, &print, &suppress); + bool print = bch2_count_fsck_err(c, vfs_inode_i_blocks_underflow, &buf); if (print) - bch2_print_str(c, buf.buf); + bch2_print_str(c, KERN_ERR, buf.buf); printbuf_exit(&buf); if (sectors < 0) @@ -220,7 +222,7 @@ static int bch2_flush_inode(struct bch_fs *c, if (c->opts.journal_flush_disabled) return 0; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync)) + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_fsync)) return -EROFS; u64 seq; @@ -228,7 +230,7 @@ static int bch2_flush_inode(struct bch_fs *c, bch2_get_inode_journal_seq_trans(trans, inode_inum(inode), &seq)) ?: bch2_journal_flush_seq(&c->journal, seq, TASK_INTERRUPTIBLE) ?: bch2_inode_flush_nocow_writes(c, inode); - bch2_write_ref_put(c, BCH_WRITE_REF_fsync); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_fsync); return ret; } @@ -526,11 +528,9 @@ int bchfs_truncate(struct mnt_idmap *idmap, inode->v.i_ino, (u64) inode->v.i_blocks, inode->ei_inode.bi_sectors); - bool repeat = false, print = false, suppress = false; - bch2_count_fsck_err(c, vfs_inode_i_blocks_not_zero_at_truncate, buf.buf, - &repeat, &print, &suppress); + bool print = bch2_count_fsck_err(c, vfs_inode_i_blocks_not_zero_at_truncate, &buf); if (print) - bch2_print_str(c, buf.buf); + bch2_print_str(c, KERN_ERR, buf.buf); printbuf_exit(&buf); } @@ -821,7 +821,7 @@ long bch2_fallocate_dispatch(struct file *file, int mode, struct bch_fs *c = inode->v.i_sb->s_fs_info; long ret; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fallocate)) + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_fallocate)) return -EROFS; inode_lock(&inode->v); @@ -845,7 +845,7 @@ long bch2_fallocate_dispatch(struct file *file, int mode, err: bch2_pagecache_block_put(inode); inode_unlock(&inode->v); - bch2_write_ref_put(c, BCH_WRITE_REF_fallocate); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_fallocate); return bch2_err_class(ret); } diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index a82dfce9e4ad..4e72e654da96 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -172,7 +172,10 @@ static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg) if (get_user(flags, arg)) return -EFAULT; - bch_notice(c, "shutdown by ioctl type %u", flags); + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + + prt_printf(&buf, "shutdown by ioctl type %u", flags); switch (flags) { case FSOP_GOING_FLAGS_DEFAULT: @@ -180,20 +183,23 @@ static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg) if (ret) break; bch2_journal_flush(&c->journal); - bch2_fs_emergency_read_only(c); + bch2_fs_emergency_read_only2(c, &buf); bdev_thaw(c->vfs_sb->s_bdev); break; case FSOP_GOING_FLAGS_LOGFLUSH: bch2_journal_flush(&c->journal); fallthrough; case FSOP_GOING_FLAGS_NOLOGFLUSH: - bch2_fs_emergency_read_only(c); + bch2_fs_emergency_read_only2(c, &buf); break; default: ret = -EINVAL; - break; + goto noprint; } + bch2_print_str(c, KERN_ERR, buf.buf); +noprint: + printbuf_exit(&buf); return ret; } @@ -262,13 +268,13 @@ static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, } if (dst_dentry->d_inode) { - error = -BCH_ERR_EEXIST_subvolume_create; + error = bch_err_throw(c, EEXIST_subvolume_create); goto err3; } dir = dst_path.dentry->d_inode; if (IS_DEADDIR(dir)) { - error = -BCH_ERR_ENOENT_directory_dead; + error = bch_err_throw(c, ENOENT_directory_dead); goto err3; } diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 47f1a64c5c8d..85d13f800165 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -124,8 +124,9 @@ retry: goto err; struct bch_extent_rebalance new_r = bch2_inode_rebalance_opts_get(c, &inode_u); + bool rebalance_changed = memcmp(&old_r, &new_r, sizeof(new_r)); - if (memcmp(&old_r, &new_r, sizeof(new_r))) { + if (rebalance_changed) { ret = bch2_set_rebalance_needs_scan_trans(trans, inode_u.bi_inum); if (ret) goto err; @@ -146,6 +147,9 @@ err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; + if (rebalance_changed) + bch2_rebalance_wakeup(c); + bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c, "%s: inode %llu:%llu not found when updating", bch2_err_str(ret), @@ -191,11 +195,6 @@ int bch2_fs_quota_transfer(struct bch_fs *c, return ret; } -static bool subvol_inum_eq(subvol_inum a, subvol_inum b) -{ - return a.subvol == b.subvol && a.inum == b.inum; -} - static u32 bch2_vfs_inode_hash_fn(const void *data, u32 len, u32 seed) { const subvol_inum *inum = data; @@ -352,9 +351,8 @@ repeat: if (!trans) { __wait_on_freeing_inode(c, inode, inum); } else { - bch2_trans_unlock(trans); - __wait_on_freeing_inode(c, inode, inum); - int ret = bch2_trans_relock(trans); + int ret = drop_locks_do(trans, + (__wait_on_freeing_inode(c, inode, inum), 0)); if (ret) return ERR_PTR(ret); } @@ -1575,11 +1573,12 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx) { struct bch_inode_info *inode = file_bch_inode(file); struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); if (!dir_emit_dots(file, ctx)) return 0; - int ret = bch2_readdir(c, inode_inum(inode), ctx); + int ret = bch2_readdir(c, inode_inum(inode), &hash, ctx); bch_err_fn(c, ret); return bch2_err_class(ret); @@ -2008,14 +2007,14 @@ retry: goto err; if (k.k->type != KEY_TYPE_dirent) { - ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode; + ret = bch_err_throw(c, ENOENT_dirent_doesnt_match_inode); goto err; } d = bkey_s_c_to_dirent(k); ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target); if (ret > 0) - ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode; + ret = bch_err_throw(c, ENOENT_dirent_doesnt_match_inode); if (ret) goto err; @@ -2181,7 +2180,13 @@ static void bch2_evict_inode(struct inode *vinode) KEY_TYPE_QUOTA_WARN); bch2_quota_acct(c, inode->ei_qid, Q_INO, -1, KEY_TYPE_QUOTA_WARN); - bch2_inode_rm(c, inode_inum(inode)); + int ret = bch2_inode_rm(c, inode_inum(inode)); + if (ret && !bch2_err_matches(ret, EROFS)) { + bch_err_msg(c, ret, "VFS incorrectly tried to delete inode %llu:%llu", + inode->ei_inum.subvol, + inode->ei_inum.inum); + bch2_sb_error_count(c, BCH_FSCK_ERR_vfs_bad_inode_rm); + } /* * If we are deleting, we need it present in the vfs hash table @@ -2328,7 +2333,8 @@ static int bch2_show_devname(struct seq_file *seq, struct dentry *root) struct bch_fs *c = root->d_sb->s_fs_info; bool first = true; - for_each_online_member(c, ca) { + guard(rcu)(); + for_each_online_member_rcu(c, ca) { if (!first) seq_putc(seq, ':'); first = false; @@ -2440,7 +2446,7 @@ static int bch2_fs_get_tree(struct fs_context *fc) struct inode *vinode; struct bch2_opts_parse *opts_parse = fc->fs_private; struct bch_opts opts = opts_parse->opts; - darray_str devs; + darray_const_str devs; darray_fs devs_to_fs = {}; int ret; @@ -2464,7 +2470,7 @@ static int bch2_fs_get_tree(struct fs_context *fc) if (!IS_ERR(sb)) goto got_sb; - c = bch2_fs_open(devs.data, devs.nr, opts); + c = bch2_fs_open(&devs, &opts); ret = PTR_ERR_OR_ZERO(c); if (ret) goto err; @@ -2514,7 +2520,12 @@ got_sb: sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1; sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec); super_set_uuid(sb, c->sb.user_uuid.b, sizeof(c->sb.user_uuid)); - super_set_sysfs_name_uuid(sb); + + if (c->sb.multi_device) + super_set_sysfs_name_uuid(sb); + else + strscpy(sb->s_sysfs_name, c->name, sizeof(sb->s_sysfs_name)); + sb->s_shrink->seeks = 0; c->vfs_sb = sb; strscpy(sb->s_id, c->name, sizeof(sb->s_id)); @@ -2525,14 +2536,15 @@ got_sb: sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; - for_each_online_member(c, ca) { - struct block_device *bdev = ca->disk_sb.bdev; + scoped_guard(rcu) { + for_each_online_member_rcu(c, ca) { + struct block_device *bdev = ca->disk_sb.bdev; - /* XXX: create an anonymous device for multi device filesystems */ - sb->s_bdev = bdev; - sb->s_dev = bdev->bd_dev; - percpu_ref_put(&ca->io_ref[READ]); - break; + /* XXX: create an anonymous device for multi device filesystems */ + sb->s_bdev = bdev; + sb->s_dev = bdev->bd_dev; + break; + } } c->dev = sb->s_dev; diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index aaf187085276..68ed69a255e1 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -23,14 +23,15 @@ #include <linux/bsearch.h> #include <linux/dcache.h> /* struct qstr */ -static int dirent_points_to_inode_nowarn(struct bkey_s_c_dirent d, +static int dirent_points_to_inode_nowarn(struct bch_fs *c, + struct bkey_s_c_dirent d, struct bch_inode_unpacked *inode) { if (d.v->d_type == DT_SUBVOL ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol : le64_to_cpu(d.v->d_inum) == inode->bi_inum) return 0; - return -BCH_ERR_ENOENT_dirent_doesnt_match_inode; + return bch_err_throw(c, ENOENT_dirent_doesnt_match_inode); } static void dirent_inode_mismatch_msg(struct printbuf *out, @@ -49,7 +50,7 @@ static int dirent_points_to_inode(struct bch_fs *c, struct bkey_s_c_dirent dirent, struct bch_inode_unpacked *inode) { - int ret = dirent_points_to_inode_nowarn(dirent, inode); + int ret = dirent_points_to_inode_nowarn(c, dirent, inode); if (ret) { struct printbuf buf = PRINTBUF; dirent_inode_mismatch_msg(&buf, c, dirent, inode); @@ -109,27 +110,6 @@ static int subvol_lookup(struct btree_trans *trans, u32 subvol, return ret; } -static int lookup_inode(struct btree_trans *trans, u64 inode_nr, u32 snapshot, - struct bch_inode_unpacked *inode) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, - SPOS(0, inode_nr, snapshot), 0); - ret = bkey_err(k); - if (ret) - goto err; - - ret = bkey_is_inode(k.k) - ? bch2_inode_unpack(k, inode) - : -BCH_ERR_ENOENT_inode; -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - static int lookup_dirent_in_snapshot(struct btree_trans *trans, struct bch_hash_info hash_info, subvol_inum dir, struct qstr *name, @@ -173,7 +153,7 @@ static int find_snapshot_tree_subvol(struct btree_trans *trans, goto found; } } - ret = -BCH_ERR_ENOENT_no_snapshot_tree_subvol; + ret = bch_err_throw(trans->c, ENOENT_no_snapshot_tree_subvol); found: bch2_trans_iter_exit(trans, &iter); return ret; @@ -231,7 +211,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, struct bch_inode_unpacked root_inode; struct bch_hash_info root_hash_info; - ret = lookup_inode(trans, root_inum.inum, snapshot, &root_inode); + ret = bch2_inode_find_by_inum_snapshot(trans, root_inum.inum, snapshot, &root_inode, 0); bch_err_msg(c, ret, "looking up root inode %llu for subvol %u", root_inum.inum, subvolid); if (ret) @@ -250,14 +230,14 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, if (d_type != DT_DIR) { bch_err(c, "error looking up lost+found: not a directory"); - return -BCH_ERR_ENOENT_not_directory; + return bch_err_throw(c, ENOENT_not_directory); } /* * The bch2_check_dirents pass has already run, dangling dirents * shouldn't exist here: */ - ret = lookup_inode(trans, inum, snapshot, lostfound); + ret = bch2_inode_find_by_inum_snapshot(trans, inum, snapshot, lostfound, 0); bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)", inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot)); return ret; @@ -285,7 +265,7 @@ create_lostfound: u64 cpu = raw_smp_processor_id(); bch2_inode_init_early(c, lostfound); - bch2_inode_init_late(lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode); + bch2_inode_init_late(c, lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode); lostfound->bi_dir = root_inode.bi_inum; lostfound->bi_snapshot = le32_to_cpu(st.root_snapshot); @@ -552,7 +532,7 @@ static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 sub if (!bch2_snapshot_is_leaf(c, snapshotid)) { bch_err(c, "need to reconstruct subvol, but have interior node snapshot"); - return -BCH_ERR_fsck_repair_unimplemented; + return bch_err_throw(c, fsck_repair_unimplemented); } /* @@ -566,7 +546,7 @@ static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 sub u64 cpu = raw_smp_processor_id(); bch2_inode_init_early(c, &new_inode); - bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL); + bch2_inode_init_late(c, &new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL); new_inode.bi_subvol = subvolid; @@ -656,7 +636,7 @@ static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 struct bch_inode_unpacked new_inode; bch2_inode_init_early(c, &new_inode); - bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL); + bch2_inode_init_late(c, &new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL); new_inode.bi_size = i_size; new_inode.bi_inum = inum; new_inode.bi_snapshot = snapshot; @@ -664,11 +644,6 @@ static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 return __bch2_fsck_write_inode(trans, &new_inode); } -struct snapshots_seen { - struct bpos pos; - snapshot_id_list ids; -}; - static inline void snapshots_seen_exit(struct snapshots_seen *s) { darray_exit(&s->ids); @@ -787,12 +762,12 @@ static int ref_visible2(struct bch_fs *c, #define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \ for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr && \ - (_i)->snapshot <= (_snapshot); _i++) \ - if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot)) + (_i)->inode.bi_snapshot <= (_snapshot); _i++) \ + if (key_visible_in_snapshot(_c, _s, _i->inode.bi_snapshot, _snapshot)) struct inode_walker_entry { struct bch_inode_unpacked inode; - u32 snapshot; + bool whiteout; u64 count; u64 i_size; }; @@ -821,13 +796,20 @@ static struct inode_walker inode_walker_init(void) static int add_inode(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c inode) { - struct bch_inode_unpacked u; - - return bch2_inode_unpack(inode, &u) ?: - darray_push(&w->inodes, ((struct inode_walker_entry) { - .inode = u, - .snapshot = inode.k->p.snapshot, + int ret = darray_push(&w->inodes, ((struct inode_walker_entry) { + .whiteout = !bkey_is_inode(inode.k), })); + if (ret) + return ret; + + struct inode_walker_entry *n = &darray_last(w->inodes); + if (!n->whiteout) { + return bch2_inode_unpack(inode, &n->inode); + } else { + n->inode.bi_inum = inode.k->p.inode; + n->inode.bi_snapshot = inode.k->p.snapshot; + return 0; + } } static int get_inodes_all_snapshots(struct btree_trans *trans, @@ -847,13 +829,12 @@ static int get_inodes_all_snapshots(struct btree_trans *trans, w->recalculate_sums = false; w->inodes.nr = 0; - for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum), - BTREE_ITER_all_snapshots, k, ret) { - if (k.k->p.offset != inum) + for_each_btree_key_max_norestart(trans, iter, + BTREE_ID_inodes, POS(0, inum), SPOS(0, inum, U32_MAX), + BTREE_ITER_all_snapshots, k, ret) { + ret = add_inode(c, w, k); + if (ret) break; - - if (bkey_is_inode(k.k)) - add_inode(c, w, k); } bch2_trans_iter_exit(trans, &iter); @@ -865,65 +846,6 @@ static int get_inodes_all_snapshots(struct btree_trans *trans, return 0; } -static struct inode_walker_entry * -lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k) -{ - bool is_whiteout = k.k->type == KEY_TYPE_whiteout; - - struct inode_walker_entry *i; - __darray_for_each(w->inodes, i) - if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->snapshot)) - goto found; - - return NULL; -found: - BUG_ON(k.k->p.snapshot > i->snapshot); - - if (k.k->p.snapshot != i->snapshot && !is_whiteout) { - struct inode_walker_entry new = *i; - - new.snapshot = k.k->p.snapshot; - new.count = 0; - new.i_size = 0; - - struct printbuf buf = PRINTBUF; - bch2_bkey_val_to_text(&buf, c, k); - - bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n" - "unexpected because we should always update the inode when we update a key in that inode\n" - "%s", - w->last_pos.inode, k.k->p.snapshot, i->snapshot, buf.buf); - printbuf_exit(&buf); - - while (i > w->inodes.data && i[-1].snapshot > k.k->p.snapshot) - --i; - - size_t pos = i - w->inodes.data; - int ret = darray_insert_item(&w->inodes, pos, new); - if (ret) - return ERR_PTR(ret); - - i = w->inodes.data + pos; - } - - return i; -} - -static struct inode_walker_entry *walk_inode(struct btree_trans *trans, - struct inode_walker *w, - struct bkey_s_c k) -{ - if (w->last_pos.inode != k.k->p.inode) { - int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode); - if (ret) - return ERR_PTR(ret); - } - - w->last_pos = k.k->p; - - return lookup_inode_for_snapshot(trans->c, w, k); -} - static int get_visible_inodes(struct btree_trans *trans, struct inode_walker *w, struct snapshots_seen *s, @@ -959,6 +881,91 @@ static int get_visible_inodes(struct btree_trans *trans, return ret; } +static struct inode_walker_entry * +lookup_inode_for_snapshot(struct btree_trans *trans, struct inode_walker *w, struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + + struct inode_walker_entry *i = darray_find_p(w->inodes, i, + bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->inode.bi_snapshot)); + + if (!i) + return NULL; + + struct printbuf buf = PRINTBUF; + int ret = 0; + + if (fsck_err_on(k.k->p.snapshot != i->inode.bi_snapshot, + trans, snapshot_key_missing_inode_snapshot, + "have key for inode %llu:%u but have inode in ancestor snapshot %u\n" + "unexpected because we should always update the inode when we update a key in that inode\n" + "%s", + w->last_pos.inode, k.k->p.snapshot, i->inode.bi_snapshot, + (bch2_bkey_val_to_text(&buf, c, k), + buf.buf))) { + struct bch_inode_unpacked new = i->inode; + struct bkey_i whiteout; + + new.bi_snapshot = k.k->p.snapshot; + + if (!i->whiteout) { + ret = __bch2_fsck_write_inode(trans, &new); + } else { + bkey_init(&whiteout.k); + whiteout.k.type = KEY_TYPE_whiteout; + whiteout.k.p = SPOS(0, i->inode.bi_inum, i->inode.bi_snapshot); + ret = bch2_btree_insert_nonextent(trans, BTREE_ID_inodes, + &whiteout, + BTREE_UPDATE_internal_snapshot_node); + } + + if (ret) + goto fsck_err; + + ret = bch2_trans_commit(trans, NULL, NULL, 0); + if (ret) + goto fsck_err; + + struct inode_walker_entry new_entry = *i; + + new_entry.inode.bi_snapshot = k.k->p.snapshot; + new_entry.count = 0; + new_entry.i_size = 0; + + while (i > w->inodes.data && i[-1].inode.bi_snapshot > k.k->p.snapshot) + --i; + + size_t pos = i - w->inodes.data; + ret = darray_insert_item(&w->inodes, pos, new_entry); + if (ret) + goto fsck_err; + + ret = bch_err_throw(c, transaction_restart_nested); + goto fsck_err; + } + + printbuf_exit(&buf); + return i; +fsck_err: + printbuf_exit(&buf); + return ERR_PTR(ret); +} + +static struct inode_walker_entry *walk_inode(struct btree_trans *trans, + struct inode_walker *w, + struct bkey_s_c k) +{ + if (w->last_pos.inode != k.k->p.inode) { + int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode); + if (ret) + return ERR_PTR(ret); + } + + w->last_pos = k.k->p; + + return lookup_inode_for_snapshot(trans, w, k); +} + /* * Prefer to delete the first one, since that will be the one at the wrong * offset: @@ -978,7 +985,8 @@ int bch2_fsck_update_backpointers(struct btree_trans *trans, int ret = 0; if (d->v.d_type == DT_SUBVOL) { - BUG(); + bch_err(trans->c, "%s does not support DT_SUBVOL", __func__); + ret = -BCH_ERR_fsck_repair_unimplemented; } else { ret = get_visible_inodes(trans, &target, s, le64_to_cpu(d->v.d_inum)); if (ret) @@ -1034,7 +1042,7 @@ static int check_inode_dirent_inode(struct btree_trans *trans, if (ret && !bch2_err_matches(ret, ENOENT)) return ret; - if ((ret || dirent_points_to_inode_nowarn(d, inode)) && + if ((ret || dirent_points_to_inode_nowarn(c, d, inode)) && inode->bi_subvol && (inode->bi_flags & BCH_INODE_has_child_snapshot)) { /* Older version of a renamed subvolume root: we won't have a @@ -1055,7 +1063,7 @@ static int check_inode_dirent_inode(struct btree_trans *trans, trans, inode_points_to_missing_dirent, "inode points to missing dirent\n%s", (bch2_inode_unpacked_to_text(&buf, inode), buf.buf)) || - fsck_err_on(!ret && dirent_points_to_inode_nowarn(d, inode), + fsck_err_on(!ret && dirent_points_to_inode_nowarn(c, d, inode), trans, inode_points_to_wrong_dirent, "%s", (printbuf_reset(&buf), @@ -1080,32 +1088,6 @@ fsck_err: return ret; } -static int get_snapshot_root_inode(struct btree_trans *trans, - struct bch_inode_unpacked *root, - u64 inum) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - - for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, - SPOS(0, inum, U32_MAX), - BTREE_ITER_all_snapshots, k, ret) { - if (k.k->p.offset != inum) - break; - if (bkey_is_inode(k.k)) - goto found_root; - } - if (ret) - goto err; - BUG(); -found_root: - ret = bch2_inode_unpack(k, root); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - static int check_inode(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, @@ -1136,20 +1118,23 @@ static int check_inode(struct btree_trans *trans, goto err; if (snapshot_root->bi_inum != u.bi_inum) { - ret = get_snapshot_root_inode(trans, snapshot_root, u.bi_inum); + ret = bch2_inode_find_snapshot_root(trans, u.bi_inum, snapshot_root); if (ret) goto err; } - if (fsck_err_on(u.bi_hash_seed != snapshot_root->bi_hash_seed || - INODE_STR_HASH(&u) != INODE_STR_HASH(snapshot_root), - trans, inode_snapshot_mismatch, - "inode hash info in different snapshots don't match")) { - u.bi_hash_seed = snapshot_root->bi_hash_seed; - SET_INODE_STR_HASH(&u, INODE_STR_HASH(snapshot_root)); - do_update = true; + if (u.bi_hash_seed != snapshot_root->bi_hash_seed || + INODE_STR_HASH(&u) != INODE_STR_HASH(snapshot_root)) { + ret = bch2_repair_inode_hash_info(trans, snapshot_root); + BUG_ON(ret == -BCH_ERR_fsck_repair_unimplemented); + if (ret) + goto err; } + ret = bch2_check_inode_has_case_insensitive(trans, &u, &s->ids, &do_update); + if (ret) + goto err; + if (u.bi_dir || u.bi_dir_offset) { ret = check_inode_dirent_inode(trans, &u, &do_update); if (ret) @@ -1183,6 +1168,14 @@ static int check_inode(struct btree_trans *trans, ret = 0; } + if (fsck_err_on(S_ISDIR(u.bi_mode) && u.bi_size, + trans, inode_dir_has_nonzero_i_size, + "directory %llu:%u with nonzero i_size %lli", + u.bi_inum, u.bi_snapshot, u.bi_size)) { + u.bi_size = 0; + do_update = true; + } + ret = bch2_inode_has_child_snapshots(trans, k.k->p); if (ret < 0) goto err; @@ -1452,25 +1445,27 @@ static int check_key_has_inode(struct btree_trans *trans, if (k.k->type == KEY_TYPE_whiteout) goto out; - if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) { + bool have_inode = i && !i->whiteout; + + if (!have_inode && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) { ret = reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); if (ret) goto err; inode->last_pos.inode--; - ret = -BCH_ERR_transaction_restart_nested; + ret = bch_err_throw(c, transaction_restart_nested); goto err; } - if (fsck_err_on(!i, + if (fsck_err_on(!have_inode, trans, key_in_missing_inode, "key in missing inode:\n%s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) goto delete; - if (fsck_err_on(i && !btree_matches_i_mode(iter->btree_id, i->inode.bi_mode), + if (fsck_err_on(have_inode && !btree_matches_i_mode(iter->btree_id, i->inode.bi_mode), trans, key_in_wrong_inode_type, "key for wrong inode mode %o:\n%s", i->inode.bi_mode, @@ -1498,21 +1493,21 @@ static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_wal if (i->inode.bi_sectors == i->count) continue; - count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot); + count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->inode.bi_snapshot); if (w->recalculate_sums) i->count = count2; if (i->count != count2) { bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu", - w->last_pos.inode, i->snapshot, i->count, count2); + w->last_pos.inode, i->inode.bi_snapshot, i->count, count2); i->count = count2; } if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty), trans, inode_i_sectors_wrong, "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu", - w->last_pos.inode, i->snapshot, + w->last_pos.inode, i->inode.bi_snapshot, i->inode.bi_sectors, i->count)) { i->inode.bi_sectors = i->count; ret = bch2_fsck_write_inode(trans, &i->inode); @@ -1576,7 +1571,7 @@ static int extent_ends_at(struct bch_fs *c, sizeof(seen->ids.data[0]) * seen->ids.size, GFP_KERNEL); if (!n.seen.ids.data) - return -BCH_ERR_ENOMEM_fsck_extent_ends_at; + return bch_err_throw(c, ENOMEM_fsck_extent_ends_at); __darray_for_each(extent_ends->e, i) { if (i->snapshot == k.k->p.snapshot) { @@ -1626,7 +1621,7 @@ static int overlapping_extents_found(struct btree_trans *trans, bch_err(c, "%s: error finding first overlapping extent when repairing, got%s", __func__, buf.buf); - ret = -BCH_ERR_internal_fsck_err; + ret = bch_err_throw(c, internal_fsck_err); goto err; } @@ -1651,7 +1646,7 @@ static int overlapping_extents_found(struct btree_trans *trans, pos2.size != k2.k->size) { bch_err(c, "%s: error finding seconding overlapping extent when repairing%s", __func__, buf.buf); - ret = -BCH_ERR_internal_fsck_err; + ret = bch_err_throw(c, internal_fsck_err); goto err; } @@ -1699,7 +1694,7 @@ static int overlapping_extents_found(struct btree_trans *trans, * We overwrote the second extent - restart * check_extent() from the top: */ - ret = -BCH_ERR_transaction_restart_nested; + ret = bch_err_throw(c, transaction_restart_nested); } } fsck_err: @@ -1823,20 +1818,20 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes); inode->inodes.data && i >= inode->inodes.data; --i) { - if (i->snapshot > k.k->p.snapshot || - !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot)) + if (i->inode.bi_snapshot > k.k->p.snapshot || + !key_visible_in_snapshot(c, s, i->inode.bi_snapshot, k.k->p.snapshot)) continue; if (fsck_err_on(k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 && !bkey_extent_is_reservation(k), trans, extent_past_end_of_inode, "extent type past end of inode %llu:%u, i_size %llu\n%s", - i->inode.bi_inum, i->snapshot, i->inode.bi_size, + i->inode.bi_inum, i->inode.bi_snapshot, i->inode.bi_size, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { struct btree_iter iter2; bch2_trans_copy_iter(trans, &iter2, iter); - bch2_btree_iter_set_snapshot(trans, &iter2, i->snapshot); + bch2_btree_iter_set_snapshot(trans, &iter2, i->inode.bi_snapshot); ret = bch2_btree_iter_traverse(trans, &iter2) ?: bch2_btree_delete_at(trans, &iter2, BTREE_UPDATE_internal_snapshot_node); @@ -1858,8 +1853,9 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes); inode->inodes.data && i >= inode->inodes.data; --i) { - if (i->snapshot > k.k->p.snapshot || - !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot)) + if (i->whiteout || + i->inode.bi_snapshot > k.k->p.snapshot || + !key_visible_in_snapshot(c, s, i->inode.bi_snapshot, k.k->p.snapshot)) continue; i->count += k.k->size; @@ -1941,13 +1937,13 @@ static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_ if (i->inode.bi_nlink == i->count) continue; - count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->snapshot); + count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->inode.bi_snapshot); if (count2 < 0) return count2; if (i->count != count2) { bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu", - w->last_pos.inode, i->snapshot, i->count, count2); + w->last_pos.inode, i->inode.bi_snapshot, i->count, count2); i->count = count2; if (i->inode.bi_nlink == i->count) continue; @@ -1956,7 +1952,7 @@ static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_ if (fsck_err_on(i->inode.bi_nlink != i->count, trans, inode_dir_wrong_nlink, "directory %llu:%u with wrong i_nlink: got %u, should be %llu", - w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) { + w->last_pos.inode, i->inode.bi_snapshot, i->inode.bi_nlink, i->count)) { i->inode.bi_nlink = i->count; ret = bch2_fsck_write_inode(trans, &i->inode); if (ret) @@ -2051,7 +2047,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { if (!new_parent_subvol) { bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot); - return -BCH_ERR_fsck_repair_unimplemented; + return bch_err_throw(c, fsck_repair_unimplemented); } struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent); @@ -2068,7 +2064,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * 0, subvolume); ret = bkey_err(s.s_c); if (ret && !bch2_err_matches(ret, ENOENT)) - return ret; + goto err; if (ret) { if (fsck_err(trans, dirent_to_missing_subvol, @@ -2079,30 +2075,41 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * goto out; } - if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol, - trans, subvol_fs_path_parent_wrong, - "subvol with wrong fs_path_parent, should be be %u\n%s", - parent_subvol, - (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - struct bkey_i_subvolume *n = - bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume); - ret = PTR_ERR_OR_ZERO(n); + if (le32_to_cpu(s.v->fs_path_parent) != parent_subvol) { + printbuf_reset(&buf); + + prt_printf(&buf, "subvol with wrong fs_path_parent, should be be %u\n", + parent_subvol); + + ret = bch2_inum_to_path(trans, (subvol_inum) { s.k->p.offset, + le64_to_cpu(s.v->inode) }, &buf); if (ret) goto err; + prt_newline(&buf); + bch2_bkey_val_to_text(&buf, c, s.s_c); + + if (fsck_err(trans, subvol_fs_path_parent_wrong, "%s", buf.buf)) { + struct bkey_i_subvolume *n = + bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + goto err; - n->v.fs_path_parent = cpu_to_le32(parent_subvol); + n->v.fs_path_parent = cpu_to_le32(parent_subvol); + } } u64 target_inum = le64_to_cpu(s.v->inode); u32 target_snapshot = le32_to_cpu(s.v->snapshot); - ret = lookup_inode(trans, target_inum, target_snapshot, &subvol_root); + ret = bch2_inode_find_by_inum_snapshot(trans, target_inum, target_snapshot, + &subvol_root, 0); if (ret && !bch2_err_matches(ret, ENOENT)) goto err; if (ret) { bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum); - ret = -BCH_ERR_fsck_repair_unimplemented; + ret = bch_err_throw(c, fsck_repair_unimplemented); goto err; } @@ -2134,7 +2141,8 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, struct bch_hash_info *hash_info, struct inode_walker *dir, struct inode_walker *target, - struct snapshots_seen *s) + struct snapshots_seen *s, + bool *need_second_pass) { struct bch_fs *c = trans->c; struct inode_walker_entry *i; @@ -2169,14 +2177,19 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, if (ret) goto err; - if (!i) + if (!i || i->whiteout) goto out; if (dir->first_this_inode) *hash_info = bch2_hash_info_init(c, &i->inode); dir->first_this_inode = false; - ret = bch2_str_hash_check_key(trans, s, &bch2_dirent_hash_desc, hash_info, iter, k); +#ifdef CONFIG_UNICODE + hash_info->cf_encoding = bch2_inode_casefold(c, &i->inode) ? c->cf_encoding : NULL; +#endif + + ret = bch2_str_hash_check_key(trans, s, &bch2_dirent_hash_desc, hash_info, + iter, k, need_second_pass); if (ret < 0) goto err; if (ret) { @@ -2197,31 +2210,34 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - struct qstr name = bch2_dirent_get_name(d); - u32 subvol = d.v->d_type == DT_SUBVOL - ? d.v->d_parent_subvol - : 0; + subvol_inum dir_inum = { .subvol = d.v->d_type == DT_SUBVOL + ? le32_to_cpu(d.v->d_parent_subvol) + : 0, + }; u64 target = d.v->d_type == DT_SUBVOL - ? d.v->d_child_subvol - : d.v->d_inum; - u64 dir_offset; + ? le32_to_cpu(d.v->d_child_subvol) + : le64_to_cpu(d.v->d_inum); + struct qstr name = bch2_dirent_get_name(d); - ret = bch2_hash_delete_at(trans, + struct bkey_i_dirent *new_d = + bch2_dirent_create_key(trans, hash_info, dir_inum, + d.v->d_type, &name, NULL, target); + ret = PTR_ERR_OR_ZERO(new_d); + if (ret) + goto out; + + new_d->k.p.inode = d.k->p.inode; + new_d->k.p.snapshot = d.k->p.snapshot; + + struct btree_iter dup_iter = {}; + ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc, hash_info, iter, BTREE_UPDATE_internal_snapshot_node) ?: - bch2_dirent_create_snapshot(trans, subvol, - d.k->p.inode, d.k->p.snapshot, - hash_info, - d.v->d_type, - &name, - target, - &dir_offset, - BTREE_ITER_with_updates| - BTREE_UPDATE_internal_snapshot_node| - STR_HASH_must_create) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); - - /* might need another check_dirents pass */ + bch2_str_hash_repair_key(trans, s, + &bch2_dirent_hash_desc, hash_info, + iter, bkey_i_to_s_c(&new_d->k_i), + &dup_iter, bkey_s_c_null, + need_second_pass); goto out; } @@ -2289,7 +2305,6 @@ out: err: fsck_err: printbuf_exit(&buf); - bch_err_fn(c, ret); return ret; } @@ -2303,16 +2318,31 @@ int bch2_check_dirents(struct bch_fs *c) struct inode_walker target = inode_walker_init(); struct snapshots_seen s; struct bch_hash_info hash_info; + bool need_second_pass = false, did_second_pass = false; + int ret; snapshots_seen_init(&s); - - int ret = bch2_trans_run(c, - for_each_btree_key(trans, iter, BTREE_ID_dirents, +again: + ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, BTREE_ID_dirents, POS(BCACHEFS_ROOT_INO, 0), BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?: + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s, + &need_second_pass)) ?: check_subdir_count_notnested(trans, &dir)); + if (!ret && need_second_pass && !did_second_pass) { + bch_info(c, "check_dirents requires second pass"); + swap(did_second_pass, need_second_pass); + goto again; + } + + if (!ret && need_second_pass) { + bch_err(c, "dirents not repairing"); + ret = -EINVAL; + } + snapshots_seen_exit(&s); inode_walker_exit(&dir); inode_walker_exit(&target); @@ -2326,16 +2356,14 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, struct inode_walker *inode) { struct bch_fs *c = trans->c; - struct inode_walker_entry *i; - int ret; - ret = bch2_check_key_has_snapshot(trans, iter, k); + int ret = bch2_check_key_has_snapshot(trans, iter, k); if (ret < 0) return ret; if (ret) return 0; - i = walk_inode(trans, inode, k); + struct inode_walker_entry *i = walk_inode(trans, inode, k); ret = PTR_ERR_OR_ZERO(i); if (ret) return ret; @@ -2344,16 +2372,16 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, if (ret) return ret; - if (!i) + if (!i || i->whiteout) return 0; if (inode->first_this_inode) *hash_info = bch2_hash_info_init(c, &i->inode); inode->first_this_inode = false; - ret = bch2_str_hash_check_key(trans, NULL, &bch2_xattr_hash_desc, hash_info, iter, k); - bch_err_fn(c, ret); - return ret; + bool need_second_pass = false; + return bch2_str_hash_check_key(trans, NULL, &bch2_xattr_hash_desc, hash_info, + iter, k, &need_second_pass); } /* @@ -2413,7 +2441,8 @@ static int check_root_trans(struct btree_trans *trans) goto err; } - ret = lookup_inode(trans, BCACHEFS_ROOT_INO, snapshot, &root_inode); + ret = bch2_inode_find_by_inum_snapshot(trans, BCACHEFS_ROOT_INO, snapshot, + &root_inode, 0); if (ret && !bch2_err_matches(ret, ENOENT)) return ret; @@ -2445,8 +2474,6 @@ int bch2_check_root(struct bch_fs *c) return ret; } -typedef DARRAY(u32) darray_u32; - static bool darray_u32_has(darray_u32 *d, u32 v) { darray_for_each(*d, i) @@ -2483,7 +2510,14 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, u32 parent = le32_to_cpu(s.v->fs_path_parent); if (darray_u32_has(&subvol_path, parent)) { - if (fsck_err(trans, subvol_loop, "subvolume loop")) + printbuf_reset(&buf); + prt_printf(&buf, "subvolume loop:\n"); + + darray_for_each_reverse(subvol_path, i) + prt_printf(&buf, "%u ", *i); + prt_printf(&buf, "%u", parent); + + if (fsck_err(trans, subvol_loop, "%s", buf.buf)) ret = reattach_subvol(trans, s); break; } @@ -2499,7 +2533,8 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, if (fsck_err_on(k.k->type != KEY_TYPE_subvolume, trans, subvol_unreachable, "unreachable subvolume %s", - (bch2_bkey_val_to_text(&buf, c, s.s_c), + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { ret = reattach_subvol(trans, s); break; @@ -2655,14 +2690,13 @@ static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k) redo_bi_depth = true; if (path_is_dup(&path, inode.bi_inum, snapshot)) { - /* XXX print path */ - bch_err(c, "directory structure loop"); - - darray_for_each(path, i) - pr_err("%llu:%u", i->inum, i->snapshot); - pr_err("%llu:%u", inode.bi_inum, snapshot); + printbuf_reset(&buf); + prt_printf(&buf, "directory structure loop:\n"); + darray_for_each_reverse(path, i) + prt_printf(&buf, "%llu:%u ", i->inum, i->snapshot); + prt_printf(&buf, "%llu:%u", inode.bi_inum, snapshot); - if (fsck_err(trans, dir_loop, "directory structure loop")) { + if (fsck_err(trans, dir_loop, "%s", buf.buf)) { ret = remove_backpointer(trans, &inode); bch_err_msg(c, ret, "removing dirent"); if (ret) @@ -2736,7 +2770,7 @@ static int add_nlink(struct bch_fs *c, struct nlink_table *t, if (!d) { bch_err(c, "fsck: error allocating memory for nlink_table, size %zu", new_size); - return -BCH_ERR_ENOMEM_fsck_add_nlink; + return bch_err_throw(c, ENOMEM_fsck_add_nlink); } if (t->d) @@ -3061,7 +3095,7 @@ long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg) { struct bch_ioctl_fsck_offline arg; struct fsck_thread *thr = NULL; - darray_str(devs) = {}; + darray_const_str devs = {}; long ret = 0; if (copy_from_user(&arg, user_arg, sizeof(arg))) @@ -3119,7 +3153,7 @@ long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg) bch2_thread_with_stdio_init(&thr->thr, &bch2_offline_fsck_ops); - thr->c = bch2_fs_open(devs.data, arg.nr_devs, thr->opts); + thr->c = bch2_fs_open(&devs, &thr->opts); if (!IS_ERR(thr->c) && thr->c->opts.errors == BCH_ON_ERROR_panic) @@ -3156,19 +3190,18 @@ static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio) c->opts.fix_errors = FSCK_FIX_ask; c->opts.fsck = true; - set_bit(BCH_FS_fsck_running, &c->flags); + set_bit(BCH_FS_in_fsck, &c->flags); - c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info; - int ret = bch2_run_online_recovery_passes(c); + int ret = bch2_run_online_recovery_passes(c, ~0ULL); - clear_bit(BCH_FS_fsck_running, &c->flags); + clear_bit(BCH_FS_in_fsck, &c->flags); bch_err_fn(c, ret); c->stdio = NULL; c->stdio_filter = NULL; c->opts.fix_errors = old_fix_errors; - up(&c->online_fsck_mutex); + up(&c->recovery.run_lock); bch2_ro_ref_put(c); return ret; } @@ -3192,7 +3225,7 @@ long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg) if (!bch2_ro_ref_tryget(c)) return -EROFS; - if (down_trylock(&c->online_fsck_mutex)) { + if (down_trylock(&c->recovery.run_lock)) { bch2_ro_ref_put(c); return -EAGAIN; } @@ -3224,7 +3257,7 @@ err: bch_err_fn(c, ret); if (thr) bch2_fsck_thread_exit(&thr->thr); - up(&c->online_fsck_mutex); + up(&c->recovery.run_lock); bch2_ro_ref_put(c); } return ret; diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h index 574948278cd4..e5fe7cf7b251 100644 --- a/fs/bcachefs/fsck.h +++ b/fs/bcachefs/fsck.h @@ -4,6 +4,12 @@ #include "str_hash.h" +/* recoverds snapshot IDs of overwrites at @pos */ +struct snapshots_seen { + struct bpos pos; + snapshot_id_list ids; +}; + int bch2_fsck_update_backpointers(struct btree_trans *, struct snapshots_seen *, const struct bch_hash_desc, diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 490b85841de9..53e5dc1f6ac1 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -38,6 +38,7 @@ static const char * const bch2_inode_flag_strs[] = { #undef x static int delete_ancestor_snapshot_inodes(struct btree_trans *, struct bpos); +static int may_delete_deleted_inum(struct btree_trans *, subvol_inum); static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 }; @@ -241,6 +242,7 @@ static int bch2_inode_unpack_v3(struct bkey_s_c k, u64 v[2]; unpacked->bi_inum = inode.k->p.offset; + unpacked->bi_snapshot = inode.k->p.snapshot; unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq); unpacked->bi_hash_seed = inode.v->bi_hash_seed; unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags); @@ -285,13 +287,12 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k, { memset(unpacked, 0, sizeof(*unpacked)); - unpacked->bi_snapshot = k.k->p.snapshot; - switch (k.k->type) { case KEY_TYPE_inode: { struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); unpacked->bi_inum = inode.k->p.offset; + unpacked->bi_snapshot = inode.k->p.snapshot; unpacked->bi_journal_seq= 0; unpacked->bi_hash_seed = inode.v->bi_hash_seed; unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags); @@ -310,6 +311,7 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k, struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); unpacked->bi_inum = inode.k->p.offset; + unpacked->bi_snapshot = inode.k->p.snapshot; unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq); unpacked->bi_hash_seed = inode.v->bi_hash_seed; unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags); @@ -327,8 +329,6 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k, int bch2_inode_unpack(struct bkey_s_c k, struct bch_inode_unpacked *unpacked) { - unpacked->bi_snapshot = k.k->p.snapshot; - return likely(k.k->type == KEY_TYPE_inode_v3) ? bch2_inode_unpack_v3(k, unpacked) : bch2_inode_unpack_slowpath(k, unpacked); @@ -368,6 +368,82 @@ err: return ret; } +int bch2_inode_find_by_inum_snapshot(struct btree_trans *trans, + u64 inode_nr, u32 snapshot, + struct bch_inode_unpacked *inode, + unsigned flags) +{ + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, + SPOS(0, inode_nr, snapshot), flags); + int ret = bkey_err(k); + if (ret) + goto err; + + ret = bkey_is_inode(k.k) + ? bch2_inode_unpack(k, inode) + : -BCH_ERR_ENOENT_inode; +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *trans, + subvol_inum inum, + struct bch_inode_unpacked *inode) +{ + struct btree_iter iter; + int ret; + + ret = bch2_inode_peek_nowarn(trans, &iter, inode, inum, 0); + if (!ret) + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_inode_find_by_inum_trans(struct btree_trans *trans, + subvol_inum inum, + struct bch_inode_unpacked *inode) +{ + struct btree_iter iter; + int ret; + + ret = bch2_inode_peek(trans, &iter, inode, inum, 0); + if (!ret) + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum, + struct bch_inode_unpacked *inode) +{ + return bch2_trans_do(c, bch2_inode_find_by_inum_trans(trans, inum, inode)); +} + +int bch2_inode_find_snapshot_root(struct btree_trans *trans, u64 inum, + struct bch_inode_unpacked *root) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, + SPOS(0, inum, U32_MAX), + BTREE_ITER_all_snapshots, k, ret) { + if (k.k->p.offset != inum) + break; + if (bkey_is_inode(k.k)) { + ret = bch2_inode_unpack(k, root); + goto out; + } + } + /* We're only called when we know we have an inode for @inum */ + BUG_ON(!ret); +out: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + int bch2_inode_write_flags(struct btree_trans *trans, struct btree_iter *iter, struct bch_inode_unpacked *inode, @@ -833,7 +909,8 @@ void bch2_inode_init_early(struct bch_fs *c, get_random_bytes(&inode_u->bi_hash_seed, sizeof(inode_u->bi_hash_seed)); } -void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now, +void bch2_inode_init_late(struct bch_fs *c, + struct bch_inode_unpacked *inode_u, u64 now, uid_t uid, gid_t gid, umode_t mode, dev_t rdev, struct bch_inode_unpacked *parent) { @@ -857,6 +934,12 @@ void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now, BCH_INODE_OPTS() #undef x } + + if (!S_ISDIR(mode)) + inode_u->bi_casefold = 0; + + if (bch2_inode_casefold(c, inode_u)) + inode_u->bi_flags |= BCH_INODE_has_case_insensitive; } void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, @@ -864,7 +947,7 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, struct bch_inode_unpacked *parent) { bch2_inode_init_early(c, inode_u); - bch2_inode_init_late(inode_u, bch2_current_time(c), + bch2_inode_init_late(c, inode_u, bch2_current_time(c), uid, gid, mode, rdev, parent); } @@ -959,7 +1042,7 @@ again: goto found_slot; if (!ret && start == min) - ret = -BCH_ERR_ENOSPC_inode_create; + ret = bch_err_throw(trans->c, ENOSPC_inode_create); if (ret) { bch2_trans_iter_exit(trans, iter); @@ -1048,19 +1131,23 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) u32 snapshot; int ret; + ret = lockrestart_do(trans, may_delete_deleted_inum(trans, inum)); + if (ret) + goto err2; + /* * If this was a directory, there shouldn't be any real dirents left - * but there could be whiteouts (from hash collisions) that we should * delete: * - * XXX: the dirent could ideally would delete whiteouts when they're no + * XXX: the dirent code ideally would delete whiteouts when they're no * longer needed */ ret = bch2_inode_delete_keys(trans, inum, BTREE_ID_extents) ?: bch2_inode_delete_keys(trans, inum, BTREE_ID_xattrs) ?: bch2_inode_delete_keys(trans, inum, BTREE_ID_dirents); if (ret) - goto err; + goto err2; retry: bch2_trans_begin(trans); @@ -1079,7 +1166,7 @@ retry: bch2_fs_inconsistent(c, "inode %llu:%u not found when deleting", inum.inum, snapshot); - ret = -BCH_ERR_ENOENT_inode; + ret = bch_err_throw(c, ENOENT_inode); goto err; } @@ -1100,38 +1187,6 @@ err2: return ret; } -int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *trans, - subvol_inum inum, - struct bch_inode_unpacked *inode) -{ - struct btree_iter iter; - int ret; - - ret = bch2_inode_peek_nowarn(trans, &iter, inode, inum, 0); - if (!ret) - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_inode_find_by_inum_trans(struct btree_trans *trans, - subvol_inum inum, - struct bch_inode_unpacked *inode) -{ - struct btree_iter iter; - int ret; - - ret = bch2_inode_peek(trans, &iter, inode, inum, 0); - if (!ret) - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum, - struct bch_inode_unpacked *inode) -{ - return bch2_trans_do(c, bch2_inode_find_by_inum_trans(trans, inum, inode)); -} - int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi) { if (bi->bi_flags & BCH_INODE_unlinked) @@ -1233,7 +1288,7 @@ int bch2_inode_set_casefold(struct btree_trans *trans, subvol_inum inum, bi->bi_casefold = v + 1; bi->bi_fields_set |= BIT(Inode_opt_casefold); - return 0; + return bch2_maybe_propagate_has_case_insensitive(trans, inum, bi); #else bch_err(c, "Cannot use casefolding on a kernel without CONFIG_UNICODE"); return -EOPNOTSUPP; @@ -1278,7 +1333,7 @@ retry: bch2_fs_inconsistent(c, "inode %llu:%u not found when deleting", inum, snapshot); - ret = -BCH_ERR_ENOENT_inode; + ret = bch_err_throw(c, ENOENT_inode); goto err; } @@ -1342,10 +1397,8 @@ int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) delete_ancestor_snapshot_inodes(trans, SPOS(0, inum, snapshot)); } -static int may_delete_deleted_inode(struct btree_trans *trans, - struct btree_iter *iter, - struct bpos pos, - bool *need_another_pass) +static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos, + bool from_deleted_inodes) { struct bch_fs *c = trans->c; struct btree_iter inode_iter; @@ -1359,12 +1412,14 @@ static int may_delete_deleted_inode(struct btree_trans *trans, if (ret) return ret; - ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode; - if (fsck_err_on(!bkey_is_inode(k.k), + ret = bkey_is_inode(k.k) ? 0 : bch_err_throw(c, ENOENT_inode); + if (fsck_err_on(from_deleted_inodes && ret, trans, deleted_inode_missing, "nonexistent inode %llu:%u in deleted_inodes btree", pos.offset, pos.snapshot)) goto delete; + if (ret) + goto out; ret = bch2_inode_unpack(k, &inode); if (ret) @@ -1372,7 +1427,8 @@ static int may_delete_deleted_inode(struct btree_trans *trans, if (S_ISDIR(inode.bi_mode)) { ret = bch2_empty_dir_snapshot(trans, pos.offset, 0, pos.snapshot); - if (fsck_err_on(bch2_err_matches(ret, ENOTEMPTY), + if (fsck_err_on(from_deleted_inodes && + bch2_err_matches(ret, ENOTEMPTY), trans, deleted_inode_is_dir, "non empty directory %llu:%u in deleted_inodes btree", pos.offset, pos.snapshot)) @@ -1381,17 +1437,25 @@ static int may_delete_deleted_inode(struct btree_trans *trans, goto out; } - if (fsck_err_on(!(inode.bi_flags & BCH_INODE_unlinked), + ret = inode.bi_flags & BCH_INODE_unlinked ? 0 : bch_err_throw(c, inode_not_unlinked); + if (fsck_err_on(from_deleted_inodes && ret, trans, deleted_inode_not_unlinked, "non-deleted inode %llu:%u in deleted_inodes btree", pos.offset, pos.snapshot)) goto delete; + if (ret) + goto out; - if (fsck_err_on(inode.bi_flags & BCH_INODE_has_child_snapshot, + ret = !(inode.bi_flags & BCH_INODE_has_child_snapshot) + ? 0 : bch_err_throw(c, inode_has_child_snapshot); + + if (fsck_err_on(from_deleted_inodes && ret, trans, deleted_inode_has_child_snapshots, "inode with child snapshots %llu:%u in deleted_inodes btree", pos.offset, pos.snapshot)) goto delete; + if (ret) + goto out; ret = bch2_inode_has_child_snapshots(trans, k.k->p); if (ret < 0) @@ -1408,19 +1472,28 @@ static int may_delete_deleted_inode(struct btree_trans *trans, if (ret) goto out; } + + if (!from_deleted_inodes) { + ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: + bch_err_throw(c, inode_has_child_snapshot); + goto out; + } + goto delete; } - if (test_bit(BCH_FS_clean_recovery, &c->flags) && - !fsck_err(trans, deleted_inode_but_clean, - "filesystem marked as clean but have deleted inode %llu:%u", - pos.offset, pos.snapshot)) { - ret = 0; - goto out; - } + if (from_deleted_inodes) { + if (test_bit(BCH_FS_clean_recovery, &c->flags) && + !fsck_err(trans, deleted_inode_but_clean, + "filesystem marked as clean but have deleted inode %llu:%u", + pos.offset, pos.snapshot)) { + ret = 0; + goto out; + } - ret = 1; + ret = 1; + } out: fsck_err: bch2_trans_iter_exit(trans, &inode_iter); @@ -1431,12 +1504,19 @@ delete: goto out; } +static int may_delete_deleted_inum(struct btree_trans *trans, subvol_inum inum) +{ + u32 snapshot; + + return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?: + may_delete_deleted_inode(trans, SPOS(0, inum.inum, snapshot), false); +} + int bch2_delete_dead_inodes(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); - bool need_another_pass; int ret; -again: + /* * if we ran check_inodes() unlinked inodes will have already been * cleaned up but the write buffer will be out of sync; therefore we @@ -1446,8 +1526,6 @@ again: if (ret) goto err; - need_another_pass = false; - /* * Weird transaction restart handling here because on successful delete, * bch2_inode_rm_snapshot() will return a nested transaction restart, @@ -1457,7 +1535,7 @@ again: ret = for_each_btree_key_commit(trans, iter, BTREE_ID_deleted_inodes, POS_MIN, BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ - ret = may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass); + ret = may_delete_deleted_inode(trans, k.k->p, true); if (ret > 0) { bch_verbose_ratelimited(c, "deleting unlinked inode %llu:%u", k.k->p.offset, k.k->p.snapshot); @@ -1478,10 +1556,8 @@ again: ret; })); - - if (!ret && need_another_pass) - goto again; err: bch2_trans_put(trans); + bch_err_fn(c, ret); return ret; } diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index 5cfba9e98966..82cec2836cbd 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -134,10 +134,21 @@ static inline int bch2_inode_peek(struct btree_trans *trans, subvol_inum inum, unsigned flags) { return __bch2_inode_peek(trans, iter, inode, inum, flags, true); - int ret = bch2_inode_peek_nowarn(trans, iter, inode, inum, flags); - return ret; } +int bch2_inode_find_by_inum_snapshot(struct btree_trans *, u64, u32, + struct bch_inode_unpacked *, unsigned); +int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *, + subvol_inum, + struct bch_inode_unpacked *); +int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum, + struct bch_inode_unpacked *); +int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum, + struct bch_inode_unpacked *); + +int bch2_inode_find_snapshot_root(struct btree_trans *trans, u64 inum, + struct bch_inode_unpacked *root); + int bch2_inode_write_flags(struct btree_trans *, struct btree_iter *, struct bch_inode_unpacked *, enum btree_iter_update_trigger_flags); @@ -153,7 +164,7 @@ int bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *); void bch2_inode_init_early(struct bch_fs *, struct bch_inode_unpacked *); -void bch2_inode_init_late(struct bch_inode_unpacked *, u64, +void bch2_inode_init_late(struct bch_fs *, struct bch_inode_unpacked *, u64, uid_t, gid_t, umode_t, dev_t, struct bch_inode_unpacked *); void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, @@ -165,14 +176,6 @@ int bch2_inode_create(struct btree_trans *, struct btree_iter *, int bch2_inode_rm(struct bch_fs *, subvol_inum); -int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *, - subvol_inum, - struct bch_inode_unpacked *); -int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum, - struct bch_inode_unpacked *); -int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum, - struct bch_inode_unpacked *); - #define inode_opt_get(_c, _inode, _name) \ ((_inode)->bi_##_name ? (_inode)->bi_##_name - 1 : (_c)->opts._name) @@ -245,7 +248,7 @@ static inline unsigned bkey_inode_mode(struct bkey_s_c k) static inline bool bch2_inode_casefold(struct bch_fs *c, const struct bch_inode_unpacked *bi) { - /* inode apts are stored with a +1 bias: 0 means "unset, use fs opt" */ + /* inode opts are stored with a +1 bias: 0 means "unset, use fs opt" */ return bi->bi_casefold ? bi->bi_casefold - 1 : c->opts.casefold; @@ -280,15 +283,6 @@ static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi, int bch2_inode_nlink_inc(struct bch_inode_unpacked *); void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *); -static inline bool bch2_inode_should_have_single_bp(struct bch_inode_unpacked *inode) -{ - bool inode_has_bp = inode->bi_dir || inode->bi_dir_offset; - - return S_ISDIR(inode->bi_mode) || - inode->bi_subvol || - (!inode->bi_nlink && inode_has_bp); -} - struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *); void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *, struct bch_inode_unpacked *); @@ -306,6 +300,14 @@ bch2_inode_rebalance_opts_get(struct bch_fs *c, struct bch_inode_unpacked *inode return io_opts_to_rebalance_opts(c, &io_opts); } +#define BCACHEFS_ROOT_SUBVOL_INUM \ + ((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO }) + +static inline bool subvol_inum_eq(subvol_inum a, subvol_inum b) +{ + return a.subvol == b.subvol && a.inum == b.inum; +} + int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32); int bch2_delete_dead_inodes(struct bch_fs *); diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h index 87e193e8ed25..1f00938b1bdc 100644 --- a/fs/bcachefs/inode_format.h +++ b/fs/bcachefs/inode_format.h @@ -129,6 +129,10 @@ enum inode_opt_id { Inode_opt_nr, }; +/* + * BCH_INODE_has_case_insensitive is set if any descendent is case insensitive - + * for overlayfs + */ #define BCH_INODE_FLAGS() \ x(sync, 0) \ x(immutable, 1) \ @@ -139,7 +143,8 @@ enum inode_opt_id { x(i_sectors_dirty, 6) \ x(unlinked, 7) \ x(backptr_untrusted, 8) \ - x(has_child_snapshot, 9) + x(has_child_snapshot, 9) \ + x(has_case_insensitive, 10) /* bits 20+ reserved for packed fields below: */ diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c index cc07729a4b62..bf72b1d2e2cb 100644 --- a/fs/bcachefs/io_misc.c +++ b/fs/bcachefs/io_misc.c @@ -91,7 +91,7 @@ int bch2_extent_fallocate(struct btree_trans *trans, opts.data_replicas, BCH_WATERMARK_normal, 0, &cl, &wp); if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) - ret = -BCH_ERR_transaction_restart_nested; + ret = bch_err_throw(c, transaction_restart_nested); if (ret) goto err; diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index def4a26a3b45..a77779afad01 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -9,6 +9,7 @@ #include "bcachefs.h" #include "alloc_background.h" #include "alloc_foreground.h" +#include "async_objs.h" #include "btree_update.h" #include "buckets.h" #include "checksum.h" @@ -17,6 +18,7 @@ #include "data_update.h" #include "disk_groups.h" #include "ec.h" +#include "enumerated_ref.h" #include "error.h" #include "io_read.h" #include "io_misc.h" @@ -25,6 +27,7 @@ #include "subvolume.h" #include "trace.h" +#include <linux/moduleparam.h> #include <linux/random.h> #include <linux/sched/mm.h> @@ -34,6 +37,12 @@ module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644); MODULE_PARM_DESC(read_corrupt_ratio, ""); #endif +static bool bch2_poison_extents_on_checksum_error; +module_param_named(poison_extents_on_checksum_error, + bch2_poison_extents_on_checksum_error, bool, 0644); +MODULE_PARM_DESC(poison_extents_on_checksum_error, + "Extents with checksum errors are marked as poisoned - unsafe without read fua support"); + #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT static bool bch2_target_congested(struct bch_fs *c, u16 target) @@ -47,7 +56,7 @@ static bool bch2_target_congested(struct bch_fs *c, u16 target) if (!target) return false; - rcu_read_lock(); + guard(rcu)(); devs = bch2_target_to_mask(c, target) ?: &c->rw_devs[BCH_DATA_user]; @@ -64,7 +73,6 @@ static bool bch2_target_congested(struct bch_fs *c, u16 target) total += max(congested, 0LL); nr++; } - rcu_read_unlock(); return get_random_u32_below(nr * CONGESTED_MAX) < total; } @@ -80,18 +88,6 @@ static bool bch2_target_congested(struct bch_fs *c, u16 target) /* Cache promotion on read */ -struct promote_op { - struct rcu_head rcu; - u64 start_time; - - struct rhash_head hash; - struct bpos pos; - - struct work_struct work; - struct data_update write; - struct bio_vec bi_inline_vecs[]; /* must be last */ -}; - static const struct rhashtable_params bch_promote_params = { .head_offset = offsetof(struct promote_op, hash), .key_offset = offsetof(struct promote_op, pos), @@ -141,21 +137,21 @@ static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, BUG_ON(!opts.promote_target); if (!(flags & BCH_READ_may_promote)) - return -BCH_ERR_nopromote_may_not; + return bch_err_throw(c, nopromote_may_not); if (bch2_bkey_has_target(c, k, opts.promote_target)) - return -BCH_ERR_nopromote_already_promoted; + return bch_err_throw(c, nopromote_already_promoted); if (bkey_extent_is_unwritten(k)) - return -BCH_ERR_nopromote_unwritten; + return bch_err_throw(c, nopromote_unwritten); if (bch2_target_congested(c, opts.promote_target)) - return -BCH_ERR_nopromote_congested; + return bch_err_throw(c, nopromote_congested); } if (rhashtable_lookup_fast(&c->promote_table, &pos, bch_promote_params)) - return -BCH_ERR_nopromote_in_flight; + return bch_err_throw(c, nopromote_in_flight); return 0; } @@ -169,9 +165,11 @@ static noinline void promote_free(struct bch_read_bio *rbio) bch_promote_params); BUG_ON(ret); + async_object_list_del(c, promote, op->list_idx); + bch2_data_update_exit(&op->write); - bch2_write_ref_put(c, BCH_WRITE_REF_promote); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_promote); kfree_rcu(op, rcu); } @@ -236,12 +234,12 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, return NULL; } - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_promote)) return ERR_PTR(-BCH_ERR_nopromote_no_writes); struct promote_op *op = kzalloc(sizeof(*op), GFP_KERNEL); if (!op) { - ret = -BCH_ERR_nopromote_enomem; + ret = bch_err_throw(c, nopromote_enomem); goto err_put; } @@ -250,10 +248,14 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, bch_promote_params)) { - ret = -BCH_ERR_nopromote_in_flight; + ret = bch_err_throw(c, nopromote_in_flight); goto err; } + ret = async_object_list_add(c, promote, op, &op->list_idx); + if (ret < 0) + goto err_remove_hash; + ret = bch2_data_update_init(trans, NULL, NULL, &op->write, writepoint_hashed((unsigned long) current), &orig->opts, @@ -265,7 +267,7 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, * -BCH_ERR_ENOSPC_disk_reservation: */ if (ret) - goto err_remove_hash; + goto err_remove_list; rbio_init_fragment(&op->write.rbio.bio, orig); op->write.rbio.bounce = true; @@ -273,6 +275,8 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, op->write.op.end_io = promote_done; return &op->write.rbio; +err_remove_list: + async_object_list_del(c, promote, op->list_idx); err_remove_hash: BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash, bch_promote_params)); @@ -281,7 +285,7 @@ err: /* We may have added to the rhashtable and thus need rcu freeing: */ kfree_rcu(op, rcu); err_put: - bch2_write_ref_put(c, BCH_WRITE_REF_promote); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_promote); return ERR_PTR(ret); } @@ -296,6 +300,13 @@ static struct bch_read_bio *promote_alloc(struct btree_trans *trans, bool *read_full, struct bch_io_failures *failed) { + /* + * We're in the retry path, but we don't know what to repair yet, and we + * don't want to do a promote here: + */ + if (failed && !failed->nr) + return NULL; + struct bch_fs *c = trans->c; /* * if failed != NULL we're not actually doing a promote, we're @@ -338,6 +349,18 @@ nopromote: return NULL; } +void bch2_promote_op_to_text(struct printbuf *out, struct promote_op *op) +{ + if (!op->write.read_done) { + prt_printf(out, "parent read: %px\n", op->write.rbio.parent); + printbuf_indent_add(out, 2); + bch2_read_bio_to_text(out, op->write.rbio.parent); + printbuf_indent_sub(out, 2); + } + + bch2_data_update_to_text(out, &op->write); +} + /* Read */ static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out, @@ -394,7 +417,7 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) if (rbio->have_ioref) { struct bch_dev *ca = bch2_dev_have_ref(rbio->c, rbio->pick.ptr.dev); - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_io_read); } if (rbio->split) { @@ -406,6 +429,8 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) else promote_free(rbio); } else { + async_object_list_del(rbio->c, rbio, rbio->list_idx); + if (rbio->bounce) bch2_bio_free_pages_pool(rbio->c, &rbio->bio); @@ -430,6 +455,74 @@ static void bch2_rbio_done(struct bch_read_bio *rbio) bio_endio(&rbio->bio); } +static void get_rbio_extent(struct btree_trans *trans, + struct bch_read_bio *rbio, + struct bkey_buf *sk) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret = lockrestart_do(trans, + bkey_err(k = bch2_bkey_get_iter(trans, &iter, + rbio->data_btree, rbio->data_pos, 0))); + if (ret) + return; + + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + bkey_for_each_ptr(ptrs, ptr) + if (bch2_extent_ptr_eq(*ptr, rbio->pick.ptr)) { + bch2_bkey_buf_reassemble(sk, trans->c, k); + break; + } + + bch2_trans_iter_exit(trans, &iter); +} + +static noinline int maybe_poison_extent(struct btree_trans *trans, struct bch_read_bio *rbio, + enum btree_id btree, struct bkey_s_c read_k) +{ + if (!bch2_poison_extents_on_checksum_error) + return 0; + + struct bch_fs *c = trans->c; + + struct data_update *u = rbio_data_update(rbio); + if (u) + read_k = bkey_i_to_s_c(u->k.k); + + u64 flags = bch2_bkey_extent_flags(read_k); + if (flags & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) + return 0; + + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, btree, bkey_start_pos(read_k.k), + BTREE_ITER_intent); + int ret = bkey_err(k); + if (ret) + return ret; + + if (!bkey_and_val_eq(k, read_k)) + goto out; + + struct bkey_i *new = bch2_trans_kmalloc(trans, + bkey_bytes(k.k) + sizeof(struct bch_extent_flags)); + ret = PTR_ERR_OR_ZERO(new) ?: + (bkey_reassemble(new, k), 0) ?: + bch2_bkey_extent_flags_set(c, new, flags|BIT_ULL(BCH_EXTENT_FLAG_poisoned)) ?: + bch2_trans_update(trans, &iter, new, BTREE_UPDATE_internal_snapshot_node) ?: + bch2_trans_commit(trans, NULL, NULL, 0); + + /* + * Propagate key change back to data update path, in particular so it + * knows the extent has been poisoned and it's safe to change the + * checksum + */ + if (u && !ret) + bch2_bkey_buf_copy(&u->k, c, new); +out: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + static noinline int bch2_read_retry_nodecode(struct btree_trans *trans, struct bch_read_bio *rbio, struct bvec_iter bvec_iter, @@ -451,7 +544,7 @@ retry: if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) { /* extent we wanted to read no longer exists: */ - rbio->ret = -BCH_ERR_data_read_key_overwritten; + rbio->ret = bch_err_throw(trans->c, data_read_key_overwritten); goto err; } @@ -463,7 +556,8 @@ retry: err: bch2_trans_iter_exit(trans, &iter); - if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) + if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || + bch2_err_matches(ret, BCH_ERR_data_read_retry)) goto retry; if (ret) { @@ -487,15 +581,21 @@ static void bch2_rbio_retry(struct work_struct *work) .inum = rbio->read_pos.inode, }; struct bch_io_failures failed = { .nr = 0 }; - int orig_error = rbio->ret; struct btree_trans *trans = bch2_trans_get(c); + struct bkey_buf sk; + bch2_bkey_buf_init(&sk); + bkey_init(&sk.k->k); + trace_io_read_retry(&rbio->bio); this_cpu_add(c->counters[BCH_COUNTER_io_read_retry], bvec_iter_sectors(rbio->bvec_iter)); - if (bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid)) + get_rbio_extent(trans, rbio, &sk); + + if (!bkey_deleted(&sk.k->k) && + bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid)) bch2_mark_io_failure(&failed, &rbio->pick, rbio->ret == -BCH_ERR_data_read_retry_csum_err); @@ -516,15 +616,16 @@ static void bch2_rbio_retry(struct work_struct *work) int ret = rbio->data_update ? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags) - : __bch2_read(trans, rbio, iter, inum, &failed, flags); + : __bch2_read(trans, rbio, iter, inum, &failed, &sk, flags); if (ret) { rbio->ret = ret; rbio->bio.bi_status = BLK_STS_IOERR; - } else if (orig_error != -BCH_ERR_data_read_retry_csum_err_maybe_userspace && - orig_error != -BCH_ERR_data_read_ptr_stale_race && - !failed.nr) { + } + + if (failed.nr || ret) { struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); lockrestart_do(trans, bch2_inum_offset_err_msg_trans(trans, &buf, @@ -532,13 +633,27 @@ static void bch2_rbio_retry(struct work_struct *work) read_pos.offset << 9)); if (rbio->data_update) prt_str(&buf, "(internal move) "); - prt_str(&buf, "successful retry"); - bch_err_ratelimited(c, "%s", buf.buf); + prt_str(&buf, "data read error, "); + if (!ret) + prt_str(&buf, "successful retry"); + else + prt_str(&buf, bch2_err_str(ret)); + prt_newline(&buf); + + if (!bkey_deleted(&sk.k->k)) { + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(sk.k)); + prt_newline(&buf); + } + + bch2_io_failures_to_text(&buf, c, &failed); + + bch2_print_str_ratelimited(c, KERN_ERR, buf.buf); printbuf_exit(&buf); } bch2_rbio_done(rbio); + bch2_bkey_buf_exit(&sk, c); bch2_trans_put(trans); } @@ -568,27 +683,6 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, } } -static void bch2_read_io_err(struct work_struct *work) -{ - struct bch_read_bio *rbio = - container_of(work, struct bch_read_bio, work); - struct bio *bio = &rbio->bio; - struct bch_fs *c = rbio->c; - struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; - struct printbuf buf = PRINTBUF; - - bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); - prt_printf(&buf, "data read error: %s", bch2_blk_status_to_str(bio->bi_status)); - - if (ca) - bch_err_ratelimited(ca, "%s", buf.buf); - else - bch_err_ratelimited(c, "%s", buf.buf); - - printbuf_exit(&buf); - bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_io_err, bio->bi_status); -} - static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, struct bch_read_bio *rbio) { @@ -652,31 +746,6 @@ static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) __bch2_rbio_narrow_crcs(trans, rbio)); } -static void bch2_read_csum_err(struct work_struct *work) -{ - struct bch_read_bio *rbio = - container_of(work, struct bch_read_bio, work); - struct bch_fs *c = rbio->c; - struct bio *src = &rbio->bio; - struct bch_extent_crc_unpacked crc = rbio->pick.crc; - struct nonce nonce = extent_nonce(rbio->version, crc); - struct bch_csum csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); - struct printbuf buf = PRINTBUF; - - bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); - prt_str(&buf, "data "); - bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum); - - struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; - if (ca) - bch_err_ratelimited(ca, "%s", buf.buf); - else - bch_err_ratelimited(c, "%s", buf.buf); - - bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR); - printbuf_exit(&buf); -} - static void bch2_read_decompress_err(struct work_struct *work) { struct bch_read_bio *rbio = @@ -837,7 +906,7 @@ out: memalloc_nofs_restore(nofs_flags); return; csum_err: - bch2_rbio_punt(rbio, bch2_read_csum_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); + bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR); goto out; decompression_err: bch2_rbio_punt(rbio, bch2_read_decompress_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); @@ -863,7 +932,7 @@ static void bch2_read_endio(struct bio *bio) rbio->bio.bi_end_io = rbio->end_io; if (unlikely(bio->bi_status)) { - bch2_rbio_punt(rbio, bch2_read_io_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); + bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_io_err, bio->bi_status); return; } @@ -963,6 +1032,10 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, bvec_iter_sectors(iter)); goto out_read_done; } + + if ((bch2_bkey_extent_flags(k) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) && + !orig->data_update) + return bch_err_throw(c, extent_poisoned); retry_pick: ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev); @@ -971,6 +1044,16 @@ retry_pick: goto hole; if (unlikely(ret < 0)) { + if (ret == -BCH_ERR_data_read_csum_err) { + int ret2 = maybe_poison_extent(trans, orig, data_btree, k); + if (ret2) { + ret = ret2; + goto err; + } + + trace_and_count(c, io_read_fail_and_poison, &orig->bio); + } + struct printbuf buf = PRINTBUF; bch2_read_err_msg_trans(trans, &buf, orig, read_pos); prt_printf(&buf, "%s\n ", bch2_err_str(ret)); @@ -990,11 +1073,12 @@ retry_pick: bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); - ret = -BCH_ERR_data_read_no_encryption_key; + ret = bch_err_throw(c, data_read_no_encryption_key); goto err; } - struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); + struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, + BCH_DEV_READ_REF_io_read); /* * Stale dirty pointers are treated as IO errors, but @failed isn't @@ -1008,7 +1092,7 @@ retry_pick: unlikely(dev_ptr_stale(ca, &pick.ptr))) { read_from_stale_dirty_pointer(trans, ca, k, pick.ptr); bch2_mark_io_failure(failed, &pick, false); - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_io_read); goto retry_pick; } @@ -1041,8 +1125,9 @@ retry_pick: */ if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) { if (ca) - percpu_ref_put(&ca->io_ref[READ]); - rbio->ret = -BCH_ERR_data_read_buffer_too_small; + enumerated_ref_put(&ca->io_ref[READ], + BCH_DEV_READ_REF_io_read); + rbio->ret = bch_err_throw(c, data_read_buffer_too_small); goto out_read_done; } @@ -1138,6 +1223,8 @@ retry_pick: rbio->bio.bi_iter.bi_sector = pick.ptr.offset; rbio->bio.bi_end_io = bch2_read_endio; + async_object_list_add(c, rbio, rbio, &rbio->list_idx); + if (rbio->bounce) trace_and_count(c, io_read_bounce, &rbio->bio); @@ -1171,14 +1258,6 @@ retry_pick: if (likely(!rbio->pick.do_ec_reconstruct)) { if (unlikely(!rbio->have_ioref)) { - struct printbuf buf = PRINTBUF; - bch2_read_err_msg_trans(trans, &buf, rbio, read_pos); - prt_printf(&buf, "no device to read from:\n "); - bch2_bkey_val_to_text(&buf, c, k); - - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); - bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_device_offline, BLK_STS_IOERR); @@ -1253,7 +1332,7 @@ hole: * have to signal that: */ if (u) - orig->ret = -BCH_ERR_data_read_key_overwritten; + orig->ret = bch_err_throw(c, data_read_key_overwritten); zero_fill_bio_iter(&orig->bio, iter); out_read_done: @@ -1265,12 +1344,15 @@ out_read_done: int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, struct bvec_iter bvec_iter, subvol_inum inum, - struct bch_io_failures *failed, unsigned flags) + struct bch_io_failures *failed, + struct bkey_buf *prev_read, + unsigned flags) { struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_buf sk; struct bkey_s_c k; + enum btree_id data_btree; int ret; EBUG_ON(rbio->data_update); @@ -1281,7 +1363,7 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, BTREE_ITER_slots); while (1) { - enum btree_id data_btree = BTREE_ID_extents; + data_btree = BTREE_ID_extents; bch2_trans_begin(trans); @@ -1313,6 +1395,12 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, k = bkey_i_to_s_c(sk.k); + if (unlikely(flags & BCH_READ_in_retry)) { + if (!bkey_and_val_eq(k, bkey_i_to_s_c(prev_read->k))) + failed->nr = 0; + bch2_bkey_buf_copy(prev_read, c, sk.k); + } + /* * With indirect extents, the amount of data to read is the min * of the original extent and the indirect extent: @@ -1347,8 +1435,6 @@ err: break; } - bch2_trans_iter_exit(trans, &iter); - if (unlikely(ret)) { if (ret != -BCH_ERR_extent_poisoned) { struct printbuf buf = PRINTBUF; @@ -1367,30 +1453,74 @@ err: bch2_rbio_done(rbio); } + bch2_trans_iter_exit(trans, &iter); bch2_bkey_buf_exit(&sk, c); return ret; } +static const char * const bch2_read_bio_flags[] = { +#define x(n) #n, + BCH_READ_FLAGS() +#undef x + NULL +}; + +void bch2_read_bio_to_text(struct printbuf *out, struct bch_read_bio *rbio) +{ + u64 now = local_clock(); + prt_printf(out, "start_time:\t%llu\n", rbio->start_time ? now - rbio->start_time : 0); + prt_printf(out, "submit_time:\t%llu\n", rbio->submit_time ? now - rbio->submit_time : 0); + + if (!rbio->split) + prt_printf(out, "end_io:\t%ps\n", rbio->end_io); + else + prt_printf(out, "parent:\t%px\n", rbio->parent); + + prt_printf(out, "bi_end_io:\t%ps\n", rbio->bio.bi_end_io); + + prt_printf(out, "promote:\t%u\n", rbio->promote); + prt_printf(out, "bounce:\t%u\n", rbio->bounce); + prt_printf(out, "split:\t%u\n", rbio->split); + prt_printf(out, "have_ioref:\t%u\n", rbio->have_ioref); + prt_printf(out, "narrow_crcs:\t%u\n", rbio->narrow_crcs); + prt_printf(out, "context:\t%u\n", rbio->context); + prt_printf(out, "ret:\t%s\n", bch2_err_str(rbio->ret)); + + prt_printf(out, "flags:\t"); + bch2_prt_bitflags(out, bch2_read_bio_flags, rbio->flags); + prt_newline(out); + + bch2_bio_to_text(out, &rbio->bio); +} + void bch2_fs_io_read_exit(struct bch_fs *c) { if (c->promote_table.tbl) rhashtable_destroy(&c->promote_table); bioset_exit(&c->bio_read_split); bioset_exit(&c->bio_read); + mempool_exit(&c->bio_bounce_pages); } int bch2_fs_io_read_init(struct bch_fs *c) { + if (mempool_init_page_pool(&c->bio_bounce_pages, + max_t(unsigned, + c->opts.btree_node_size, + c->opts.encoded_extent_max) / + PAGE_SIZE, 0)) + return bch_err_throw(c, ENOMEM_bio_bounce_pages_init); + if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), BIOSET_NEED_BVECS)) - return -BCH_ERR_ENOMEM_bio_read_init; + return bch_err_throw(c, ENOMEM_bio_read_init); if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), BIOSET_NEED_BVECS)) - return -BCH_ERR_ENOMEM_bio_read_split_init; + return bch_err_throw(c, ENOMEM_bio_read_split_init); if (rhashtable_init(&c->promote_table, &bch_promote_params)) - return -BCH_ERR_ENOMEM_promote_table_init; + return bch_err_throw(c, ENOMEM_promote_table_init); return 0; } diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h index c78025d863e0..45c959018919 100644 --- a/fs/bcachefs/io_read.h +++ b/fs/bcachefs/io_read.h @@ -4,6 +4,7 @@ #include "bkey_buf.h" #include "btree_iter.h" +#include "extents_types.h" #include "reflink.h" struct bch_read_bio { @@ -48,6 +49,9 @@ struct bch_read_bio { u16 _state; }; s16 ret; +#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS + unsigned list_idx; +#endif struct extent_ptr_decoded pick; @@ -87,6 +91,8 @@ static inline int bch2_read_indirect_extent(struct btree_trans *trans, return 0; *data_btree = BTREE_ID_reflink; + + struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_s_c k = bch2_lookup_indirect_extent(trans, &iter, offset_into_extent, @@ -98,10 +104,10 @@ static inline int bch2_read_indirect_extent(struct btree_trans *trans, if (bkey_deleted(k.k)) { bch2_trans_iter_exit(trans, &iter); - return -BCH_ERR_missing_indirect_extent; + return bch_err_throw(c, missing_indirect_extent); } - bch2_bkey_buf_reassemble(extent, trans->c, k); + bch2_bkey_buf_reassemble(extent, c, k); bch2_trans_iter_exit(trans, &iter); return 0; } @@ -144,7 +150,8 @@ static inline void bch2_read_extent(struct btree_trans *trans, } int __bch2_read(struct btree_trans *, struct bch_read_bio *, struct bvec_iter, - subvol_inum, struct bch_io_failures *, unsigned flags); + subvol_inum, + struct bch_io_failures *, struct bkey_buf *, unsigned flags); static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, subvol_inum inum) @@ -154,7 +161,7 @@ static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, rbio->subvol = inum.subvol; bch2_trans_run(c, - __bch2_read(trans, rbio, rbio->bio.bi_iter, inum, NULL, + __bch2_read(trans, rbio, rbio->bio.bi_iter, inum, NULL, NULL, BCH_READ_retry_if_stale| BCH_READ_may_promote| BCH_READ_user_mapped)); @@ -172,6 +179,9 @@ static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio, rbio->split = true; rbio->parent = orig; rbio->opts = orig->opts; +#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS + rbio->list_idx = 0; +#endif return rbio; } @@ -189,9 +199,16 @@ static inline struct bch_read_bio *rbio_init(struct bio *bio, rbio->ret = 0; rbio->opts = opts; rbio->bio.bi_end_io = end_io; +#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS + rbio->list_idx = 0; +#endif return rbio; } +struct promote_op; +void bch2_promote_op_to_text(struct printbuf *, struct promote_op *); +void bch2_read_bio_to_text(struct printbuf *, struct bch_read_bio *); + void bch2_fs_io_read_exit(struct bch_fs *); int bch2_fs_io_read_init(struct bch_fs *); diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index c1237da079ed..88b1eec8eff3 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -6,6 +6,7 @@ #include "bcachefs.h" #include "alloc_foreground.h" +#include "async_objs.h" #include "bkey_buf.h" #include "bset.h" #include "btree_update.h" @@ -15,6 +16,7 @@ #include "compress.h" #include "debug.h" #include "ec.h" +#include "enumerated_ref.h" #include "error.h" #include "extent_update.h" #include "inode.h" @@ -263,11 +265,9 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, prt_printf(&buf, "inode %llu i_sectors underflow: %lli + %lli < 0", extent_iter->pos.inode, bi_sectors, i_sectors_delta); - bool repeat = false, print = false, suppress = false; - bch2_count_fsck_err(c, inode_i_sectors_underflow, buf.buf, - &repeat, &print, &suppress); + bool print = bch2_count_fsck_err(c, inode_i_sectors_underflow, &buf); if (print) - bch2_print_str(c, buf.buf); + bch2_print_str(c, KERN_ERR, buf.buf); printbuf_exit(&buf); if (i_sectors_delta < 0) @@ -280,6 +280,12 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, inode_update_flags = 0; } + /* + * extents, dirents and xattrs updates require that an inode update also + * happens - to ensure that if a key exists in one of those btrees with + * a given snapshot ID an inode is also present - so we may have to skip + * the nojournal optimization: + */ if (inode->k.p.snapshot != iter.snapshot) { inode->k.p.snapshot = iter.snapshot; inode_update_flags = 0; @@ -397,8 +403,7 @@ static int bch2_write_index_default(struct bch_write_op *op) bkey_start_pos(&sk.k->k), BTREE_ITER_slots|BTREE_ITER_intent); - ret = bch2_bkey_set_needs_rebalance(c, &op->opts, sk.k) ?: - bch2_extent_update(trans, inum, &iter, sk.k, + ret = bch2_extent_update(trans, inum, &iter, sk.k, &op->res, op->new_i_size, &op->i_sectors_delta, op->flags & BCH_WRITE_check_enospc); @@ -462,9 +467,17 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); struct bch_write_bio *n; + unsigned ref_rw = type == BCH_DATA_btree ? READ : WRITE; + unsigned ref_idx = type == BCH_DATA_btree + ? BCH_DEV_READ_REF_btree_node_write + : BCH_DEV_WRITE_REF_io_write; BUG_ON(c->opts.nochanges); + const struct bch_extent_ptr *last = NULL; + bkey_for_each_ptr(ptrs, ptr) + last = ptr; + bkey_for_each_ptr(ptrs, ptr) { /* * XXX: btree writes should be using io_ref[WRITE], but we @@ -473,9 +486,9 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, */ struct bch_dev *ca = nocow ? bch2_dev_have_ref(c, ptr->dev) - : bch2_dev_get_ioref(c, ptr->dev, type == BCH_DATA_btree ? READ : WRITE); + : bch2_dev_get_ioref(c, ptr->dev, ref_rw, ref_idx); - if (to_entry(ptr + 1) < ptrs.end) { + if (ptr != last) { n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, GFP_NOFS, &c->replica_set)); n->bio.bi_end_io = wbio->bio.bi_end_io; @@ -533,17 +546,19 @@ static void bch2_write_done(struct closure *cl) bch2_disk_reservation_put(c, &op->res); if (!(op->flags & BCH_WRITE_move)) - bch2_write_ref_put(c, BCH_WRITE_REF_write); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_write); bch2_keylist_free(&op->insert_keys, op->inline_keys); EBUG_ON(cl->parent); closure_debug_destroy(cl); + async_object_list_del(c, write_op, op->list_idx); if (op->end_io) op->end_io(op); } static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op) { + struct bch_fs *c = op->c; struct keylist *keys = &op->insert_keys; struct bkey_i *src, *dst = keys->keys, *n; @@ -555,7 +570,7 @@ static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op) test_bit(ptr->dev, op->failed.d)); if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) - return -BCH_ERR_data_write_io; + return bch_err_throw(c, data_write_io); } if (dst != src) @@ -748,7 +763,8 @@ static void bch2_write_endio(struct bio *bio) } if (wbio->have_ioref) - percpu_ref_put(&ca->io_ref[WRITE]); + enumerated_ref_put(&ca->io_ref[WRITE], + BCH_DEV_WRITE_REF_io_write); if (wbio->bounce) bch2_bio_free_pages_pool(c, bio); @@ -784,6 +800,9 @@ static void init_append_extent(struct bch_write_op *op, bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size, op->flags & BCH_WRITE_cached); + if (!(op->flags & BCH_WRITE_move)) + bch2_bkey_set_needs_rebalance(op->c, &op->opts, &e->k_i); + bch2_keylist_push(&op->insert_keys); } @@ -958,7 +977,7 @@ csum_err: op->crc.csum_type < BCH_CSUM_NR ? __bch2_csum_types[op->crc.csum_type] : "(unknown)"); - return -BCH_ERR_data_write_csum; + return bch_err_throw(c, data_write_csum); } static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, @@ -1190,16 +1209,13 @@ static bool bch2_extent_is_writeable(struct bch_write_op *op, e = bkey_s_c_to_extent(k); - rcu_read_lock(); + guard(rcu)(); extent_for_each_ptr_decode(e, p, entry) { - if (crc_is_encoded(p.crc) || p.has_ec) { - rcu_read_unlock(); + if (crc_is_encoded(p.crc) || p.has_ec) return false; - } replicas += bch2_extent_ptr_durability(c, &p); } - rcu_read_unlock(); return replicas >= op->opts.data_replicas; } @@ -1272,7 +1288,7 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) static void __bch2_nocow_write_done(struct bch_write_op *op) { if (unlikely(op->flags & BCH_WRITE_io_error)) { - op->error = -BCH_ERR_data_write_io; + op->error = bch_err_throw(op->c, data_write_io); } else if (unlikely(op->flags & BCH_WRITE_convert_unwritten)) bch2_nocow_write_convert_unwritten(op); } @@ -1345,7 +1361,8 @@ retry: /* Get iorefs before dropping btree locks: */ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE); + struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE, + BCH_DEV_WRITE_REF_io_write); if (unlikely(!ca)) goto err_get_ioref; @@ -1447,7 +1464,8 @@ err: return; err_get_ioref: darray_for_each(buckets, i) - percpu_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref[WRITE]); + enumerated_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref[WRITE], + BCH_DEV_WRITE_REF_io_write); /* Fall back to COW path: */ goto out; @@ -1463,10 +1481,10 @@ err_bucket_stale: "pointer to invalid bucket in nocow path on device %llu\n %s", stale_at->b.inode, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = -BCH_ERR_data_write_invalid_ptr; + ret = bch_err_throw(c, data_write_invalid_ptr); } else { /* We can retry this: */ - ret = -BCH_ERR_transaction_restart; + ret = bch_err_throw(c, transaction_restart); } printbuf_exit(&buf); @@ -1661,6 +1679,8 @@ CLOSURE_CALLBACK(bch2_write) BUG_ON(!op->write_point.v); BUG_ON(bkey_eq(op->pos, POS_MAX)); + async_object_list_add(c, write_op, op, &op->list_idx); + if (op->flags & BCH_WRITE_only_specified_devs) op->flags |= BCH_WRITE_alloc_nowait; @@ -1671,18 +1691,18 @@ CLOSURE_CALLBACK(bch2_write) if (unlikely(bio->bi_iter.bi_size & (c->opts.block_size - 1))) { bch2_write_op_error(op, op->pos.offset, "misaligned write"); - op->error = -BCH_ERR_data_write_misaligned; + op->error = bch_err_throw(c, data_write_misaligned); goto err; } if (c->opts.nochanges) { - op->error = -BCH_ERR_erofs_no_writes; + op->error = bch_err_throw(c, erofs_no_writes); goto err; } if (!(op->flags & BCH_WRITE_move) && - !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) { - op->error = -BCH_ERR_erofs_no_writes; + !enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_write)) { + op->error = bch_err_throw(c, erofs_no_writes); goto err; } @@ -1705,6 +1725,7 @@ err: bch2_disk_reservation_put(c, &op->res); closure_debug_destroy(&op->cl); + async_object_list_del(c, write_op, op->list_idx); if (op->end_io) op->end_io(op); } @@ -1738,13 +1759,13 @@ void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op) prt_printf(out, "nr_replicas_required:\t%u\n", op->nr_replicas_required); prt_printf(out, "ref:\t%u\n", closure_nr_remaining(&op->cl)); + prt_printf(out, "ret\t%s\n", bch2_err_str(op->error)); printbuf_indent_sub(out, 2); } void bch2_fs_io_write_exit(struct bch_fs *c) { - mempool_exit(&c->bio_bounce_pages); bioset_exit(&c->replica_set); bioset_exit(&c->bio_write); } @@ -1753,14 +1774,7 @@ int bch2_fs_io_write_init(struct bch_fs *c) { if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), BIOSET_NEED_BVECS) || bioset_init(&c->replica_set, 4, offsetof(struct bch_write_bio, bio), 0)) - return -BCH_ERR_ENOMEM_bio_write_init; - - if (mempool_init_page_pool(&c->bio_bounce_pages, - max_t(unsigned, - c->opts.btree_node_size, - c->opts.encoded_extent_max) / - PAGE_SIZE, 0)) - return -BCH_ERR_ENOMEM_bio_bounce_pages_init; + return bch_err_throw(c, ENOMEM_bio_write_init); return 0; } diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h index b8ab19a1e1da..2c0a8f35ee1f 100644 --- a/fs/bcachefs/io_write.h +++ b/fs/bcachefs/io_write.h @@ -17,34 +17,6 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, __printf(3, 4) void bch2_write_op_error(struct bch_write_op *op, u64, const char *, ...); -#define BCH_WRITE_FLAGS() \ - x(alloc_nowait) \ - x(cached) \ - x(data_encoded) \ - x(pages_stable) \ - x(pages_owned) \ - x(only_specified_devs) \ - x(wrote_data_inline) \ - x(check_enospc) \ - x(sync) \ - x(move) \ - x(in_worker) \ - x(submitted) \ - x(io_error) \ - x(convert_unwritten) - -enum __bch_write_flags { -#define x(f) __BCH_WRITE_##f, - BCH_WRITE_FLAGS() -#undef x -}; - -enum bch_write_flags { -#define x(f) BCH_WRITE_##f = BIT(__BCH_WRITE_##f), - BCH_WRITE_FLAGS() -#undef x -}; - static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) { return op->watermark == BCH_WATERMARK_copygc diff --git a/fs/bcachefs/io_write_types.h b/fs/bcachefs/io_write_types.h index 3ef6df9145ef..5da4eb8bb6f6 100644 --- a/fs/bcachefs/io_write_types.h +++ b/fs/bcachefs/io_write_types.h @@ -13,6 +13,34 @@ #include <linux/llist.h> #include <linux/workqueue.h> +#define BCH_WRITE_FLAGS() \ + x(alloc_nowait) \ + x(cached) \ + x(data_encoded) \ + x(pages_stable) \ + x(pages_owned) \ + x(only_specified_devs) \ + x(wrote_data_inline) \ + x(check_enospc) \ + x(sync) \ + x(move) \ + x(in_worker) \ + x(submitted) \ + x(io_error) \ + x(convert_unwritten) + +enum __bch_write_flags { +#define x(f) __BCH_WRITE_##f, + BCH_WRITE_FLAGS() +#undef x +}; + +enum bch_write_flags { +#define x(f) BCH_WRITE_##f = BIT(__BCH_WRITE_##f), + BCH_WRITE_FLAGS() +#undef x +}; + struct bch_write_bio { struct_group(wbio, struct bch_fs *c; @@ -43,6 +71,10 @@ struct bch_write_op { void (*end_io)(struct bch_write_op *); u64 start_time; +#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS + unsigned list_idx; +#endif + unsigned written; /* sectors */ u16 flags; s16 error; /* dio write path expects it to hold -ERESTARTSYS... */ diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index bb45d3634194..dda802a656cf 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -12,6 +12,7 @@ #include "btree_update.h" #include "btree_write_buffer.h" #include "buckets.h" +#include "enumerated_ref.h" #include "error.h" #include "journal.h" #include "journal_io.h" @@ -173,7 +174,7 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags) spin_unlock(&j->lock); prt_printf(&buf, bch2_fmt(c, "Journal stuck! Hava a pre-reservation but journal full (error %s)"), bch2_err_str(error)); - bch2_print_string_as_lines(KERN_ERR, buf.buf); + bch2_print_str(c, KERN_ERR, buf.buf); printbuf_reset(&buf); bch2_journal_pins_to_text(&buf, j); @@ -331,16 +332,6 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t __bch2_journal_buf_put(j, le64_to_cpu(buf->data->seq)); } -void bch2_journal_halt(struct journal *j) -{ - spin_lock(&j->lock); - __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL, true); - if (!j->err_seq) - j->err_seq = journal_cur_seq(j); - journal_wake(j); - spin_unlock(&j->lock); -} - void bch2_journal_halt_locked(struct journal *j) { lockdep_assert_held(&j->lock); @@ -351,6 +342,13 @@ void bch2_journal_halt_locked(struct journal *j) journal_wake(j); } +void bch2_journal_halt(struct journal *j) +{ + spin_lock(&j->lock); + bch2_journal_halt_locked(j); + spin_unlock(&j->lock); +} + static bool journal_entry_want_write(struct journal *j) { bool ret = !journal_entry_is_open(j) || @@ -399,7 +397,7 @@ static int journal_entry_open(struct journal *j) BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); if (j->blocked) - return -BCH_ERR_journal_blocked; + return bch_err_throw(c, journal_blocked); if (j->cur_entry_error) return j->cur_entry_error; @@ -409,23 +407,23 @@ static int journal_entry_open(struct journal *j) return ret; if (!fifo_free(&j->pin)) - return -BCH_ERR_journal_pin_full; + return bch_err_throw(c, journal_pin_full); if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf)) - return -BCH_ERR_journal_max_in_flight; + return bch_err_throw(c, journal_max_in_flight); if (atomic64_read(&j->seq) - j->seq_write_started == JOURNAL_STATE_BUF_NR) - return -BCH_ERR_journal_max_open; + return bch_err_throw(c, journal_max_open); - if (journal_cur_seq(j) >= JOURNAL_SEQ_MAX) { + if (unlikely(journal_cur_seq(j) >= JOURNAL_SEQ_MAX)) { bch_err(c, "cannot start: journal seq overflow"); if (bch2_fs_emergency_read_only_locked(c)) bch_err(c, "fatal error - emergency read only"); - return -BCH_ERR_journal_shutdown; + return bch_err_throw(c, journal_shutdown); } if (!j->free_buf && !buf->data) - return -BCH_ERR_journal_buf_enomem; /* will retry after write completion frees up a buf */ + return bch_err_throw(c, journal_buf_enomem); /* will retry after write completion frees up a buf */ BUG_ON(!j->cur_entry_sectors); @@ -449,7 +447,7 @@ static int journal_entry_open(struct journal *j) u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1); if (u64s <= (ssize_t) j->early_journal_entries.nr) - return -BCH_ERR_journal_full; + return bch_err_throw(c, journal_full); if (fifo_empty(&j->pin) && j->reclaim_thread) wake_up_process(j->reclaim_thread); @@ -461,6 +459,14 @@ static int journal_entry_open(struct journal *j) atomic64_inc(&j->seq); journal_pin_list_init(fifo_push_ref(&j->pin), 1); + if (unlikely(bch2_journal_seq_is_blacklisted(c, journal_cur_seq(j), false))) { + bch_err(c, "attempting to open blacklisted journal seq %llu", + journal_cur_seq(j)); + if (bch2_fs_emergency_read_only_locked(c)) + bch_err(c, "fatal error - emergency read only"); + return bch_err_throw(c, journal_shutdown); + } + BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq)); BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf); @@ -591,16 +597,16 @@ retry: return ret; if (j->blocked) - return -BCH_ERR_journal_blocked; + return bch_err_throw(c, journal_blocked); if ((flags & BCH_WATERMARK_MASK) < j->watermark) { - ret = -BCH_ERR_journal_full; + ret = bch_err_throw(c, journal_full); can_discard = j->can_discard; goto out; } if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) && !journal_entry_is_open(j)) { - ret = -BCH_ERR_journal_max_in_flight; + ret = bch_err_throw(c, journal_max_in_flight); goto out; } @@ -641,7 +647,7 @@ out: goto retry; if (journal_error_check_stuck(j, ret, flags)) - ret = -BCH_ERR_journal_stuck; + ret = bch_err_throw(c, journal_stuck); if (ret == -BCH_ERR_journal_max_in_flight && track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true) && @@ -702,7 +708,8 @@ static unsigned max_dev_latency(struct bch_fs *c) { u64 nsecs = 0; - for_each_rw_member(c, ca) + guard(rcu)(); + for_each_rw_member_rcu(c, ca) nsecs = max(nsecs, ca->io_latency[WRITE].stats.max_duration); return nsecs_to_jiffies(nsecs); @@ -746,7 +753,7 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, struct printbuf buf = PRINTBUF; bch2_journal_debug_to_text(&buf, j); - bch2_print_string_as_lines(KERN_ERR, buf.buf); + bch2_print_str(c, KERN_ERR, buf.buf); prt_printf(&buf, bch2_fmt(c, "Journal stuck? Waited for 10 seconds, err %s"), bch2_err_str(ret)); printbuf_exit(&buf); @@ -805,6 +812,7 @@ out: int bch2_journal_flush_seq_async(struct journal *j, u64 seq, struct closure *parent) { + struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_buf *buf; int ret = 0; @@ -820,7 +828,7 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq, /* Recheck under lock: */ if (j->err_seq && seq >= j->err_seq) { - ret = -BCH_ERR_journal_flush_err; + ret = bch_err_throw(c, journal_flush_err); goto out; } @@ -990,11 +998,11 @@ int bch2_journal_meta(struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_journal)) - return -BCH_ERR_erofs_no_writes; + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_journal)) + return bch_err_throw(c, erofs_no_writes); int ret = __bch2_journal_meta(j); - bch2_write_ref_put(c, BCH_WRITE_REF_journal); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_journal); return ret; } @@ -1124,7 +1132,7 @@ static int bch2_set_nr_journal_buckets_iter(struct bch_dev *ca, unsigned nr, new_buckets = kcalloc(nr, sizeof(u64), GFP_KERNEL); new_bucket_seq = kcalloc(nr, sizeof(u64), GFP_KERNEL); if (!bu || !ob || !new_buckets || !new_bucket_seq) { - ret = -BCH_ERR_ENOMEM_set_nr_journal_buckets; + ret = bch_err_throw(c, ENOMEM_set_nr_journal_buckets); goto err_free; } @@ -1296,13 +1304,83 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, return ret; } +int bch2_dev_journal_bucket_delete(struct bch_dev *ca, u64 b) +{ + struct bch_fs *c = ca->fs; + struct journal *j = &c->journal; + struct journal_device *ja = &ca->journal; + + guard(mutex)(&c->sb_lock); + unsigned pos; + for (pos = 0; pos < ja->nr; pos++) + if (ja->buckets[pos] == b) + break; + + if (pos == ja->nr) { + bch_err(ca, "journal bucket %llu not found when deleting", b); + return -EINVAL; + } + + u64 *new_buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);; + if (!new_buckets) + return bch_err_throw(c, ENOMEM_set_nr_journal_buckets); + + memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64)); + memmove(&new_buckets[pos], + &new_buckets[pos + 1], + (ja->nr - 1 - pos) * sizeof(new_buckets[0])); + + int ret = bch2_journal_buckets_to_sb(c, ca, ja->buckets, ja->nr - 1) ?: + bch2_write_super(c); + if (ret) { + kfree(new_buckets); + return ret; + } + + scoped_guard(spinlock, &j->lock) { + if (pos < ja->discard_idx) + --ja->discard_idx; + if (pos < ja->dirty_idx_ondisk) + --ja->dirty_idx_ondisk; + if (pos < ja->dirty_idx) + --ja->dirty_idx; + if (pos < ja->cur_idx) + --ja->cur_idx; + + ja->nr--; + + memmove(&ja->buckets[pos], + &ja->buckets[pos + 1], + (ja->nr - pos) * sizeof(ja->buckets[0])); + + memmove(&ja->bucket_seq[pos], + &ja->bucket_seq[pos + 1], + (ja->nr - pos) * sizeof(ja->bucket_seq[0])); + + bch2_journal_space_available(j); + } + + kfree(new_buckets); + return 0; +} + int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs) { + struct bch_fs *c = ca->fs; + + if (!(ca->mi.data_allowed & BIT(BCH_DATA_journal))) + return 0; + + if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) { + bch_err(c, "cannot allocate journal, filesystem is an unresized image file"); + return bch_err_throw(c, erofs_filesystem_full); + } + unsigned nr; int ret; if (dynamic_fault("bcachefs:add:journal_alloc")) { - ret = -BCH_ERR_ENOMEM_set_nr_journal_buckets; + ret = bch_err_throw(c, ENOMEM_set_nr_journal_buckets); goto err; } @@ -1318,7 +1396,7 @@ int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs) min(1 << 13, (1 << 24) / ca->mi.bucket_size)); - ret = bch2_set_nr_journal_buckets_loop(ca->fs, ca, nr, new_fs); + ret = bch2_set_nr_journal_buckets_loop(c, ca, nr, new_fs); err: bch_err_fn(ca, ret); return ret; @@ -1326,13 +1404,14 @@ err: int bch2_fs_journal_alloc(struct bch_fs *c) { - for_each_online_member(c, ca) { + for_each_online_member(c, ca, BCH_DEV_READ_REF_fs_journal_alloc) { if (ca->journal.nr) continue; int ret = bch2_dev_journal_alloc(ca, true); if (ret) { - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], + BCH_DEV_READ_REF_fs_journal_alloc); return ret; } } @@ -1404,6 +1483,13 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) bool had_entries = false; u64 last_seq = cur_seq, nr, seq; + /* + * + * XXX pick most recent non blacklisted sequence number + */ + + cur_seq = max(cur_seq, bch2_journal_last_blacklisted_seq(c)); + if (cur_seq >= JOURNAL_SEQ_MAX) { bch_err(c, "cannot start: journal seq overflow"); return -EINVAL; @@ -1429,13 +1515,11 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) */ nr += nr / 4; - if (nr + 1 > j->pin.size) { - free_fifo(&j->pin); - init_fifo(&j->pin, roundup_pow_of_two(nr + 1), GFP_KERNEL); - if (!j->pin.data) { - bch_err(c, "error reallocating journal fifo (%llu open entries)", nr); - return -BCH_ERR_ENOMEM_journal_pin_fifo; - } + nr = max(nr, JOURNAL_PIN); + init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL); + if (!j->pin.data) { + bch_err(c, "error reallocating journal fifo (%llu open entries)", nr); + return bch_err_throw(c, ENOMEM_journal_pin_fifo); } j->replay_journal_seq = last_seq; @@ -1523,6 +1607,7 @@ void bch2_dev_journal_exit(struct bch_dev *ca) int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) { + struct bch_fs *c = ca->fs; struct journal_device *ja = &ca->journal; struct bch_sb_field_journal *journal_buckets = bch2_sb_field_get(sb, journal); @@ -1542,7 +1627,7 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); if (!ja->bucket_seq) - return -BCH_ERR_ENOMEM_dev_journal_init; + return bch_err_throw(c, ENOMEM_dev_journal_init); unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE); @@ -1550,7 +1635,7 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) ja->bio[i] = kzalloc(struct_size(ja->bio[i], bio.bi_inline_vecs, nr_bvecs), GFP_KERNEL); if (!ja->bio[i]) - return -BCH_ERR_ENOMEM_dev_journal_init; + return bch_err_throw(c, ENOMEM_dev_journal_init); ja->bio[i]->ca = ca; ja->bio[i]->buf_idx = i; @@ -1559,7 +1644,7 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); if (!ja->buckets) - return -BCH_ERR_ENOMEM_dev_journal_init; + return bch_err_throw(c, ENOMEM_dev_journal_init); if (journal_buckets_v2) { unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2); @@ -1590,7 +1675,7 @@ void bch2_fs_journal_exit(struct journal *j) free_fifo(&j->pin); } -int bch2_fs_journal_init(struct journal *j) +void bch2_fs_journal_init_early(struct journal *j) { static struct lock_class_key res_key; @@ -1609,24 +1694,24 @@ int bch2_fs_journal_init(struct journal *j) atomic64_set(&j->reservations.counter, ((union journal_res_state) { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v); +} - if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL))) - return -BCH_ERR_ENOMEM_journal_pin_fifo; +int bch2_fs_journal_init(struct journal *j) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); j->free_buf_size = j->buf_size_want = JOURNAL_ENTRY_SIZE_MIN; j->free_buf = kvmalloc(j->free_buf_size, GFP_KERNEL); if (!j->free_buf) - return -BCH_ERR_ENOMEM_journal_buf; + return bch_err_throw(c, ENOMEM_journal_buf); for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) j->buf[i].idx = i; - j->pin.front = j->pin.back = 1; - j->wq = alloc_workqueue("bcachefs_journal", WQ_HIGHPRI|WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512); if (!j->wq) - return -BCH_ERR_ENOMEM_fs_other_alloc; + return bch_err_throw(c, ENOMEM_fs_other_alloc); return 0; } @@ -1650,7 +1735,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) printbuf_tabstop_push(out, 28); out->atomic++; - rcu_read_lock(); + guard(rcu)(); s = READ_ONCE(j->reservations); prt_printf(out, "flags:\t"); @@ -1741,8 +1826,6 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) prt_printf(out, "replicas want %u need %u\n", c->opts.metadata_replicas, c->opts.metadata_replicas_required); - rcu_read_unlock(); - --out->atomic; } diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 641e20c05a14..83734fe4331f 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -426,8 +426,8 @@ int bch2_journal_flush(struct journal *); bool bch2_journal_noflush_seq(struct journal *, u64, u64); int bch2_journal_meta(struct journal *); -void bch2_journal_halt(struct journal *); void bch2_journal_halt_locked(struct journal *); +void bch2_journal_halt(struct journal *); static inline int bch2_journal_error(struct journal *j) { @@ -444,8 +444,9 @@ struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *, u void __bch2_journal_debug_to_text(struct printbuf *, struct journal *); void bch2_journal_debug_to_text(struct printbuf *, struct journal *); -int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, - unsigned nr); +int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, unsigned); +int bch2_dev_journal_bucket_delete(struct bch_dev *, u64); + int bch2_dev_journal_alloc(struct bch_dev *, bool); int bch2_fs_journal_alloc(struct bch_fs *); @@ -458,6 +459,7 @@ void bch2_journal_set_replay_done(struct journal *); void bch2_dev_journal_exit(struct bch_dev *); int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *); void bch2_fs_journal_exit(struct journal *); +void bch2_fs_journal_init_early(struct journal *); int bch2_fs_journal_init(struct journal *); #endif /* _BCACHEFS_JOURNAL_H */ diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index ded18a94ed02..0b15d71a8d2d 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -49,25 +49,27 @@ void bch2_journal_pos_from_member_info_resume(struct bch_fs *c) mutex_unlock(&c->sb_lock); } -void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, - struct journal_replay *j) +static void bch2_journal_ptr_to_text(struct printbuf *out, struct bch_fs *c, struct journal_ptr *p) +{ + struct bch_dev *ca = bch2_dev_tryget_noerror(c, p->dev); + prt_printf(out, "%s %u:%u:%u (sector %llu)", + ca ? ca->name : "(invalid dev)", + p->dev, p->bucket, p->bucket_offset, p->sector); + bch2_dev_put(ca); +} + +void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, struct journal_replay *j) { darray_for_each(j->ptrs, i) { if (i != j->ptrs.data) prt_printf(out, " "); - prt_printf(out, "%u:%u:%u (sector %llu)", - i->dev, i->bucket, i->bucket_offset, i->sector); + bch2_journal_ptr_to_text(out, c, i); } } -static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c, - struct journal_replay *j) +static void bch2_journal_datetime_to_text(struct printbuf *out, struct jset *j) { - prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq)); - - bch2_journal_ptrs_to_text(out, c, j); - - for_each_jset_entry_type(entry, &j->j, BCH_JSET_ENTRY_datetime) { + for_each_jset_entry_type(entry, j, BCH_JSET_ENTRY_datetime) { struct jset_entry_datetime *datetime = container_of(entry, struct jset_entry_datetime, entry); bch2_prt_datetime(out, le64_to_cpu(datetime->seconds)); @@ -75,6 +77,15 @@ static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c, } } +static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c, + struct journal_replay *j) +{ + prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq)); + bch2_journal_datetime_to_text(out, &j->j); + prt_char(out, ' '); + bch2_journal_ptrs_to_text(out, c, j); +} + static struct nonce journal_nonce(const struct jset *jset) { return (struct nonce) {{ @@ -188,7 +199,7 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, journal_entry_radix_idx(c, le64_to_cpu(j->seq)), GFP_KERNEL); if (!_i) - return -BCH_ERR_ENOMEM_journal_entry_add; + return bch_err_throw(c, ENOMEM_journal_entry_add); /* * Duplicate journal entries? If so we want the one that didn't have a @@ -231,7 +242,7 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, replace: i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); if (!i) - return -BCH_ERR_ENOMEM_journal_entry_add; + return bch_err_throw(c, ENOMEM_journal_entry_add); darray_init(&i->ptrs); i->csum_good = entry_ptr.csum_good; @@ -311,7 +322,7 @@ static void journal_entry_err_msg(struct printbuf *out, bch2_sb_error_count(c, BCH_FSCK_ERR_##_err); \ if (bch2_fs_inconsistent(c, \ "corrupt metadata before write: %s\n", _buf.buf)) {\ - ret = -BCH_ERR_fsck_errors_not_fixed; \ + ret = bch_err_throw(c, fsck_errors_not_fixed); \ goto fsck_err; \ } \ break; \ @@ -418,6 +429,10 @@ static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs bool first = true; jset_entry_for_each_key(entry, k) { + /* We may be called on entries that haven't been validated: */ + if (!k->k.u64s) + break; + if (!first) { prt_newline(out); bch2_prt_jset_entry_type(out, entry->type); @@ -1005,19 +1020,19 @@ struct journal_read_buf { size_t size; }; -static int journal_read_buf_realloc(struct journal_read_buf *b, +static int journal_read_buf_realloc(struct bch_fs *c, struct journal_read_buf *b, size_t new_size) { void *n; /* the bios are sized for this many pages, max: */ if (new_size > JOURNAL_ENTRY_SIZE_MAX) - return -BCH_ERR_ENOMEM_journal_read_buf_realloc; + return bch_err_throw(c, ENOMEM_journal_read_buf_realloc); new_size = roundup_pow_of_two(new_size); n = kvmalloc(new_size, GFP_KERNEL); if (!n) - return -BCH_ERR_ENOMEM_journal_read_buf_realloc; + return bch_err_throw(c, ENOMEM_journal_read_buf_realloc); kvfree(b->data); b->data = n; @@ -1037,7 +1052,6 @@ static int journal_read_bucket(struct bch_dev *ca, u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), end = offset + ca->mi.bucket_size; bool saw_bad = false, csum_good; - struct printbuf err = PRINTBUF; int ret = 0; pr_debug("reading %u", bucket); @@ -1053,7 +1067,7 @@ reread: bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); if (!bio) - return -BCH_ERR_ENOMEM_journal_read_bucket; + return bch_err_throw(c, ENOMEM_journal_read_bucket); bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ); bio->bi_iter.bi_sector = offset; @@ -1064,7 +1078,7 @@ reread: kfree(bio); if (!ret && bch2_meta_read_fault("journal")) - ret = -BCH_ERR_EIO_fault_injected; + ret = bch_err_throw(c, EIO_fault_injected); bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !ret); @@ -1078,7 +1092,7 @@ reread: * found on a different device, and missing or * no journal entries will be handled later */ - goto out; + return 0; } j = buf->data; @@ -1092,15 +1106,15 @@ reread: break; case JOURNAL_ENTRY_REREAD: if (vstruct_bytes(j) > buf->size) { - ret = journal_read_buf_realloc(buf, + ret = journal_read_buf_realloc(c, buf, vstruct_bytes(j)); if (ret) - goto err; + return ret; } goto reread; case JOURNAL_ENTRY_NONE: if (!saw_bad) - goto out; + return 0; /* * On checksum error we don't really trust the size * field of the journal entry we read, so try reading @@ -1109,7 +1123,7 @@ reread: sectors = block_sectors(c); goto next_block; default: - goto err; + return ret; } if (le64_to_cpu(j->seq) > ja->highest_seq_found) { @@ -1126,22 +1140,20 @@ reread: * bucket: */ if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket]) - goto out; + return 0; ja->bucket_seq[bucket] = le64_to_cpu(j->seq); - enum bch_csum_type csum_type = JSET_CSUM_TYPE(j); struct bch_csum csum; csum_good = jset_csum_good(c, j, &csum); bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good); if (!csum_good) { - bch_err_dev_ratelimited(ca, "%s", - (printbuf_reset(&err), - prt_str(&err, "journal "), - bch2_csum_err_msg(&err, csum_type, j->csum, csum), - err.buf)); + /* + * Don't print an error here, we'll print the error + * later if we need this journal entry + */ saw_bad = true; } @@ -1153,6 +1165,7 @@ reread: mutex_lock(&jlist->lock); ret = journal_entry_add(c, ca, (struct journal_ptr) { .csum_good = csum_good, + .csum = csum, .dev = ca->dev_idx, .bucket = bucket, .bucket_offset = offset - @@ -1167,7 +1180,7 @@ reread: case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: break; default: - goto err; + return ret; } next_block: pr_debug("next"); @@ -1176,11 +1189,7 @@ next_block: j = ((void *) j) + (sectors << 9); } -out: - ret = 0; -err: - printbuf_exit(&err); - return ret; + return 0; } static CLOSURE_CALLBACK(bch2_journal_read_device) @@ -1197,7 +1206,7 @@ static CLOSURE_CALLBACK(bch2_journal_read_device) if (!ja->nr) goto out; - ret = journal_read_buf_realloc(&buf, PAGE_SIZE); + ret = journal_read_buf_realloc(c, &buf, PAGE_SIZE); if (ret) goto err; @@ -1219,7 +1228,7 @@ static CLOSURE_CALLBACK(bch2_journal_read_device) out: bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret); kvfree(buf.data); - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_journal_read); closure_return(cl); return; err: @@ -1229,13 +1238,105 @@ err: goto out; } +noinline_for_stack +static void bch2_journal_print_checksum_error(struct bch_fs *c, struct journal_replay *j) +{ + struct printbuf buf = PRINTBUF; + enum bch_csum_type csum_type = JSET_CSUM_TYPE(&j->j); + bool have_good = false; + + prt_printf(&buf, "invalid journal checksum(s) at seq %llu ", le64_to_cpu(j->j.seq)); + bch2_journal_datetime_to_text(&buf, &j->j); + prt_newline(&buf); + + darray_for_each(j->ptrs, ptr) + if (!ptr->csum_good) { + bch2_journal_ptr_to_text(&buf, c, ptr); + prt_char(&buf, ' '); + bch2_csum_to_text(&buf, csum_type, ptr->csum); + prt_newline(&buf); + } else { + have_good = true; + } + + prt_printf(&buf, "should be "); + bch2_csum_to_text(&buf, csum_type, j->j.csum); + + if (have_good) + prt_printf(&buf, "\n(had good copy on another device)"); + + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); +} + +noinline_for_stack +static int bch2_journal_check_for_missing(struct bch_fs *c, u64 start_seq, u64 end_seq) +{ + struct printbuf buf = PRINTBUF; + int ret = 0; + + struct genradix_iter radix_iter; + struct journal_replay *i, **_i, *prev = NULL; + u64 seq = start_seq; + + genradix_for_each(&c->journal_entries, radix_iter, _i) { + i = *_i; + + if (journal_replay_ignore(i)) + continue; + + BUG_ON(seq > le64_to_cpu(i->j.seq)); + + while (seq < le64_to_cpu(i->j.seq)) { + while (seq < le64_to_cpu(i->j.seq) && + bch2_journal_seq_is_blacklisted(c, seq, false)) + seq++; + + if (seq == le64_to_cpu(i->j.seq)) + break; + + u64 missing_start = seq; + + while (seq < le64_to_cpu(i->j.seq) && + !bch2_journal_seq_is_blacklisted(c, seq, false)) + seq++; + + u64 missing_end = seq - 1; + + printbuf_reset(&buf); + prt_printf(&buf, "journal entries %llu-%llu missing! (replaying %llu-%llu)", + missing_start, missing_end, + start_seq, end_seq); + + prt_printf(&buf, "\nprev at "); + if (prev) { + bch2_journal_ptrs_to_text(&buf, c, prev); + prt_printf(&buf, " size %zu", vstruct_sectors(&prev->j, c->block_bits)); + } else + prt_printf(&buf, "(none)"); + + prt_printf(&buf, "\nnext at "); + bch2_journal_ptrs_to_text(&buf, c, i); + prt_printf(&buf, ", continue?"); + + fsck_err(c, journal_entries_missing, "%s", buf.buf); + } + + prev = i; + seq++; + } +fsck_err: + printbuf_exit(&buf); + return ret; +} + int bch2_journal_read(struct bch_fs *c, u64 *last_seq, u64 *blacklist_seq, u64 *start_seq) { struct journal_list jlist; - struct journal_replay *i, **_i, *prev = NULL; + struct journal_replay *i, **_i; struct genradix_iter radix_iter; struct printbuf buf = PRINTBUF; bool degraded = false, last_write_torn = false; @@ -1254,7 +1355,8 @@ int bch2_journal_read(struct bch_fs *c, if ((ca->mi.state == BCH_MEMBER_STATE_rw || ca->mi.state == BCH_MEMBER_STATE_ro) && - percpu_ref_tryget(&ca->io_ref[READ])) + enumerated_ref_tryget(&ca->io_ref[READ], + BCH_DEV_READ_REF_journal_read)) closure_call(&ca->journal.read, bch2_journal_read_device, system_unbound_wq, @@ -1325,12 +1427,12 @@ int bch2_journal_read(struct bch_fs *c, return 0; } - bch_info(c, "journal read done, replaying entries %llu-%llu", - *last_seq, *blacklist_seq - 1); - + printbuf_reset(&buf); + prt_printf(&buf, "journal read done, replaying entries %llu-%llu", + *last_seq, *blacklist_seq - 1); if (*start_seq != *blacklist_seq) - bch_info(c, "dropped unflushed entries %llu-%llu", - *blacklist_seq, *start_seq - 1); + prt_printf(&buf, " (unflushed %llu-%llu)", *blacklist_seq, *start_seq - 1); + bch_info(c, "%s", buf.buf); /* Drop blacklisted entries and entries older than last_seq: */ genradix_for_each(&c->journal_entries, radix_iter, _i) { @@ -1353,59 +1455,12 @@ int bch2_journal_read(struct bch_fs *c, } } - /* Check for missing entries: */ - seq = *last_seq; - genradix_for_each(&c->journal_entries, radix_iter, _i) { - i = *_i; - - if (journal_replay_ignore(i)) - continue; - - BUG_ON(seq > le64_to_cpu(i->j.seq)); - - while (seq < le64_to_cpu(i->j.seq)) { - u64 missing_start, missing_end; - struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; - - while (seq < le64_to_cpu(i->j.seq) && - bch2_journal_seq_is_blacklisted(c, seq, false)) - seq++; - - if (seq == le64_to_cpu(i->j.seq)) - break; - - missing_start = seq; - - while (seq < le64_to_cpu(i->j.seq) && - !bch2_journal_seq_is_blacklisted(c, seq, false)) - seq++; - - if (prev) { - bch2_journal_ptrs_to_text(&buf1, c, prev); - prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits)); - } else - prt_printf(&buf1, "(none)"); - bch2_journal_ptrs_to_text(&buf2, c, i); - - missing_end = seq - 1; - fsck_err(c, journal_entries_missing, - "journal entries %llu-%llu missing! (replaying %llu-%llu)\n" - "prev at %s\n" - "next at %s, continue?", - missing_start, missing_end, - *last_seq, *blacklist_seq - 1, - buf1.buf, buf2.buf); - - printbuf_exit(&buf1); - printbuf_exit(&buf2); - } - - prev = i; - seq++; - } + ret = bch2_journal_check_for_missing(c, *last_seq, *blacklist_seq - 1); + if (ret) + goto err; genradix_for_each(&c->journal_entries, radix_iter, _i) { - struct bch_replicas_padded replicas = { + union bch_replicas_padded replicas = { .e.data_type = BCH_DATA_journal, .e.nr_devs = 0, .e.nr_required = 1, @@ -1415,15 +1470,15 @@ int bch2_journal_read(struct bch_fs *c, if (journal_replay_ignore(i)) continue; - darray_for_each(i->ptrs, ptr) { - struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); - - if (!ptr->csum_good) - bch_err_dev_offset(ca, ptr->sector, - "invalid journal checksum, seq %llu%s", - le64_to_cpu(i->j.seq), - i->csum_good ? " (had good copy on another device)" : ""); - } + /* + * Don't print checksum errors until we know we're going to use + * a given journal entry: + */ + darray_for_each(i->ptrs, ptr) + if (!ptr->csum_good) { + bch2_journal_print_checksum_error(c, i); + break; + } ret = jset_validate(c, bch2_dev_have_ref(c, i->ptrs.data[0].dev), @@ -1466,6 +1521,7 @@ static void journal_advance_devs_to_next_bucket(struct journal *j, { struct bch_fs *c = container_of(j, struct bch_fs, journal); + guard(rcu)(); darray_for_each(*devs, i) { struct bch_dev *ca = rcu_dereference(c->devs[*i]); if (!ca) @@ -1499,7 +1555,8 @@ static void __journal_write_alloc(struct journal *j, struct bch_fs *c = container_of(j, struct bch_fs, journal); darray_for_each(*devs, i) { - struct bch_dev *ca = rcu_dereference(c->devs[*i]); + struct bch_dev *ca = bch2_dev_get_ioref(c, *i, WRITE, + BCH_DEV_WRITE_REF_journal_write); if (!ca) continue; @@ -1513,8 +1570,10 @@ static void __journal_write_alloc(struct journal *j, ca->mi.state != BCH_MEMBER_STATE_rw || !ja->nr || bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) || - sectors > ja->sectors_free) + sectors > ja->sectors_free) { + enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write); continue; + } bch2_dev_stripe_increment(ca, &j->wp.stripe); @@ -1537,15 +1596,8 @@ static void __journal_write_alloc(struct journal *j, } } -/** - * journal_write_alloc - decide where to write next journal entry - * - * @j: journal object - * @w: journal buf (entry to be written) - * - * Returns: 0 on success, or -BCH_ERR_insufficient_devices on failure - */ -static int journal_write_alloc(struct journal *j, struct journal_buf *w) +static int journal_write_alloc(struct journal *j, struct journal_buf *w, + unsigned *replicas) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_devs_mask devs; @@ -1553,29 +1605,18 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w) unsigned sectors = vstruct_sectors(w->data, c->block_bits); unsigned target = c->opts.metadata_target ?: c->opts.foreground_target; - unsigned replicas = 0, replicas_want = - READ_ONCE(c->opts.metadata_replicas); + unsigned replicas_want = READ_ONCE(c->opts.metadata_replicas); unsigned replicas_need = min_t(unsigned, replicas_want, READ_ONCE(c->opts.metadata_replicas_required)); bool advance_done = false; - rcu_read_lock(); - - /* We might run more than once if we have to stop and do discards: */ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&w->key)); - bkey_for_each_ptr(ptrs, p) { - struct bch_dev *ca = bch2_dev_rcu_noerror(c, p->dev); - if (ca) - replicas += ca->mi.durability; - } - retry_target: devs = target_rw_devs(c, BCH_DATA_journal, target); - devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs); + bch2_dev_alloc_list(c, &j->wp.stripe, &devs, &devs_sorted); retry_alloc: - __journal_write_alloc(j, w, &devs_sorted, sectors, &replicas, replicas_want); + __journal_write_alloc(j, w, &devs_sorted, sectors, replicas, replicas_want); - if (likely(replicas >= replicas_want)) + if (likely(*replicas >= replicas_want)) goto done; if (!advance_done) { @@ -1584,18 +1625,26 @@ retry_alloc: goto retry_alloc; } - if (replicas < replicas_want && target) { + if (*replicas < replicas_want && target) { /* Retry from all devices: */ target = 0; advance_done = false; goto retry_target; } done: - rcu_read_unlock(); - BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX); - return replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices; +#if 0 + /* + * XXX: we need a way to alert the user when we go degraded for any + * reason + */ + if (*replicas < min(replicas_want, + dev_mask_nr(&c->rw_devs[BCH_DATA_free]))) { + } +#endif + + return *replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices; } static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) @@ -1633,7 +1682,7 @@ static CLOSURE_CALLBACK(journal_write_done) closure_type(w, struct journal_buf, io); struct journal *j = container_of(w, struct journal, buf[w->idx]); struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bch_replicas_padded replicas; + union bch_replicas_padded replicas; u64 seq = le64_to_cpu(w->data->seq); int err = 0; @@ -1642,17 +1691,27 @@ static CLOSURE_CALLBACK(journal_write_done) : j->noflush_write_time, j->write_start_time); if (!w->devs_written.nr) { - if (!bch2_journal_error(j)) - bch_err(c, "unable to write journal to sufficient devices"); - err = -BCH_ERR_journal_write_err; + err = bch_err_throw(c, journal_write_err); } else { bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, w->devs_written); err = bch2_mark_replicas(c, &replicas.e); } - if (err) - bch2_fatal_error(c); + if (err && !bch2_journal_error(j)) { + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + + if (err == -BCH_ERR_journal_write_err) + prt_printf(&buf, "unable to write journal to sufficient devices"); + else + prt_printf(&buf, "journal write error marking replicas: %s", bch2_err_str(err)); + + bch2_fs_emergency_read_only2(c, &buf); + + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + } closure_debug_destroy(cl); @@ -1770,7 +1829,7 @@ static void journal_write_endio(struct bio *bio) } closure_put(&w->io); - percpu_ref_put(&ca->io_ref[WRITE]); + enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write); } static CLOSURE_CALLBACK(journal_write_submit) @@ -1781,12 +1840,7 @@ static CLOSURE_CALLBACK(journal_write_submit) unsigned sectors = vstruct_sectors(w->data, c->block_bits); extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { - struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE); - if (!ca) { - /* XXX: fix this */ - bch_err(c, "missing device %u for journal write", ptr->dev); - continue; - } + struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], sectors); @@ -1844,8 +1898,9 @@ static CLOSURE_CALLBACK(journal_write_preflush) } if (w->separate_flush) { - for_each_rw_member(c, ca) { - percpu_ref_get(&ca->io_ref[WRITE]); + for_each_rw_member(c, ca, BCH_DEV_WRITE_REF_journal_write) { + enumerated_ref_get(&ca->io_ref[WRITE], + BCH_DEV_WRITE_REF_journal_write); struct journal_device *ja = &ca->journal; struct bio *bio = &ja->bio[w->idx]->bio; @@ -1872,9 +1927,8 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) struct jset_entry *start, *end; struct jset *jset = w->data; struct journal_keys_to_wb wb = { NULL }; - unsigned sectors, bytes, u64s; + unsigned u64s; unsigned long btree_roots_have = 0; - bool validate_before_checksum = false; u64 seq = le64_to_cpu(jset->seq); int ret; @@ -1957,8 +2011,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) le32_add_cpu(&jset->u64s, u64s); - sectors = vstruct_sectors(jset, c->block_bits); - bytes = vstruct_bytes(jset); + unsigned sectors = vstruct_sectors(jset, c->block_bits); if (sectors > w->sectors) { bch2_fs_fatal_error(c, ": journal write overran available space, %zu > %u (extra %u reserved %u/%u)", @@ -1967,6 +2020,17 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) return -EINVAL; } + return 0; +} + +static int bch2_journal_write_checksum(struct journal *j, struct journal_buf *w) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct jset *jset = w->data; + u64 seq = le64_to_cpu(jset->seq); + bool validate_before_checksum = false; + int ret = 0; + jset->magic = cpu_to_le64(jset_magic(c)); jset->version = cpu_to_le32(c->sb.version); @@ -1989,7 +2053,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset->encrypted_start, vstruct_end(jset) - (void *) jset->encrypted_start); - if (bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret))) + if (bch2_fs_fatal_err_on(ret, c, "encrypting journal entry: %s", bch2_err_str(ret))) return ret; jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), @@ -1999,6 +2063,8 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) (ret = jset_validate(c, NULL, jset, 0, WRITE))) return ret; + unsigned sectors = vstruct_sectors(jset, c->block_bits); + unsigned bytes = vstruct_bytes(jset); memset((void *) jset + bytes, 0, (sectors << 9) - bytes); return 0; } @@ -2054,13 +2120,10 @@ CLOSURE_CALLBACK(bch2_journal_write) closure_type(w, struct journal_buf, io); struct journal *j = container_of(w, struct journal, buf[w->idx]); struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bch_replicas_padded replicas; - unsigned nr_rw_members = 0; + union bch_replicas_padded replicas; + unsigned nr_rw_members = dev_mask_nr(&c->rw_devs[BCH_DATA_free]); int ret; - for_each_rw_member(c, ca) - nr_rw_members++; - BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); BUG_ON(!w->write_started); BUG_ON(w->write_allocated); @@ -2074,7 +2137,8 @@ CLOSURE_CALLBACK(bch2_journal_write) ret = bch2_journal_write_pick_flush(j, w); spin_unlock(&j->lock); - if (ret) + + if (unlikely(ret)) goto err; mutex_lock(&j->buf_lock); @@ -2082,43 +2146,34 @@ CLOSURE_CALLBACK(bch2_journal_write) ret = bch2_journal_write_prep(j, w); mutex_unlock(&j->buf_lock); - if (ret) - goto err; - j->entry_bytes_written += vstruct_bytes(w->data); + if (unlikely(ret)) + goto err; + unsigned replicas_allocated = 0; while (1) { - spin_lock(&j->lock); - ret = journal_write_alloc(j, w); + ret = journal_write_alloc(j, w, &replicas_allocated); if (!ret || !j->can_discard) break; - spin_unlock(&j->lock); bch2_journal_do_discards(j); } - if (ret && !bch2_journal_error(j)) { - struct printbuf buf = PRINTBUF; - buf.atomic++; + if (unlikely(ret)) + goto err_allocate_write; - __bch2_journal_debug_to_text(&buf, j); - spin_unlock(&j->lock); - prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu for %zu sectors: %s"), - le64_to_cpu(w->data->seq), - vstruct_sectors(w->data, c->block_bits), - bch2_err_str(ret)); - bch2_print_string_as_lines(KERN_ERR, buf.buf); - printbuf_exit(&buf); - } - if (ret) + ret = bch2_journal_write_checksum(j, w); + if (unlikely(ret)) goto err; + spin_lock(&j->lock); /* * write is allocated, no longer need to account for it in * bch2_journal_space_available(): */ w->sectors = 0; w->write_allocated = true; + j->entry_bytes_written += vstruct_bytes(w->data); /* * journal entry has been compacted and allocated, recalculate space @@ -2130,9 +2185,6 @@ CLOSURE_CALLBACK(bch2_journal_write) w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); - if (c->opts.nochanges) - goto no_io; - /* * Mark journal replicas before we submit the write to guarantee * recovery will find the journal entries after a crash. @@ -2143,15 +2195,33 @@ CLOSURE_CALLBACK(bch2_journal_write) if (ret) goto err; + if (c->opts.nochanges) + goto no_io; + if (!JSET_NO_FLUSH(w->data)) continue_at(cl, journal_write_preflush, j->wq); else continue_at(cl, journal_write_submit, j->wq); return; -no_io: - continue_at(cl, journal_write_done, j->wq); - return; +err_allocate_write: + if (!bch2_journal_error(j)) { + struct printbuf buf = PRINTBUF; + + bch2_journal_debug_to_text(&buf, j); + prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu for %zu sectors: %s"), + le64_to_cpu(w->data->seq), + vstruct_sectors(w->data, c->block_bits), + bch2_err_str(ret)); + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + } err: bch2_fatal_error(c); +no_io: + extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { + struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); + enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write); + } + continue_at(cl, journal_write_done, j->wq); } diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h index 12b39fcb4424..6fa82c4050fe 100644 --- a/fs/bcachefs/journal_io.h +++ b/fs/bcachefs/journal_io.h @@ -9,6 +9,7 @@ void bch2_journal_pos_from_member_info_resume(struct bch_fs *); struct journal_ptr { bool csum_good; + struct bch_csum csum; u8 dev; u32 bucket; u32 bucket_offset; diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index cc00b0fc40d8..cd6201741c59 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -83,18 +83,20 @@ static struct journal_space journal_dev_space_available(struct journal *j, struct bch_dev *ca, enum journal_space_from from) { + struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_device *ja = &ca->journal; unsigned sectors, buckets, unwritten; + unsigned bucket_size_aligned = round_down(ca->mi.bucket_size, block_sectors(c)); u64 seq; if (from == journal_space_total) return (struct journal_space) { - .next_entry = ca->mi.bucket_size, - .total = ca->mi.bucket_size * ja->nr, + .next_entry = bucket_size_aligned, + .total = bucket_size_aligned * ja->nr, }; buckets = bch2_journal_dev_buckets_available(j, ja, from); - sectors = ja->sectors_free; + sectors = round_down(ja->sectors_free, block_sectors(c)); /* * We that we don't allocate the space for a journal entry @@ -109,7 +111,7 @@ journal_dev_space_available(struct journal *j, struct bch_dev *ca, continue; /* entry won't fit on this device, skip: */ - if (unwritten > ca->mi.bucket_size) + if (unwritten > bucket_size_aligned) continue; if (unwritten >= sectors) { @@ -119,7 +121,7 @@ journal_dev_space_available(struct journal *j, struct bch_dev *ca, } buckets--; - sectors = ca->mi.bucket_size; + sectors = bucket_size_aligned; } sectors -= unwritten; @@ -127,12 +129,12 @@ journal_dev_space_available(struct journal *j, struct bch_dev *ca, if (sectors < ca->mi.bucket_size && buckets) { buckets--; - sectors = ca->mi.bucket_size; + sectors = bucket_size_aligned; } return (struct journal_space) { .next_entry = sectors, - .total = sectors + buckets * ca->mi.bucket_size, + .total = sectors + buckets * bucket_size_aligned, }; } @@ -146,7 +148,6 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space)); - rcu_read_lock(); for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { if (!ca->journal.nr || !ca->mi.durability) @@ -164,7 +165,6 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne array_insert_item(dev_space, nr_devs, pos, space); } - rcu_read_unlock(); if (nr_devs < nr_devs_want) return (struct journal_space) { 0, 0 }; @@ -189,8 +189,8 @@ void bch2_journal_space_available(struct journal *j) int ret = 0; lockdep_assert_held(&j->lock); + guard(rcu)(); - rcu_read_lock(); for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { struct journal_device *ja = &ca->journal; @@ -210,24 +210,23 @@ void bch2_journal_space_available(struct journal *j) max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size); nr_online++; } - rcu_read_unlock(); j->can_discard = can_discard; if (nr_online < metadata_replicas_required(c)) { - struct printbuf buf = PRINTBUF; - buf.atomic++; - prt_printf(&buf, "insufficient writeable journal devices available: have %u, need %u\n" - "rw journal devs:", nr_online, metadata_replicas_required(c)); - - rcu_read_lock(); - for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) - prt_printf(&buf, " %s", ca->name); - rcu_read_unlock(); - - bch_err(c, "%s", buf.buf); - printbuf_exit(&buf); - ret = -BCH_ERR_insufficient_journal_devices; + if (!(c->sb.features & BIT_ULL(BCH_FEATURE_small_image))) { + struct printbuf buf = PRINTBUF; + buf.atomic++; + prt_printf(&buf, "insufficient writeable journal devices available: have %u, need %u\n" + "rw journal devs:", nr_online, metadata_replicas_required(c)); + + for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) + prt_printf(&buf, " %s", ca->name); + + bch_err(c, "%s", buf.buf); + printbuf_exit(&buf); + } + ret = bch_err_throw(c, insufficient_journal_devices); goto out; } @@ -241,7 +240,7 @@ void bch2_journal_space_available(struct journal *j) total = j->space[journal_space_total].total; if (!j->space[journal_space_discarded].next_entry) - ret = -BCH_ERR_journal_full; + ret = bch_err_throw(c, journal_full); if ((j->space[journal_space_clean_ondisk].next_entry < j->space[journal_space_clean_ondisk].total) && @@ -254,8 +253,7 @@ void bch2_journal_space_available(struct journal *j) bch2_journal_set_watermark(j); out: j->cur_entry_sectors = !ret - ? round_down(j->space[journal_space_discarded].next_entry, - block_sectors(c)) + ? j->space[journal_space_discarded].next_entry : 0; j->cur_entry_error = ret; @@ -293,12 +291,12 @@ void bch2_journal_do_discards(struct journal *j) mutex_lock(&j->discard_lock); - for_each_rw_member(c, ca) { + for_each_rw_member(c, ca, BCH_DEV_WRITE_REF_journal_do_discards) { struct journal_device *ja = &ca->journal; while (should_discard_bucket(j, ja)) { if (!c->opts.nochanges && - ca->mi.discard && + bch2_discard_opt_enabled(c, ca) && bdev_max_discard_sectors(ca->disk_sb.bdev)) blkdev_issue_discard(ca->disk_sb.bdev, bucket_to_sector(ca, @@ -623,9 +621,10 @@ static u64 journal_seq_to_flush(struct journal *j) struct bch_fs *c = container_of(j, struct bch_fs, journal); u64 seq_to_flush = 0; - spin_lock(&j->lock); + guard(spinlock)(&j->lock); + guard(rcu)(); - for_each_rw_member(c, ca) { + for_each_rw_member_rcu(c, ca) { struct journal_device *ja = &ca->journal; unsigned nr_buckets, bucket_to_flush; @@ -635,20 +634,15 @@ static u64 journal_seq_to_flush(struct journal *j) /* Try to keep the journal at most half full: */ nr_buckets = ja->nr / 2; - nr_buckets = min(nr_buckets, ja->nr); - bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr; seq_to_flush = max(seq_to_flush, ja->bucket_seq[bucket_to_flush]); } /* Also flush if the pin fifo is more than half full */ - seq_to_flush = max_t(s64, seq_to_flush, - (s64) journal_cur_seq(j) - - (j->pin.size >> 1)); - spin_unlock(&j->lock); - - return seq_to_flush; + return max_t(s64, seq_to_flush, + (s64) journal_cur_seq(j) - + (j->pin.size >> 1)); } /** @@ -699,6 +693,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) if (ret) break; + /* XXX shove journal discards off to another thread */ bch2_journal_do_discards(j); seq_to_flush = journal_seq_to_flush(j); @@ -961,7 +956,7 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) seq = 0; spin_lock(&j->lock); while (!ret) { - struct bch_replicas_padded replicas; + union bch_replicas_padded replicas; seq = max(seq, journal_last_seq(j)); if (seq >= j->pin.back) diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c index 62b910f2fb27..0cb9b93f13e7 100644 --- a/fs/bcachefs/journal_sb.c +++ b/fs/bcachefs/journal_sb.c @@ -210,7 +210,7 @@ int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca, j = bch2_sb_field_resize(&ca->disk_sb, journal_v2, (sizeof(*j) + sizeof(j->d[0]) * nr_compacted) / sizeof(u64)); if (!j) - return -BCH_ERR_ENOSPC_sb_journal; + return bch_err_throw(c, ENOSPC_sb_journal); bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal); diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c index e463d2d95359..af4fe416d9ec 100644 --- a/fs/bcachefs/journal_seq_blacklist.c +++ b/fs/bcachefs/journal_seq_blacklist.c @@ -78,7 +78,7 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist, sb_blacklist_u64s(nr + 1)); if (!bl) { - ret = -BCH_ERR_ENOSPC_sb_journal_seq_blacklist; + ret = bch_err_throw(c, ENOSPC_sb_journal_seq_blacklist); goto out; } @@ -130,6 +130,16 @@ bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq, return true; } +u64 bch2_journal_last_blacklisted_seq(struct bch_fs *c) +{ + struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; + + if (!t || !t->nr) + return 0; + + return t->entries[eytzinger0_last(t->nr)].end - 1; +} + int bch2_blacklist_table_initialize(struct bch_fs *c) { struct bch_sb_field_journal_seq_blacklist *bl = @@ -142,7 +152,7 @@ int bch2_blacklist_table_initialize(struct bch_fs *c) t = kzalloc(struct_size(t, entries, nr), GFP_KERNEL); if (!t) - return -BCH_ERR_ENOMEM_blacklist_table_init; + return bch_err_throw(c, ENOMEM_blacklist_table_init); t->nr = nr; diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h index d47636f96fdc..f06942ccfcdd 100644 --- a/fs/bcachefs/journal_seq_blacklist.h +++ b/fs/bcachefs/journal_seq_blacklist.h @@ -12,6 +12,7 @@ blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl) } bool bch2_journal_seq_is_blacklisted(struct bch_fs *, u64, bool); +u64 bch2_journal_last_blacklisted_seq(struct bch_fs *); int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64, u64); int bch2_blacklist_table_initialize(struct bch_fs *); diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index 8e0eba776b9d..51104bbb99da 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -151,8 +151,6 @@ enum journal_flags { #undef x }; -typedef DARRAY(u64) darray_u64; - struct journal_bio { struct bch_dev *ca; unsigned buf_idx; diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c index 2f63fc6d456f..57b5b3263b08 100644 --- a/fs/bcachefs/lru.c +++ b/fs/bcachefs/lru.c @@ -145,13 +145,11 @@ static u64 bkey_lru_type_idx(struct bch_fs *c, case BCH_LRU_fragmentation: { a = bch2_alloc_to_v4(k, &a_convert); - rcu_read_lock(); + guard(rcu)(); struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.k->p.inode); - u64 idx = ca + return ca ? alloc_lru_idx_fragmentation(*a, ca) : 0; - rcu_read_unlock(); - return idx; } case BCH_LRU_stripes: return k.k->type == KEY_TYPE_stripe diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c index 90dcf80bd64a..f296cce95338 100644 --- a/fs/bcachefs/migrate.c +++ b/fs/bcachefs/migrate.c @@ -4,10 +4,13 @@ */ #include "bcachefs.h" +#include "backpointers.h" #include "bkey_buf.h" #include "btree_update.h" #include "btree_update_interior.h" +#include "btree_write_buffer.h" #include "buckets.h" +#include "ec.h" #include "errcode.h" #include "extents.h" #include "io_write.h" @@ -20,7 +23,7 @@ #include "super-io.h" static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k, - unsigned dev_idx, int flags, bool metadata) + unsigned dev_idx, unsigned flags, bool metadata) { unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas; unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST; @@ -32,16 +35,33 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k, nr_good = bch2_bkey_durability(c, k.s_c); if ((!nr_good && !(flags & lost)) || (nr_good < replicas && !(flags & degraded))) - return -BCH_ERR_remove_would_lose_data; + return bch_err_throw(c, remove_would_lose_data); return 0; } +static int drop_btree_ptrs(struct btree_trans *trans, struct btree_iter *iter, + struct btree *b, unsigned dev_idx, unsigned flags) +{ + struct bch_fs *c = trans->c; + struct bkey_buf k; + + bch2_bkey_buf_init(&k); + bch2_bkey_buf_copy(&k, c, &b->key); + + int ret = drop_dev_ptrs(c, bkey_i_to_s(k.k), dev_idx, flags, true) ?: + bch2_btree_node_update_key(trans, iter, b, k.k, 0, false); + + bch_err_fn(c, ret); + bch2_bkey_buf_exit(&k, c); + return ret; +} + static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, unsigned dev_idx, - int flags) + unsigned flags) { struct bch_fs *c = trans->c; struct bkey_i *n; @@ -77,9 +97,27 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, return 0; } +static int bch2_dev_btree_drop_key(struct btree_trans *trans, + struct bkey_s_c_backpointer bp, + unsigned dev_idx, + struct bkey_buf *last_flushed, + unsigned flags) +{ + struct btree_iter iter; + struct btree *b = bch2_backpointer_get_node(trans, bp, &iter, last_flushed); + int ret = PTR_ERR_OR_ZERO(b); + if (ret) + return ret == -BCH_ERR_backpointer_to_overwritten_btree_node ? 0 : ret; + + ret = drop_btree_ptrs(trans, &iter, b, dev_idx, flags); + + bch2_trans_iter_exit(trans, &iter); + return ret; +} + static int bch2_dev_usrdata_drop(struct bch_fs *c, struct progress_indicator_state *progress, - unsigned dev_idx, int flags) + unsigned dev_idx, unsigned flags) { struct btree_trans *trans = bch2_trans_get(c); enum btree_id id; @@ -106,7 +144,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, static int bch2_dev_metadata_drop(struct bch_fs *c, struct progress_indicator_state *progress, - unsigned dev_idx, int flags) + unsigned dev_idx, unsigned flags) { struct btree_trans *trans; struct btree_iter iter; @@ -118,7 +156,7 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, /* don't handle this yet: */ if (flags & BCH_FORCE_IF_METADATA_LOST) - return -BCH_ERR_remove_with_metadata_missing_unimplemented; + return bch_err_throw(c, remove_with_metadata_missing_unimplemented); trans = bch2_trans_get(c); bch2_bkey_buf_init(&k); @@ -137,20 +175,12 @@ retry: if (!bch2_bkey_has_device_c(bkey_i_to_s_c(&b->key), dev_idx)) goto next; - bch2_bkey_buf_copy(&k, c, &b->key); - - ret = drop_dev_ptrs(c, bkey_i_to_s(k.k), - dev_idx, flags, true); - if (ret) - break; - - ret = bch2_btree_node_update_key(trans, &iter, b, k.k, 0, false); + ret = drop_btree_ptrs(trans, &iter, b, dev_idx, flags); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { ret = 0; continue; } - bch_err_msg(c, ret, "updating btree node key"); if (ret) break; next: @@ -176,7 +206,66 @@ err: return ret; } -int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags) +static int data_drop_bp(struct btree_trans *trans, unsigned dev_idx, + struct bkey_s_c_backpointer bp, struct bkey_buf *last_flushed, + unsigned flags) +{ + struct btree_iter iter; + struct bkey_s_c k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, + last_flushed); + int ret = bkey_err(k); + if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) + return 0; + if (ret) + return ret; + + if (!k.k || !bch2_bkey_has_device_c(k, dev_idx)) + goto out; + + /* + * XXX: pass flags arg to invalidate_stripe_to_dev and handle it + * properly + */ + + if (bkey_is_btree_ptr(k.k)) + ret = bch2_dev_btree_drop_key(trans, bp, dev_idx, last_flushed, flags); + else if (k.k->type == KEY_TYPE_stripe) + ret = bch2_invalidate_stripe_to_dev(trans, &iter, k, dev_idx, flags); + else + ret = bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags); +out: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_dev_data_drop_by_backpointers(struct bch_fs *c, unsigned dev_idx, unsigned flags) +{ + struct btree_trans *trans = bch2_trans_get(c); + + struct bkey_buf last_flushed; + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); + + int ret = bch2_btree_write_buffer_flush_sync(trans) ?: + for_each_btree_key_max_commit(trans, iter, BTREE_ID_backpointers, + POS(dev_idx, 0), + POS(dev_idx, U64_MAX), 0, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + if (k.k->type != KEY_TYPE_backpointer) + continue; + + data_drop_bp(trans, dev_idx, bkey_s_c_to_backpointer(k), + &last_flushed, flags); + + })); + + bch2_bkey_buf_exit(&last_flushed, trans->c); + bch2_trans_put(trans); + bch_err_fn(c, ret); + return ret; +} + +int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, unsigned flags) { struct progress_indicator_state progress; bch2_progress_init(&progress, c, diff --git a/fs/bcachefs/migrate.h b/fs/bcachefs/migrate.h index 027efaa0d575..30018140711b 100644 --- a/fs/bcachefs/migrate.h +++ b/fs/bcachefs/migrate.h @@ -2,6 +2,7 @@ #ifndef _BCACHEFS_MIGRATE_H #define _BCACHEFS_MIGRATE_H -int bch2_dev_data_drop(struct bch_fs *, unsigned, int); +int bch2_dev_data_drop_by_backpointers(struct bch_fs *, unsigned, unsigned); +int bch2_dev_data_drop(struct bch_fs *, unsigned, unsigned); #endif /* _BCACHEFS_MIGRATE_H */ diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index dfdbb9259985..eec591e947bd 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -38,36 +38,80 @@ const char * const bch2_data_ops_strs[] = { NULL }; -static void trace_io_move2(struct bch_fs *c, struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) +struct evacuate_bucket_arg { + struct bpos bucket; + int gen; + struct data_update_opts data_opts; +}; + +static bool evacuate_bucket_pred(struct bch_fs *, void *, + enum btree_id, struct bkey_s_c, + struct bch_io_opts *, + struct data_update_opts *); + +static noinline void +trace_io_move2(struct bch_fs *c, struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) { - if (trace_io_move_enabled()) { - struct printbuf buf = PRINTBUF; + struct printbuf buf = PRINTBUF; - bch2_bkey_val_to_text(&buf, c, k); - prt_newline(&buf); - bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts); - trace_io_move(c, buf.buf); - printbuf_exit(&buf); - } + bch2_bkey_val_to_text(&buf, c, k); + prt_newline(&buf); + bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts); + trace_io_move(c, buf.buf); + printbuf_exit(&buf); } -static void trace_io_move_read2(struct bch_fs *c, struct bkey_s_c k) +static noinline void trace_io_move_read2(struct bch_fs *c, struct bkey_s_c k) { - if (trace_io_move_read_enabled()) { - struct printbuf buf = PRINTBUF; + struct printbuf buf = PRINTBUF; - bch2_bkey_val_to_text(&buf, c, k); - trace_io_move_read(c, buf.buf); - printbuf_exit(&buf); + bch2_bkey_val_to_text(&buf, c, k); + trace_io_move_read(c, buf.buf); + printbuf_exit(&buf); +} + +static noinline void +trace_io_move_pred2(struct bch_fs *c, struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts, + move_pred_fn pred, void *_arg, bool p) +{ + struct printbuf buf = PRINTBUF; + + prt_printf(&buf, "%ps: %u", pred, p); + + if (pred == evacuate_bucket_pred) { + struct evacuate_bucket_arg *arg = _arg; + prt_printf(&buf, " gen=%u", arg->gen); } + + prt_newline(&buf); + bch2_bkey_val_to_text(&buf, c, k); + prt_newline(&buf); + bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts); + trace_io_move_pred(c, buf.buf); + printbuf_exit(&buf); +} + +static noinline void +trace_io_move_evacuate_bucket2(struct bch_fs *c, struct bpos bucket, int gen) +{ + struct printbuf buf = PRINTBUF; + + prt_printf(&buf, "bucket: "); + bch2_bpos_to_text(&buf, bucket); + prt_printf(&buf, " gen: %i\n", gen); + + trace_io_move_evacuate_bucket(c, buf.buf); + printbuf_exit(&buf); } struct moving_io { struct list_head read_list; struct list_head io_list; - struct move_bucket_in_flight *b; + struct move_bucket *b; struct closure cl; bool read_completed; @@ -109,7 +153,6 @@ static void move_write_done(struct bch_write_op *op) struct printbuf buf = PRINTBUF; bch2_write_op_to_text(&buf, op); - prt_printf(&buf, "ret\t%s\n", bch2_err_str(op->error)); trace_io_move_write_fail(c, buf.buf); printbuf_exit(&buf); } @@ -126,26 +169,40 @@ static void move_write_done(struct bch_write_op *op) static void move_write(struct moving_io *io) { + struct bch_fs *c = io->write.op.c; struct moving_context *ctxt = io->write.ctxt; + struct bch_read_bio *rbio = &io->write.rbio; if (ctxt->stats) { - if (io->write.rbio.bio.bi_status) + if (rbio->bio.bi_status) atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9, &ctxt->stats->sectors_error_uncorrected); - else if (io->write.rbio.saw_error) + else if (rbio->saw_error) atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9, &ctxt->stats->sectors_error_corrected); } - if (unlikely(io->write.rbio.ret || - io->write.rbio.bio.bi_status || - io->write.data_opts.scrub)) { + /* + * If the extent has been bitrotted, we're going to have to give it a + * new checksum in order to move it - but the poison bit will ensure + * that userspace still gets the appropriate error. + */ + if (unlikely(rbio->ret == -BCH_ERR_data_read_csum_err && + (bch2_bkey_extent_flags(bkey_i_to_s_c(io->write.k.k)) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)))) { + struct bch_extent_crc_unpacked crc = rbio->pick.crc; + struct nonce nonce = extent_nonce(rbio->version, crc); + + rbio->pick.crc.csum = bch2_checksum_bio(c, rbio->pick.crc.csum_type, + nonce, &rbio->bio); + rbio->ret = 0; + } + + if (unlikely(rbio->ret || io->write.data_opts.scrub)) { move_free(io); return; } if (trace_io_move_write_enabled()) { - struct bch_fs *c = io->write.op.c; struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k)); @@ -275,7 +332,7 @@ void bch2_move_stats_init(struct bch_move_stats *stats, const char *name) } int bch2_move_extent(struct moving_context *ctxt, - struct move_bucket_in_flight *bucket_in_flight, + struct move_bucket *bucket_in_flight, struct btree_iter *iter, struct bkey_s_c k, struct bch_io_opts io_opts, @@ -285,7 +342,8 @@ int bch2_move_extent(struct moving_context *ctxt, struct bch_fs *c = trans->c; int ret = -ENOMEM; - trace_io_move2(c, k, &io_opts, &data_opts); + if (trace_io_move_enabled()) + trace_io_move2(c, k, &io_opts, &data_opts); this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size); if (ctxt->stats) @@ -301,16 +359,14 @@ int bch2_move_extent(struct moving_context *ctxt, return 0; } - /* - * Before memory allocations & taking nocow locks in - * bch2_data_update_init(): - */ - bch2_trans_unlock(trans); - - struct moving_io *io = kzalloc(sizeof(struct moving_io), GFP_KERNEL); + struct moving_io *io = allocate_dropping_locks(trans, ret, + kzalloc(sizeof(struct moving_io), _gfp)); if (!io) goto err; + if (ret) + goto err_free; + INIT_LIST_HEAD(&io->io_list); io->write.ctxt = ctxt; io->read_sectors = k.k->size; @@ -330,6 +386,8 @@ int bch2_move_extent(struct moving_context *ctxt, io->write.op.c = c; io->write.data_opts = data_opts; + bch2_trans_unlock(trans); + ret = bch2_data_update_bios_init(&io->write, c, &io_opts); if (ret) goto err_free; @@ -351,7 +409,8 @@ int bch2_move_extent(struct moving_context *ctxt, atomic_inc(&io->b->count); } - trace_io_move_read2(c, k); + if (trace_io_move_read_enabled()) + trace_io_move_read2(c, k); mutex_lock(&ctxt->lock); atomic_add(io->read_sectors, &ctxt->read_sectors); @@ -377,9 +436,6 @@ int bch2_move_extent(struct moving_context *ctxt, err_free: kfree(io); err: - if (bch2_err_matches(ret, BCH_ERR_data_update_done)) - return 0; - if (bch2_err_matches(ret, EROFS) || bch2_err_matches(ret, BCH_ERR_transaction_restart)) return ret; @@ -395,10 +451,13 @@ err: trace_io_move_start_fail(c, buf.buf); printbuf_exit(&buf); } + + if (bch2_err_matches(ret, BCH_ERR_data_update_done)) + return 0; return ret; } -static struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, +struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, struct per_snapshot_io_opts *io_opts, struct bpos extent_pos, /* extent_iter, extent_k may be in reflink btree */ struct btree_iter *extent_iter, @@ -409,6 +468,9 @@ static struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, struct bch_io_opts *opts_ret = &io_opts->fs_io_opts; int ret = 0; + if (extent_iter->min_depth) + return opts_ret; + if (extent_k.k->type == KEY_TYPE_reflink_v) goto out; @@ -480,6 +542,7 @@ int bch2_move_get_io_opts_one(struct btree_trans *trans, bch2_inode_opts_get(io_opts, c, &inode); } bch2_trans_iter_exit(trans, &inode_iter); + /* seem to be spinning here? */ out: return bch2_get_update_rebalance_opts(trans, io_opts, extent_iter, extent_k); } @@ -559,11 +622,11 @@ static struct bkey_s_c bch2_lookup_indirect_extent_for_move(struct btree_trans * return k; } -static int bch2_move_data_btree(struct moving_context *ctxt, - struct bpos start, - struct bpos end, - move_pred_fn pred, void *arg, - enum btree_id btree_id) +int bch2_move_data_btree(struct moving_context *ctxt, + struct bpos start, + struct bpos end, + move_pred_fn pred, void *arg, + enum btree_id btree_id, unsigned level) { struct btree_trans *trans = ctxt->trans; struct bch_fs *c = trans->c; @@ -589,11 +652,56 @@ static int bch2_move_data_btree(struct moving_context *ctxt, ctxt->stats->pos = BBPOS(btree_id, start); } +retry_root: bch2_trans_begin(trans); - bch2_trans_iter_init(trans, &iter, btree_id, start, - BTREE_ITER_prefetch| - BTREE_ITER_not_extents| - BTREE_ITER_all_snapshots); + + if (level == bch2_btree_id_root(c, btree_id)->level + 1) { + bch2_trans_node_iter_init(trans, &iter, btree_id, start, 0, level - 1, + BTREE_ITER_prefetch| + BTREE_ITER_not_extents| + BTREE_ITER_all_snapshots); + struct btree *b = bch2_btree_iter_peek_node(trans, &iter); + ret = PTR_ERR_OR_ZERO(b); + if (ret) + goto root_err; + + if (b != btree_node_root(c, b)) { + bch2_trans_iter_exit(trans, &iter); + goto retry_root; + } + + k = bkey_i_to_s_c(&b->key); + + io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, + iter.pos, &iter, k); + ret = PTR_ERR_OR_ZERO(io_opts); + if (ret) + goto root_err; + + memset(&data_opts, 0, sizeof(data_opts)); + if (!pred(c, arg, iter.btree_id, k, io_opts, &data_opts)) + goto out; + + + if (!data_opts.scrub) + ret = bch2_btree_node_rewrite_pos(trans, btree_id, level, + k.k->p, data_opts.target, 0); + else + ret = bch2_btree_node_scrub(trans, btree_id, level, k, data_opts.read_dev); + +root_err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { + bch2_trans_iter_exit(trans, &iter); + goto retry_root; + } + + goto out; + } + + bch2_trans_node_iter_init(trans, &iter, btree_id, start, 0, level, + BTREE_ITER_prefetch| + BTREE_ITER_not_extents| + BTREE_ITER_all_snapshots); if (ctxt->rate) bch2_ratelimit_reset(ctxt->rate); @@ -613,7 +721,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, if (ret) break; - if (bkey_ge(bkey_start_pos(k.k), end)) + if (bkey_gt(bkey_start_pos(k.k), end)) break; if (ctxt->stats) @@ -653,7 +761,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, continue; memset(&data_opts, 0, sizeof(data_opts)); - if (!pred(c, arg, k, io_opts, &data_opts)) + if (!pred(c, arg, extent_iter->btree_id, k, io_opts, &data_opts)) goto next; /* @@ -663,7 +771,14 @@ static int bch2_move_data_btree(struct moving_context *ctxt, bch2_bkey_buf_reassemble(&sk, c, k); k = bkey_i_to_s_c(sk.k); - ret2 = bch2_move_extent(ctxt, NULL, extent_iter, k, *io_opts, data_opts); + if (!level) + ret2 = bch2_move_extent(ctxt, NULL, extent_iter, k, *io_opts, data_opts); + else if (!data_opts.scrub) + ret2 = bch2_btree_node_rewrite_pos(trans, btree_id, level, + k.k->p, data_opts.target, 0); + else + ret2 = bch2_btree_node_scrub(trans, btree_id, level, k, data_opts.read_dev); + if (ret2) { if (bch2_err_matches(ret2, BCH_ERR_transaction_restart)) continue; @@ -681,9 +796,10 @@ next: if (ctxt->stats) atomic64_add(k.k->size, &ctxt->stats->sectors_seen); next_nondata: - bch2_btree_iter_advance(trans, &iter); + if (!bch2_btree_iter_advance(trans, &iter)) + break; } - +out: bch2_trans_iter_exit(trans, &reflink_iter); bch2_trans_iter_exit(trans, &iter); bch2_bkey_buf_exit(&sk, c); @@ -713,7 +829,7 @@ int __bch2_move_data(struct moving_context *ctxt, ret = bch2_move_data_btree(ctxt, id == start.btree ? start.pos : POS_MIN, id == end.btree ? end.pos : POS_MAX, - pred, arg, id); + pred, arg, id, 0); if (ret) break; } @@ -740,11 +856,12 @@ int bch2_move_data(struct bch_fs *c, } static int __bch2_move_data_phys(struct moving_context *ctxt, - struct move_bucket_in_flight *bucket_in_flight, + struct move_bucket *bucket_in_flight, unsigned dev, u64 bucket_start, u64 bucket_end, unsigned data_types, + bool copygc, move_pred_fn pred, void *arg) { struct btree_trans *trans = ctxt->trans; @@ -755,6 +872,7 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, struct bkey_buf sk; struct bkey_s_c k; struct bkey_buf last_flushed; + u64 check_mismatch_done = bucket_start; int ret = 0; struct bch_dev *ca = bch2_dev_tryget(c, dev); @@ -765,8 +883,6 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, struct bpos bp_start = bucket_pos_to_bp_start(ca, POS(dev, bucket_start)); struct bpos bp_end = bucket_pos_to_bp_end(ca, POS(dev, bucket_end)); - bch2_dev_put(ca); - ca = NULL; bch2_bkey_buf_init(&last_flushed); bkey_init(&last_flushed.k->k); @@ -779,10 +895,6 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, bp_start, 0); - bch_err_msg(c, ret, "looking up alloc key"); - if (ret) - goto err; - ret = bch2_btree_write_buffer_tryflush(trans); if (!bch2_err_matches(ret, EROFS)) bch_err_msg(c, ret, "flushing btree write buffer"); @@ -805,6 +917,14 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, if (!k.k || bkey_gt(k.k->p, bp_end)) break; + if (check_mismatch_done < bp_pos_to_bucket(ca, k.k->p).offset) { + while (check_mismatch_done < bp_pos_to_bucket(ca, k.k->p).offset) { + bch2_check_bucket_backpointer_mismatch(trans, ca, check_mismatch_done++, + copygc, &last_flushed); + } + continue; + } + if (k.k->type != KEY_TYPE_backpointer) goto next; @@ -837,7 +957,13 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, } struct data_update_opts data_opts = {}; - if (!pred(c, arg, k, &io_opts, &data_opts)) { + bool p = pred(c, arg, bp.v->btree_id, k, &io_opts, &data_opts); + + if (trace_io_move_pred_enabled()) + trace_io_move_pred2(c, k, &io_opts, &data_opts, + pred, arg, p); + + if (!p) { bch2_trans_iter_exit(trans, &iter); goto next; } @@ -845,7 +971,7 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, if (data_opts.scrub && !bch2_dev_idx_is_online(c, data_opts.read_dev)) { bch2_trans_iter_exit(trans, &iter); - ret = -BCH_ERR_device_offline; + ret = bch_err_throw(c, device_offline); break; } @@ -858,7 +984,8 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, if (!bp.v->level) ret = bch2_move_extent(ctxt, bucket_in_flight, &iter, k, io_opts, data_opts); else if (!data_opts.scrub) - ret = bch2_btree_node_rewrite_pos(trans, bp.v->btree_id, bp.v->level, k.k->p, 0); + ret = bch2_btree_node_rewrite_pos(trans, bp.v->btree_id, bp.v->level, + k.k->p, data_opts.target, 0); else ret = bch2_btree_node_scrub(trans, bp.v->btree_id, bp.v->level, k, data_opts.read_dev); @@ -879,45 +1006,48 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, next: bch2_btree_iter_advance(trans, &bp_iter); } + + while (check_mismatch_done < bucket_end) + bch2_check_bucket_backpointer_mismatch(trans, ca, check_mismatch_done++, + copygc, &last_flushed); err: bch2_trans_iter_exit(trans, &bp_iter); bch2_bkey_buf_exit(&sk, c); bch2_bkey_buf_exit(&last_flushed, c); + bch2_dev_put(ca); return ret; } -static int bch2_move_data_phys(struct bch_fs *c, - unsigned dev, - u64 start, - u64 end, - unsigned data_types, - struct bch_ratelimit *rate, - struct bch_move_stats *stats, - struct write_point_specifier wp, - bool wait_on_copygc, - move_pred_fn pred, void *arg) +int bch2_move_data_phys(struct bch_fs *c, + unsigned dev, + u64 start, + u64 end, + unsigned data_types, + struct bch_ratelimit *rate, + struct bch_move_stats *stats, + struct write_point_specifier wp, + bool wait_on_copygc, + move_pred_fn pred, void *arg) { struct moving_context ctxt; bch2_trans_run(c, bch2_btree_write_buffer_flush_sync(trans)); bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); - ctxt.stats->phys = true; - ctxt.stats->data_type = (int) DATA_PROGRESS_DATA_TYPE_phys; + if (ctxt.stats) { + ctxt.stats->phys = true; + ctxt.stats->data_type = (int) DATA_PROGRESS_DATA_TYPE_phys; + } - int ret = __bch2_move_data_phys(&ctxt, NULL, dev, start, end, data_types, pred, arg); + int ret = __bch2_move_data_phys(&ctxt, NULL, dev, start, end, + data_types, false, pred, arg); bch2_moving_ctxt_exit(&ctxt); return ret; } -struct evacuate_bucket_arg { - struct bpos bucket; - int gen; - struct data_update_opts data_opts; -}; - -static bool evacuate_bucket_pred(struct bch_fs *c, void *_arg, struct bkey_s_c k, +static bool evacuate_bucket_pred(struct bch_fs *c, void *_arg, + enum btree_id btree, struct bkey_s_c k, struct bch_io_opts *io_opts, struct data_update_opts *data_opts) { @@ -938,17 +1068,23 @@ static bool evacuate_bucket_pred(struct bch_fs *c, void *_arg, struct bkey_s_c k } int bch2_evacuate_bucket(struct moving_context *ctxt, - struct move_bucket_in_flight *bucket_in_flight, - struct bpos bucket, int gen, - struct data_update_opts data_opts) + struct move_bucket *bucket_in_flight, + struct bpos bucket, int gen, + struct data_update_opts data_opts) { + struct bch_fs *c = ctxt->trans->c; struct evacuate_bucket_arg arg = { bucket, gen, data_opts, }; + count_event(c, io_move_evacuate_bucket); + if (trace_io_move_evacuate_bucket_enabled()) + trace_io_move_evacuate_bucket2(c, bucket, gen); + return __bch2_move_data_phys(ctxt, bucket_in_flight, bucket.inode, bucket.offset, bucket.offset + 1, ~0, + true, evacuate_bucket_pred, &arg); } @@ -1006,7 +1142,7 @@ retry: if (!pred(c, arg, b, &io_opts, &data_opts)) goto next; - ret = bch2_btree_node_rewrite(trans, &iter, b, 0) ?: ret; + ret = bch2_btree_node_rewrite(trans, &iter, b, 0, 0) ?: ret; if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; if (ret) @@ -1031,7 +1167,7 @@ next: } static bool rereplicate_pred(struct bch_fs *c, void *arg, - struct bkey_s_c k, + enum btree_id btree, struct bkey_s_c k, struct bch_io_opts *io_opts, struct data_update_opts *data_opts) { @@ -1040,7 +1176,7 @@ static bool rereplicate_pred(struct bch_fs *c, void *arg, ? c->opts.metadata_replicas : io_opts->data_replicas; - rcu_read_lock(); + guard(rcu)(); struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); unsigned i = 0; bkey_for_each_ptr(ptrs, ptr) { @@ -1050,7 +1186,6 @@ static bool rereplicate_pred(struct bch_fs *c, void *arg, data_opts->kill_ptrs |= BIT(i); i++; } - rcu_read_unlock(); if (!data_opts->kill_ptrs && (!nr_good || nr_good >= replicas)) @@ -1063,7 +1198,7 @@ static bool rereplicate_pred(struct bch_fs *c, void *arg, } static bool migrate_pred(struct bch_fs *c, void *arg, - struct bkey_s_c k, + enum btree_id btree, struct bkey_s_c k, struct bch_io_opts *io_opts, struct data_update_opts *data_opts) { @@ -1090,7 +1225,7 @@ static bool rereplicate_btree_pred(struct bch_fs *c, void *arg, struct bch_io_opts *io_opts, struct data_update_opts *data_opts) { - return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); + return rereplicate_pred(c, arg, b->c.btree_id, bkey_i_to_s_c(&b->key), io_opts, data_opts); } /* @@ -1146,7 +1281,7 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats) } static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg, - struct bkey_s_c k, + enum btree_id btree, struct bkey_s_c k, struct bch_io_opts *io_opts, struct data_update_opts *data_opts) { @@ -1158,7 +1293,7 @@ static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg, struct extent_ptr_decoded p; unsigned i = 0; - rcu_read_lock(); + guard(rcu)(); bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) { unsigned d = bch2_extent_ptr_durability(c, &p); @@ -1169,7 +1304,6 @@ static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg, i++; } - rcu_read_unlock(); return data_opts->kill_ptrs != 0; } @@ -1179,11 +1313,12 @@ static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg, struct bch_io_opts *io_opts, struct data_update_opts *data_opts) { - return drop_extra_replicas_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); + return drop_extra_replicas_pred(c, arg, b->c.btree_id, bkey_i_to_s_c(&b->key), + io_opts, data_opts); } static bool scrub_pred(struct bch_fs *c, void *_arg, - struct bkey_s_c k, + enum btree_id btree, struct bkey_s_c k, struct bch_io_opts *io_opts, struct data_update_opts *data_opts) { diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h index 51e0505a8156..86b80499ac55 100644 --- a/fs/bcachefs/move.h +++ b/fs/bcachefs/move.h @@ -72,7 +72,7 @@ do { \ break; \ } while (1) -typedef bool (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c, +typedef bool (*move_pred_fn)(struct bch_fs *, void *, enum btree_id, struct bkey_s_c, struct bch_io_opts *, struct data_update_opts *); extern const char * const bch2_data_ops_strs[]; @@ -116,12 +116,18 @@ int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *, int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *); int bch2_move_extent(struct moving_context *, - struct move_bucket_in_flight *, + struct move_bucket *, struct btree_iter *, struct bkey_s_c, struct bch_io_opts, struct data_update_opts); +struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *, + struct per_snapshot_io_opts *, struct bpos, + struct btree_iter *, struct bkey_s_c); + +int bch2_move_data_btree(struct moving_context *, struct bpos, struct bpos, + move_pred_fn, void *, enum btree_id, unsigned); int __bch2_move_data(struct moving_context *, struct bbpos, struct bbpos, @@ -135,8 +141,13 @@ int bch2_move_data(struct bch_fs *, bool, move_pred_fn, void *); +int bch2_move_data_phys(struct bch_fs *, unsigned, u64, u64, unsigned, + struct bch_ratelimit *, struct bch_move_stats *, + struct write_point_specifier, bool, + move_pred_fn, void *); + int bch2_evacuate_bucket(struct moving_context *, - struct move_bucket_in_flight *, + struct move_bucket *, struct bpos, int, struct data_update_opts); int bch2_data_job(struct bch_fs *, diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h index 807f779f6f76..c5c62cd600de 100644 --- a/fs/bcachefs/move_types.h +++ b/fs/bcachefs/move_types.h @@ -36,14 +36,10 @@ struct move_bucket_key { }; struct move_bucket { + struct move_bucket *next; + struct rhash_head hash; struct move_bucket_key k; unsigned sectors; -}; - -struct move_bucket_in_flight { - struct move_bucket_in_flight *next; - struct rhash_head hash; - struct move_bucket bucket; atomic_t count; }; diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index 96873372b516..6d7b1d5f7697 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -8,6 +8,7 @@ #include "bcachefs.h" #include "alloc_background.h" #include "alloc_foreground.h" +#include "backpointers.h" #include "btree_iter.h" #include "btree_update.h" #include "btree_write_buffer.h" @@ -27,47 +28,32 @@ #include <linux/wait.h> struct buckets_in_flight { - struct rhashtable table; - struct move_bucket_in_flight *first; - struct move_bucket_in_flight *last; - size_t nr; - size_t sectors; + struct rhashtable table; + struct move_bucket *first; + struct move_bucket *last; + size_t nr; + size_t sectors; + + DARRAY(struct move_bucket *) to_evacuate; }; static const struct rhashtable_params bch_move_bucket_params = { - .head_offset = offsetof(struct move_bucket_in_flight, hash), - .key_offset = offsetof(struct move_bucket_in_flight, bucket.k), + .head_offset = offsetof(struct move_bucket, hash), + .key_offset = offsetof(struct move_bucket, k), .key_len = sizeof(struct move_bucket_key), .automatic_shrinking = true, }; -static struct move_bucket_in_flight * -move_bucket_in_flight_add(struct buckets_in_flight *list, struct move_bucket b) +static void move_bucket_in_flight_add(struct buckets_in_flight *list, struct move_bucket *b) { - struct move_bucket_in_flight *new = kzalloc(sizeof(*new), GFP_KERNEL); - int ret; - - if (!new) - return ERR_PTR(-ENOMEM); - - new->bucket = b; - - ret = rhashtable_lookup_insert_fast(&list->table, &new->hash, - bch_move_bucket_params); - if (ret) { - kfree(new); - return ERR_PTR(ret); - } - if (!list->first) - list->first = new; + list->first = b; else - list->last->next = new; + list->last->next = b; - list->last = new; + list->last = b; list->nr++; - list->sectors += b.sectors; - return new; + list->sectors += b->sectors; } static int bch2_bucket_is_movable(struct btree_trans *trans, @@ -89,9 +75,12 @@ static int bch2_bucket_is_movable(struct btree_trans *trans, if (!ca) goto out; + if (bch2_bucket_bitmap_test(&ca->bucket_backpointer_mismatch, b->k.bucket.offset)) + goto out; + if (ca->mi.state != BCH_MEMBER_STATE_rw || !bch2_dev_is_online(ca)) - goto out_put; + goto out; struct bch_alloc_v4 _a; const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a); @@ -100,19 +89,26 @@ static int bch2_bucket_is_movable(struct btree_trans *trans, u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca); ret = lru_idx && lru_idx <= time; -out_put: - bch2_dev_put(ca); out: + bch2_dev_put(ca); bch2_trans_iter_exit(trans, &iter); return ret; } +static void move_bucket_free(struct buckets_in_flight *list, + struct move_bucket *b) +{ + int ret = rhashtable_remove_fast(&list->table, &b->hash, + bch_move_bucket_params); + BUG_ON(ret); + kfree(b); +} + static void move_buckets_wait(struct moving_context *ctxt, struct buckets_in_flight *list, bool flush) { - struct move_bucket_in_flight *i; - int ret; + struct move_bucket *i; while ((i = list->first)) { if (flush) @@ -126,12 +122,9 @@ static void move_buckets_wait(struct moving_context *ctxt, list->last = NULL; list->nr--; - list->sectors -= i->bucket.sectors; + list->sectors -= i->sectors; - ret = rhashtable_remove_fast(&list->table, &i->hash, - bch_move_bucket_params); - BUG_ON(ret); - kfree(i); + move_bucket_free(list, i); } bch2_trans_unlock_long(ctxt->trans); @@ -143,11 +136,8 @@ static bool bucket_in_flight(struct buckets_in_flight *list, return rhashtable_lookup_fast(&list->table, &k, bch_move_bucket_params); } -typedef DARRAY(struct move_bucket) move_buckets; - static int bch2_copygc_get_buckets(struct moving_context *ctxt, - struct buckets_in_flight *buckets_in_flight, - move_buckets *buckets) + struct buckets_in_flight *buckets_in_flight) { struct btree_trans *trans = ctxt->trans; struct bch_fs *c = trans->c; @@ -164,8 +154,6 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt, if (bch2_fs_fatal_err_on(ret, c, "%s: from bch2_btree_write_buffer_tryflush()", bch2_err_str(ret))) return ret; - bch2_trans_begin(trans); - ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru, lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, 0, 0), lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, U64_MAX, LRU_TIME_MAX), @@ -184,20 +172,34 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt, else if (bucket_in_flight(buckets_in_flight, b.k)) in_flight++; else { - ret2 = darray_push(buckets, b); + struct move_bucket *b_i = kmalloc(sizeof(*b_i), GFP_KERNEL); + ret2 = b_i ? 0 : -ENOMEM; if (ret2) goto err; + + *b_i = b; + + ret2 = darray_push(&buckets_in_flight->to_evacuate, b_i); + if (ret2) { + kfree(b_i); + goto err; + } + + ret2 = rhashtable_lookup_insert_fast(&buckets_in_flight->table, &b_i->hash, + bch_move_bucket_params); + BUG_ON(ret2); + sectors += b.sectors; } - ret2 = buckets->nr >= nr_to_get; + ret2 = buckets_in_flight->to_evacuate.nr >= nr_to_get; err: ret2; })); pr_debug("have: %zu (%zu) saw %zu in flight %zu not movable %zu got %zu (%zu)/%zu buckets ret %i", buckets_in_flight->nr, buckets_in_flight->sectors, - saw, in_flight, not_movable, buckets->nr, sectors, nr_to_get, ret); + saw, in_flight, not_movable, buckets_in_flight->to_evacuate.nr, sectors, nr_to_get, ret); return ret < 0 ? ret : 0; } @@ -212,40 +214,30 @@ static int bch2_copygc(struct moving_context *ctxt, struct data_update_opts data_opts = { .btree_insert_flags = BCH_WATERMARK_copygc, }; - move_buckets buckets = { 0 }; - struct move_bucket_in_flight *f; u64 sectors_seen = atomic64_read(&ctxt->stats->sectors_seen); u64 sectors_moved = atomic64_read(&ctxt->stats->sectors_moved); int ret = 0; - ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight, &buckets); + ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight); if (ret) goto err; - darray_for_each(buckets, i) { + darray_for_each(buckets_in_flight->to_evacuate, i) { if (kthread_should_stop() || freezing(current)) break; - f = move_bucket_in_flight_add(buckets_in_flight, *i); - ret = PTR_ERR_OR_ZERO(f); - if (ret == -EEXIST) { /* rare race: copygc_get_buckets returned same bucket more than once */ - ret = 0; - continue; - } - if (ret == -ENOMEM) { /* flush IO, continue later */ - ret = 0; - break; - } + struct move_bucket *b = *i; + *i = NULL; + + move_bucket_in_flight_add(buckets_in_flight, b); - ret = bch2_evacuate_bucket(ctxt, f, f->bucket.k.bucket, - f->bucket.k.gen, data_opts); + ret = bch2_evacuate_bucket(ctxt, b, b->k.bucket, b->k.gen, data_opts); if (ret) goto err; *did_work = true; } err: - /* no entries in LRU btree found, or got to end: */ if (bch2_err_matches(ret, ENOENT)) ret = 0; @@ -255,12 +247,34 @@ err: sectors_seen = atomic64_read(&ctxt->stats->sectors_seen) - sectors_seen; sectors_moved = atomic64_read(&ctxt->stats->sectors_moved) - sectors_moved; - trace_and_count(c, copygc, c, buckets.nr, sectors_seen, sectors_moved); + trace_and_count(c, copygc, c, buckets_in_flight->to_evacuate.nr, sectors_seen, sectors_moved); - darray_exit(&buckets); + darray_for_each(buckets_in_flight->to_evacuate, i) + if (*i) + move_bucket_free(buckets_in_flight, *i); + darray_exit(&buckets_in_flight->to_evacuate); return ret; } +static u64 bch2_copygc_dev_wait_amount(struct bch_dev *ca) +{ + struct bch_dev_usage_full usage_full = bch2_dev_usage_full_read(ca); + struct bch_dev_usage usage; + + for (unsigned i = 0; i < BCH_DATA_NR; i++) + usage.buckets[i] = usage_full.d[i].buckets; + + s64 fragmented_allowed = ((__dev_buckets_available(ca, usage, BCH_WATERMARK_stripe) * + ca->mi.bucket_size) >> 1); + s64 fragmented = 0; + + for (unsigned i = 0; i < BCH_DATA_NR; i++) + if (data_type_movable(i)) + fragmented += usage_full.d[i].fragmented; + + return max(0LL, fragmented_allowed - fragmented); +} + /* * Copygc runs when the amount of fragmented data is above some arbitrary * threshold: @@ -275,28 +289,13 @@ err: * often and continually reduce the amount of fragmented space as the device * fills up. So, we increase the threshold by half the current free space. */ -unsigned long bch2_copygc_wait_amount(struct bch_fs *c) +u64 bch2_copygc_wait_amount(struct bch_fs *c) { - s64 wait = S64_MAX, fragmented_allowed, fragmented; - - for_each_rw_member(c, ca) { - struct bch_dev_usage_full usage_full = bch2_dev_usage_full_read(ca); - struct bch_dev_usage usage; - - for (unsigned i = 0; i < BCH_DATA_NR; i++) - usage.buckets[i] = usage_full.d[i].buckets; - - fragmented_allowed = ((__dev_buckets_available(ca, usage, BCH_WATERMARK_stripe) * - ca->mi.bucket_size) >> 1); - fragmented = 0; - - for (unsigned i = 0; i < BCH_DATA_NR; i++) - if (data_type_movable(i)) - fragmented += usage_full.d[i].fragmented; - - wait = min(wait, max(0LL, fragmented_allowed - fragmented)); - } + u64 wait = U64_MAX; + guard(rcu)(); + for_each_rw_member_rcu(c, ca) + wait = min(wait, bch2_copygc_dev_wait_amount(ca)); return wait; } @@ -318,15 +317,23 @@ void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c) c->copygc_wait_at) << 9); prt_newline(out); - prt_printf(out, "Currently calculated wait:\t"); - prt_human_readable_u64(out, bch2_copygc_wait_amount(c)); - prt_newline(out); + bch2_printbuf_make_room(out, 4096); - rcu_read_lock(); - struct task_struct *t = rcu_dereference(c->copygc_thread); - if (t) - get_task_struct(t); - rcu_read_unlock(); + struct task_struct *t; + out->atomic++; + scoped_guard(rcu) { + prt_printf(out, "Currently calculated wait:\n"); + for_each_rw_member_rcu(c, ca) { + prt_printf(out, " %s:\t", ca->name); + prt_human_readable_u64(out, bch2_copygc_dev_wait_amount(ca)); + prt_newline(out); + } + + t = rcu_dereference(c->copygc_thread); + if (t) + get_task_struct(t); + } + --out->atomic; if (t) { bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL); @@ -340,19 +347,13 @@ static int bch2_copygc_thread(void *arg) struct moving_context ctxt; struct bch_move_stats move_stats; struct io_clock *clock = &c->io_clock[WRITE]; - struct buckets_in_flight *buckets; + struct buckets_in_flight buckets = {}; u64 last, wait; - int ret = 0; - buckets = kzalloc(sizeof(struct buckets_in_flight), GFP_KERNEL); - if (!buckets) - return -ENOMEM; - ret = rhashtable_init(&buckets->table, &bch_move_bucket_params); + int ret = rhashtable_init(&buckets.table, &bch_move_bucket_params); bch_err_msg(c, ret, "allocating copygc buckets in flight"); - if (ret) { - kfree(buckets); + if (ret) return ret; - } set_freezable(); @@ -360,7 +361,7 @@ static int bch2_copygc_thread(void *arg) * Data move operations can't run until after check_snapshots has * completed, and bch2_snapshot_is_ancestor() is available. */ - kthread_wait_freezable(c->recovery_pass_done > BCH_RECOVERY_PASS_check_snapshots || + kthread_wait_freezable(c->recovery.pass_done > BCH_RECOVERY_PASS_check_snapshots || kthread_should_stop()); bch2_move_stats_init(&move_stats, "copygc"); @@ -375,13 +376,13 @@ static int bch2_copygc_thread(void *arg) cond_resched(); if (!c->opts.copygc_enabled) { - move_buckets_wait(&ctxt, buckets, true); + move_buckets_wait(&ctxt, &buckets, true); kthread_wait_freezable(c->opts.copygc_enabled || kthread_should_stop()); } if (unlikely(freezing(current))) { - move_buckets_wait(&ctxt, buckets, true); + move_buckets_wait(&ctxt, &buckets, true); __refrigerator(false); continue; } @@ -392,7 +393,7 @@ static int bch2_copygc_thread(void *arg) if (wait > clock->max_slop) { c->copygc_wait_at = last; c->copygc_wait = last + wait; - move_buckets_wait(&ctxt, buckets, true); + move_buckets_wait(&ctxt, &buckets, true); trace_and_count(c, copygc_wait, c, wait, last + wait); bch2_kthread_io_clock_wait(clock, last + wait, MAX_SCHEDULE_TIMEOUT); @@ -402,7 +403,7 @@ static int bch2_copygc_thread(void *arg) c->copygc_wait = 0; c->copygc_running = true; - ret = bch2_copygc(&ctxt, buckets, &did_work); + ret = bch2_copygc(&ctxt, &buckets, &did_work); c->copygc_running = false; wake_up(&c->copygc_running_wq); @@ -413,16 +414,14 @@ static int bch2_copygc_thread(void *arg) if (min_member_capacity == U64_MAX) min_member_capacity = 128 * 2048; - move_buckets_wait(&ctxt, buckets, true); + move_buckets_wait(&ctxt, &buckets, true); bch2_kthread_io_clock_wait(clock, last + (min_member_capacity >> 6), MAX_SCHEDULE_TIMEOUT); } } - move_buckets_wait(&ctxt, buckets, true); - - rhashtable_destroy(&buckets->table); - kfree(buckets); + move_buckets_wait(&ctxt, &buckets, true); + rhashtable_destroy(&buckets.table); bch2_moving_ctxt_exit(&ctxt); bch2_move_stats_exit(&move_stats, c); diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h index d1885cf67a45..f615910d6f98 100644 --- a/fs/bcachefs/movinggc.h +++ b/fs/bcachefs/movinggc.h @@ -2,16 +2,15 @@ #ifndef _BCACHEFS_MOVINGGC_H #define _BCACHEFS_MOVINGGC_H -unsigned long bch2_copygc_wait_amount(struct bch_fs *); +u64 bch2_copygc_wait_amount(struct bch_fs *); void bch2_copygc_wait_to_text(struct printbuf *, struct bch_fs *); static inline void bch2_copygc_wakeup(struct bch_fs *c) { - rcu_read_lock(); + guard(rcu)(); struct task_struct *p = rcu_dereference(c->copygc_thread); if (p) wake_up_process(p); - rcu_read_unlock(); } void bch2_copygc_stop(struct bch_fs *); diff --git a/fs/bcachefs/namei.c b/fs/bcachefs/namei.c index 9136a9097789..24120037c031 100644 --- a/fs/bcachefs/namei.c +++ b/fs/bcachefs/namei.c @@ -11,6 +11,14 @@ #include <linux/posix_acl.h> +static inline subvol_inum parent_inum(subvol_inum inum, struct bch_inode_unpacked *inode) +{ + return (subvol_inum) { + .subvol = inode->bi_parent_subvol ?: inum.subvol, + .inum = inode->bi_dir, + }; +} + static inline int is_subdir_for_nlink(struct bch_inode_unpacked *inode) { return S_ISDIR(inode->bi_mode) && !inode->bi_subvol; @@ -49,7 +57,7 @@ int bch2_create_trans(struct btree_trans *trans, if (!(flags & BCH_CREATE_SNAPSHOT)) { /* Normal create path - allocate a new inode: */ - bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u); + bch2_inode_init_late(c, new_inode, now, uid, gid, mode, rdev, dir_u); if (flags & BCH_CREATE_TMPFILE) new_inode->bi_flags |= BCH_INODE_unlinked; @@ -279,7 +287,7 @@ int bch2_unlink_trans(struct btree_trans *trans, } if (deleting_subvol && !inode_u->bi_subvol) { - ret = -BCH_ERR_ENOENT_not_subvol; + ret = bch_err_throw(c, ENOENT_not_subvol); goto err; } @@ -404,8 +412,7 @@ int bch2_rename_trans(struct btree_trans *trans, src_hash = bch2_hash_info_init(c, src_dir_u); - if (dst_dir.inum != src_dir.inum || - dst_dir.subvol != src_dir.subvol) { + if (!subvol_inum_eq(dst_dir, src_dir)) { ret = bch2_inode_peek(trans, &dst_dir_iter, dst_dir_u, dst_dir, BTREE_ITER_intent); if (ret) @@ -418,8 +425,8 @@ int bch2_rename_trans(struct btree_trans *trans, } ret = bch2_dirent_rename(trans, - src_dir, &src_hash, &src_dir_u->bi_size, - dst_dir, &dst_hash, &dst_dir_u->bi_size, + src_dir, &src_hash, + dst_dir, &dst_hash, src_name, &src_inum, &src_offset, dst_name, &dst_inum, &dst_offset, mode); @@ -497,32 +504,41 @@ int bch2_rename_trans(struct btree_trans *trans, } } - if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) && - S_ISDIR(src_inode_u->bi_mode)) { - ret = -EXDEV; - goto err; - } + if (!subvol_inum_eq(dst_dir, src_dir)) { + if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) && + S_ISDIR(src_inode_u->bi_mode)) { + ret = -EXDEV; + goto err; + } - if (mode == BCH_RENAME_EXCHANGE && - bch2_reinherit_attrs(dst_inode_u, src_dir_u) && - S_ISDIR(dst_inode_u->bi_mode)) { - ret = -EXDEV; - goto err; - } + if (mode == BCH_RENAME_EXCHANGE && + bch2_reinherit_attrs(dst_inode_u, src_dir_u) && + S_ISDIR(dst_inode_u->bi_mode)) { + ret = -EXDEV; + goto err; + } - if (is_subdir_for_nlink(src_inode_u)) { - src_dir_u->bi_nlink--; - dst_dir_u->bi_nlink++; - } + ret = bch2_maybe_propagate_has_case_insensitive(trans, src_inum, src_inode_u) ?: + (mode == BCH_RENAME_EXCHANGE + ? bch2_maybe_propagate_has_case_insensitive(trans, dst_inum, dst_inode_u) + : 0); + if (ret) + goto err; - if (S_ISDIR(src_inode_u->bi_mode) && - !src_inode_u->bi_subvol) - src_inode_u->bi_depth = dst_dir_u->bi_depth + 1; + if (is_subdir_for_nlink(src_inode_u)) { + src_dir_u->bi_nlink--; + dst_dir_u->bi_nlink++; + } - if (mode == BCH_RENAME_EXCHANGE && - S_ISDIR(dst_inode_u->bi_mode) && - !dst_inode_u->bi_subvol) - dst_inode_u->bi_depth = src_dir_u->bi_depth + 1; + if (S_ISDIR(src_inode_u->bi_mode) && + !src_inode_u->bi_subvol) + src_inode_u->bi_depth = dst_dir_u->bi_depth + 1; + + if (mode == BCH_RENAME_EXCHANGE && + S_ISDIR(dst_inode_u->bi_mode) && + !dst_inode_u->bi_subvol) + dst_inode_u->bi_depth = src_dir_u->bi_depth + 1; + } if (dst_inum.inum && is_subdir_for_nlink(dst_inode_u)) { dst_dir_u->bi_nlink--; @@ -593,31 +609,39 @@ static inline void reverse_bytes(void *b, size_t n) } } -/* XXX: we don't yet attempt to print paths when we don't know the subvol */ -int bch2_inum_to_path(struct btree_trans *trans, subvol_inum inum, struct printbuf *path) +static int __bch2_inum_to_path(struct btree_trans *trans, + u32 subvol, u64 inum, u32 snapshot, + struct printbuf *path) { unsigned orig_pos = path->pos; int ret = 0; - while (!(inum.subvol == BCACHEFS_ROOT_SUBVOL && - inum.inum == BCACHEFS_ROOT_INO)) { + while (true) { + if (!snapshot) { + ret = bch2_subvolume_get_snapshot(trans, subvol, &snapshot); + if (ret) + goto disconnected; + } + struct bch_inode_unpacked inode; - ret = bch2_inode_find_by_inum_trans(trans, inum, &inode); + ret = bch2_inode_find_by_inum_snapshot(trans, inum, snapshot, &inode, 0); if (ret) goto disconnected; + if (inode.bi_subvol == BCACHEFS_ROOT_SUBVOL && + inode.bi_inum == BCACHEFS_ROOT_INO) + break; + if (!inode.bi_dir && !inode.bi_dir_offset) { - ret = -BCH_ERR_ENOENT_inode_no_backpointer; + ret = bch_err_throw(trans->c, ENOENT_inode_no_backpointer); goto disconnected; } - inum.subvol = inode.bi_parent_subvol ?: inum.subvol; - inum.inum = inode.bi_dir; - - u32 snapshot; - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - goto disconnected; + inum = inode.bi_dir; + if (inode.bi_parent_subvol) { + subvol = inode.bi_parent_subvol; + snapshot = 0; + } struct btree_iter d_iter; struct bkey_s_c_dirent d = bch2_bkey_get_iter_typed(trans, &d_iter, @@ -654,6 +678,20 @@ disconnected: goto out; } +int bch2_inum_to_path(struct btree_trans *trans, + subvol_inum inum, + struct printbuf *path) +{ + return __bch2_inum_to_path(trans, inum.subvol, inum.inum, 0, path); +} + +int bch2_inum_snapshot_to_path(struct btree_trans *trans, u64 inum, u32 snapshot, + snapshot_id_list *snapshot_overwrites, + struct printbuf *path) +{ + return __bch2_inum_to_path(trans, 0, inum, snapshot, path); +} + /* fsck */ static int bch2_check_dirent_inode_dirent(struct btree_trans *trans, @@ -695,15 +733,6 @@ static int bch2_check_dirent_inode_dirent(struct btree_trans *trans, return __bch2_fsck_write_inode(trans, target); } - if (bch2_inode_should_have_single_bp(target) && - !fsck_err(trans, inode_wrong_backpointer, - "dirent points to inode that does not point back:\n%s", - (bch2_bkey_val_to_text(&buf, c, d.s_c), - prt_newline(&buf), - bch2_inode_unpacked_to_text(&buf, target), - buf.buf))) - goto err; - struct bkey_s_c_dirent bp_dirent = bch2_bkey_get_iter_typed(trans, &bp_iter, BTREE_ID_dirents, SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot), @@ -730,6 +759,7 @@ static int bch2_check_dirent_inode_dirent(struct btree_trans *trans, ret = __bch2_fsck_write_inode(trans, target); } } else { + printbuf_reset(&buf); bch2_bkey_val_to_text(&buf, c, d.s_c); prt_newline(&buf); bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c); @@ -819,7 +849,8 @@ int __bch2_check_dirent_target(struct btree_trans *trans, n->v.d_inum = cpu_to_le64(target->bi_inum); } - ret = bch2_trans_update(trans, dirent_iter, &n->k_i, 0); + ret = bch2_trans_update(trans, dirent_iter, &n->k_i, + BTREE_UPDATE_internal_snapshot_node); if (ret) goto err; } @@ -829,3 +860,149 @@ fsck_err: bch_err_fn(c, ret); return ret; } + +/* + * BCH_INODE_has_case_insensitive: + * We have to track whether directories have any descendent directory that is + * casefolded - for overlayfs: + */ + +static int bch2_propagate_has_case_insensitive(struct btree_trans *trans, subvol_inum inum) +{ + struct btree_iter iter = {}; + int ret = 0; + + while (true) { + struct bch_inode_unpacked inode; + ret = bch2_inode_peek(trans, &iter, &inode, inum, + BTREE_ITER_intent|BTREE_ITER_with_updates); + if (ret) + break; + + if (inode.bi_flags & BCH_INODE_has_case_insensitive) + break; + + inode.bi_flags |= BCH_INODE_has_case_insensitive; + ret = bch2_inode_write(trans, &iter, &inode); + if (ret) + break; + + bch2_trans_iter_exit(trans, &iter); + if (subvol_inum_eq(inum, BCACHEFS_ROOT_SUBVOL_INUM)) + break; + + inum = parent_inum(inum, &inode); + } + + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_maybe_propagate_has_case_insensitive(struct btree_trans *trans, subvol_inum inum, + struct bch_inode_unpacked *inode) +{ + if (!bch2_inode_casefold(trans->c, inode)) + return 0; + + inode->bi_flags |= BCH_INODE_has_case_insensitive; + + return bch2_propagate_has_case_insensitive(trans, parent_inum(inum, inode)); +} + +int bch2_check_inode_has_case_insensitive(struct btree_trans *trans, + struct bch_inode_unpacked *inode, + snapshot_id_list *snapshot_overwrites, + bool *do_update) +{ + struct printbuf buf = PRINTBUF; + bool repairing_parents = false; + int ret = 0; + + if (!S_ISDIR(inode->bi_mode)) { + /* + * Old versions set bi_casefold for non dirs, but that's + * unnecessary and wasteful + */ + if (inode->bi_casefold) { + inode->bi_casefold = 0; + *do_update = true; + } + return 0; + } + + if (trans->c->sb.version < bcachefs_metadata_version_inode_has_case_insensitive) + return 0; + + if (bch2_inode_casefold(trans->c, inode) && + !(inode->bi_flags & BCH_INODE_has_case_insensitive)) { + prt_printf(&buf, "casefolded dir with has_case_insensitive not set\ninum %llu:%u ", + inode->bi_inum, inode->bi_snapshot); + + ret = bch2_inum_snapshot_to_path(trans, inode->bi_inum, inode->bi_snapshot, + snapshot_overwrites, &buf); + if (ret) + goto err; + + if (fsck_err(trans, inode_has_case_insensitive_not_set, "%s", buf.buf)) { + inode->bi_flags |= BCH_INODE_has_case_insensitive; + *do_update = true; + } + } + + if (!(inode->bi_flags & BCH_INODE_has_case_insensitive)) + goto out; + + struct bch_inode_unpacked dir = *inode; + u32 snapshot = dir.bi_snapshot; + + while (!(dir.bi_inum == BCACHEFS_ROOT_INO && + dir.bi_subvol == BCACHEFS_ROOT_SUBVOL)) { + if (dir.bi_parent_subvol) { + ret = bch2_subvolume_get_snapshot(trans, dir.bi_parent_subvol, &snapshot); + if (ret) + goto err; + + snapshot_overwrites = NULL; + } + + ret = bch2_inode_find_by_inum_snapshot(trans, dir.bi_dir, snapshot, &dir, 0); + if (ret) + goto err; + + if (!(dir.bi_flags & BCH_INODE_has_case_insensitive)) { + prt_printf(&buf, "parent of casefolded dir with has_case_insensitive not set\n"); + + ret = bch2_inum_snapshot_to_path(trans, dir.bi_inum, dir.bi_snapshot, + snapshot_overwrites, &buf); + if (ret) + goto err; + + if (fsck_err(trans, inode_parent_has_case_insensitive_not_set, "%s", buf.buf)) { + dir.bi_flags |= BCH_INODE_has_case_insensitive; + ret = __bch2_fsck_write_inode(trans, &dir); + if (ret) + goto err; + } + } + + /* + * We only need to check the first parent, unless we find an + * inconsistency + */ + if (!repairing_parents) + break; + } +out: +err: +fsck_err: + printbuf_exit(&buf); + if (ret) + return ret; + + if (repairing_parents) { + return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: + -BCH_ERR_transaction_restart_nested; + } + + return 0; +} diff --git a/fs/bcachefs/namei.h b/fs/bcachefs/namei.h index 2e6f6364767f..ae6ebc2d0785 100644 --- a/fs/bcachefs/namei.h +++ b/fs/bcachefs/namei.h @@ -43,6 +43,8 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *, struct bch_inode_unpacked *); int bch2_inum_to_path(struct btree_trans *, subvol_inum, struct printbuf *); +int bch2_inum_snapshot_to_path(struct btree_trans *, u64, u32, + snapshot_id_list *, struct printbuf *); int __bch2_check_dirent_target(struct btree_trans *, struct btree_iter *, @@ -69,4 +71,9 @@ static inline int bch2_check_dirent_target(struct btree_trans *trans, return __bch2_check_dirent_target(trans, dirent_iter, d, target, in_fsck); } +int bch2_maybe_propagate_has_case_insensitive(struct btree_trans *, subvol_inum, + struct bch_inode_unpacked *); +int bch2_check_inode_has_case_insensitive(struct btree_trans *, struct bch_inode_unpacked *, + snapshot_id_list *, bool *); + #endif /* _BCACHEFS_NAMEI_H */ diff --git a/fs/bcachefs/nocow_locking.c b/fs/bcachefs/nocow_locking.c index 3c21981a4a1c..962218fa68ec 100644 --- a/fs/bcachefs/nocow_locking.c +++ b/fs/bcachefs/nocow_locking.c @@ -133,12 +133,10 @@ void bch2_fs_nocow_locking_exit(struct bch_fs *c) BUG_ON(atomic_read(&l->l[j])); } -int bch2_fs_nocow_locking_init(struct bch_fs *c) +void bch2_fs_nocow_locking_init_early(struct bch_fs *c) { struct bucket_nocow_lock_table *t = &c->nocow_locks; for (struct nocow_lock_bucket *l = t->l; l < t->l + ARRAY_SIZE(t->l); l++) spin_lock_init(&l->lock); - - return 0; } diff --git a/fs/bcachefs/nocow_locking.h b/fs/bcachefs/nocow_locking.h index f9d6a426a960..48b8a003c0d2 100644 --- a/fs/bcachefs/nocow_locking.h +++ b/fs/bcachefs/nocow_locking.h @@ -45,6 +45,6 @@ static inline bool bch2_bucket_nocow_trylock(struct bucket_nocow_lock_table *t, void bch2_nocow_locks_to_text(struct printbuf *, struct bucket_nocow_lock_table *); void bch2_fs_nocow_locking_exit(struct bch_fs *); -int bch2_fs_nocow_locking_init(struct bch_fs *); +void bch2_fs_nocow_locking_init_early(struct bch_fs *); #endif /* _BCACHEFS_NOCOW_LOCKING_H */ diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index af3258814822..b1cf88905b81 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -7,7 +7,9 @@ #include "compress.h" #include "disk_groups.h" #include "error.h" +#include "movinggc.h" #include "opts.h" +#include "rebalance.h" #include "recovery_passes.h" #include "super-io.h" #include "util.h" @@ -19,6 +21,11 @@ const char * const bch2_error_actions[] = { NULL }; +const char * const bch2_degraded_actions[] = { + BCH_DEGRADED_ACTIONS() + NULL +}; + const char * const bch2_fsck_fix_opts[] = { BCH_FIX_ERRORS_OPTS() NULL @@ -273,20 +280,20 @@ int bch2_opt_lookup(const char *name) return -1; } -struct synonym { +struct opt_synonym { const char *s1, *s2; }; -static const struct synonym bch_opt_synonyms[] = { +static const struct opt_synonym bch2_opt_synonyms[] = { { "quota", "usrquota" }, }; static int bch2_mount_opt_lookup(const char *name) { - const struct synonym *i; + const struct opt_synonym *i; - for (i = bch_opt_synonyms; - i < bch_opt_synonyms + ARRAY_SIZE(bch_opt_synonyms); + for (i = bch2_opt_synonyms; + i < bch2_opt_synonyms + ARRAY_SIZE(bch2_opt_synonyms); i++) if (!strcmp(name, i->s1)) name = i->s2; @@ -294,6 +301,30 @@ static int bch2_mount_opt_lookup(const char *name) return bch2_opt_lookup(name); } +struct opt_val_synonym { + const char *opt, *v1, *v2; +}; + +static const struct opt_val_synonym bch2_opt_val_synonyms[] = { + { "degraded", "true", "yes" }, + { "degraded", "false", "no" }, + { "degraded", "1", "yes" }, + { "degraded", "0", "no" }, +}; + +static const char *bch2_opt_val_synonym_lookup(const char *opt, const char *val) +{ + const struct opt_val_synonym *i; + + for (i = bch2_opt_val_synonyms; + i < bch2_opt_val_synonyms + ARRAY_SIZE(bch2_opt_val_synonyms); + i++) + if (!strcmp(opt, i->opt) && !strcmp(val, i->v1)) + return i->v2; + + return val; +} + int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err) { if (v < opt->min) { @@ -337,21 +368,22 @@ int bch2_opt_parse(struct bch_fs *c, { ssize_t ret; + if (err) + printbuf_indent_add_nextline(err, 2); + switch (opt->type) { case BCH_OPT_BOOL: - if (val) { - ret = lookup_constant(bool_names, val, -BCH_ERR_option_not_bool); - if (ret != -BCH_ERR_option_not_bool) { - *res = ret; - } else { - if (err) - prt_printf(err, "%s: must be bool", opt->attr.name); - return ret; - } + if (!val) + val = "1"; + + ret = lookup_constant(bool_names, val, -BCH_ERR_option_not_bool); + if (ret != -BCH_ERR_option_not_bool) { + *res = ret; } else { - *res = 1; + if (err) + prt_printf(err, "%s: must be bool", opt->attr.name); + return ret; } - break; case BCH_OPT_UINT: if (!val) { @@ -360,9 +392,15 @@ int bch2_opt_parse(struct bch_fs *c, return -EINVAL; } - ret = opt->flags & OPT_HUMAN_READABLE - ? bch2_strtou64_h(val, res) - : kstrtou64(val, 10, res); + if (*val != '-') { + ret = opt->flags & OPT_HUMAN_READABLE + ? bch2_strtou64_h(val, res) + : kstrtou64(val, 10, res); + } else { + prt_printf(err, "%s: must be a non-negative number", opt->attr.name); + return -BCH_ERR_option_negative; + } + if (ret < 0) { if (err) prt_printf(err, "%s: must be a number", @@ -480,7 +518,7 @@ void bch2_opts_to_text(struct printbuf *out, } } -int bch2_opt_check_may_set(struct bch_fs *c, struct bch_dev *ca, int id, u64 v) +int bch2_opt_hook_pre_set(struct bch_fs *c, struct bch_dev *ca, enum bch_opt_id id, u64 v) { int ret = 0; @@ -498,15 +536,17 @@ int bch2_opt_check_may_set(struct bch_fs *c, struct bch_dev *ca, int id, u64 v) if (v) bch2_check_set_feature(c, BCH_FEATURE_ec); break; + default: + break; } return ret; } -int bch2_opts_check_may_set(struct bch_fs *c) +int bch2_opts_hooks_pre_set(struct bch_fs *c) { for (unsigned i = 0; i < bch2_opts_nr; i++) { - int ret = bch2_opt_check_may_set(c, NULL, i, bch2_opt_get_by_id(&c->opts, i)); + int ret = bch2_opt_hook_pre_set(c, NULL, i, bch2_opt_get_by_id(&c->opts, i)); if (ret) return ret; } @@ -514,6 +554,61 @@ int bch2_opts_check_may_set(struct bch_fs *c) return 0; } +void bch2_opt_hook_post_set(struct bch_fs *c, struct bch_dev *ca, u64 inum, + struct bch_opts *new_opts, enum bch_opt_id id) +{ + switch (id) { + case Opt_foreground_target: + if (new_opts->foreground_target && + !new_opts->background_target) + bch2_set_rebalance_needs_scan(c, inum); + break; + case Opt_compression: + if (new_opts->compression && + !new_opts->background_compression) + bch2_set_rebalance_needs_scan(c, inum); + break; + case Opt_background_target: + if (new_opts->background_target) + bch2_set_rebalance_needs_scan(c, inum); + break; + case Opt_background_compression: + if (new_opts->background_compression) + bch2_set_rebalance_needs_scan(c, inum); + break; + case Opt_rebalance_enabled: + bch2_rebalance_wakeup(c); + break; + case Opt_copygc_enabled: + bch2_copygc_wakeup(c); + break; + case Opt_discard: + if (!ca) { + mutex_lock(&c->sb_lock); + for_each_member_device(c, ca) { + struct bch_member *m = + bch2_members_v2_get_mut(ca->disk_sb.sb, ca->dev_idx); + SET_BCH_MEMBER_DISCARD(m, c->opts.discard); + } + + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + } + break; + case Opt_version_upgrade: + /* + * XXX: in the future we'll likely want to do compatible + * upgrades at runtime as well, but right now there's nothing + * that does that: + */ + if (new_opts->version_upgrade == BCH_VERSION_UPGRADE_incompatible) + bch2_sb_upgrade_incompat(c); + break; + default: + break; + } +} + int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts, struct printbuf *parse_later, const char *name, const char *val) @@ -536,6 +631,12 @@ int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts, if (id < 0) return 0; + /* must have a value for synonym lookup - but OPT_FN is weird */ + if (!val && bch2_opt_table[id].type != BCH_OPT_FN) + val = "1"; + + val = bch2_opt_val_synonym_lookup(name, val); + if (!(bch2_opt_table[id].flags & OPT_MOUNT)) goto bad_opt; @@ -667,9 +768,11 @@ int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb) return 0; } -void __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx, +bool __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx, const struct bch_option *opt, u64 v) { + bool changed = false; + if (opt->flags & OPT_SB_FIELD_SECTORS) v >>= 9; @@ -679,26 +782,35 @@ void __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx, if (opt->flags & OPT_SB_FIELD_ONE_BIAS) v++; - if ((opt->flags & OPT_FS) && opt->set_sb && dev_idx < 0) + if ((opt->flags & OPT_FS) && opt->set_sb && dev_idx < 0) { + changed = v != opt->get_sb(sb); + opt->set_sb(sb, v); + } if ((opt->flags & OPT_DEVICE) && opt->set_member && dev_idx >= 0) { if (WARN(!bch2_member_exists(sb, dev_idx), "tried to set device option %s on nonexistent device %i", opt->attr.name, dev_idx)) - return; + return false; - opt->set_member(bch2_members_v2_get_mut(sb, dev_idx), v); + struct bch_member *m = bch2_members_v2_get_mut(sb, dev_idx); + changed = v != opt->get_member(m); + opt->set_member(m, v); } + + return changed; } -void bch2_opt_set_sb(struct bch_fs *c, struct bch_dev *ca, +bool bch2_opt_set_sb(struct bch_fs *c, struct bch_dev *ca, const struct bch_option *opt, u64 v) { mutex_lock(&c->sb_lock); - __bch2_opt_set_sb(c->disk_sb.sb, ca ? ca->dev_idx : -1, opt, v); - bch2_write_super(c); + bool changed = __bch2_opt_set_sb(c->disk_sb.sb, ca ? ca->dev_idx : -1, opt, v); + if (changed) + bch2_write_super(c); mutex_unlock(&c->sb_lock); + return changed; } /* io opts: */ diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index dfb14810124c..2a02606254b3 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -11,6 +11,7 @@ struct bch_fs; extern const char * const bch2_error_actions[]; +extern const char * const bch2_degraded_actions[]; extern const char * const bch2_fsck_fix_opts[]; extern const char * const bch2_version_upgrade_opts[]; extern const char * const bch2_sb_features[]; @@ -307,14 +308,9 @@ enum fsck_err_opts { NULL, "Enable project quotas") \ x(degraded, u8, \ OPT_FS|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ + OPT_STR(bch2_degraded_actions), \ + BCH_SB_DEGRADED_ACTION, BCH_DEGRADED_ask, \ NULL, "Allow mounting in degraded mode") \ - x(very_degraded, u8, \ - OPT_FS|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Allow mounting in when data will be missing") \ x(no_splitbrain_check, u8, \ OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ @@ -454,7 +450,7 @@ enum fsck_err_opts { BCH2_NO_SB_OPT, false, \ NULL, "Reconstruct alloc btree") \ x(version_upgrade, u8, \ - OPT_FS|OPT_MOUNT, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_STR(bch2_version_upgrade_opts), \ BCH_SB_VERSION_UPGRADE, BCH_VERSION_UPGRADE_compatible, \ NULL, "Set superblock to latest version,\n" \ @@ -494,6 +490,17 @@ enum fsck_err_opts { BCH2_NO_SB_OPT, true, \ NULL, "Enable rebalance: disable for debugging, or to\n"\ "quiet the system when doing performance testing\n")\ + x(rebalance_on_ac_only, u8, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH_SB_REBALANCE_AC_ONLY, false, \ + NULL, "Enable rebalance while on mains power only\n") \ + x(auto_snapshot_deletion, u8, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, true, \ + NULL, "Enable automatic snapshot deletion: disable for debugging, or to\n"\ + "quiet the system when doing performance testing\n")\ x(no_data_io, u8, \ OPT_MOUNT, \ OPT_BOOL(), \ @@ -522,7 +529,7 @@ enum fsck_err_opts { BCH_MEMBER_DATA_ALLOWED, BIT(BCH_DATA_journal)|BIT(BCH_DATA_btree)|BIT(BCH_DATA_user),\ "types", "Allowed data types for this device: journal, btree, and/or user")\ x(discard, u8, \ - OPT_MOUNT|OPT_DEVICE|OPT_RUNTIME, \ + OPT_MOUNT|OPT_FS|OPT_DEVICE|OPT_RUNTIME, \ OPT_BOOL(), \ BCH_MEMBER_DISCARD, true, \ NULL, "Enable discard/TRIM support") \ @@ -530,7 +537,7 @@ enum fsck_err_opts { OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, true, \ - NULL, "BTREE_ITER_prefetch casuse btree nodes to be\n"\ + NULL, "BTREE_ITER_prefetch causes btree nodes to be\n"\ " prefetched sequentially") struct bch_opts { @@ -616,10 +623,10 @@ void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64); u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id, int); int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *); -void __bch2_opt_set_sb(struct bch_sb *, int, const struct bch_option *, u64); +bool __bch2_opt_set_sb(struct bch_sb *, int, const struct bch_option *, u64); struct bch_dev; -void bch2_opt_set_sb(struct bch_fs *, struct bch_dev *, const struct bch_option *, u64); +bool bch2_opt_set_sb(struct bch_fs *, struct bch_dev *, const struct bch_option *, u64); int bch2_opt_lookup(const char *); int bch2_opt_validate(const struct bch_option *, u64, struct printbuf *); @@ -636,8 +643,11 @@ void bch2_opts_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, unsigned, unsigned, unsigned); -int bch2_opt_check_may_set(struct bch_fs *, struct bch_dev *, int, u64); -int bch2_opts_check_may_set(struct bch_fs *); +int bch2_opt_hook_pre_set(struct bch_fs *, struct bch_dev *, enum bch_opt_id, u64); +int bch2_opts_hooks_pre_set(struct bch_fs *); +void bch2_opt_hook_post_set(struct bch_fs *, struct bch_dev *, u64, + struct bch_opts *, enum bch_opt_id); + int bch2_parse_one_mount_opt(struct bch_fs *, struct bch_opts *, struct printbuf *, const char *, const char *); int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, struct printbuf *, diff --git a/fs/bcachefs/printbuf.h b/fs/bcachefs/printbuf.h index 1ca476adbf6f..8f4e28d440ac 100644 --- a/fs/bcachefs/printbuf.h +++ b/fs/bcachefs/printbuf.h @@ -140,6 +140,14 @@ void bch2_prt_bitflags_vector(struct printbuf *, const char * const[], .size = _size, \ }) +static inline struct printbuf bch2_printbuf_init(void) +{ + return PRINTBUF; +} + +DEFINE_CLASS(printbuf, struct printbuf, + bch2_printbuf_exit(&_T), bch2_printbuf_init(), void) + /* * Returns size remaining of output buffer: */ diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c index 3d4755d73af7..f241efb1fb50 100644 --- a/fs/bcachefs/quota.c +++ b/fs/bcachefs/quota.c @@ -527,7 +527,7 @@ int bch2_fs_quota_read(struct bch_fs *c) struct bch_sb_field_quota *sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); if (!sb_quota) { mutex_unlock(&c->sb_lock); - return -BCH_ERR_ENOSPC_sb_quota; + return bch_err_throw(c, ENOSPC_sb_quota); } bch2_sb_quota_read(c); @@ -572,7 +572,7 @@ static int bch2_quota_enable(struct super_block *sb, unsigned uflags) mutex_lock(&c->sb_lock); sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); if (!sb_quota) { - ret = -BCH_ERR_ENOSPC_sb_quota; + ret = bch_err_throw(c, ENOSPC_sb_quota); goto unlock; } @@ -726,7 +726,7 @@ static int bch2_quota_set_info(struct super_block *sb, int type, mutex_lock(&c->sb_lock); sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); if (!sb_quota) { - ret = -BCH_ERR_ENOSPC_sb_quota; + ret = bch_err_throw(c, ENOSPC_sb_quota); goto unlock; } diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 623273556aa9..1c345b86b1c0 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -80,6 +80,7 @@ static inline unsigned bch2_bkey_ptrs_need_move(struct bch_fs *c, unsigned ptr_bit = 1; unsigned rewrite_ptrs = 0; + guard(rcu)(); bkey_for_each_ptr(ptrs, ptr) { if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, opts->background_target)) rewrite_ptrs |= ptr_bit; @@ -95,6 +96,9 @@ static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) + return 0; + return bch2_bkey_ptrs_need_compress(c, opts, k, ptrs) | bch2_bkey_ptrs_need_move(c, opts, ptrs); } @@ -107,6 +111,9 @@ u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) if (!opts) return 0; + if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) + return 0; + const union bch_extent_entry *entry; struct extent_ptr_decoded p; u64 sectors = 0; @@ -126,10 +133,13 @@ u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) } } incompressible: - if (opts->background_target) + if (opts->background_target) { + guard(rcu)(); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, opts->background_target)) + if (!p.ptr.cached && + !bch2_dev_in_target(c, p.ptr.dev, opts->background_target)) sectors += p.crc.compressed_size; + } return sectors; } @@ -433,7 +443,7 @@ static int do_rebalance_extent(struct moving_context *ctxt, if (bch2_err_matches(ret, ENOMEM)) { /* memory allocation failure, wait for some IO to finish */ bch2_move_ctxt_wait_for_io(ctxt); - ret = -BCH_ERR_transaction_restart_nested; + ret = bch_err_throw(c, transaction_restart_nested); } if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -447,22 +457,11 @@ out: return ret; } -static bool rebalance_pred(struct bch_fs *c, void *arg, - struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k); - data_opts->target = io_opts->background_target; - data_opts->write_flags |= BCH_WRITE_only_specified_devs; - return data_opts->rewrite_ptrs != 0; -} - static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie) { struct btree_trans *trans = ctxt->trans; + struct bch_fs *c = trans->c; struct bch_fs_rebalance *r = &trans->c->rebalance; - int ret; bch2_move_stats_init(&r->scan_stats, "rebalance_scan"); ctxt->stats = &r->scan_stats; @@ -477,11 +476,34 @@ static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie) r->state = BCH_REBALANCE_scanning; - ret = __bch2_move_data(ctxt, r->scan_start, r->scan_end, rebalance_pred, NULL) ?: - commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_clear_rebalance_needs_scan(trans, inum, cookie)); + struct per_snapshot_io_opts snapshot_io_opts; + per_snapshot_io_opts_init(&snapshot_io_opts, c); + + int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents, + r->scan_start.pos, r->scan_end.pos, + BTREE_ITER_all_snapshots| + BTREE_ITER_not_extents| + BTREE_ITER_prefetch, k, ({ + ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos); + struct bch_io_opts *io_opts = bch2_move_get_io_opts(trans, + &snapshot_io_opts, iter.pos, &iter, k); + PTR_ERR_OR_ZERO(io_opts); + })) ?: + commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + bch2_clear_rebalance_needs_scan(trans, inum, cookie)); + + per_snapshot_io_opts_exit(&snapshot_io_opts); bch2_move_stats_exit(&r->scan_stats, trans->c); + + /* + * Ensure that the rebalance_work entries we created are seen by the + * next iteration of do_rebalance(), so we don't end up stuck in + * rebalance_wait(): + */ + atomic64_inc(&r->scan_stats.sectors_seen); + bch2_btree_write_buffer_flush_sync(trans); + return ret; } @@ -503,7 +525,14 @@ static void rebalance_wait(struct bch_fs *c) r->state = BCH_REBALANCE_waiting; } - bch2_kthread_io_clock_wait(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT); + bch2_kthread_io_clock_wait_once(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT); +} + +static bool bch2_rebalance_enabled(struct bch_fs *c) +{ + return c->opts.rebalance_enabled && + !(c->opts.rebalance_on_ac_only && + c->rebalance.on_battery); } static int do_rebalance(struct moving_context *ctxt) @@ -513,6 +542,7 @@ static int do_rebalance(struct moving_context *ctxt) struct bch_fs_rebalance *r = &c->rebalance; struct btree_iter rebalance_work_iter, extent_iter = {}; struct bkey_s_c k; + u32 kick = r->kick; int ret = 0; bch2_trans_begin(trans); @@ -525,9 +555,9 @@ static int do_rebalance(struct moving_context *ctxt) BTREE_ITER_all_snapshots); while (!bch2_move_ratelimit(ctxt)) { - if (!c->opts.rebalance_enabled) { + if (!bch2_rebalance_enabled(c)) { bch2_moving_ctxt_flush_all(ctxt); - kthread_wait_freezable(c->opts.rebalance_enabled || + kthread_wait_freezable(bch2_rebalance_enabled(c) || kthread_should_stop()); } @@ -562,7 +592,8 @@ static int do_rebalance(struct moving_context *ctxt) if (!ret && !kthread_should_stop() && !atomic64_read(&r->work_stats.sectors_seen) && - !atomic64_read(&r->scan_stats.sectors_seen)) { + !atomic64_read(&r->scan_stats.sectors_seen) && + kick == r->kick) { bch2_moving_ctxt_flush_all(ctxt); bch2_trans_unlock_long(trans); rebalance_wait(c); @@ -585,7 +616,7 @@ static int bch2_rebalance_thread(void *arg) * Data move operations can't run until after check_snapshots has * completed, and bch2_snapshot_is_ancestor() is available. */ - kthread_wait_freezable(c->recovery_pass_done > BCH_RECOVERY_PASS_check_snapshots || + kthread_wait_freezable(c->recovery.pass_done > BCH_RECOVERY_PASS_check_snapshots || kthread_should_stop()); bch2_moving_ctxt_init(&ctxt, c, NULL, &r->work_stats, @@ -646,11 +677,12 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c) } prt_newline(out); - rcu_read_lock(); - struct task_struct *t = rcu_dereference(c->rebalance.thread); - if (t) - get_task_struct(t); - rcu_read_unlock(); + struct task_struct *t; + scoped_guard(rcu) { + t = rcu_dereference(c->rebalance.thread); + if (t) + get_task_struct(t); + } if (t) { bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL); @@ -702,7 +734,156 @@ int bch2_rebalance_start(struct bch_fs *c) return 0; } -void bch2_fs_rebalance_init(struct bch_fs *c) +#ifdef CONFIG_POWER_SUPPLY +#include <linux/power_supply.h> + +static int bch2_rebalance_power_notifier(struct notifier_block *nb, + unsigned long event, void *data) +{ + struct bch_fs *c = container_of(nb, struct bch_fs, rebalance.power_notifier); + + c->rebalance.on_battery = !power_supply_is_system_supplied(); + bch2_rebalance_wakeup(c); + return NOTIFY_OK; +} +#endif + +void bch2_fs_rebalance_exit(struct bch_fs *c) +{ +#ifdef CONFIG_POWER_SUPPLY + power_supply_unreg_notifier(&c->rebalance.power_notifier); +#endif +} + +int bch2_fs_rebalance_init(struct bch_fs *c) +{ + struct bch_fs_rebalance *r = &c->rebalance; + + bch2_pd_controller_init(&r->pd); + +#ifdef CONFIG_POWER_SUPPLY + r->power_notifier.notifier_call = bch2_rebalance_power_notifier; + int ret = power_supply_reg_notifier(&r->power_notifier); + if (ret) + return ret; + + r->on_battery = !power_supply_is_system_supplied(); +#endif + return 0; +} + +static int check_rebalance_work_one(struct btree_trans *trans, + struct btree_iter *extent_iter, + struct btree_iter *rebalance_iter, + struct bkey_buf *last_flushed) { - bch2_pd_controller_init(&c->rebalance.pd); + struct bch_fs *c = trans->c; + struct bkey_s_c extent_k, rebalance_k; + struct printbuf buf = PRINTBUF; + + int ret = bkey_err(extent_k = bch2_btree_iter_peek(trans, extent_iter)) ?: + bkey_err(rebalance_k = bch2_btree_iter_peek(trans, rebalance_iter)); + if (ret) + return ret; + + if (!extent_k.k && + extent_iter->btree_id == BTREE_ID_reflink && + (!rebalance_k.k || + rebalance_k.k->p.inode >= BCACHEFS_ROOT_INO)) { + bch2_trans_iter_exit(trans, extent_iter); + bch2_trans_iter_init(trans, extent_iter, + BTREE_ID_extents, POS_MIN, + BTREE_ITER_prefetch| + BTREE_ITER_all_snapshots); + return bch_err_throw(c, transaction_restart_nested); + } + + if (!extent_k.k && !rebalance_k.k) + return 1; + + int cmp = bpos_cmp(extent_k.k ? extent_k.k->p : SPOS_MAX, + rebalance_k.k ? rebalance_k.k->p : SPOS_MAX); + + struct bkey deleted; + bkey_init(&deleted); + + if (cmp < 0) { + deleted.p = extent_k.k->p; + rebalance_k.k = &deleted; + } else if (cmp > 0) { + deleted.p = rebalance_k.k->p; + extent_k.k = &deleted; + } + + bool should_have_rebalance = + bch2_bkey_sectors_need_rebalance(c, extent_k) != 0; + bool have_rebalance = rebalance_k.k->type == KEY_TYPE_set; + + if (should_have_rebalance != have_rebalance) { + ret = bch2_btree_write_buffer_maybe_flush(trans, extent_k, last_flushed); + if (ret) + return ret; + + bch2_bkey_val_to_text(&buf, c, extent_k); + } + + if (fsck_err_on(!should_have_rebalance && have_rebalance, + trans, rebalance_work_incorrectly_set, + "rebalance work incorrectly set\n%s", buf.buf)) { + ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, + extent_k.k->p, false); + if (ret) + goto err; + } + + if (fsck_err_on(should_have_rebalance && !have_rebalance, + trans, rebalance_work_incorrectly_unset, + "rebalance work incorrectly unset\n%s", buf.buf)) { + ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, + extent_k.k->p, true); + if (ret) + goto err; + } + + if (cmp <= 0) + bch2_btree_iter_advance(trans, extent_iter); + if (cmp >= 0) + bch2_btree_iter_advance(trans, rebalance_iter); +err: +fsck_err: + printbuf_exit(&buf); + return ret; +} + +int bch2_check_rebalance_work(struct bch_fs *c) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter rebalance_iter, extent_iter; + int ret = 0; + + bch2_trans_iter_init(trans, &extent_iter, + BTREE_ID_reflink, POS_MIN, + BTREE_ITER_prefetch); + bch2_trans_iter_init(trans, &rebalance_iter, + BTREE_ID_rebalance_work, POS_MIN, + BTREE_ITER_prefetch); + + struct bkey_buf last_flushed; + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); + + while (!ret) { + bch2_trans_begin(trans); + + ret = check_rebalance_work_one(trans, &extent_iter, &rebalance_iter, &last_flushed); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + ret = 0; + } + + bch2_bkey_buf_exit(&last_flushed, c); + bch2_trans_iter_exit(trans, &extent_iter); + bch2_trans_iter_exit(trans, &rebalance_iter); + bch2_trans_put(trans); + return ret < 0 ? ret : 0; } diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h index e5e8eb4a2dd1..7a565ea7dbfc 100644 --- a/fs/bcachefs/rebalance.h +++ b/fs/bcachefs/rebalance.h @@ -39,19 +39,21 @@ int bch2_set_fs_needs_rebalance(struct bch_fs *); static inline void bch2_rebalance_wakeup(struct bch_fs *c) { - struct task_struct *p; - - rcu_read_lock(); - p = rcu_dereference(c->rebalance.thread); + c->rebalance.kick++; + guard(rcu)(); + struct task_struct *p = rcu_dereference(c->rebalance.thread); if (p) wake_up_process(p); - rcu_read_unlock(); } void bch2_rebalance_status_to_text(struct printbuf *, struct bch_fs *); void bch2_rebalance_stop(struct bch_fs *); int bch2_rebalance_start(struct bch_fs *); -void bch2_fs_rebalance_init(struct bch_fs *); + +void bch2_fs_rebalance_exit(struct bch_fs *); +int bch2_fs_rebalance_init(struct bch_fs *); + +int bch2_check_rebalance_work(struct bch_fs *); #endif /* _BCACHEFS_REBALANCE_H */ diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h index fe5098c17dfc..c659da149fa3 100644 --- a/fs/bcachefs/rebalance_types.h +++ b/fs/bcachefs/rebalance_types.h @@ -18,6 +18,7 @@ enum bch_rebalance_states { struct bch_fs_rebalance { struct task_struct __rcu *thread; + u32 kick; struct bch_pd_controller pd; enum bch_rebalance_states state; @@ -30,6 +31,11 @@ struct bch_fs_rebalance { struct bbpos scan_start; struct bbpos scan_end; struct bch_move_stats scan_stats; + + bool on_battery; +#ifdef CONFIG_POWER_SUPPLY + struct notifier_block power_notifier; +#endif }; #endif /* _BCACHEFS_REBALANCE_TYPES_H */ diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index d6c4ef819d40..1e68e61f08e8 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -33,8 +33,9 @@ #include <linux/sort.h> #include <linux/stat.h> - -int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree) +int bch2_btree_lost_data(struct bch_fs *c, + struct printbuf *msg, + enum btree_id btree) { u64 b = BIT_ULL(btree); int ret = 0; @@ -43,32 +44,32 @@ int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree) struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); if (!(c->sb.btrees_lost_data & b)) { - struct printbuf buf = PRINTBUF; - bch2_btree_id_to_text(&buf, btree); - bch_err(c, "flagging btree %s lost data", buf.buf); - printbuf_exit(&buf); + prt_printf(msg, "flagging btree "); + bch2_btree_id_to_text(msg, btree); + prt_printf(msg, " lost data\n"); + ext->btrees_lost_data |= cpu_to_le64(b); } /* Once we have runtime self healing for topology errors we won't need this: */ - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_topology) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0) ?: ret; /* Btree node accounting will be off: */ __set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent); - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_allocations) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_allocations, 0) ?: ret; #ifdef CONFIG_BCACHEFS_DEBUG /* * These are much more minor, and don't need to be corrected right away, * but in debug mode we want the next fsck run to be clean: */ - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_lrus) ?: ret; - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_backpointers_to_extents) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_lrus, 0) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_backpointers_to_extents, 0) ?: ret; #endif switch (btree) { case BTREE_ID_alloc: - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret; __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent); __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent); @@ -78,26 +79,30 @@ int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree) __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent); goto out; case BTREE_ID_backpointers: - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_btree_backpointers) ?: ret; - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_extents_to_backpointers) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_btree_backpointers, 0) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_extents_to_backpointers, 0) ?: ret; goto out; case BTREE_ID_need_discard: - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret; goto out; case BTREE_ID_freespace: - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret; goto out; case BTREE_ID_bucket_gens: - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret; goto out; case BTREE_ID_lru: - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret; goto out; case BTREE_ID_accounting: - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_allocations) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_allocations, 0) ?: ret; + goto out; + case BTREE_ID_snapshots: + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_reconstruct_snapshots, 0) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0) ?: ret; goto out; default: - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0) ?: ret; goto out; } out: @@ -114,11 +119,8 @@ static void kill_btree(struct bch_fs *c, enum btree_id btree) } /* for -o reconstruct_alloc: */ -static void bch2_reconstruct_alloc(struct bch_fs *c) +void bch2_reconstruct_alloc(struct bch_fs *c) { - bch2_journal_log_msg(c, "dropping alloc info"); - bch_info(c, "dropping and reconstructing all alloc info"); - mutex_lock(&c->sb_lock); struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); @@ -160,6 +162,8 @@ static void bch2_reconstruct_alloc(struct bch_fs *c) c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); + c->disk_sb.sb->features[0] &= ~cpu_to_le64(BIT_ULL(BCH_FEATURE_no_alloc_info)); + bch2_write_super(c); mutex_unlock(&c->sb_lock); @@ -282,7 +286,12 @@ static int bch2_journal_replay_key(struct btree_trans *trans, goto out; if (k->k->k.type == KEY_TYPE_accounting) { - ret = bch2_trans_update_buffered(trans, BTREE_ID_accounting, k->k); + struct bkey_i *n = bch2_trans_subbuf_alloc(trans, &trans->accounting, k->k->k.u64s); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + goto out; + + bkey_copy(n, k->k); goto out; } @@ -430,7 +439,7 @@ int bch2_journal_replay(struct bch_fs *c) trans = NULL; if (!c->opts.retain_recovery_info && - c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) + c->recovery.pass_done >= BCH_RECOVERY_PASS_journal_replay) bch2_journal_keys_put_initial(c); replay_now_at(j, j->replay_journal_seq_end); @@ -585,9 +594,6 @@ static int read_btree_roots(struct bch_fs *c) buf.buf, bch2_err_str(ret))) { if (btree_id_is_alloc(i)) r->error = 0; - - ret = bch2_btree_lost_data(c, i); - BUG_ON(ret); } } @@ -667,7 +673,7 @@ static bool check_version_upgrade(struct bch_fs *c) bch2_recovery_passes_from_stable(le64_to_cpu(passes))); } - bch_info(c, "%s", buf.buf); + bch_notice(c, "%s", buf.buf); printbuf_exit(&buf); ret = true; @@ -683,7 +689,7 @@ static bool check_version_upgrade(struct bch_fs *c) bch2_version_to_text(&buf, c->sb.version_incompat_allowed); prt_newline(&buf); - bch_info(c, "%s", buf.buf); + bch_notice(c, "%s", buf.buf); printbuf_exit(&buf); ret = true; @@ -790,11 +796,11 @@ int bch2_fs_recovery(struct bch_fs *c) bch2_write_super(c); mutex_unlock(&c->sb_lock); - if (c->opts.fsck) - set_bit(BCH_FS_fsck_running, &c->flags); if (c->sb.clean) set_bit(BCH_FS_clean_recovery, &c->flags); - set_bit(BCH_FS_recovery_running, &c->flags); + if (c->opts.fsck) + set_bit(BCH_FS_in_fsck, &c->flags); + set_bit(BCH_FS_in_recovery, &c->flags); ret = bch2_blacklist_table_initialize(c); if (ret) { @@ -873,7 +879,7 @@ int bch2_fs_recovery(struct bch_fs *c) use_clean: if (!clean) { bch_err(c, "no superblock clean section found"); - ret = -BCH_ERR_fsck_repair_impossible; + ret = bch_err_throw(c, fsck_repair_impossible); goto err; } @@ -889,8 +895,37 @@ use_clean: if (ret) goto err; - if (c->opts.reconstruct_alloc) + ret = bch2_fs_resize_on_mount(c); + if (ret) { + up_write(&c->state_lock); + goto err; + } + + if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) { + bch_info(c, "filesystem is an unresized image file, mounting ro"); + c->opts.read_only = true; + } + + if (!c->opts.read_only && + (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))) { + bch_info(c, "mounting a filesystem with no alloc info read-write; will recreate"); + + bch2_reconstruct_alloc(c); + } else if (c->opts.reconstruct_alloc) { + bch2_journal_log_msg(c, "dropping alloc info"); + bch_info(c, "dropping and reconstructing all alloc info"); + bch2_reconstruct_alloc(c); + } + + if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) { + /* We can't go RW to fix errors without alloc info */ + if (c->opts.fix_errors == FSCK_FIX_yes || + c->opts.fix_errors == FSCK_FIX_ask) + c->opts.fix_errors = FSCK_FIX_no; + if (c->opts.errors == BCH_ON_ERROR_fix_safe) + c->opts.errors = BCH_ON_ERROR_continue; + } /* * After an unclean shutdown, skip then next few journal sequence @@ -933,8 +968,10 @@ use_clean: set_bit(BCH_FS_btree_running, &c->flags); ret = bch2_sb_set_upgrade_extra(c); + if (ret) + goto err; - ret = bch2_run_recovery_passes(c); + ret = bch2_run_recovery_passes(c, 0); if (ret) goto err; @@ -945,8 +982,7 @@ use_clean: * multithreaded use: */ set_bit(BCH_FS_may_go_rw, &c->flags); - clear_bit(BCH_FS_fsck_running, &c->flags); - clear_bit(BCH_FS_recovery_running, &c->flags); + clear_bit(BCH_FS_in_fsck, &c->flags); /* in case we don't run journal replay, i.e. norecovery mode */ set_bit(BCH_FS_accounting_replay_done, &c->flags); @@ -969,9 +1005,8 @@ use_clean: bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean"); clear_bit(BCH_FS_errors_fixed, &c->flags); - c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info; - - ret = bch2_run_recovery_passes(c); + ret = bch2_run_recovery_passes(c, + BCH_RECOVERY_PASS_check_alloc_info); if (ret) goto err; @@ -1015,7 +1050,7 @@ use_clean: if (c->opts.fsck && !test_bit(BCH_FS_error, &c->flags) && - c->recovery_pass_done == BCH_RECOVERY_PASS_NR - 1 && + c->recovery.pass_done == BCH_RECOVERY_PASS_NR - 1 && ext->btrees_lost_data) { ext->btrees_lost_data = 0; write_sb = true; @@ -1058,10 +1093,6 @@ use_clean: out: bch2_flush_fsck_errs(c); - if (!c->opts.retain_recovery_info) { - bch2_journal_keys_put_initial(c); - bch2_find_btree_nodes_exit(&c->found_btree_nodes); - } if (!IS_ERR(clean)) kfree(clean); @@ -1076,8 +1107,17 @@ out: return ret; err: fsck_err: - bch2_fs_emergency_read_only(c); - goto out; + { + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + + prt_printf(&buf, "error in recovery: %s", bch2_err_str(ret)); + bch2_fs_emergency_read_only2(c, &buf); + + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + } + return ret; } int bch2_fs_initialize(struct bch_fs *c) @@ -1193,7 +1233,7 @@ int bch2_fs_initialize(struct bch_fs *c) if (ret) goto err; - c->recovery_pass_done = BCH_RECOVERY_PASS_NR - 1; + c->recovery.pass_done = BCH_RECOVERY_PASS_NR - 1; bch2_copygc_wakeup(c); bch2_rebalance_wakeup(c); @@ -1216,7 +1256,7 @@ int bch2_fs_initialize(struct bch_fs *c) bch2_write_super(c); mutex_unlock(&c->sb_lock); - c->curr_recovery_pass = BCH_RECOVERY_PASS_NR; + c->recovery.curr_pass = BCH_RECOVERY_PASS_NR; return 0; err: bch_err_fn(c, ret); diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h index b0d55754b21b..c023f52fc2d6 100644 --- a/fs/bcachefs/recovery.h +++ b/fs/bcachefs/recovery.h @@ -2,7 +2,8 @@ #ifndef _BCACHEFS_RECOVERY_H #define _BCACHEFS_RECOVERY_H -int bch2_btree_lost_data(struct bch_fs *, enum btree_id); +int bch2_btree_lost_data(struct bch_fs *, struct printbuf *, enum btree_id); +void bch2_reconstruct_alloc(struct bch_fs *); int bch2_journal_replay(struct bch_fs *); diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index 22f72bb5b853..605588e33fb3 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -28,6 +28,176 @@ const char * const bch2_recovery_passes[] = { NULL }; +static const u8 passes_to_stable_map[] = { +#define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n, + BCH_RECOVERY_PASSES() +#undef x +}; + +static const u8 passes_from_stable_map[] = { +#define x(n, id, ...) [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n, + BCH_RECOVERY_PASSES() +#undef x +}; + +static enum bch_recovery_pass_stable bch2_recovery_pass_to_stable(enum bch_recovery_pass pass) +{ + return passes_to_stable_map[pass]; +} + +u64 bch2_recovery_passes_to_stable(u64 v) +{ + u64 ret = 0; + for (unsigned i = 0; i < ARRAY_SIZE(passes_to_stable_map); i++) + if (v & BIT_ULL(i)) + ret |= BIT_ULL(passes_to_stable_map[i]); + return ret; +} + +static enum bch_recovery_pass bch2_recovery_pass_from_stable(enum bch_recovery_pass_stable pass) +{ + return pass < ARRAY_SIZE(passes_from_stable_map) + ? passes_from_stable_map[pass] + : 0; +} + +u64 bch2_recovery_passes_from_stable(u64 v) +{ + u64 ret = 0; + for (unsigned i = 0; i < ARRAY_SIZE(passes_from_stable_map); i++) + if (v & BIT_ULL(i)) + ret |= BIT_ULL(passes_from_stable_map[i]); + return ret; +} + +static int bch2_sb_recovery_passes_validate(struct bch_sb *sb, struct bch_sb_field *f, + enum bch_validate_flags flags, struct printbuf *err) +{ + return 0; +} + +static void bch2_sb_recovery_passes_to_text(struct printbuf *out, + struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_recovery_passes *r = + field_to_type(f, recovery_passes); + unsigned nr = recovery_passes_nr_entries(r); + + if (out->nr_tabstops < 1) + printbuf_tabstop_push(out, 32); + if (out->nr_tabstops < 2) + printbuf_tabstop_push(out, 16); + + prt_printf(out, "Pass\tLast run\tLast runtime\n"); + + for (struct recovery_pass_entry *i = r->start; i < r->start + nr; i++) { + if (!i->last_run) + continue; + + unsigned idx = i - r->start; + + prt_printf(out, "%s\t", bch2_recovery_passes[bch2_recovery_pass_from_stable(idx)]); + + bch2_prt_datetime(out, le64_to_cpu(i->last_run)); + prt_tab(out); + + bch2_pr_time_units(out, le32_to_cpu(i->last_runtime) * NSEC_PER_SEC); + + if (BCH_RECOVERY_PASS_NO_RATELIMIT(i)) + prt_str(out, " (no ratelimit)"); + + prt_newline(out); + } +} + +static struct recovery_pass_entry *bch2_sb_recovery_pass_entry(struct bch_fs *c, + enum bch_recovery_pass pass) +{ + enum bch_recovery_pass_stable stable = bch2_recovery_pass_to_stable(pass); + + lockdep_assert_held(&c->sb_lock); + + struct bch_sb_field_recovery_passes *r = + bch2_sb_field_get(c->disk_sb.sb, recovery_passes); + + if (stable >= recovery_passes_nr_entries(r)) { + unsigned u64s = struct_size(r, start, stable + 1) / sizeof(u64); + + r = bch2_sb_field_resize(&c->disk_sb, recovery_passes, u64s); + if (!r) { + bch_err(c, "error creating recovery_passes sb section"); + return NULL; + } + } + + return r->start + stable; +} + +static void bch2_sb_recovery_pass_complete(struct bch_fs *c, + enum bch_recovery_pass pass, + s64 start_time) +{ + guard(mutex)(&c->sb_lock); + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + __clear_bit_le64(bch2_recovery_pass_to_stable(pass), + ext->recovery_passes_required); + + struct recovery_pass_entry *e = bch2_sb_recovery_pass_entry(c, pass); + if (e) { + s64 end_time = ktime_get_real_seconds(); + e->last_run = cpu_to_le64(end_time); + e->last_runtime = cpu_to_le32(max(0, end_time - start_time)); + SET_BCH_RECOVERY_PASS_NO_RATELIMIT(e, false); + } + + bch2_write_super(c); +} + +void bch2_recovery_pass_set_no_ratelimit(struct bch_fs *c, + enum bch_recovery_pass pass) +{ + guard(mutex)(&c->sb_lock); + + struct recovery_pass_entry *e = bch2_sb_recovery_pass_entry(c, pass); + if (e && !BCH_RECOVERY_PASS_NO_RATELIMIT(e)) { + SET_BCH_RECOVERY_PASS_NO_RATELIMIT(e, false); + bch2_write_super(c); + } +} + +static bool bch2_recovery_pass_want_ratelimit(struct bch_fs *c, enum bch_recovery_pass pass) +{ + enum bch_recovery_pass_stable stable = bch2_recovery_pass_to_stable(pass); + bool ret = false; + + lockdep_assert_held(&c->sb_lock); + + struct bch_sb_field_recovery_passes *r = + bch2_sb_field_get(c->disk_sb.sb, recovery_passes); + + if (stable < recovery_passes_nr_entries(r)) { + struct recovery_pass_entry *i = r->start + stable; + + /* + * Ratelimit if the last runtime was more than 1% of the time + * since we last ran + */ + ret = (u64) le32_to_cpu(i->last_runtime) * 100 > + ktime_get_real_seconds() - le64_to_cpu(i->last_run); + + if (BCH_RECOVERY_PASS_NO_RATELIMIT(i)) + ret = false; + } + + return ret; +} + +const struct bch_sb_field_ops bch_sb_field_ops_recovery_passes = { + .validate = bch2_sb_recovery_passes_validate, + .to_text = bch2_sb_recovery_passes_to_text +}; + /* Fake recovery pass, so that scan_for_btree_nodes isn't 0: */ static int bch2_recovery_pass_empty(struct bch_fs *c) { @@ -47,11 +217,36 @@ static int bch2_set_may_go_rw(struct bch_fs *c) set_bit(BCH_FS_may_go_rw, &c->flags); - if (keys->nr || !c->opts.read_only || c->opts.fsck || !c->sb.clean || c->opts.recovery_passes) + if (keys->nr || + !c->opts.read_only || + !c->sb.clean || + c->opts.recovery_passes || + (c->opts.fsck && !(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)))) { + if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) { + bch_info(c, "mounting a filesystem with no alloc info read-write; will recreate"); + bch2_reconstruct_alloc(c); + } + return bch2_fs_read_write_early(c); + } return 0; } +/* + * Make sure root inode is readable while we're still in recovery and can rewind + * for repair: + */ +static int bch2_lookup_root_inode(struct bch_fs *c) +{ + subvol_inum inum = BCACHEFS_ROOT_SUBVOL_INUM; + struct bch_inode_unpacked inode_u; + struct bch_subvolume subvol; + + return bch2_trans_do(c, + bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: + bch2_inode_find_by_inum_trans(trans, inum, &inode_u)); +} + struct recovery_pass_fn { int (*fn)(struct bch_fs *); unsigned when; @@ -63,252 +258,382 @@ static struct recovery_pass_fn recovery_pass_fns[] = { #undef x }; -static const u8 passes_to_stable_map[] = { -#define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n, - BCH_RECOVERY_PASSES() -#undef x -}; +static u64 bch2_recovery_passes_match(unsigned flags) +{ + u64 ret = 0; -static enum bch_recovery_pass_stable bch2_recovery_pass_to_stable(enum bch_recovery_pass pass) + for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) + if (recovery_pass_fns[i].when & flags) + ret |= BIT_ULL(i); + return ret; +} + +u64 bch2_fsck_recovery_passes(void) { - return passes_to_stable_map[pass]; + return bch2_recovery_passes_match(PASS_FSCK); } -u64 bch2_recovery_passes_to_stable(u64 v) +static void bch2_run_async_recovery_passes(struct bch_fs *c) { - u64 ret = 0; - for (unsigned i = 0; i < ARRAY_SIZE(passes_to_stable_map); i++) - if (v & BIT_ULL(i)) - ret |= BIT_ULL(passes_to_stable_map[i]); - return ret; + if (!down_trylock(&c->recovery.run_lock)) + return; + + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_async_recovery_passes)) + goto unlock; + + if (queue_work(system_long_wq, &c->recovery.work)) + return; + + enumerated_ref_put(&c->writes, BCH_WRITE_REF_async_recovery_passes); +unlock: + up(&c->recovery.run_lock); } -u64 bch2_recovery_passes_from_stable(u64 v) +static bool recovery_pass_needs_set(struct bch_fs *c, + enum bch_recovery_pass pass, + enum bch_run_recovery_pass_flags *flags) { - static const u8 map[] = { -#define x(n, id, ...) [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n, - BCH_RECOVERY_PASSES() -#undef x - }; + struct bch_fs_recovery *r = &c->recovery; + bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags); + bool persistent = !in_recovery || !(*flags & RUN_RECOVERY_PASS_nopersistent); - u64 ret = 0; - for (unsigned i = 0; i < ARRAY_SIZE(map); i++) - if (v & BIT_ULL(i)) - ret |= BIT_ULL(map[i]); - return ret; + if ((*flags & RUN_RECOVERY_PASS_ratelimit) && + !bch2_recovery_pass_want_ratelimit(c, pass)) + *flags &= ~RUN_RECOVERY_PASS_ratelimit; + + /* + * If RUN_RECOVERY_PASS_nopersistent is set, we don't want to do + * anything if the pass has already run: these mean we need a prior pass + * to run before we continue to repair, we don't expect that pass to fix + * the damage we encountered. + * + * Otherwise, we run run_explicit_recovery_pass when we find damage, so + * it should run again even if it's already run: + */ + + if (persistent + ? !(c->sb.recovery_passes_required & BIT_ULL(pass)) + : !((r->passes_to_run|r->passes_complete) & BIT_ULL(pass))) + return true; + + if (!(*flags & RUN_RECOVERY_PASS_ratelimit) && + (r->passes_ratelimiting & BIT_ULL(pass))) + return true; + + return false; } /* * For when we need to rewind recovery passes and run a pass we skipped: */ -static int __bch2_run_explicit_recovery_pass(struct bch_fs *c, - enum bch_recovery_pass pass) +int __bch2_run_explicit_recovery_pass(struct bch_fs *c, + struct printbuf *out, + enum bch_recovery_pass pass, + enum bch_run_recovery_pass_flags flags) { - if (c->curr_recovery_pass == ARRAY_SIZE(recovery_pass_fns)) - return -BCH_ERR_not_in_recovery; + struct bch_fs_recovery *r = &c->recovery; + int ret = 0; - if (c->recovery_passes_complete & BIT_ULL(pass)) - return 0; + lockdep_assert_held(&c->sb_lock); - bool print = !(c->opts.recovery_passes & BIT_ULL(pass)); + bch2_printbuf_make_room(out, 1024); + out->atomic++; + + unsigned long lockflags; + spin_lock_irqsave(&r->lock, lockflags); + + if (!recovery_pass_needs_set(c, pass, &flags)) + goto out; + + bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags); + bool rewind = in_recovery && + r->curr_pass > pass && + !(r->passes_complete & BIT_ULL(pass)); + bool ratelimit = flags & RUN_RECOVERY_PASS_ratelimit; + + if (!(in_recovery && (flags & RUN_RECOVERY_PASS_nopersistent))) { + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + __set_bit_le64(bch2_recovery_pass_to_stable(pass), ext->recovery_passes_required); + } if (pass < BCH_RECOVERY_PASS_set_may_go_rw && - c->curr_recovery_pass >= BCH_RECOVERY_PASS_set_may_go_rw) { - if (print) - bch_info(c, "need recovery pass %s (%u), but already rw", - bch2_recovery_passes[pass], pass); - return -BCH_ERR_cannot_rewind_recovery; + (!in_recovery || r->curr_pass >= BCH_RECOVERY_PASS_set_may_go_rw)) { + prt_printf(out, "need recovery pass %s (%u), but already rw\n", + bch2_recovery_passes[pass], pass); + ret = bch_err_throw(c, cannot_rewind_recovery); + goto out; } - if (print) - bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)", - bch2_recovery_passes[pass], pass, - bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass); + if (ratelimit) + r->passes_ratelimiting |= BIT_ULL(pass); + else + r->passes_ratelimiting &= ~BIT_ULL(pass); + + if (in_recovery && !ratelimit) { + prt_printf(out, "running recovery pass %s (%u), currently at %s (%u)%s\n", + bch2_recovery_passes[pass], pass, + bch2_recovery_passes[r->curr_pass], r->curr_pass, + rewind ? " - rewinding" : ""); - c->opts.recovery_passes |= BIT_ULL(pass); + r->passes_to_run |= BIT_ULL(pass); - if (c->curr_recovery_pass > pass) { - c->next_recovery_pass = pass; - c->recovery_passes_complete &= (1ULL << pass) >> 1; - return -BCH_ERR_restart_recovery; + if (rewind) { + r->next_pass = pass; + r->passes_complete &= (1ULL << pass) >> 1; + ret = bch_err_throw(c, restart_recovery); + } } else { - return 0; + prt_printf(out, "scheduling recovery pass %s (%u)%s\n", + bch2_recovery_passes[pass], pass, + ratelimit ? " - ratelimiting" : ""); + + struct recovery_pass_fn *p = recovery_pass_fns + pass; + if (p->when & PASS_ONLINE) + bch2_run_async_recovery_passes(c); } +out: + spin_unlock_irqrestore(&r->lock, lockflags); + --out->atomic; + return ret; } int bch2_run_explicit_recovery_pass(struct bch_fs *c, - enum bch_recovery_pass pass) + struct printbuf *out, + enum bch_recovery_pass pass, + enum bch_run_recovery_pass_flags flags) { - unsigned long flags; - spin_lock_irqsave(&c->recovery_pass_lock, flags); - int ret = __bch2_run_explicit_recovery_pass(c, pass); - spin_unlock_irqrestore(&c->recovery_pass_lock, flags); - return ret; -} + int ret = 0; -int bch2_run_explicit_recovery_pass_persistent_locked(struct bch_fs *c, - enum bch_recovery_pass pass) -{ - lockdep_assert_held(&c->sb_lock); + scoped_guard(mutex, &c->sb_lock) { + if (!recovery_pass_needs_set(c, pass, &flags)) + return 0; - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - __set_bit_le64(bch2_recovery_pass_to_stable(pass), ext->recovery_passes_required); + ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags); + bch2_write_super(c); + } - return bch2_run_explicit_recovery_pass(c, pass); + return ret; } -int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c, - enum bch_recovery_pass pass) +/* + * Returns 0 if @pass has run recently, otherwise one of + * -BCH_ERR_restart_recovery + * -BCH_ERR_recovery_pass_will_run + */ +int bch2_require_recovery_pass(struct bch_fs *c, + struct printbuf *out, + enum bch_recovery_pass pass) { - enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass); + if (test_bit(BCH_FS_in_recovery, &c->flags) && + c->recovery.passes_complete & BIT_ULL(pass)) + return 0; - mutex_lock(&c->sb_lock); - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + guard(mutex)(&c->sb_lock); - if (!test_bit_le64(s, ext->recovery_passes_required)) { - __set_bit_le64(s, ext->recovery_passes_required); + if (bch2_recovery_pass_want_ratelimit(c, pass)) + return 0; + + enum bch_run_recovery_pass_flags flags = 0; + int ret = 0; + + if (recovery_pass_needs_set(c, pass, &flags)) { + ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags); bch2_write_super(c); } - mutex_unlock(&c->sb_lock); - return bch2_run_explicit_recovery_pass(c, pass); + return ret ?: bch_err_throw(c, recovery_pass_will_run); } -static void bch2_clear_recovery_pass_required(struct bch_fs *c, - enum bch_recovery_pass pass) +int bch2_run_print_explicit_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) { - enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass); + enum bch_run_recovery_pass_flags flags = RUN_RECOVERY_PASS_nopersistent; - mutex_lock(&c->sb_lock); - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + if (!recovery_pass_needs_set(c, pass, &flags)) + return 0; - if (test_bit_le64(s, ext->recovery_passes_required)) { - __clear_bit_le64(s, ext->recovery_passes_required); - bch2_write_super(c); - } - mutex_unlock(&c->sb_lock); -} + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); -u64 bch2_fsck_recovery_passes(void) -{ - u64 ret = 0; + mutex_lock(&c->sb_lock); + int ret = __bch2_run_explicit_recovery_pass(c, &buf, pass, + RUN_RECOVERY_PASS_nopersistent); + mutex_unlock(&c->sb_lock); - for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) - if (recovery_pass_fns[i].when & PASS_FSCK) - ret |= BIT_ULL(i); + bch2_print_str(c, KERN_NOTICE, buf.buf); + printbuf_exit(&buf); return ret; } -static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) -{ - struct recovery_pass_fn *p = recovery_pass_fns + pass; - - if (c->opts.recovery_passes_exclude & BIT_ULL(pass)) - return false; - if (c->opts.recovery_passes & BIT_ULL(pass)) - return true; - if ((p->when & PASS_FSCK) && c->opts.fsck) - return true; - if ((p->when & PASS_UNCLEAN) && !c->sb.clean) - return true; - if (p->when & PASS_ALWAYS) - return true; - return false; -} - static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) { + struct bch_fs_recovery *r = &c->recovery; struct recovery_pass_fn *p = recovery_pass_fns + pass; - int ret; if (!(p->when & PASS_SILENT)) bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."), bch2_recovery_passes[pass]); - ret = p->fn(c); - if (ret) + + s64 start_time = ktime_get_real_seconds(); + int ret = p->fn(c); + + r->passes_to_run &= ~BIT_ULL(pass); + + if (ret) { + r->passes_failing |= BIT_ULL(pass); return ret; + } + + r->passes_failing = 0; + + if (!test_bit(BCH_FS_error, &c->flags)) + bch2_sb_recovery_pass_complete(c, pass, start_time); + if (!(p->when & PASS_SILENT)) bch2_print(c, KERN_CONT " done\n"); return 0; } -int bch2_run_online_recovery_passes(struct bch_fs *c) +static int __bch2_run_recovery_passes(struct bch_fs *c, u64 orig_passes_to_run, + bool online) { - for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) { - struct recovery_pass_fn *p = recovery_pass_fns + i; - - if (!(p->when & PASS_ONLINE)) - continue; + struct bch_fs_recovery *r = &c->recovery; + int ret = 0; - int ret = bch2_run_recovery_pass(c, i); - if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) { - i = c->curr_recovery_pass; - continue; - } - if (ret) - return ret; - } + spin_lock_irq(&r->lock); - return 0; -} + if (online) + orig_passes_to_run &= bch2_recovery_passes_match(PASS_ONLINE); -int bch2_run_recovery_passes(struct bch_fs *c) -{ - int ret = 0; + if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) + orig_passes_to_run &= ~bch2_recovery_passes_match(PASS_ALLOC); /* - * We can't allow set_may_go_rw to be excluded; that would cause us to - * use the journal replay keys for updates where it's not expected. + * A failed recovery pass will be retried after another pass succeeds - + * but not this iteration. + * + * This is because some passes depend on repair done by other passes: we + * may want to retry, but we don't want to loop on failing passes. */ - c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw; - spin_lock_irq(&c->recovery_pass_lock); + orig_passes_to_run &= ~r->passes_failing; - while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns) && !ret) { - unsigned prev_done = c->recovery_pass_done; - unsigned pass = c->curr_recovery_pass; + r->passes_to_run = orig_passes_to_run; - c->next_recovery_pass = pass + 1; + while (r->passes_to_run) { + unsigned prev_done = r->pass_done; + unsigned pass = __ffs64(r->passes_to_run); + r->curr_pass = pass; + r->next_pass = r->curr_pass + 1; + r->passes_to_run &= ~BIT_ULL(pass); - if (c->opts.recovery_pass_last && - c->curr_recovery_pass > c->opts.recovery_pass_last) - break; + spin_unlock_irq(&r->lock); + + int ret2 = bch2_run_recovery_pass(c, pass) ?: + bch2_journal_flush(&c->journal); - if (should_run_recovery_pass(c, pass)) { - spin_unlock_irq(&c->recovery_pass_lock); - ret = bch2_run_recovery_pass(c, pass) ?: - bch2_journal_flush(&c->journal); - - if (!ret && !test_bit(BCH_FS_error, &c->flags)) - bch2_clear_recovery_pass_required(c, pass); - spin_lock_irq(&c->recovery_pass_lock); - - if (c->next_recovery_pass < c->curr_recovery_pass) { - /* - * bch2_run_explicit_recovery_pass() was called: we - * can't always catch -BCH_ERR_restart_recovery because - * it may have been called from another thread (btree - * node read completion) - */ - ret = 0; - c->recovery_passes_complete &= ~(~0ULL << c->curr_recovery_pass); - } else { - c->recovery_passes_complete |= BIT_ULL(pass); - c->recovery_pass_done = max(c->recovery_pass_done, pass); - } + spin_lock_irq(&r->lock); + + if (r->next_pass < r->curr_pass) { + /* Rewind: */ + r->passes_to_run |= orig_passes_to_run & (~0ULL << r->next_pass); + } else if (!ret2) { + r->pass_done = max(r->pass_done, pass); + r->passes_complete |= BIT_ULL(pass); + } else { + ret = ret2; } - c->curr_recovery_pass = c->next_recovery_pass; + if (ret && !online) + break; if (prev_done <= BCH_RECOVERY_PASS_check_snapshots && - c->recovery_pass_done > BCH_RECOVERY_PASS_check_snapshots) { + r->pass_done > BCH_RECOVERY_PASS_check_snapshots) { bch2_copygc_wakeup(c); bch2_rebalance_wakeup(c); } } - spin_unlock_irq(&c->recovery_pass_lock); + clear_bit(BCH_FS_in_recovery, &c->flags); + spin_unlock_irq(&r->lock); return ret; } + +static void bch2_async_recovery_passes_work(struct work_struct *work) +{ + struct bch_fs *c = container_of(work, struct bch_fs, recovery.work); + struct bch_fs_recovery *r = &c->recovery; + + __bch2_run_recovery_passes(c, + c->sb.recovery_passes_required & ~r->passes_ratelimiting, + true); + + up(&r->run_lock); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_async_recovery_passes); +} + +int bch2_run_online_recovery_passes(struct bch_fs *c, u64 passes) +{ + return __bch2_run_recovery_passes(c, c->sb.recovery_passes_required|passes, true); +} + +int bch2_run_recovery_passes(struct bch_fs *c, enum bch_recovery_pass from) +{ + u64 passes = + bch2_recovery_passes_match(PASS_ALWAYS) | + (!c->sb.clean ? bch2_recovery_passes_match(PASS_UNCLEAN) : 0) | + (c->opts.fsck ? bch2_recovery_passes_match(PASS_FSCK) : 0) | + c->opts.recovery_passes | + c->sb.recovery_passes_required; + + if (c->opts.recovery_pass_last) + passes &= BIT_ULL(c->opts.recovery_pass_last + 1) - 1; + + /* + * We can't allow set_may_go_rw to be excluded; that would cause us to + * use the journal replay keys for updates where it's not expected. + */ + c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw; + passes &= ~c->opts.recovery_passes_exclude; + + passes &= ~(BIT_ULL(from) - 1); + + down(&c->recovery.run_lock); + int ret = __bch2_run_recovery_passes(c, passes, false); + up(&c->recovery.run_lock); + + return ret; +} + +static void prt_passes(struct printbuf *out, const char *msg, u64 passes) +{ + prt_printf(out, "%s:\t", msg); + prt_bitflags(out, bch2_recovery_passes, passes); + prt_newline(out); +} + +void bch2_recovery_pass_status_to_text(struct printbuf *out, struct bch_fs *c) +{ + struct bch_fs_recovery *r = &c->recovery; + + printbuf_tabstop_push(out, 32); + prt_passes(out, "Scheduled passes", c->sb.recovery_passes_required); + prt_passes(out, "Scheduled online passes", c->sb.recovery_passes_required & + bch2_recovery_passes_match(PASS_ONLINE)); + prt_passes(out, "Complete passes", r->passes_complete); + prt_passes(out, "Failing passes", r->passes_failing); + + if (r->curr_pass) { + prt_printf(out, "Current pass:\t%s\n", bch2_recovery_passes[r->curr_pass]); + prt_passes(out, "Current passes", r->passes_to_run); + } +} + +void bch2_fs_recovery_passes_init(struct bch_fs *c) +{ + spin_lock_init(&c->recovery.lock); + sema_init(&c->recovery.run_lock, 1); + + INIT_WORK(&c->recovery.work, bch2_async_recovery_passes_work); +} diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h index 7d7339c8fa29..260571c7105e 100644 --- a/fs/bcachefs/recovery_passes.h +++ b/fs/bcachefs/recovery_passes.h @@ -3,16 +3,37 @@ extern const char * const bch2_recovery_passes[]; +extern const struct bch_sb_field_ops bch_sb_field_ops_recovery_passes; + u64 bch2_recovery_passes_to_stable(u64 v); u64 bch2_recovery_passes_from_stable(u64 v); u64 bch2_fsck_recovery_passes(void); -int bch2_run_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass); -int bch2_run_explicit_recovery_pass_persistent_locked(struct bch_fs *, enum bch_recovery_pass); -int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *, enum bch_recovery_pass); +void bch2_recovery_pass_set_no_ratelimit(struct bch_fs *, enum bch_recovery_pass); + +enum bch_run_recovery_pass_flags { + RUN_RECOVERY_PASS_nopersistent = BIT(0), + RUN_RECOVERY_PASS_ratelimit = BIT(1), +}; + +int bch2_run_print_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass); + +int __bch2_run_explicit_recovery_pass(struct bch_fs *, struct printbuf *, + enum bch_recovery_pass, + enum bch_run_recovery_pass_flags); +int bch2_run_explicit_recovery_pass(struct bch_fs *, struct printbuf *, + enum bch_recovery_pass, + enum bch_run_recovery_pass_flags); + +int bch2_require_recovery_pass(struct bch_fs *, struct printbuf *, + enum bch_recovery_pass); + +int bch2_run_online_recovery_passes(struct bch_fs *, u64); +int bch2_run_recovery_passes(struct bch_fs *, enum bch_recovery_pass); + +void bch2_recovery_pass_status_to_text(struct printbuf *, struct bch_fs *); -int bch2_run_online_recovery_passes(struct bch_fs *); -int bch2_run_recovery_passes(struct bch_fs *); +void bch2_fs_recovery_passes_init(struct bch_fs *); #endif /* _BCACHEFS_RECOVERY_PASSES_H */ diff --git a/fs/bcachefs/recovery_passes_format.h b/fs/bcachefs/recovery_passes_format.h new file mode 100644 index 000000000000..b63c20558d3d --- /dev/null +++ b/fs/bcachefs/recovery_passes_format.h @@ -0,0 +1,106 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_RECOVERY_PASSES_FORMAT_H +#define _BCACHEFS_RECOVERY_PASSES_FORMAT_H + +#define PASS_SILENT BIT(0) +#define PASS_FSCK BIT(1) +#define PASS_UNCLEAN BIT(2) +#define PASS_ALWAYS BIT(3) +#define PASS_ONLINE BIT(4) +#define PASS_ALLOC BIT(5) +#define PASS_FSCK_ALLOC (PASS_FSCK|PASS_ALLOC) + +#ifdef CONFIG_BCACHEFS_DEBUG +#define PASS_FSCK_DEBUG BIT(1) +#else +#define PASS_FSCK_DEBUG 0 +#endif + +/* + * Passes may be reordered, but the second field is a persistent identifier and + * must never change: + */ +#define BCH_RECOVERY_PASSES() \ + x(recovery_pass_empty, 41, PASS_SILENT) \ + x(scan_for_btree_nodes, 37, 0) \ + x(check_topology, 4, 0) \ + x(accounting_read, 39, PASS_ALWAYS) \ + x(alloc_read, 0, PASS_ALWAYS) \ + x(stripes_read, 1, 0) \ + x(initialize_subvolumes, 2, 0) \ + x(snapshots_read, 3, PASS_ALWAYS) \ + x(check_allocations, 5, PASS_FSCK_ALLOC) \ + x(trans_mark_dev_sbs, 6, PASS_ALWAYS|PASS_SILENT|PASS_ALLOC) \ + x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT|PASS_ALLOC) \ + x(set_may_go_rw, 8, PASS_ALWAYS|PASS_SILENT) \ + x(journal_replay, 9, PASS_ALWAYS) \ + x(check_alloc_info, 10, PASS_ONLINE|PASS_FSCK_ALLOC) \ + x(check_lrus, 11, PASS_ONLINE|PASS_FSCK_ALLOC) \ + x(check_btree_backpointers, 12, PASS_ONLINE|PASS_FSCK_ALLOC) \ + x(check_backpointers_to_extents, 13, PASS_ONLINE|PASS_FSCK_DEBUG) \ + x(check_extents_to_backpointers, 14, PASS_ONLINE|PASS_FSCK_ALLOC) \ + x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK_ALLOC) \ + x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \ + x(bucket_gens_init, 17, 0) \ + x(reconstruct_snapshots, 38, 0) \ + x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \ + x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \ + x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \ + x(check_subvol_children, 35, PASS_ONLINE|PASS_FSCK) \ + x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \ + x(fs_upgrade_for_subvolumes, 22, 0) \ + x(check_inodes, 24, PASS_FSCK) \ + x(check_extents, 25, PASS_FSCK) \ + x(check_indirect_extents, 26, PASS_ONLINE|PASS_FSCK) \ + x(check_dirents, 27, PASS_FSCK) \ + x(check_xattrs, 28, PASS_FSCK) \ + x(check_root, 29, PASS_ONLINE|PASS_FSCK) \ + x(check_unreachable_inodes, 40, PASS_FSCK) \ + x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \ + x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \ + x(check_nlinks, 31, PASS_FSCK) \ + x(check_rebalance_work, 43, PASS_ONLINE|PASS_FSCK) \ + x(resume_logged_ops, 23, PASS_ALWAYS) \ + x(delete_dead_inodes, 32, PASS_ALWAYS) \ + x(fix_reflink_p, 33, 0) \ + x(set_fs_needs_rebalance, 34, 0) \ + x(lookup_root_inode, 42, PASS_ALWAYS|PASS_SILENT) + +/* We normally enumerate recovery passes in the order we run them: */ +enum bch_recovery_pass { +#define x(n, id, when) BCH_RECOVERY_PASS_##n, + BCH_RECOVERY_PASSES() +#undef x + BCH_RECOVERY_PASS_NR +}; + +/* But we also need stable identifiers that can be used in the superblock */ +enum bch_recovery_pass_stable { +#define x(n, id, when) BCH_RECOVERY_PASS_STABLE_##n = id, + BCH_RECOVERY_PASSES() +#undef x +}; + +struct recovery_pass_entry { + __le64 last_run; + __le32 last_runtime; + __le32 flags; +}; + +LE32_BITMASK(BCH_RECOVERY_PASS_NO_RATELIMIT, struct recovery_pass_entry, flags, 0, 1) + +struct bch_sb_field_recovery_passes { + struct bch_sb_field field; + struct recovery_pass_entry start[]; +}; + +static inline unsigned +recovery_passes_nr_entries(struct bch_sb_field_recovery_passes *r) +{ + return r + ? ((vstruct_end(&r->field) - (void *) &r->start[0]) / + sizeof(struct recovery_pass_entry)) + : 0; +} + +#endif /* _BCACHEFS_RECOVERY_PASSES_FORMAT_H */ diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h index e89b9c783285..aa9526938cc3 100644 --- a/fs/bcachefs/recovery_passes_types.h +++ b/fs/bcachefs/recovery_passes_types.h @@ -2,79 +2,26 @@ #ifndef _BCACHEFS_RECOVERY_PASSES_TYPES_H #define _BCACHEFS_RECOVERY_PASSES_TYPES_H -#define PASS_SILENT BIT(0) -#define PASS_FSCK BIT(1) -#define PASS_UNCLEAN BIT(2) -#define PASS_ALWAYS BIT(3) -#define PASS_ONLINE BIT(4) - -#ifdef CONFIG_BCACHEFS_DEBUG -#define PASS_FSCK_DEBUG BIT(1) -#else -#define PASS_FSCK_DEBUG 0 -#endif - -/* - * Passes may be reordered, but the second field is a persistent identifier and - * must never change: - */ -#define BCH_RECOVERY_PASSES() \ - x(recovery_pass_empty, 41, PASS_SILENT) \ - x(scan_for_btree_nodes, 37, 0) \ - x(check_topology, 4, 0) \ - x(accounting_read, 39, PASS_ALWAYS) \ - x(alloc_read, 0, PASS_ALWAYS) \ - x(stripes_read, 1, 0) \ - x(initialize_subvolumes, 2, 0) \ - x(snapshots_read, 3, PASS_ALWAYS) \ - x(check_allocations, 5, PASS_FSCK) \ - x(trans_mark_dev_sbs, 6, PASS_ALWAYS|PASS_SILENT) \ - x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT) \ - x(set_may_go_rw, 8, PASS_ALWAYS|PASS_SILENT) \ - x(journal_replay, 9, PASS_ALWAYS) \ - x(check_alloc_info, 10, PASS_ONLINE|PASS_FSCK) \ - x(check_lrus, 11, PASS_ONLINE|PASS_FSCK) \ - x(check_btree_backpointers, 12, PASS_ONLINE|PASS_FSCK) \ - x(check_backpointers_to_extents, 13, PASS_ONLINE|PASS_FSCK_DEBUG) \ - x(check_extents_to_backpointers, 14, PASS_ONLINE|PASS_FSCK) \ - x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK) \ - x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \ - x(bucket_gens_init, 17, 0) \ - x(reconstruct_snapshots, 38, 0) \ - x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \ - x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \ - x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \ - x(check_subvol_children, 35, PASS_ONLINE|PASS_FSCK) \ - x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \ - x(fs_upgrade_for_subvolumes, 22, 0) \ - x(check_inodes, 24, PASS_FSCK) \ - x(check_extents, 25, PASS_FSCK) \ - x(check_indirect_extents, 26, PASS_ONLINE|PASS_FSCK) \ - x(check_dirents, 27, PASS_FSCK) \ - x(check_xattrs, 28, PASS_FSCK) \ - x(check_root, 29, PASS_ONLINE|PASS_FSCK) \ - x(check_unreachable_inodes, 40, PASS_FSCK) \ - x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \ - x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \ - x(check_nlinks, 31, PASS_FSCK) \ - x(resume_logged_ops, 23, PASS_ALWAYS) \ - x(delete_dead_inodes, 32, PASS_ALWAYS) \ - x(fix_reflink_p, 33, 0) \ - x(set_fs_needs_rebalance, 34, 0) - -/* We normally enumerate recovery passes in the order we run them: */ -enum bch_recovery_pass { -#define x(n, id, when) BCH_RECOVERY_PASS_##n, - BCH_RECOVERY_PASSES() -#undef x - BCH_RECOVERY_PASS_NR -}; - -/* But we also need stable identifiers that can be used in the superblock */ -enum bch_recovery_pass_stable { -#define x(n, id, when) BCH_RECOVERY_PASS_STABLE_##n = id, - BCH_RECOVERY_PASSES() -#undef x +struct bch_fs_recovery { + /* + * Two different uses: + * "Has this fsck pass?" - i.e. should this type of error be an + * emergency read-only + * And, in certain situations fsck will rewind to an earlier pass: used + * for signaling to the toplevel code which pass we want to run now. + */ + enum bch_recovery_pass curr_pass; + enum bch_recovery_pass next_pass; + /* never rewinds version of curr_pass */ + enum bch_recovery_pass pass_done; + u64 passes_to_run; + /* bitmask of recovery passes that we actually ran */ + u64 passes_complete; + u64 passes_failing; + u64 passes_ratelimiting; + spinlock_t lock; + struct semaphore run_lock; + struct work_struct work; }; #endif /* _BCACHEFS_RECOVERY_PASSES_TYPES_H */ diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index 710178e3da4c..a535abd44df3 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -3,6 +3,7 @@ #include "bkey_buf.h" #include "btree_update.h" #include "buckets.h" +#include "enumerated_ref.h" #include "error.h" #include "extents.h" #include "inode.h" @@ -311,7 +312,7 @@ static int trans_trigger_reflink_p_segment(struct btree_trans *trans, if (!bkey_refcount_c(k)) { if (!(flags & BTREE_TRIGGER_overwrite)) - ret = -BCH_ERR_missing_indirect_extent; + ret = bch_err_throw(c, missing_indirect_extent); goto next; } @@ -610,8 +611,8 @@ s64 bch2_remap_range(struct bch_fs *c, !bch2_request_incompat_feature(c, bcachefs_metadata_version_reflink_p_may_update_opts); int ret = 0, ret2 = 0; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_reflink)) - return -BCH_ERR_erofs_no_writes; + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_reflink)) + return bch_err_throw(c, erofs_no_writes); bch2_check_set_feature(c, BCH_FEATURE_reflink); @@ -710,7 +711,8 @@ s64 bch2_remap_range(struct bch_fs *c, SET_REFLINK_P_IDX(&dst_p->v, offset); if (reflink_p_may_update_opts_field && - may_change_src_io_path_opts) + may_change_src_io_path_opts && + REFLINK_P_MAY_UPDATE_OPTIONS(src_p.v)) SET_REFLINK_P_MAY_UPDATE_OPTIONS(&dst_p->v, true); } else { BUG(); @@ -761,7 +763,7 @@ err: bch2_bkey_buf_exit(&new_src, c); bch2_bkey_buf_exit(&new_dst, c); - bch2_write_ref_put(c, BCH_WRITE_REF_reflink); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_reflink); return dst_done ?: ret ?: ret2; } @@ -846,7 +848,7 @@ int bch2_gc_reflink_start(struct bch_fs *c) struct reflink_gc *r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++, GFP_KERNEL); if (!r) { - ret = -BCH_ERR_ENOMEM_gc_reflink_start; + ret = bch_err_throw(c, ENOMEM_gc_reflink_start); break; } diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c index 477ef0997949..8383bd7fdb3f 100644 --- a/fs/bcachefs/replicas.c +++ b/fs/bcachefs/replicas.c @@ -119,7 +119,7 @@ int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r, return 0; bad: bch2_replicas_entry_to_text(err, r); - return -BCH_ERR_invalid_replicas_entry; + return bch_err_throw(c, invalid_replicas_entry); } void bch2_cpu_replicas_to_text(struct printbuf *out, @@ -311,7 +311,7 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c, !__replicas_has_entry(&c->replicas_gc, new_entry)) { new_gc = cpu_replicas_add_entry(c, &c->replicas_gc, new_entry); if (!new_gc.entries) { - ret = -BCH_ERR_ENOMEM_cpu_replicas; + ret = bch_err_throw(c, ENOMEM_cpu_replicas); goto err; } } @@ -319,7 +319,7 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c, if (!__replicas_has_entry(&c->replicas, new_entry)) { new_r = cpu_replicas_add_entry(c, &c->replicas, new_entry); if (!new_r.entries) { - ret = -BCH_ERR_ENOMEM_cpu_replicas; + ret = bch_err_throw(c, ENOMEM_cpu_replicas); goto err; } @@ -422,7 +422,7 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) if (!c->replicas_gc.entries) { mutex_unlock(&c->sb_lock); bch_err(c, "error allocating c->replicas_gc"); - return -BCH_ERR_ENOMEM_replicas_gc; + return bch_err_throw(c, ENOMEM_replicas_gc); } for_each_cpu_replicas_entry(&c->replicas, e) @@ -458,7 +458,7 @@ retry: new.entries = kcalloc(nr, new.entry_size, GFP_KERNEL); if (!new.entries) { bch_err(c, "error allocating c->replicas_gc"); - return -BCH_ERR_ENOMEM_replicas_gc; + return bch_err_throw(c, ENOMEM_replicas_gc); } mutex_lock(&c->sb_lock); @@ -622,7 +622,7 @@ static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c, sb_r = bch2_sb_field_resize(&c->disk_sb, replicas_v0, DIV_ROUND_UP(bytes, sizeof(u64))); if (!sb_r) - return -BCH_ERR_ENOSPC_sb_replicas; + return bch_err_throw(c, ENOSPC_sb_replicas); bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas); sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas_v0); @@ -667,7 +667,7 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c, sb_r = bch2_sb_field_resize(&c->disk_sb, replicas, DIV_ROUND_UP(bytes, sizeof(u64))); if (!sb_r) - return -BCH_ERR_ENOSPC_sb_replicas; + return bch_err_throw(c, ENOSPC_sb_replicas); bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0); sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas); @@ -819,19 +819,18 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, if (e->data_type == BCH_DATA_cached) continue; - rcu_read_lock(); - for (unsigned i = 0; i < e->nr_devs; i++) { - if (e->devs[i] == BCH_SB_MEMBER_INVALID) { - nr_failed++; - continue; - } + scoped_guard(rcu) + for (unsigned i = 0; i < e->nr_devs; i++) { + if (e->devs[i] == BCH_SB_MEMBER_INVALID) { + nr_failed++; + continue; + } - nr_online += test_bit(e->devs[i], devs.d); + nr_online += test_bit(e->devs[i], devs.d); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, e->devs[i]); - nr_failed += !ca || ca->mi.state == BCH_MEMBER_STATE_failed; - } - rcu_read_unlock(); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, e->devs[i]); + nr_failed += !ca || ca->mi.state == BCH_MEMBER_STATE_failed; + } if (nr_online + nr_failed == e->nr_devs) continue; diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h index fa27ec59a647..b868702a431a 100644 --- a/fs/bcachefs/sb-counters_format.h +++ b/fs/bcachefs/sb-counters_format.h @@ -16,6 +16,7 @@ enum counters_flags { x(io_read_split, 33, TYPE_COUNTER) \ x(io_read_reuse_race, 34, TYPE_COUNTER) \ x(io_read_retry, 32, TYPE_COUNTER) \ + x(io_read_fail_and_poison, 82, TYPE_COUNTER) \ x(io_write, 1, TYPE_SECTORS) \ x(io_move, 2, TYPE_SECTORS) \ x(io_move_read, 35, TYPE_SECTORS) \ @@ -24,6 +25,8 @@ enum counters_flags { x(io_move_fail, 38, TYPE_COUNTER) \ x(io_move_write_fail, 82, TYPE_COUNTER) \ x(io_move_start_fail, 39, TYPE_COUNTER) \ + x(io_move_created_rebalance, 83, TYPE_COUNTER) \ + x(io_move_evacuate_bucket, 84, TYPE_COUNTER) \ x(bucket_invalidate, 3, TYPE_COUNTER) \ x(bucket_discard, 4, TYPE_COUNTER) \ x(bucket_discard_fast, 79, TYPE_COUNTER) \ diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index badd0e17ada5..b61f88450a6d 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -100,7 +100,11 @@ BCH_FSCK_ERR_ptr_to_missing_backpointer) \ x(stripe_backpointers, \ BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\ - BCH_FSCK_ERR_ptr_to_missing_backpointer) + BCH_FSCK_ERR_ptr_to_missing_backpointer) \ + x(inode_has_case_insensitive, \ + BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \ + BCH_FSCK_ERR_inode_has_case_insensitive_not_set, \ + BCH_FSCK_ERR_inode_parent_has_case_insensitive_not_set) #define DOWNGRADE_TABLE() \ x(bucket_stripe_sectors, \ @@ -374,6 +378,9 @@ int bch2_sb_downgrade_update(struct bch_fs *c) if (BCH_VERSION_MAJOR(src->version) != BCH_VERSION_MAJOR(le16_to_cpu(c->disk_sb.sb->version))) continue; + if (src->version < c->sb.version_incompat) + continue; + struct bch_sb_field_downgrade_entry *dst; unsigned bytes = sizeof(*dst) + sizeof(dst->errors[0]) * src->nr_errors; @@ -410,7 +417,7 @@ int bch2_sb_downgrade_update(struct bch_fs *c) d = bch2_sb_field_resize(&c->disk_sb, downgrade, sb_u64s); if (!d) { - ret = -BCH_ERR_ENOSPC_sb_downgrade; + ret = bch_err_throw(c, ENOSPC_sb_downgrade); goto out; } diff --git a/fs/bcachefs/sb-errors.c b/fs/bcachefs/sb-errors.c index 013a96883b4e..48853efdc105 100644 --- a/fs/bcachefs/sb-errors.c +++ b/fs/bcachefs/sb-errors.c @@ -78,6 +78,28 @@ const struct bch_sb_field_ops bch_sb_field_ops_errors = { .to_text = bch2_sb_errors_to_text, }; +void bch2_fs_errors_to_text(struct printbuf *out, struct bch_fs *c) +{ + if (out->nr_tabstops < 1) + printbuf_tabstop_push(out, 48); + if (out->nr_tabstops < 2) + printbuf_tabstop_push(out, 8); + if (out->nr_tabstops < 3) + printbuf_tabstop_push(out, 16); + + guard(mutex)(&c->fsck_error_counts_lock); + + bch_sb_errors_cpu *e = &c->fsck_error_counts; + darray_for_each(*e, i) { + bch2_sb_error_id_to_text(out, i->id); + prt_tab(out); + prt_u64(out, i->nr); + prt_tab(out); + bch2_prt_datetime(out, i->last_error_time); + prt_newline(out); + } +} + void bch2_sb_error_count(struct bch_fs *c, enum bch_sb_error_id err) { bch_sb_errors_cpu *e = &c->fsck_error_counts; diff --git a/fs/bcachefs/sb-errors.h b/fs/bcachefs/sb-errors.h index b2357b8e6107..e86267264692 100644 --- a/fs/bcachefs/sb-errors.h +++ b/fs/bcachefs/sb-errors.h @@ -7,6 +7,7 @@ extern const char * const bch2_sb_error_strs[]; void bch2_sb_error_id_to_text(struct printbuf *, enum bch_sb_error_id); +void bch2_fs_errors_to_text(struct printbuf *, struct bch_fs *); extern const struct bch_sb_field_ops bch_sb_field_ops_errors; diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index 4036a20c6adc..6fdbf265e4c0 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -209,7 +209,7 @@ enum bch_fsck_flags { x(subvol_to_missing_root, 188, 0) \ x(subvol_root_wrong_bi_subvol, 189, FSCK_AUTOFIX) \ x(bkey_in_missing_snapshot, 190, 0) \ - x(bkey_in_deleted_snapshot, 315, 0) \ + x(bkey_in_deleted_snapshot, 315, FSCK_AUTOFIX) \ x(inode_pos_inode_nonzero, 191, 0) \ x(inode_pos_blockdev_range, 192, 0) \ x(inode_alloc_cursor_inode_bad, 301, 0) \ @@ -232,6 +232,7 @@ enum bch_fsck_flags { x(inode_dir_multiple_links, 206, FSCK_AUTOFIX) \ x(inode_dir_missing_backpointer, 284, FSCK_AUTOFIX) \ x(inode_dir_unlinked_but_not_empty, 286, FSCK_AUTOFIX) \ + x(inode_dir_has_nonzero_i_size, 319, FSCK_AUTOFIX) \ x(inode_multiple_links_but_nlink_0, 207, FSCK_AUTOFIX) \ x(inode_wrong_backpointer, 208, FSCK_AUTOFIX) \ x(inode_wrong_nlink, 209, FSCK_AUTOFIX) \ @@ -243,6 +244,7 @@ enum bch_fsck_flags { x(inode_parent_has_case_insensitive_not_set, 317, FSCK_AUTOFIX) \ x(vfs_inode_i_blocks_underflow, 311, FSCK_AUTOFIX) \ x(vfs_inode_i_blocks_not_zero_at_truncate, 313, FSCK_AUTOFIX) \ + x(vfs_bad_inode_rm, 320, 0) \ x(deleted_inode_but_clean, 211, FSCK_AUTOFIX) \ x(deleted_inode_missing, 212, FSCK_AUTOFIX) \ x(deleted_inode_is_dir, 213, FSCK_AUTOFIX) \ @@ -328,7 +330,7 @@ enum bch_fsck_flags { x(dirent_stray_data_after_cf_name, 305, 0) \ x(rebalance_work_incorrectly_set, 309, FSCK_AUTOFIX) \ x(rebalance_work_incorrectly_unset, 310, FSCK_AUTOFIX) \ - x(MAX, 319, 0) + x(MAX, 321, 0) enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c index 72779912939b..363eb0c6eb7c 100644 --- a/fs/bcachefs/sb-members.c +++ b/fs/bcachefs/sb-members.c @@ -5,11 +5,31 @@ #include "disk_groups.h" #include "error.h" #include "opts.h" +#include "recovery_passes.h" #include "replicas.h" #include "sb-members.h" #include "super-io.h" -void bch2_dev_missing(struct bch_fs *c, unsigned dev) +int bch2_dev_missing_bkey(struct bch_fs *c, struct bkey_s_c k, unsigned dev) +{ + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + + prt_printf(&buf, "pointer to nonexistent device %u in key\n", dev); + bch2_bkey_val_to_text(&buf, c, k); + + bool print = bch2_count_fsck_err(c, ptr_to_invalid_device, &buf); + + int ret = bch2_run_explicit_recovery_pass(c, &buf, + BCH_RECOVERY_PASS_check_allocations, 0); + + if (print) + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + return ret; +} + +void bch2_dev_missing_atomic(struct bch_fs *c, unsigned dev) { if (dev != BCH_SB_MEMBER_INVALID) bch2_fs_inconsistent(c, "pointer to nonexistent device %u", dev); @@ -81,7 +101,7 @@ static int sb_members_v2_resize_entries(struct bch_fs *c) mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s); if (!mi) - return -BCH_ERR_ENOSPC_sb_members_v2; + return bch_err_throw(c, ENOSPC_sb_members_v2); for (int i = c->disk_sb.sb->nr_devices - 1; i >= 0; --i) { void *dst = (void *) mi->_members + (i * sizeof(struct bch_member)); @@ -119,6 +139,11 @@ int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb) struct bch_sb_field_members_v1 *mi1; struct bch_sb_field_members_v2 *mi2; + if (BCH_SB_VERSION_INCOMPAT(disk_sb->sb) > bcachefs_metadata_version_extent_flags) { + bch2_sb_field_resize(disk_sb, members_v1, 0); + return 0; + } + mi1 = bch2_sb_field_resize(disk_sb, members_v1, DIV_ROUND_UP(sizeof(*mi1) + BCH_MEMBER_V1_BYTES * disk_sb->sb->nr_devices, sizeof(u64))); @@ -170,6 +195,12 @@ static int validate_member(struct printbuf *err, return -BCH_ERR_invalid_sb_members; } + if (BCH_MEMBER_FREESPACE_INITIALIZED(&m) && + sb->features[0] & cpu_to_le64(BIT_ULL(BCH_FEATURE_no_alloc_info))) { + prt_printf(err, "device %u: freespace initialized but fs has no alloc info", i); + return -BCH_ERR_invalid_sb_members; + } + return 0; } @@ -191,17 +222,11 @@ static void member_to_text(struct printbuf *out, printbuf_indent_add(out, 2); prt_printf(out, "Label:\t"); - if (BCH_MEMBER_GROUP(&m)) { - unsigned idx = BCH_MEMBER_GROUP(&m) - 1; - - if (idx < disk_groups_nr(gi)) - prt_printf(out, "%s (%u)", - gi->entries[idx].label, idx); - else - prt_printf(out, "(bad disk labels section)"); - } else { + if (BCH_MEMBER_GROUP(&m)) + bch2_disk_path_to_text_sb(out, sb, + BCH_MEMBER_GROUP(&m) - 1); + else prt_printf(out, "(none)"); - } prt_newline(out); prt_printf(out, "UUID:\t"); @@ -268,6 +293,7 @@ static void member_to_text(struct printbuf *out, prt_printf(out, "Discard:\t%llu\n", BCH_MEMBER_DISCARD(&m)); prt_printf(out, "Freespace initialized:\t%llu\n", BCH_MEMBER_FREESPACE_INITIALIZED(&m)); + prt_printf(out, "Resize on mount:\t%llu\n", BCH_MEMBER_RESIZE_ON_MOUNT(&m)); printbuf_indent_sub(out, 2); } @@ -352,14 +378,13 @@ void bch2_sb_members_from_cpu(struct bch_fs *c) { struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); - rcu_read_lock(); + guard(rcu)(); for_each_member_device_rcu(c, ca, NULL) { struct bch_member *m = __bch2_members_v2_get_mut(mi, ca->dev_idx); for (unsigned e = 0; e < BCH_MEMBER_ERROR_NR; e++) m->errors[e] = cpu_to_le64(atomic64_read(&ca->errors[e])); } - rcu_read_unlock(); } void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca) @@ -417,20 +442,14 @@ void bch2_dev_errors_reset(struct bch_dev *ca) bool bch2_dev_btree_bitmap_marked(struct bch_fs *c, struct bkey_s_c k) { - bool ret = true; - rcu_read_lock(); + guard(rcu)(); bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); - if (!ca) - continue; - - if (!bch2_dev_btree_bitmap_marked_sectors(ca, ptr->offset, btree_sectors(c))) { - ret = false; - break; - } + if (ca && + !bch2_dev_btree_bitmap_marked_sectors(ca, ptr->offset, btree_sectors(c))) + return false; } - rcu_read_unlock(); - return ret; + return true; } static void __bch2_dev_btree_bitmap_mark(struct bch_sb_field_members_v2 *mi, unsigned dev, @@ -493,6 +512,7 @@ int bch2_sb_member_alloc(struct bch_fs *c) unsigned u64s; int best = -1; u64 best_last_mount = 0; + unsigned nr_deleted = 0; if (dev_idx < BCH_SB_MEMBERS_MAX) goto have_slot; @@ -503,7 +523,10 @@ int bch2_sb_member_alloc(struct bch_fs *c) continue; struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, dev_idx); - if (bch2_member_alive(&m)) + + nr_deleted += uuid_equal(&m.uuid, &BCH_SB_MEMBER_DELETED_UUID); + + if (!bch2_is_zero(&m.uuid, sizeof(m.uuid))) continue; u64 last_mount = le64_to_cpu(m.last_mount); @@ -517,6 +540,10 @@ int bch2_sb_member_alloc(struct bch_fs *c) goto have_slot; } + if (nr_deleted) + bch_err(c, "unable to allocate new member, but have %u deleted: run fsck", + nr_deleted); + return -BCH_ERR_ENOSPC_sb_members; have_slot: nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices); @@ -532,3 +559,22 @@ have_slot: c->disk_sb.sb->nr_devices = nr_devices; return dev_idx; } + +void bch2_sb_members_clean_deleted(struct bch_fs *c) +{ + mutex_lock(&c->sb_lock); + bool write_sb = false; + + for (unsigned i = 0; i < c->sb.nr_devices; i++) { + struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, i); + + if (uuid_equal(&m->uuid, &BCH_SB_MEMBER_DELETED_UUID)) { + memset(&m->uuid, 0, sizeof(m->uuid)); + write_sb = true; + } + } + + if (write_sb) + bch2_write_super(c); + mutex_unlock(&c->sb_lock); +} diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h index 42786657522c..8d8a8a857648 100644 --- a/fs/bcachefs/sb-members.h +++ b/fs/bcachefs/sb-members.h @@ -4,6 +4,7 @@ #include "darray.h" #include "bkey_types.h" +#include "enumerated_ref.h" extern char * const bch2_member_error_strs[]; @@ -20,19 +21,16 @@ struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i); static inline bool bch2_dev_is_online(struct bch_dev *ca) { - return !percpu_ref_is_zero(&ca->io_ref[READ]); + return !enumerated_ref_is_zero(&ca->io_ref[READ]); } static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *, unsigned); static inline bool bch2_dev_idx_is_online(struct bch_fs *c, unsigned dev) { - rcu_read_lock(); + guard(rcu)(); struct bch_dev *ca = bch2_dev_rcu(c, dev); - bool ret = ca && bch2_dev_is_online(ca); - rcu_read_unlock(); - - return ret; + return ca && bch2_dev_is_online(ca); } static inline bool bch2_dev_is_healthy(struct bch_dev *ca) @@ -104,6 +102,12 @@ static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, struct bch_dev * for (struct bch_dev *_ca = NULL; \ (_ca = __bch2_next_dev((_c), _ca, (_mask)));) +#define for_each_online_member_rcu(_c, _ca) \ + for_each_member_device_rcu(_c, _ca, &(_c)->online_devs) + +#define for_each_rw_member_rcu(_c, _ca) \ + for_each_member_device_rcu(_c, _ca, &(_c)->rw_devs[BCH_DATA_free]) + static inline void bch2_dev_get(struct bch_dev *ca) { #ifdef CONFIG_BCACHEFS_DEBUG @@ -135,12 +139,10 @@ static inline void bch2_dev_put(struct bch_dev *ca) static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev *ca) { - rcu_read_lock(); + guard(rcu)(); bch2_dev_put(ca); if ((ca = __bch2_next_dev(c, ca, NULL))) bch2_dev_get(ca); - rcu_read_unlock(); - return ca; } @@ -157,33 +159,32 @@ static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c, struct bch_dev *ca, unsigned state_mask, - int rw) + int rw, unsigned ref_idx) { - rcu_read_lock(); + guard(rcu)(); if (ca) - percpu_ref_put(&ca->io_ref[rw]); + enumerated_ref_put(&ca->io_ref[rw], ref_idx); while ((ca = __bch2_next_dev(c, ca, NULL)) && (!((1 << ca->mi.state) & state_mask) || - !percpu_ref_tryget(&ca->io_ref[rw]))) + !enumerated_ref_tryget(&ca->io_ref[rw], ref_idx))) ; - rcu_read_unlock(); return ca; } -#define __for_each_online_member(_c, _ca, state_mask, rw) \ +#define __for_each_online_member(_c, _ca, state_mask, rw, ref_idx) \ for (struct bch_dev *_ca = NULL; \ - (_ca = bch2_get_next_online_dev(_c, _ca, state_mask, rw));) + (_ca = bch2_get_next_online_dev(_c, _ca, state_mask, rw, ref_idx));) -#define for_each_online_member(c, ca) \ - __for_each_online_member(c, ca, ~0, READ) +#define for_each_online_member(c, ca, ref_idx) \ + __for_each_online_member(c, ca, ~0, READ, ref_idx) -#define for_each_rw_member(c, ca) \ - __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), WRITE) +#define for_each_rw_member(c, ca, ref_idx) \ + __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), WRITE, ref_idx) -#define for_each_readable_member(c, ca) \ - __for_each_online_member(c, ca, BIT( BCH_MEMBER_STATE_rw)|BIT(BCH_MEMBER_STATE_ro), READ) +#define for_each_readable_member(c, ca, ref_idx) \ + __for_each_online_member(c, ca, BIT( BCH_MEMBER_STATE_rw)|BIT(BCH_MEMBER_STATE_ro), READ, ref_idx) static inline bool bch2_dev_exists(const struct bch_fs *c, unsigned dev) { @@ -218,23 +219,24 @@ static inline struct bch_dev *bch2_dev_rcu_noerror(struct bch_fs *c, unsigned de : NULL; } -void bch2_dev_missing(struct bch_fs *, unsigned); +int bch2_dev_missing_bkey(struct bch_fs *, struct bkey_s_c, unsigned); + +void bch2_dev_missing_atomic(struct bch_fs *, unsigned); static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *c, unsigned dev) { struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev); if (unlikely(!ca)) - bch2_dev_missing(c, dev); + bch2_dev_missing_atomic(c, dev); return ca; } static inline struct bch_dev *bch2_dev_tryget_noerror(struct bch_fs *c, unsigned dev) { - rcu_read_lock(); + guard(rcu)(); struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev); if (ca) bch2_dev_get(ca); - rcu_read_unlock(); return ca; } @@ -242,7 +244,7 @@ static inline struct bch_dev *bch2_dev_tryget(struct bch_fs *c, unsigned dev) { struct bch_dev *ca = bch2_dev_tryget_noerror(c, dev); if (unlikely(!ca)) - bch2_dev_missing(c, dev); + bch2_dev_missing_atomic(c, dev); return ca; } @@ -285,43 +287,31 @@ static inline struct bch_dev *bch2_dev_iterate(struct bch_fs *c, struct bch_dev return bch2_dev_tryget(c, dev_idx); } -static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, int rw) +static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, + int rw, unsigned ref_idx) { might_sleep(); - rcu_read_lock(); + guard(rcu)(); struct bch_dev *ca = bch2_dev_rcu(c, dev); - if (ca && !percpu_ref_tryget(&ca->io_ref[rw])) - ca = NULL; - rcu_read_unlock(); + if (!ca || !enumerated_ref_tryget(&ca->io_ref[rw], ref_idx)) + return NULL; - if (ca && - (ca->mi.state == BCH_MEMBER_STATE_rw || - (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ))) + if (ca->mi.state == BCH_MEMBER_STATE_rw || + (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ)) return ca; - if (ca) - percpu_ref_put(&ca->io_ref[rw]); + enumerated_ref_put(&ca->io_ref[rw], ref_idx); return NULL; } -/* XXX kill, move to struct bch_fs */ -static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c) -{ - struct bch_devs_mask devs; - - memset(&devs, 0, sizeof(devs)); - for_each_online_member(c, ca) - __set_bit(ca->dev_idx, devs.d); - return devs; -} - extern const struct bch_sb_field_ops bch_sb_field_ops_members_v1; extern const struct bch_sb_field_ops bch_sb_field_ops_members_v2; static inline bool bch2_member_alive(struct bch_member *m) { - return !bch2_is_zero(&m->uuid, sizeof(m->uuid)); + return !bch2_is_zero(&m->uuid, sizeof(m->uuid)) && + !uuid_equal(&m->uuid, &BCH_SB_MEMBER_DELETED_UUID); } static inline bool bch2_member_exists(struct bch_sb *sb, unsigned dev) @@ -351,6 +341,7 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) ? BCH_MEMBER_DURABILITY(mi) - 1 : 1, .freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi), + .resize_on_mount = BCH_MEMBER_RESIZE_ON_MOUNT(mi), .valid = bch2_member_alive(mi), .btree_bitmap_shift = mi->btree_bitmap_shift, .btree_allocated_bitmap = le64_to_cpu(mi->btree_allocated_bitmap), @@ -381,5 +372,6 @@ bool bch2_dev_btree_bitmap_marked(struct bch_fs *, struct bkey_s_c); void bch2_dev_btree_bitmap_mark(struct bch_fs *, struct bkey_s_c); int bch2_sb_member_alloc(struct bch_fs *); +void bch2_sb_members_clean_deleted(struct bch_fs *); #endif /* _BCACHEFS_SB_MEMBERS_H */ diff --git a/fs/bcachefs/sb-members_format.h b/fs/bcachefs/sb-members_format.h index 3affec823b3f..fb72ad730518 100644 --- a/fs/bcachefs/sb-members_format.h +++ b/fs/bcachefs/sb-members_format.h @@ -13,6 +13,10 @@ */ #define BCH_SB_MEMBER_INVALID 255 +#define BCH_SB_MEMBER_DELETED_UUID \ + UUID_INIT(0xffffffff, 0xffff, 0xffff, \ + 0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef) + #define BCH_MIN_NR_NBUCKETS (1 << 6) #define BCH_IOPS_MEASUREMENTS() \ @@ -88,6 +92,8 @@ LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags, 20, 28) LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags, 28, 30) LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED, struct bch_member, flags, 30, 31) +LE64_BITMASK(BCH_MEMBER_RESIZE_ON_MOUNT, + struct bch_member, flags, 31, 32) #if 0 LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20); diff --git a/fs/bcachefs/sb-members_types.h b/fs/bcachefs/sb-members_types.h index c0eda888fe39..d6443e186872 100644 --- a/fs/bcachefs/sb-members_types.h +++ b/fs/bcachefs/sb-members_types.h @@ -13,6 +13,7 @@ struct bch_member_cpu { u8 data_allowed; u8 durability; u8 freespace_initialized; + u8 resize_on_mount; u8 valid; u8 btree_bitmap_shift; u64 btree_allocated_bitmap; diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c index 7c403427fbdb..538c324f4765 100644 --- a/fs/bcachefs/six.c +++ b/fs/bcachefs/six.c @@ -339,12 +339,9 @@ static inline bool six_owner_running(struct six_lock *lock) * acquiring the lock and setting the owner field. If we're an RT task * that will live-lock because we won't let the owner complete. */ - rcu_read_lock(); + guard(rcu)(); struct task_struct *owner = READ_ONCE(lock->owner); - bool ret = owner ? owner_on_cpu(owner) : !rt_or_dl_task(current); - rcu_read_unlock(); - - return ret; + return owner ? owner_on_cpu(owner) : !rt_or_dl_task(current); } static inline bool six_optimistic_spin(struct six_lock *lock, diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index fec569c7deb1..23a332d76b32 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -1,11 +1,13 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "bbpos.h" #include "bkey_buf.h" #include "btree_cache.h" #include "btree_key_cache.h" #include "btree_update.h" #include "buckets.h" +#include "enumerated_ref.h" #include "errcode.h" #include "error.h" #include "fs.h" @@ -52,7 +54,7 @@ int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id, BTREE_ITER_with_updates, snapshot_tree, s); if (bch2_err_matches(ret, ENOENT)) - ret = -BCH_ERR_ENOENT_snapshot_tree; + ret = bch_err_throw(trans->c, ENOENT_snapshot_tree); return ret; } @@ -65,7 +67,7 @@ __bch2_snapshot_tree_create(struct btree_trans *trans) struct bkey_i_snapshot_tree *s_t; if (ret == -BCH_ERR_ENOSPC_btree_slot) - ret = -BCH_ERR_ENOSPC_snapshot_tree; + ret = bch_err_throw(trans->c, ENOSPC_snapshot_tree); if (ret) return ERR_PTR(ret); @@ -103,11 +105,8 @@ static bool __bch2_snapshot_is_ancestor_early(struct snapshot_table *t, u32 id, static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor) { - rcu_read_lock(); - bool ret = __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor); - rcu_read_unlock(); - - return ret; + guard(rcu)(); + return __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor); } static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor) @@ -138,13 +137,11 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) { bool ret; - rcu_read_lock(); + guard(rcu)(); struct snapshot_table *t = rcu_dereference(c->snapshots); - if (unlikely(c->recovery_pass_done < BCH_RECOVERY_PASS_check_snapshots)) { - ret = __bch2_snapshot_is_ancestor_early(t, id, ancestor); - goto out; - } + if (unlikely(c->recovery.pass_done < BCH_RECOVERY_PASS_check_snapshots)) + return __bch2_snapshot_is_ancestor_early(t, id, ancestor); if (likely(ancestor >= IS_ANCESTOR_BITMAP)) while (id && id < ancestor - IS_ANCESTOR_BITMAP) @@ -155,9 +152,6 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) : id == ancestor; EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor)); -out: - rcu_read_unlock(); - return ret; } @@ -209,9 +203,14 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c, { struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); - prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u", - BCH_SNAPSHOT_SUBVOL(s.v), - BCH_SNAPSHOT_DELETED(s.v), + if (BCH_SNAPSHOT_SUBVOL(s.v)) + prt_str(out, "subvol "); + if (BCH_SNAPSHOT_WILL_DELETE(s.v)) + prt_str(out, "will_delete "); + if (BCH_SNAPSHOT_DELETED(s.v)) + prt_str(out, "deleted "); + + prt_printf(out, "parent %10u children %10u %10u subvol %u tree %u", le32_to_cpu(s.v->parent), le32_to_cpu(s.v->children[0]), le32_to_cpu(s.v->children[1]), @@ -281,6 +280,16 @@ fsck_err: return ret; } +static int bch2_snapshot_table_make_room(struct bch_fs *c, u32 id) +{ + mutex_lock(&c->snapshot_table_lock); + int ret = snapshot_t_mut(c, id) + ? 0 + : bch_err_throw(c, ENOMEM_mark_snapshot); + mutex_unlock(&c->snapshot_table_lock); + return ret; +} + static int __bch2_mark_snapshot(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s_c new, @@ -295,14 +304,16 @@ static int __bch2_mark_snapshot(struct btree_trans *trans, t = snapshot_t_mut(c, id); if (!t) { - ret = -BCH_ERR_ENOMEM_mark_snapshot; + ret = bch_err_throw(c, ENOMEM_mark_snapshot); goto err; } if (new.k->type == KEY_TYPE_snapshot) { struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new); - t->live = true; + t->state = !BCH_SNAPSHOT_DELETED(s.v) + ? SNAPSHOT_ID_live + : SNAPSHOT_ID_deleted; t->parent = le32_to_cpu(s.v->parent); t->children[0] = le32_to_cpu(s.v->children[0]); t->children[1] = le32_to_cpu(s.v->children[1]); @@ -327,9 +338,9 @@ static int __bch2_mark_snapshot(struct btree_trans *trans, parent - id - 1 < IS_ANCESTOR_BITMAP) __set_bit(parent - id - 1, t->is_ancestor); - if (BCH_SNAPSHOT_DELETED(s.v)) { + if (BCH_SNAPSHOT_WILL_DELETE(s.v)) { set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags); - if (c->curr_recovery_pass > BCH_RECOVERY_PASS_delete_dead_snapshots) + if (c->recovery.pass_done > BCH_RECOVERY_PASS_delete_dead_snapshots) bch2_delete_dead_snapshots_async(c); } } else { @@ -390,21 +401,29 @@ static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id) return 0; } -u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root) +u32 bch2_snapshot_oldest_subvol(struct bch_fs *c, u32 snapshot_root, + snapshot_id_list *skip) { - u32 id = snapshot_root; - u32 subvol = 0, s; - - rcu_read_lock(); + guard(rcu)(); + u32 id, subvol = 0, s; +retry: + id = snapshot_root; while (id && bch2_snapshot_exists(c, id)) { - s = snapshot_t(c, id)->subvol; - - if (s && (!subvol || s < subvol)) - subvol = s; + if (!(skip && snapshot_list_has_id(skip, id))) { + s = snapshot_t(c, id)->subvol; + if (s && (!subvol || s < subvol)) + subvol = s; + } id = bch2_snapshot_tree_next(c, id); + if (id == snapshot_root) + break; + } + + if (!subvol && skip) { + skip = NULL; + goto retry; } - rcu_read_unlock(); return subvol; } @@ -437,7 +456,7 @@ static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans, if (!ret && !found) { struct bkey_i_subvolume *u; - *subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root); + *subvol_id = bch2_snapshot_oldest_subvol(c, snapshot_root, NULL); u = bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_subvolumes, POS(0, *subvol_id), @@ -589,18 +608,14 @@ static int snapshot_tree_ptr_good(struct btree_trans *trans, u32 bch2_snapshot_skiplist_get(struct bch_fs *c, u32 id) { - const struct snapshot_t *s; - if (!id) return 0; - rcu_read_lock(); - s = snapshot_t(c, id); - if (s->parent) - id = bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth)); - rcu_read_unlock(); - - return id; + guard(rcu)(); + const struct snapshot_t *s = snapshot_t(c, id); + return s->parent + ? bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth)) + : id; } static int snapshot_skiplist_good(struct btree_trans *trans, u32 id, struct bch_snapshot s) @@ -654,7 +669,7 @@ static int snapshot_tree_ptr_repair(struct btree_trans *trans, u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot); ret = PTR_ERR_OR_ZERO(u) ?: bch2_snapshot_tree_create(trans, root_id, - bch2_snapshot_tree_oldest_subvol(c, root_id), + bch2_snapshot_oldest_subvol(c, root_id, NULL), &tree_id); if (ret) goto err; @@ -699,6 +714,9 @@ static int check_snapshot(struct btree_trans *trans, memset(&s, 0, sizeof(s)); memcpy(&s, k.v, min(sizeof(s), bkey_val_bytes(k.k))); + if (BCH_SNAPSHOT_DELETED(&s)) + return 0; + id = le32_to_cpu(s.parent); if (id) { ret = bch2_snapshot_lookup(trans, id, &v); @@ -736,7 +754,7 @@ static int check_snapshot(struct btree_trans *trans, } bool should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) && - !BCH_SNAPSHOT_DELETED(&s); + !BCH_SNAPSHOT_WILL_DELETE(&s); if (should_have_subvol) { id = le32_to_cpu(s.subvol); @@ -887,9 +905,8 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id) } bch2_trans_iter_exit(trans, &iter); - return bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?: - bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, - bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0); + return bch2_snapshot_table_make_room(c, id) ?: + bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0); } /* Figure out which snapshot nodes belong in the same tree: */ @@ -917,10 +934,7 @@ static inline bool same_snapshot(struct snapshot_tree_reconstruct *r, struct bpo static inline bool snapshot_id_lists_have_common(snapshot_id_list *l, snapshot_id_list *r) { - darray_for_each(*l, i) - if (snapshot_list_has_id(r, *i)) - return true; - return false; + return darray_find_p(*l, i, snapshot_list_has_id(r, *i)) != NULL; } static void snapshot_id_list_to_text(struct printbuf *out, snapshot_id_list *s) @@ -987,12 +1001,12 @@ int bch2_reconstruct_snapshots(struct bch_fs *c) snapshot_id_list_to_text(&buf, t); darray_for_each(*t, id) { - if (fsck_err_on(!bch2_snapshot_exists(c, *id), + if (fsck_err_on(bch2_snapshot_id_state(c, *id) == SNAPSHOT_ID_empty, trans, snapshot_node_missing, "snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) { if (t->nr > 1) { bch_err(c, "cannot reconstruct snapshot trees with multiple nodes"); - ret = -BCH_ERR_fsck_repair_unimplemented; + ret = bch_err_throw(c, fsck_repair_unimplemented); goto err; } @@ -1012,27 +1026,92 @@ err: return ret; } -int bch2_check_key_has_snapshot(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) +int __bch2_check_key_has_snapshot(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; int ret = 0; + enum snapshot_id_state state = bch2_snapshot_id_state(c, k.k->p.snapshot); - if (fsck_err_on(!bch2_snapshot_exists(c, k.k->p.snapshot), - trans, bkey_in_missing_snapshot, - "key in missing snapshot %s, delete?", + /* Snapshot was definitively deleted, this error is marked autofix */ + if (fsck_err_on(state == SNAPSHOT_ID_deleted, + trans, bkey_in_deleted_snapshot, + "key in deleted snapshot %s, delete?", (bch2_btree_id_to_text(&buf, iter->btree_id), prt_char(&buf, ' '), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) ret = bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_internal_snapshot_node) ?: 1; + BTREE_UPDATE_internal_snapshot_node) ?: 1; + + if (state == SNAPSHOT_ID_empty) { + /* + * Snapshot missing: we should have caught this with btree_lost_data and + * kicked off reconstruct_snapshots, so if we end up here we have no + * idea what happened. + * + * Do not delete unless we know that subvolumes and snapshots + * are consistent: + * + * XXX: + * + * We could be smarter here, and instead of using the generic + * recovery pass ratelimiting, track if there have been any + * changes to the snapshots or inodes btrees since those passes + * last ran. + */ + ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_check_snapshots) ?: ret; + ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_check_subvols) ?: ret; + + if (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots)) + ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_reconstruct_snapshots) ?: ret; + + unsigned repair_flags = FSCK_CAN_IGNORE | (!ret ? FSCK_CAN_FIX : 0); + + if (__fsck_err(trans, repair_flags, bkey_in_missing_snapshot, + "key in missing snapshot %s, delete?", + (bch2_btree_id_to_text(&buf, iter->btree_id), + prt_char(&buf, ' '), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_internal_snapshot_node) ?: 1; + } + } fsck_err: printbuf_exit(&buf); return ret; } +int __bch2_get_snapshot_overwrites(struct btree_trans *trans, + enum btree_id btree, struct bpos pos, + snapshot_id_list *s) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + for_each_btree_key_reverse_norestart(trans, iter, btree, bpos_predecessor(pos), + BTREE_ITER_all_snapshots, k, ret) { + if (!bkey_eq(k.k->p, pos)) + break; + + if (!bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot) || + snapshot_list_has_ancestor(c, s, k.k->p.snapshot)) + continue; + + ret = snapshot_list_add(c, s, k.k->p.snapshot); + if (ret) + break; + } + bch2_trans_iter_exit(trans, &iter); + if (ret) + darray_exit(s); + + return ret; +} + /* * Mark a snapshot as deleted, for future cleanup: */ @@ -1051,10 +1130,10 @@ int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id) } /* already deleted? */ - if (BCH_SNAPSHOT_DELETED(&s->v)) + if (BCH_SNAPSHOT_WILL_DELETE(&s->v)) goto err; - SET_BCH_SNAPSHOT_DELETED(&s->v, true); + SET_BCH_SNAPSHOT_WILL_DELETE(&s->v, true); SET_BCH_SNAPSHOT_SUBVOL(&s->v, false); s->v.subvol = 0; err: @@ -1074,24 +1153,25 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) struct btree_iter iter, p_iter = {}; struct btree_iter c_iter = {}; struct btree_iter tree_iter = {}; - struct bkey_s_c_snapshot s; u32 parent_id, child_id; unsigned i; int ret = 0; - s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id), - BTREE_ITER_intent, snapshot); - ret = bkey_err(s); + struct bkey_i_snapshot *s = + bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id), + BTREE_ITER_intent, snapshot); + ret = PTR_ERR_OR_ZERO(s); bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, "missing snapshot %u", id); if (ret) goto err; - BUG_ON(s.v->children[1]); + BUG_ON(BCH_SNAPSHOT_DELETED(&s->v)); + BUG_ON(s->v.children[1]); - parent_id = le32_to_cpu(s.v->parent); - child_id = le32_to_cpu(s.v->children[0]); + parent_id = le32_to_cpu(s->v.parent); + child_id = le32_to_cpu(s->v.children[0]); if (parent_id) { struct bkey_i_snapshot *parent; @@ -1149,24 +1229,38 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) */ struct bkey_i_snapshot_tree *s_t; - BUG_ON(s.v->children[1]); + BUG_ON(s->v.children[1]); s_t = bch2_bkey_get_mut_typed(trans, &tree_iter, - BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)), + BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s->v.tree)), 0, snapshot_tree); ret = PTR_ERR_OR_ZERO(s_t); if (ret) goto err; - if (s.v->children[0]) { - s_t->v.root_snapshot = s.v->children[0]; + if (s->v.children[0]) { + s_t->v.root_snapshot = s->v.children[0]; } else { s_t->k.type = KEY_TYPE_deleted; set_bkey_val_u64s(&s_t->k, 0); } } - ret = bch2_btree_delete_at(trans, &iter, 0); + if (!bch2_request_incompat_feature(c, bcachefs_metadata_version_snapshot_deletion_v2)) { + SET_BCH_SNAPSHOT_DELETED(&s->v, true); + s->v.parent = 0; + s->v.children[0] = 0; + s->v.children[1] = 0; + s->v.subvol = 0; + s->v.tree = 0; + s->v.depth = 0; + s->v.skip[0] = 0; + s->v.skip[1] = 0; + s->v.skip[2] = 0; + } else { + s->k.type = KEY_TYPE_deleted; + set_bkey_val_u64s(&s->k, 0); + } err: bch2_trans_iter_exit(trans, &tree_iter); bch2_trans_iter_exit(trans, &p_iter); @@ -1202,7 +1296,7 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, goto err; if (!k.k || !k.k->p.offset) { - ret = -BCH_ERR_ENOSPC_snapshot_create; + ret = bch_err_throw(c, ENOSPC_snapshot_create); goto err; } @@ -1336,18 +1430,10 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, * that key to snapshot leaf nodes, where we can mutate it */ -struct snapshot_interior_delete { - u32 id; - u32 live_child; -}; -typedef DARRAY(struct snapshot_interior_delete) interior_delete_list; - static inline u32 interior_delete_has_id(interior_delete_list *l, u32 id) { - darray_for_each(*l, i) - if (i->id == id) - return i->live_child; - return 0; + struct snapshot_interior_delete *i = darray_find_p(*l, i, i->id == id); + return i ? i->live_child : 0; } static unsigned __live_child(struct snapshot_table *t, u32 id, @@ -1375,28 +1461,32 @@ static unsigned __live_child(struct snapshot_table *t, u32 id, return 0; } -static unsigned live_child(struct bch_fs *c, u32 id, - snapshot_id_list *delete_leaves, - interior_delete_list *delete_interior) +static unsigned live_child(struct bch_fs *c, u32 id) { - rcu_read_lock(); - u32 ret = __live_child(rcu_dereference(c->snapshots), id, - delete_leaves, delete_interior); - rcu_read_unlock(); - return ret; + struct snapshot_delete *d = &c->snapshot_delete; + + guard(rcu)(); + return __live_child(rcu_dereference(c->snapshots), id, + &d->delete_leaves, &d->delete_interior); +} + +static bool snapshot_id_dying(struct snapshot_delete *d, unsigned id) +{ + return snapshot_list_has_id(&d->delete_leaves, id) || + interior_delete_has_id(&d->delete_interior, id) != 0; } static int delete_dead_snapshots_process_key(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c k, - snapshot_id_list *delete_leaves, - interior_delete_list *delete_interior) + struct bkey_s_c k) { - if (snapshot_list_has_id(delete_leaves, k.k->p.snapshot)) + struct snapshot_delete *d = &trans->c->snapshot_delete; + + if (snapshot_list_has_id(&d->delete_leaves, k.k->p.snapshot)) return bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node); - u32 live_child = interior_delete_has_id(delete_interior, k.k->p.snapshot); + u32 live_child = interior_delete_has_id(&d->delete_interior, k.k->p.snapshot); if (live_child) { struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); int ret = PTR_ERR_OR_ZERO(new); @@ -1427,55 +1517,214 @@ static int delete_dead_snapshots_process_key(struct btree_trans *trans, return 0; } +static bool skip_unrelated_snapshot_tree(struct btree_trans *trans, struct btree_iter *iter, u64 *prev_inum) +{ + struct bch_fs *c = trans->c; + struct snapshot_delete *d = &c->snapshot_delete; + + u64 inum = iter->btree_id != BTREE_ID_inodes + ? iter->pos.inode + : iter->pos.offset; + + if (*prev_inum == inum) + return false; + + *prev_inum = inum; + + bool ret = !snapshot_list_has_id(&d->deleting_from_trees, + bch2_snapshot_tree(c, iter->pos.snapshot)); + if (unlikely(ret)) { + struct bpos pos = iter->pos; + pos.snapshot = 0; + if (iter->btree_id != BTREE_ID_inodes) + pos.offset = U64_MAX; + bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(pos)); + } + + return ret; +} + +static int delete_dead_snapshot_keys_v1(struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; + struct snapshot_delete *d = &c->snapshot_delete; + + for (d->pos.btree = 0; d->pos.btree < BTREE_ID_NR; d->pos.btree++) { + struct disk_reservation res = { 0 }; + u64 prev_inum = 0; + + d->pos.pos = POS_MIN; + + if (!btree_type_has_snapshots(d->pos.btree)) + continue; + + int ret = for_each_btree_key_commit(trans, iter, + d->pos.btree, POS_MIN, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, + &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + d->pos.pos = iter.pos; + + if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum)) + continue; + + delete_dead_snapshots_process_key(trans, &iter, k); + })); + + bch2_disk_reservation_put(c, &res); + + if (ret) + return ret; + } + + return 0; +} + +static int delete_dead_snapshot_keys_range(struct btree_trans *trans, enum btree_id btree, + struct bpos start, struct bpos end) +{ + struct bch_fs *c = trans->c; + struct snapshot_delete *d = &c->snapshot_delete; + struct disk_reservation res = { 0 }; + + d->pos.btree = btree; + d->pos.pos = POS_MIN; + + int ret = for_each_btree_key_max_commit(trans, iter, + btree, start, end, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, + &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + d->pos.pos = iter.pos; + delete_dead_snapshots_process_key(trans, &iter, k); + })); + + bch2_disk_reservation_put(c, &res); + return ret; +} + +static int delete_dead_snapshot_keys_v2(struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; + struct snapshot_delete *d = &c->snapshot_delete; + struct disk_reservation res = { 0 }; + u64 prev_inum = 0; + int ret = 0; + + struct btree_iter iter; + bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, POS_MIN, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots); + + while (1) { + struct bkey_s_c k; + ret = lockrestart_do(trans, + bkey_err(k = bch2_btree_iter_peek(trans, &iter))); + if (ret) + break; + + if (!k.k) + break; + + d->pos.btree = iter.btree_id; + d->pos.pos = iter.pos; + + if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum)) + continue; + + if (snapshot_id_dying(d, k.k->p.snapshot)) { + struct bpos start = POS(k.k->p.offset, 0); + struct bpos end = POS(k.k->p.offset, U64_MAX); + + ret = delete_dead_snapshot_keys_range(trans, BTREE_ID_extents, start, end) ?: + delete_dead_snapshot_keys_range(trans, BTREE_ID_dirents, start, end) ?: + delete_dead_snapshot_keys_range(trans, BTREE_ID_xattrs, start, end); + if (ret) + break; + + bch2_btree_iter_set_pos(trans, &iter, POS(0, k.k->p.offset + 1)); + } else { + bch2_btree_iter_advance(trans, &iter); + } + } + bch2_trans_iter_exit(trans, &iter); + + if (ret) + goto err; + + prev_inum = 0; + ret = for_each_btree_key_commit(trans, iter, + BTREE_ID_inodes, POS_MIN, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, + &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + d->pos.btree = iter.btree_id; + d->pos.pos = iter.pos; + + if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum)) + continue; + + delete_dead_snapshots_process_key(trans, &iter, k); + })); +err: + bch2_disk_reservation_put(c, &res); + return ret; +} + /* * For a given snapshot, if it doesn't have a subvolume that points to it, and * it doesn't have child snapshot nodes - it's now redundant and we can mark it * as deleted. */ -static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s_c k, - snapshot_id_list *delete_leaves, - interior_delete_list *delete_interior) +static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s_c k) { if (k.k->type != KEY_TYPE_snapshot) return 0; struct bch_fs *c = trans->c; + struct snapshot_delete *d = &c->snapshot_delete; struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); unsigned live_children = 0; + int ret = 0; if (BCH_SNAPSHOT_SUBVOL(s.v)) return 0; + if (BCH_SNAPSHOT_DELETED(s.v)) + return 0; + + mutex_lock(&d->progress_lock); for (unsigned i = 0; i < 2; i++) { u32 child = le32_to_cpu(s.v->children[i]); live_children += child && - !snapshot_list_has_id(delete_leaves, child); + !snapshot_list_has_id(&d->delete_leaves, child); } + u32 tree = bch2_snapshot_tree(c, s.k->p.offset); + if (live_children == 0) { - return snapshot_list_add(c, delete_leaves, s.k->p.offset); + ret = snapshot_list_add_nodup(c, &d->deleting_from_trees, tree) ?: + snapshot_list_add(c, &d->delete_leaves, s.k->p.offset); } else if (live_children == 1) { - struct snapshot_interior_delete d = { + struct snapshot_interior_delete n = { .id = s.k->p.offset, - .live_child = live_child(c, s.k->p.offset, delete_leaves, delete_interior), + .live_child = live_child(c, s.k->p.offset), }; - if (!d.live_child) { - bch_err(c, "error finding live child of snapshot %u", d.id); - return -EINVAL; + if (!n.live_child) { + bch_err(c, "error finding live child of snapshot %u", n.id); + ret = -EINVAL; + } else { + ret = snapshot_list_add_nodup(c, &d->deleting_from_trees, tree) ?: + darray_push(&d->delete_interior, n); } - - return darray_push(delete_interior, d); - } else { - return 0; } + mutex_unlock(&d->progress_lock); + + return ret; } static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n, interior_delete_list *skip) { - rcu_read_lock(); + guard(rcu)(); while (interior_delete_has_id(skip, id)) id = __bch2_snapshot_parent(c, id); @@ -1484,7 +1733,6 @@ static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n, id = __bch2_snapshot_parent(c, id); } while (interior_delete_has_id(skip, id)); } - rcu_read_unlock(); return id; } @@ -1498,6 +1746,9 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, struct bkey_i_snapshot *s; int ret; + if (!bch2_snapshot_exists(c, k.k->p.offset)) + return 0; + if (k.k->type != KEY_TYPE_snapshot) return 0; @@ -1545,39 +1796,56 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, return bch2_trans_update(trans, iter, &s->k_i, 0); } -int bch2_delete_dead_snapshots(struct bch_fs *c) +static void bch2_snapshot_delete_nodes_to_text(struct printbuf *out, struct snapshot_delete *d) { - if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags)) + prt_printf(out, "deleting from trees"); + darray_for_each(d->deleting_from_trees, i) + prt_printf(out, " %u", *i); + + prt_printf(out, "deleting leaves"); + darray_for_each(d->delete_leaves, i) + prt_printf(out, " %u", *i); + prt_newline(out); + + prt_printf(out, "interior"); + darray_for_each(d->delete_interior, i) + prt_printf(out, " %u->%u", i->id, i->live_child); + prt_newline(out); +} + +int __bch2_delete_dead_snapshots(struct bch_fs *c) +{ + struct snapshot_delete *d = &c->snapshot_delete; + int ret = 0; + + if (!mutex_trylock(&d->lock)) return 0; + if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags)) + goto out_unlock; + struct btree_trans *trans = bch2_trans_get(c); - snapshot_id_list delete_leaves = {}; - interior_delete_list delete_interior = {}; - int ret = 0; /* * For every snapshot node: If we have no live children and it's not * pointed to by a subvolume, delete it: */ + d->running = true; + d->pos = BBPOS_MIN; + ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, - check_should_delete_snapshot(trans, k, &delete_leaves, &delete_interior)); + check_should_delete_snapshot(trans, k)); if (!bch2_err_matches(ret, EROFS)) bch_err_msg(c, ret, "walking snapshots"); if (ret) goto err; - if (!delete_leaves.nr && !delete_interior.nr) + if (!d->delete_leaves.nr && !d->delete_interior.nr) goto err; { struct printbuf buf = PRINTBUF; - prt_printf(&buf, "deleting leaves"); - darray_for_each(delete_leaves, i) - prt_printf(&buf, " %u", *i); - - prt_printf(&buf, " interior"); - darray_for_each(delete_interior, i) - prt_printf(&buf, " %u->%u", i->id, i->live_child); + bch2_snapshot_delete_nodes_to_text(&buf, d); ret = commit_do(trans, NULL, NULL, 0, bch2_trans_log_msg(trans, &buf)); printbuf_exit(&buf); @@ -1585,29 +1853,15 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) goto err; } - for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) { - struct disk_reservation res = { 0 }; - - if (!btree_type_has_snapshots(btree)) - continue; - - ret = for_each_btree_key_commit(trans, iter, - btree, POS_MIN, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - &res, NULL, BCH_TRANS_COMMIT_no_enospc, - delete_dead_snapshots_process_key(trans, &iter, k, - &delete_leaves, - &delete_interior)); - - bch2_disk_reservation_put(c, &res); - - if (!bch2_err_matches(ret, EROFS)) - bch_err_msg(c, ret, "deleting keys from dying snapshots"); - if (ret) - goto err; - } + ret = !bch2_request_incompat_feature(c, bcachefs_metadata_version_snapshot_deletion_v2) + ? delete_dead_snapshot_keys_v2(trans) + : delete_dead_snapshot_keys_v1(trans); + if (!bch2_err_matches(ret, EROFS)) + bch_err_msg(c, ret, "deleting keys from dying snapshots"); + if (ret) + goto err; - darray_for_each(delete_leaves, i) { + darray_for_each(d->delete_leaves, i) { ret = commit_do(trans, NULL, NULL, 0, bch2_snapshot_node_delete(trans, *i)); if (!bch2_err_matches(ret, EROFS)) @@ -1624,11 +1878,11 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN, BTREE_ITER_intent, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &delete_interior)); + bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &d->delete_interior)); if (ret) goto err; - darray_for_each(delete_interior, i) { + darray_for_each(d->delete_interior, i) { ret = commit_do(trans, NULL, NULL, 0, bch2_snapshot_node_delete(trans, i->id)); if (!bch2_err_matches(ret, EROFS)) @@ -1637,33 +1891,68 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) goto err; } err: - darray_exit(&delete_interior); - darray_exit(&delete_leaves); + mutex_lock(&d->progress_lock); + darray_exit(&d->deleting_from_trees); + darray_exit(&d->delete_interior); + darray_exit(&d->delete_leaves); + d->running = false; + mutex_unlock(&d->progress_lock); bch2_trans_put(trans); + + bch2_recovery_pass_set_no_ratelimit(c, BCH_RECOVERY_PASS_check_snapshots); +out_unlock: + mutex_unlock(&d->lock); if (!bch2_err_matches(ret, EROFS)) bch_err_fn(c, ret); return ret; } +int bch2_delete_dead_snapshots(struct bch_fs *c) +{ + if (!c->opts.auto_snapshot_deletion) + return 0; + + return __bch2_delete_dead_snapshots(c); +} + void bch2_delete_dead_snapshots_work(struct work_struct *work) { - struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work); + struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete.work); set_worker_desc("bcachefs-delete-dead-snapshots/%s", c->name); bch2_delete_dead_snapshots(c); - bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_delete_dead_snapshots); } void bch2_delete_dead_snapshots_async(struct bch_fs *c) { - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots)) + if (!c->opts.auto_snapshot_deletion) + return; + + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_delete_dead_snapshots)) return; BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags)); - if (!queue_work(c->write_ref_wq, &c->snapshot_delete_work)) - bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); + if (!queue_work(system_long_wq, &c->snapshot_delete.work)) + enumerated_ref_put(&c->writes, BCH_WRITE_REF_delete_dead_snapshots); +} + +void bch2_snapshot_delete_status_to_text(struct printbuf *out, struct bch_fs *c) +{ + struct snapshot_delete *d = &c->snapshot_delete; + + if (!d->running) { + prt_str(out, "(not running)"); + return; + } + + mutex_lock(&d->progress_lock); + bch2_snapshot_delete_nodes_to_text(out, d); + + bch2_bbpos_to_text(out, d->pos); + mutex_unlock(&d->progress_lock); } int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans, @@ -1704,7 +1993,7 @@ static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct return 0; struct bkey_s_c_snapshot snap = bkey_s_c_to_snapshot(k); - if (BCH_SNAPSHOT_DELETED(snap.v) || + if (BCH_SNAPSHOT_WILL_DELETE(snap.v) || interior_snapshot_needs_delete(snap)) set_bit(BCH_FS_need_delete_dead_snapshots, &trans->c->flags); @@ -1733,10 +2022,6 @@ int bch2_snapshots_read(struct bch_fs *c) BUG_ON(!test_bit(BCH_FS_new_fs, &c->flags) && test_bit(BCH_FS_may_go_rw, &c->flags)); - if (bch2_err_matches(ret, EIO) || - (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots))) - ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_reconstruct_snapshots); - return ret; } @@ -1744,3 +2029,11 @@ void bch2_fs_snapshots_exit(struct bch_fs *c) { kvfree(rcu_dereference_protected(c->snapshots, true)); } + +void bch2_fs_snapshots_init_early(struct bch_fs *c) +{ + INIT_WORK(&c->snapshot_delete.work, bch2_delete_dead_snapshots_work); + mutex_init(&c->snapshot_delete.lock); + mutex_init(&c->snapshot_delete.progress_lock); + mutex_init(&c->snapshots_unlinked_lock); +} diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h index 81180181d7c9..6766bf673ed9 100644 --- a/fs/bcachefs/snapshot.h +++ b/fs/bcachefs/snapshot.h @@ -46,12 +46,9 @@ static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id) static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id) { - rcu_read_lock(); + guard(rcu)(); const struct snapshot_t *s = snapshot_t(c, id); - id = s ? s->tree : 0; - rcu_read_unlock(); - - return id; + return s ? s->tree : 0; } static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id) @@ -62,11 +59,8 @@ static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id) static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id) { - rcu_read_lock(); - id = __bch2_snapshot_parent_early(c, id); - rcu_read_unlock(); - - return id; + guard(rcu)(); + return __bch2_snapshot_parent_early(c, id); } static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id) @@ -88,61 +82,53 @@ static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id) static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id) { - rcu_read_lock(); - id = __bch2_snapshot_parent(c, id); - rcu_read_unlock(); - - return id; + guard(rcu)(); + return __bch2_snapshot_parent(c, id); } static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n) { - rcu_read_lock(); + guard(rcu)(); while (n--) id = __bch2_snapshot_parent(c, id); - rcu_read_unlock(); - return id; } -u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *, u32); +u32 bch2_snapshot_oldest_subvol(struct bch_fs *, u32, snapshot_id_list *); u32 bch2_snapshot_skiplist_get(struct bch_fs *, u32); static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id) { - u32 parent; + guard(rcu)(); - rcu_read_lock(); + u32 parent; while ((parent = __bch2_snapshot_parent(c, id))) id = parent; - rcu_read_unlock(); - return id; } -static inline bool __bch2_snapshot_exists(struct bch_fs *c, u32 id) +static inline enum snapshot_id_state __bch2_snapshot_id_state(struct bch_fs *c, u32 id) { const struct snapshot_t *s = snapshot_t(c, id); - return s ? s->live : 0; + return s ? s->state : SNAPSHOT_ID_empty; } -static inline bool bch2_snapshot_exists(struct bch_fs *c, u32 id) +static inline enum snapshot_id_state bch2_snapshot_id_state(struct bch_fs *c, u32 id) { - rcu_read_lock(); - bool ret = __bch2_snapshot_exists(c, id); - rcu_read_unlock(); + guard(rcu)(); + return __bch2_snapshot_id_state(c, id); +} - return ret; +static inline bool bch2_snapshot_exists(struct bch_fs *c, u32 id) +{ + return bch2_snapshot_id_state(c, id) == SNAPSHOT_ID_live; } static inline int bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id) { - rcu_read_lock(); + guard(rcu)(); const struct snapshot_t *s = snapshot_t(c, id); - int ret = s ? s->children[0] : -BCH_ERR_invalid_snapshot_node; - rcu_read_unlock(); - - return ret; + return s ? s->children[0] : -BCH_ERR_invalid_snapshot_node; } static inline int bch2_snapshot_is_leaf(struct bch_fs *c, u32 id) @@ -155,13 +141,8 @@ static inline int bch2_snapshot_is_leaf(struct bch_fs *c, u32 id) static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent) { - u32 depth; - - rcu_read_lock(); - depth = parent ? snapshot_t(c, parent)->depth + 1 : 0; - rcu_read_unlock(); - - return depth; + guard(rcu)(); + return parent ? snapshot_t(c, parent)->depth + 1 : 0; } bool __bch2_snapshot_is_ancestor(struct bch_fs *, u32, u32); @@ -175,20 +156,14 @@ static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ances static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id) { - rcu_read_lock(); + guard(rcu)(); const struct snapshot_t *t = snapshot_t(c, id); - bool ret = t && (t->children[0]|t->children[1]) != 0; - rcu_read_unlock(); - - return ret; + return t && (t->children[0]|t->children[1]) != 0; } static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id) { - darray_for_each(*s, i) - if (*i == id) - return true; - return false; + return darray_find(*s, id) != NULL; } static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list *s, u32 id) @@ -241,10 +216,38 @@ int bch2_snapshot_node_create(struct btree_trans *, u32, int bch2_check_snapshot_trees(struct bch_fs *); int bch2_check_snapshots(struct bch_fs *); int bch2_reconstruct_snapshots(struct bch_fs *); -int bch2_check_key_has_snapshot(struct btree_trans *, struct btree_iter *, struct bkey_s_c); + +int __bch2_check_key_has_snapshot(struct btree_trans *, struct btree_iter *, struct bkey_s_c); + +static inline int bch2_check_key_has_snapshot(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + return likely(bch2_snapshot_exists(trans->c, k.k->p.snapshot)) + ? 0 + : __bch2_check_key_has_snapshot(trans, iter, k); +} + +int __bch2_get_snapshot_overwrites(struct btree_trans *, + enum btree_id, struct bpos, + snapshot_id_list *); + +/* + * Get a list of snapshot IDs that have overwritten a given key: + */ +static inline int bch2_get_snapshot_overwrites(struct btree_trans *trans, + enum btree_id btree, struct bpos pos, + snapshot_id_list *s) +{ + darray_init(s); + + return bch2_snapshot_has_children(trans->c, pos.snapshot) + ? __bch2_get_snapshot_overwrites(trans, btree, pos, s) + : 0; + +} int bch2_snapshot_node_set_deleted(struct btree_trans *, u32); -void bch2_delete_dead_snapshots_work(struct work_struct *); int __bch2_key_has_snapshot_overwrites(struct btree_trans *, enum btree_id, struct bpos); @@ -259,7 +262,14 @@ static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans, return __bch2_key_has_snapshot_overwrites(trans, id, pos); } +int __bch2_delete_dead_snapshots(struct bch_fs *); +int bch2_delete_dead_snapshots(struct bch_fs *); +void bch2_delete_dead_snapshots_work(struct work_struct *); +void bch2_delete_dead_snapshots_async(struct bch_fs *); +void bch2_snapshot_delete_status_to_text(struct printbuf *, struct bch_fs *); + int bch2_snapshots_read(struct bch_fs *); void bch2_fs_snapshots_exit(struct bch_fs *); +void bch2_fs_snapshots_init_early(struct bch_fs *); #endif /* _BCACHEFS_SNAPSHOT_H */ diff --git a/fs/bcachefs/snapshot_format.h b/fs/bcachefs/snapshot_format.h index aabcd3a74cd9..9bccae1f3590 100644 --- a/fs/bcachefs/snapshot_format.h +++ b/fs/bcachefs/snapshot_format.h @@ -15,10 +15,10 @@ struct bch_snapshot { bch_le128 btime; }; -LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1) - +LE32_BITMASK(BCH_SNAPSHOT_WILL_DELETE, struct bch_snapshot, flags, 0, 1) /* True if a subvolume points to this snapshot node: */ LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2) +LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 2, 3) /* * Snapshot trees: diff --git a/fs/bcachefs/snapshot_types.h b/fs/bcachefs/snapshot_types.h new file mode 100644 index 000000000000..0ab698f13e5c --- /dev/null +++ b/fs/bcachefs/snapshot_types.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SNAPSHOT_TYPES_H +#define _BCACHEFS_SNAPSHOT_TYPES_H + +#include "bbpos_types.h" +#include "darray.h" +#include "subvolume_types.h" + +typedef DARRAY(u32) snapshot_id_list; + +#define IS_ANCESTOR_BITMAP 128 + +struct snapshot_t { + enum snapshot_id_state { + SNAPSHOT_ID_empty, + SNAPSHOT_ID_live, + SNAPSHOT_ID_deleted, + } state; + u32 parent; + u32 skip[3]; + u32 depth; + u32 children[2]; + u32 subvol; /* Nonzero only if a subvolume points to this node: */ + u32 tree; + unsigned long is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)]; +}; + +struct snapshot_table { + struct rcu_head rcu; + size_t nr; +#ifndef RUST_BINDGEN + DECLARE_FLEX_ARRAY(struct snapshot_t, s); +#else + struct snapshot_t s[0]; +#endif +}; + +struct snapshot_interior_delete { + u32 id; + u32 live_child; +}; +typedef DARRAY(struct snapshot_interior_delete) interior_delete_list; + +struct snapshot_delete { + struct mutex lock; + struct work_struct work; + + struct mutex progress_lock; + snapshot_id_list deleting_from_trees; + snapshot_id_list delete_leaves; + interior_delete_list delete_interior; + + bool running; + struct bbpos pos; +}; + +#endif /* _BCACHEFS_SNAPSHOT_TYPES_H */ diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c index a90bf7b8a2b4..71b735a85026 100644 --- a/fs/bcachefs/str_hash.c +++ b/fs/bcachefs/str_hash.c @@ -31,14 +31,15 @@ static int bch2_dirent_has_target(struct btree_trans *trans, struct bkey_s_c_dir } } -static noinline int fsck_rename_dirent(struct btree_trans *trans, - struct snapshots_seen *s, - const struct bch_hash_desc desc, - struct bch_hash_info *hash_info, - struct bkey_s_c_dirent old) +static int bch2_fsck_rename_dirent(struct btree_trans *trans, + struct snapshots_seen *s, + const struct bch_hash_desc desc, + struct bch_hash_info *hash_info, + struct bkey_s_c_dirent old, + bool *updated_before_k_pos) { struct qstr old_name = bch2_dirent_get_name(old); - struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, bkey_bytes(old.k) + 32); + struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, BKEY_U64s_MAX * sizeof(u64)); int ret = PTR_ERR_OR_ZERO(new); if (ret) return ret; @@ -47,28 +48,39 @@ static noinline int fsck_rename_dirent(struct btree_trans *trans, dirent_copy_target(new, old); new->k.p = old.k->p; + char *renamed_buf = bch2_trans_kmalloc(trans, old_name.len + 20); + ret = PTR_ERR_OR_ZERO(renamed_buf); + if (ret) + return ret; + for (unsigned i = 0; i < 1000; i++) { - unsigned len = sprintf(new->v.d_name, "%.*s.fsck_renamed-%u", - old_name.len, old_name.name, i); - unsigned u64s = BKEY_U64s + dirent_val_u64s(len, 0); + new->k.u64s = BKEY_U64s_MAX; - if (u64s > U8_MAX) - return -EINVAL; + struct qstr renamed_name = (struct qstr) QSTR_INIT(renamed_buf, + sprintf(renamed_buf, "%.*s.fsck_renamed-%u", + old_name.len, old_name.name, i)); - new->k.u64s = u64s; + ret = bch2_dirent_init_name(new, hash_info, &renamed_name, NULL); + if (ret) + return ret; ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info, (subvol_inum) { 0, old.k->p.inode }, old.k->p.snapshot, &new->k_i, - BTREE_UPDATE_internal_snapshot_node); - if (!bch2_err_matches(ret, EEXIST)) + BTREE_UPDATE_internal_snapshot_node| + STR_HASH_must_create); + if (ret && !bch2_err_matches(ret, EEXIST)) + break; + if (!ret) { + if (bpos_lt(new->k.p, old.k->p)) + *updated_before_k_pos = true; break; + } } - if (ret) - return ret; - - return bch2_fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i); + ret = ret ?: bch2_fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i); + bch_err_fn(trans->c, ret); + return ret; } static noinline int hash_pick_winner(struct btree_trans *trans, @@ -101,17 +113,25 @@ static noinline int hash_pick_winner(struct btree_trans *trans, } } -static int repair_inode_hash_info(struct btree_trans *trans, - struct bch_inode_unpacked *snapshot_root) +/* + * str_hash lookups across snapshots break in wild ways if hash_info in + * different snapshot versions doesn't match - so if we find one mismatch, check + * them all + */ +int bch2_repair_inode_hash_info(struct btree_trans *trans, + struct bch_inode_unpacked *snapshot_root) { + struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_s_c k; + struct printbuf buf = PRINTBUF; + bool need_commit = false; int ret = 0; - for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, - SPOS(0, snapshot_root->bi_inum, snapshot_root->bi_snapshot - 1), - BTREE_ITER_all_snapshots, k, ret) { - if (k.k->p.offset != snapshot_root->bi_inum) + for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, + POS(0, snapshot_root->bi_inum), + BTREE_ITER_all_snapshots, k, ret) { + if (bpos_ge(k.k->p, SPOS(0, snapshot_root->bi_inum, snapshot_root->bi_snapshot))) break; if (!bkey_is_inode(k.k)) continue; @@ -121,19 +141,72 @@ static int repair_inode_hash_info(struct btree_trans *trans, if (ret) break; - if (fsck_err_on(inode.bi_hash_seed != snapshot_root->bi_hash_seed || - INODE_STR_HASH(&inode) != INODE_STR_HASH(snapshot_root), - trans, inode_snapshot_mismatch, - "inode hash info in different snapshots don't match")) { + if (inode.bi_hash_seed == snapshot_root->bi_hash_seed && + INODE_STR_HASH(&inode) == INODE_STR_HASH(snapshot_root)) { +#ifdef CONFIG_BCACHEFS_DEBUG + struct bch_hash_info hash1 = bch2_hash_info_init(c, snapshot_root); + struct bch_hash_info hash2 = bch2_hash_info_init(c, &inode); + + BUG_ON(hash1.type != hash2.type || + memcmp(&hash1.siphash_key, + &hash2.siphash_key, + sizeof(hash1.siphash_key))); +#endif + continue; + } + + printbuf_reset(&buf); + prt_printf(&buf, "inode %llu hash info in snapshots %u %u don't match\n", + snapshot_root->bi_inum, + inode.bi_snapshot, + snapshot_root->bi_snapshot); + + bch2_prt_str_hash_type(&buf, INODE_STR_HASH(&inode)); + prt_printf(&buf, " %llx\n", inode.bi_hash_seed); + + bch2_prt_str_hash_type(&buf, INODE_STR_HASH(snapshot_root)); + prt_printf(&buf, " %llx", snapshot_root->bi_hash_seed); + + if (fsck_err(trans, inode_snapshot_mismatch, "%s", buf.buf)) { inode.bi_hash_seed = snapshot_root->bi_hash_seed; SET_INODE_STR_HASH(&inode, INODE_STR_HASH(snapshot_root)); - ret = __bch2_fsck_write_inode(trans, &inode) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: - -BCH_ERR_transaction_restart_nested; - break; + + ret = __bch2_fsck_write_inode(trans, &inode); + if (ret) + break; + need_commit = true; } } + + if (ret) + goto err; + + if (!need_commit) { + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + + prt_printf(&buf, "inode %llu hash info mismatch with root, but mismatch not found\n", + snapshot_root->bi_inum); + + prt_printf(&buf, "root snapshot %u ", snapshot_root->bi_snapshot); + bch2_prt_str_hash_type(&buf, INODE_STR_HASH(snapshot_root)); + prt_printf(&buf, " %llx\n", snapshot_root->bi_hash_seed); +#if 0 + prt_printf(&buf, "vs snapshot %u ", hash_info->inum_snapshot); + bch2_prt_str_hash_type(&buf, hash_info->type); + prt_printf(&buf, " %llx %llx", hash_info->siphash_key.k0, hash_info->siphash_key.k1); +#endif + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + ret = bch_err_throw(c, fsck_repair_unimplemented); + goto err; + } + + ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: + -BCH_ERR_transaction_restart_nested; +err: fsck_err: + printbuf_exit(&buf); bch2_trans_iter_exit(trans, &iter); return ret; } @@ -145,46 +218,121 @@ fsck_err: static noinline int check_inode_hash_info_matches_root(struct btree_trans *trans, u64 inum, struct bch_hash_info *hash_info) { + struct bch_inode_unpacked snapshot_root; + int ret = bch2_inode_find_snapshot_root(trans, inum, &snapshot_root); + if (ret) + return ret; + + struct bch_hash_info hash_root = bch2_hash_info_init(trans->c, &snapshot_root); + if (hash_info->type != hash_root.type || + memcmp(&hash_info->siphash_key, + &hash_root.siphash_key, + sizeof(hash_root.siphash_key))) + ret = bch2_repair_inode_hash_info(trans, &snapshot_root); + + return ret; +} + +/* Put a str_hash key in its proper location, checking for duplicates */ +int bch2_str_hash_repair_key(struct btree_trans *trans, + struct snapshots_seen *s, + const struct bch_hash_desc *desc, + struct bch_hash_info *hash_info, + struct btree_iter *k_iter, struct bkey_s_c k, + struct btree_iter *dup_iter, struct bkey_s_c dup_k, + bool *updated_before_k_pos) +{ struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; + struct printbuf buf = PRINTBUF; + bool free_snapshots_seen = false; int ret = 0; - for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, U32_MAX), - BTREE_ITER_all_snapshots, k, ret) { - if (k.k->p.offset != inum) - break; - if (bkey_is_inode(k.k)) - goto found; + if (!s) { + s = bch2_trans_kmalloc(trans, sizeof(*s)); + ret = PTR_ERR_OR_ZERO(s); + if (ret) + goto out; + + s->pos = k_iter->pos; + darray_init(&s->ids); + + ret = bch2_get_snapshot_overwrites(trans, desc->btree_id, k_iter->pos, &s->ids); + if (ret) + goto out; + + free_snapshots_seen = true; } - bch_err(c, "%s(): inum %llu not found", __func__, inum); - ret = -BCH_ERR_fsck_repair_unimplemented; - goto err; -found:; - struct bch_inode_unpacked inode; - ret = bch2_inode_unpack(k, &inode); - if (ret) - goto err; - struct bch_hash_info hash2 = bch2_hash_info_init(c, &inode); - if (hash_info->type != hash2.type || - memcmp(&hash_info->siphash_key, &hash2.siphash_key, sizeof(hash2.siphash_key))) { - ret = repair_inode_hash_info(trans, &inode); - if (!ret) { - bch_err(c, "inode hash info mismatch with root, but mismatch not found\n" - "%u %llx %llx\n" - "%u %llx %llx", - hash_info->type, - hash_info->siphash_key.k0, - hash_info->siphash_key.k1, - hash2.type, - hash2.siphash_key.k0, - hash2.siphash_key.k1); - ret = -BCH_ERR_fsck_repair_unimplemented; + if (!dup_k.k) { + struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); + ret = PTR_ERR_OR_ZERO(new); + if (ret) + goto out; + + dup_k = bch2_hash_set_or_get_in_snapshot(trans, dup_iter, *desc, hash_info, + (subvol_inum) { 0, new->k.p.inode }, + new->k.p.snapshot, new, + STR_HASH_must_create| + BTREE_ITER_with_updates| + BTREE_UPDATE_internal_snapshot_node); + ret = bkey_err(dup_k); + if (ret) + goto out; + if (dup_k.k) + goto duplicate_entries; + + if (bpos_lt(new->k.p, k.k->p)) + *updated_before_k_pos = true; + + ret = bch2_insert_snapshot_whiteouts(trans, desc->btree_id, + k_iter->pos, new->k.p) ?: + bch2_hash_delete_at(trans, *desc, hash_info, k_iter, + BTREE_ITER_with_updates| + BTREE_UPDATE_internal_snapshot_node) ?: + bch2_fsck_update_backpointers(trans, s, *desc, hash_info, new) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: + -BCH_ERR_transaction_restart_commit; + } else { +duplicate_entries: + ret = hash_pick_winner(trans, *desc, hash_info, k, dup_k); + if (ret < 0) + goto out; + + if (!fsck_err(trans, hash_table_key_duplicate, + "duplicate hash table keys%s:\n%s", + ret != 2 ? "" : ", both point to valid inodes", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), + prt_newline(&buf), + bch2_bkey_val_to_text(&buf, c, dup_k), + buf.buf))) + goto out; + + switch (ret) { + case 0: + ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0); + break; + case 1: + ret = bch2_hash_delete_at(trans, *desc, hash_info, dup_iter, 0); + break; + case 2: + ret = bch2_fsck_rename_dirent(trans, s, *desc, hash_info, + bkey_s_c_to_dirent(k), + updated_before_k_pos) ?: + bch2_hash_delete_at(trans, *desc, hash_info, k_iter, + BTREE_ITER_with_updates); + goto out; } + + ret = bch2_trans_commit(trans, NULL, NULL, 0) ?: + -BCH_ERR_transaction_restart_commit; } -err: - bch2_trans_iter_exit(trans, &iter); +out: +fsck_err: + bch2_trans_iter_exit(trans, dup_iter); + printbuf_exit(&buf); + if (free_snapshots_seen) + darray_exit(&s->ids); return ret; } @@ -192,7 +340,8 @@ int __bch2_str_hash_check_key(struct btree_trans *trans, struct snapshots_seen *s, const struct bch_hash_desc *desc, struct bch_hash_info *hash_info, - struct btree_iter *k_iter, struct bkey_s_c hash_k) + struct btree_iter *k_iter, struct bkey_s_c hash_k, + bool *updated_before_k_pos) { struct bch_fs *c = trans->c; struct btree_iter iter = {}; @@ -206,24 +355,31 @@ int __bch2_str_hash_check_key(struct btree_trans *trans, for_each_btree_key_norestart(trans, iter, desc->btree_id, SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot), - BTREE_ITER_slots, k, ret) { + BTREE_ITER_slots| + BTREE_ITER_with_updates, k, ret) { if (bkey_eq(k.k->p, hash_k.k->p)) break; if (k.k->type == desc->key_type && - !desc->cmp_bkey(k, hash_k)) - goto duplicate_entries; + !desc->cmp_bkey(k, hash_k)) { + ret = check_inode_hash_info_matches_root(trans, hash_k.k->p.inode, + hash_info) ?: + bch2_str_hash_repair_key(trans, s, desc, hash_info, + k_iter, hash_k, + &iter, k, updated_before_k_pos); + break; + } - if (bkey_deleted(k.k)) { - bch2_trans_iter_exit(trans, &iter); + if (bkey_deleted(k.k)) goto bad_hash; - } } -out: bch2_trans_iter_exit(trans, &iter); +out: +fsck_err: printbuf_exit(&buf); return ret; bad_hash: + bch2_trans_iter_exit(trans, &iter); /* * Before doing any repair, check hash_info itself: */ @@ -232,64 +388,12 @@ bad_hash: goto out; if (fsck_err(trans, hash_table_key_wrong_offset, - "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s", - bch2_btree_id_str(desc->btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) { - struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, hash_k); - if (IS_ERR(new)) - return PTR_ERR(new); - - k = bch2_hash_set_or_get_in_snapshot(trans, &iter, *desc, hash_info, - (subvol_inum) { 0, hash_k.k->p.inode }, - hash_k.k->p.snapshot, new, - STR_HASH_must_create| - BTREE_ITER_with_updates| - BTREE_UPDATE_internal_snapshot_node); - ret = bkey_err(k); - if (ret) - goto out; - if (k.k) - goto duplicate_entries; - - ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter, - BTREE_UPDATE_internal_snapshot_node) ?: - bch2_fsck_update_backpointers(trans, s, *desc, hash_info, new) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: - -BCH_ERR_transaction_restart_nested; - goto out; - } -fsck_err: - goto out; -duplicate_entries: - ret = hash_pick_winner(trans, *desc, hash_info, hash_k, k); - if (ret < 0) - goto out; - - if (!fsck_err(trans, hash_table_key_duplicate, - "duplicate hash table keys%s:\n%s", - ret != 2 ? "" : ", both point to valid inodes", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, hash_k), - prt_newline(&buf), - bch2_bkey_val_to_text(&buf, c, k), - buf.buf))) - goto out; - - switch (ret) { - case 0: - ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0); - break; - case 1: - ret = bch2_hash_delete_at(trans, *desc, hash_info, &iter, 0); - break; - case 2: - ret = fsck_rename_dirent(trans, s, *desc, hash_info, bkey_s_c_to_dirent(hash_k)) ?: - bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0); - goto out; - } - - ret = bch2_trans_commit(trans, NULL, NULL, 0) ?: - -BCH_ERR_transaction_restart_nested; + "hash table key at wrong offset: should be at %llu\n%s", + hash, + (bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) + ret = bch2_str_hash_repair_key(trans, s, desc, hash_info, + k_iter, hash_k, + &iter, bkey_s_c_null, + updated_before_k_pos); goto out; } diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index 0c1a00539bd1..79d51aef70aa 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -32,6 +32,7 @@ bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt) } struct bch_hash_info { + u32 inum_snapshot; u8 type; struct unicode_map *cf_encoding; /* @@ -45,11 +46,12 @@ static inline struct bch_hash_info bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi) { struct bch_hash_info info = { - .type = INODE_STR_HASH(bi), + .inum_snapshot = bi->bi_snapshot, + .type = INODE_STR_HASH(bi), #ifdef CONFIG_UNICODE - .cf_encoding = bch2_inode_casefold(c, bi) ? c->cf_encoding : NULL, + .cf_encoding = bch2_inode_casefold(c, bi) ? c->cf_encoding : NULL, #endif - .siphash_key = { .k0 = bi->bi_hash_seed } + .siphash_key = { .k0 = bi->bi_hash_seed } }; if (unlikely(info.type == BCH_STR_HASH_siphash_old)) { @@ -259,6 +261,7 @@ struct bkey_s_c bch2_hash_set_or_get_in_snapshot(struct btree_trans *trans, struct bkey_i *insert, enum btree_iter_update_trigger_flags flags) { + struct bch_fs *c = trans->c; struct btree_iter slot = {}; struct bkey_s_c k; bool found = false; @@ -286,7 +289,7 @@ struct bkey_s_c bch2_hash_set_or_get_in_snapshot(struct btree_trans *trans, } if (!ret) - ret = -BCH_ERR_ENOSPC_str_hash_create; + ret = bch_err_throw(c, ENOSPC_str_hash_create); out: bch2_trans_iter_exit(trans, &slot); bch2_trans_iter_exit(trans, iter); @@ -298,7 +301,7 @@ not_found: bch2_trans_iter_exit(trans, &slot); return k; } else if (!found && (flags & STR_HASH_must_replace)) { - ret = -BCH_ERR_ENOENT_str_hash_set_must_replace; + ret = bch_err_throw(c, ENOENT_str_hash_set_must_replace); } else { if (!found && slot.path) swap(*iter, slot); @@ -326,7 +329,7 @@ int bch2_hash_set_in_snapshot(struct btree_trans *trans, return ret; if (k.k) { bch2_trans_iter_exit(trans, &iter); - return -BCH_ERR_EEXIST_str_hash_set; + return bch_err_throw(trans->c, EEXIST_str_hash_set); } return 0; @@ -392,18 +395,30 @@ int bch2_hash_delete(struct btree_trans *trans, return ret; } +int bch2_repair_inode_hash_info(struct btree_trans *, struct bch_inode_unpacked *); + struct snapshots_seen; +int bch2_str_hash_repair_key(struct btree_trans *, + struct snapshots_seen *, + const struct bch_hash_desc *, + struct bch_hash_info *, + struct btree_iter *, struct bkey_s_c, + struct btree_iter *, struct bkey_s_c, + bool *); + int __bch2_str_hash_check_key(struct btree_trans *, struct snapshots_seen *, const struct bch_hash_desc *, struct bch_hash_info *, - struct btree_iter *, struct bkey_s_c); + struct btree_iter *, struct bkey_s_c, + bool *); static inline int bch2_str_hash_check_key(struct btree_trans *trans, struct snapshots_seen *s, const struct bch_hash_desc *desc, struct bch_hash_info *hash_info, - struct btree_iter *k_iter, struct bkey_s_c hash_k) + struct btree_iter *k_iter, struct bkey_s_c hash_k, + bool *updated_before_k_pos) { if (hash_k.k->type != desc->key_type) return 0; @@ -411,7 +426,8 @@ static inline int bch2_str_hash_check_key(struct btree_trans *trans, if (likely(desc->hash_bkey(hash_info, hash_k) == hash_k.k->p.offset)) return 0; - return __bch2_str_hash_check_key(trans, s, desc, hash_info, k_iter, hash_k); + return __bch2_str_hash_check_key(trans, s, desc, hash_info, k_iter, hash_k, + updated_before_k_pos); } #endif /* _BCACHEFS_STR_HASH_H */ diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index d0209f7658bb..020587449123 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -3,6 +3,7 @@ #include "bcachefs.h" #include "btree_key_cache.h" #include "btree_update.h" +#include "enumerated_ref.h" #include "errcode.h" #include "error.h" #include "fs.h" @@ -14,6 +15,22 @@ static int bch2_subvolume_delete(struct btree_trans *, u32); +static int bch2_subvolume_missing(struct bch_fs *c, u32 subvolid) +{ + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + + prt_printf(&buf, "missing subvolume %u", subvolid); + bool print = bch2_count_fsck_err(c, subvol_missing, &buf); + + int ret = bch2_run_explicit_recovery_pass(c, &buf, + BCH_RECOVERY_PASS_check_inodes, 0); + if (print) + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + return ret; +} + static struct bpos subvolume_children_pos(struct bkey_s_c k) { if (k.k->type != KEY_TYPE_subvolume) @@ -45,7 +62,7 @@ static int check_subvol(struct btree_trans *trans, ret = bch2_snapshot_lookup(trans, snapid, &snapshot); if (bch2_err_matches(ret, ENOENT)) - return bch2_run_explicit_recovery_pass(c, + return bch2_run_print_explicit_recovery_pass(c, BCH_RECOVERY_PASS_reconstruct_snapshots) ?: ret; if (ret) return ret; @@ -113,10 +130,20 @@ static int check_subvol(struct btree_trans *trans, "subvolume %llu points to missing subvolume root %llu:%u", k.k->p.offset, le64_to_cpu(subvol.v->inode), le32_to_cpu(subvol.v->snapshot))) { - ret = bch2_subvolume_delete(trans, iter->pos.offset); - bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset); - ret = ret ?: -BCH_ERR_transaction_restart_nested; - goto err; + /* + * Recreate - any contents that are still disconnected + * will then get reattached under lost+found + */ + bch2_inode_init_early(c, &inode); + bch2_inode_init_late(c, &inode, bch2_current_time(c), + 0, 0, S_IFDIR|0700, 0, NULL); + inode.bi_inum = le64_to_cpu(subvol.v->inode); + inode.bi_snapshot = le32_to_cpu(subvol.v->snapshot); + inode.bi_subvol = k.k->p.offset; + inode.bi_parent_subvol = le32_to_cpu(subvol.v->fs_path_parent); + ret = __bch2_fsck_write_inode(trans, &inode); + if (ret) + goto err; } } else { goto err; @@ -124,13 +151,9 @@ static int check_subvol(struct btree_trans *trans, if (!BCH_SUBVOLUME_SNAP(subvol.v)) { u32 snapshot_root = bch2_snapshot_root(c, le32_to_cpu(subvol.v->snapshot)); - u32 snapshot_tree; - struct bch_snapshot_tree st; - - rcu_read_lock(); - snapshot_tree = snapshot_t(c, snapshot_root)->tree; - rcu_read_unlock(); + u32 snapshot_tree = bch2_snapshot_tree(c, snapshot_root); + struct bch_snapshot_tree st; ret = bch2_snapshot_tree_lookup(trans, snapshot_tree, &st); bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, @@ -242,6 +265,13 @@ void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c, prt_printf(out, " creation_parent %u", le32_to_cpu(s.v->creation_parent)); prt_printf(out, " fs_parent %u", le32_to_cpu(s.v->fs_path_parent)); } + + if (BCH_SUBVOLUME_RO(s.v)) + prt_printf(out, " ro"); + if (BCH_SUBVOLUME_SNAP(s.v)) + prt_printf(out, " snapshot"); + if (BCH_SUBVOLUME_UNLINKED(s.v)) + prt_printf(out, " unlinked"); } static int subvolume_children_mod(struct btree_trans *trans, struct bpos pos, bool set) @@ -292,9 +322,8 @@ bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol, int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, subvol), BTREE_ITER_cached| BTREE_ITER_with_updates, subvolume, s); - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT) && - inconsistent_if_not_found, - trans->c, "missing subvolume %u", subvol); + if (bch2_err_matches(ret, ENOENT) && inconsistent_if_not_found) + ret = bch2_subvolume_missing(trans->c, subvol) ?: ret; return ret; } @@ -344,8 +373,8 @@ int __bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid, subvolume); ret = bkey_err(subvol); - bch2_fs_inconsistent_on(warn && bch2_err_matches(ret, ENOENT), trans->c, - "missing subvolume %u", subvolid); + if (bch2_err_matches(ret, ENOENT)) + ret = bch2_subvolume_missing(trans->c, subvolid) ?: ret; if (likely(!ret)) *snapid = le32_to_cpu(subvol.v->snapshot); @@ -418,8 +447,8 @@ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) BTREE_ITER_cached|BTREE_ITER_intent, subvolume); int ret = bkey_err(subvol); - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, - "missing subvolume %u", subvolid); + if (bch2_err_matches(ret, ENOENT)) + ret = bch2_subvolume_missing(trans->c, subvolid) ?: ret; if (ret) goto err; @@ -470,22 +499,23 @@ err: static int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) { - return bch2_subvolumes_reparent(trans, subvolid) ?: + int ret = bch2_subvolumes_reparent(trans, subvolid) ?: commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, __bch2_subvolume_delete(trans, subvolid)); + + bch2_recovery_pass_set_no_ratelimit(trans->c, BCH_RECOVERY_PASS_check_subvols); + return ret; } static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, snapshot_wait_for_pagecache_and_delete_work); - snapshot_id_list s; - u32 *id; int ret = 0; while (!ret) { mutex_lock(&c->snapshots_unlinked_lock); - s = c->snapshots_unlinked; + snapshot_id_list s = c->snapshots_unlinked; darray_init(&c->snapshots_unlinked); mutex_unlock(&c->snapshots_unlinked_lock); @@ -494,7 +524,7 @@ static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *wor bch2_evict_subvolume_inodes(c, &s); - for (id = s.data; id < s.data + s.nr; id++) { + darray_for_each(s, id) { ret = bch2_trans_run(c, bch2_subvolume_delete(trans, *id)); bch_err_msg(c, ret, "deleting subvolume %u", *id); if (ret) @@ -504,7 +534,7 @@ static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *wor darray_exit(&s); } - bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_snapshot_delete_pagecache); } struct subvolume_unlink_hook { @@ -527,11 +557,11 @@ static int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans if (ret) return ret; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_snapshot_delete_pagecache)) + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_snapshot_delete_pagecache)) return -EROFS; if (!queue_work(c->write_ref_wq, &c->snapshot_wait_for_pagecache_and_delete_work)) - bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_snapshot_delete_pagecache); return 0; } @@ -555,11 +585,10 @@ int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid) BTREE_ID_subvolumes, POS(0, subvolid), BTREE_ITER_cached, subvolume); ret = PTR_ERR_OR_ZERO(n); - if (unlikely(ret)) { - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, - "missing subvolume %u", subvolid); + if (bch2_err_matches(ret, ENOENT)) + ret = bch2_subvolume_missing(trans->c, subvolid) ?: ret; + if (unlikely(ret)) return ret; - } SET_BCH_SUBVOLUME_UNLINKED(&n->v, true); n->v.fs_path_parent = 0; @@ -584,7 +613,7 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode, ret = bch2_bkey_get_empty_slot(trans, &dst_iter, BTREE_ID_subvolumes, POS(0, U32_MAX)); if (ret == -BCH_ERR_ENOSPC_btree_slot) - ret = -BCH_ERR_ENOSPC_subvolume_create; + ret = bch_err_throw(c, ENOSPC_subvolume_create); if (ret) return ret; @@ -598,11 +627,10 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode, BTREE_ID_subvolumes, POS(0, src_subvolid), BTREE_ITER_cached, subvolume); ret = PTR_ERR_OR_ZERO(src_subvol); - if (unlikely(ret)) { - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, - "subvolume %u not found", src_subvolid); + if (bch2_err_matches(ret, ENOENT)) + ret = bch2_subvolume_missing(trans->c, src_subvolid) ?: ret; + if (unlikely(ret)) goto err; - } parent = le32_to_cpu(src_subvol->v.snapshot); } @@ -691,8 +719,9 @@ static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans) return ret; if (!bkey_is_inode(k.k)) { - bch_err(trans->c, "root inode not found"); - ret = -BCH_ERR_ENOENT_inode; + struct bch_fs *c = trans->c; + bch_err(c, "root inode not found"); + ret = bch_err_throw(c, ENOENT_inode); goto err; } @@ -716,11 +745,8 @@ int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c) return ret; } -int bch2_fs_subvolumes_init(struct bch_fs *c) +void bch2_fs_subvolumes_init_early(struct bch_fs *c) { - INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work); INIT_WORK(&c->snapshot_wait_for_pagecache_and_delete_work, bch2_subvolume_wait_for_pagecache_and_delete); - mutex_init(&c->snapshots_unlinked_lock); - return 0; } diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h index f640c1e3d639..075f55e25c70 100644 --- a/fs/bcachefs/subvolume.h +++ b/fs/bcachefs/subvolume.h @@ -77,15 +77,12 @@ bch2_btree_iter_peek_in_subvolume_max_type(struct btree_trans *trans, struct btr _end, _subvolid, _flags, _k, _do); \ }) -int bch2_delete_dead_snapshots(struct bch_fs *); -void bch2_delete_dead_snapshots_async(struct bch_fs *); - int bch2_subvolume_unlink(struct btree_trans *, u32); int bch2_subvolume_create(struct btree_trans *, u64, u32, u32, u32 *, u32 *, bool); int bch2_initialize_subvolumes(struct bch_fs *); int bch2_fs_upgrade_for_subvolumes(struct bch_fs *); -int bch2_fs_subvolumes_init(struct bch_fs *); +void bch2_fs_subvolumes_init_early(struct bch_fs *); #endif /* _BCACHEFS_SUBVOLUME_H */ diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h index 1549d6daf7af..9d634b906dcd 100644 --- a/fs/bcachefs/subvolume_types.h +++ b/fs/bcachefs/subvolume_types.h @@ -2,33 +2,6 @@ #ifndef _BCACHEFS_SUBVOLUME_TYPES_H #define _BCACHEFS_SUBVOLUME_TYPES_H -#include "darray.h" - -typedef DARRAY(u32) snapshot_id_list; - -#define IS_ANCESTOR_BITMAP 128 - -struct snapshot_t { - bool live; - u32 parent; - u32 skip[3]; - u32 depth; - u32 children[2]; - u32 subvol; /* Nonzero only if a subvolume points to this node: */ - u32 tree; - unsigned long is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)]; -}; - -struct snapshot_table { - struct rcu_head rcu; - size_t nr; -#ifndef RUST_BINDGEN - DECLARE_FLEX_ARRAY(struct snapshot_t, s); -#else - struct snapshot_t s[0]; -#endif -}; - typedef struct { /* we can't have padding in this struct: */ u64 subvol; diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index cb5d960aed92..6c2e1d647403 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -87,7 +87,8 @@ int bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version v struct printbuf buf = PRINTBUF; prt_str(&buf, "requested incompat feature "); bch2_version_to_text(&buf, version); - prt_str(&buf, " currently not enabled"); + prt_str(&buf, " currently not enabled, allowed up to "); + bch2_version_to_text(&buf, version); prt_printf(&buf, "\n set version_upgrade=incompat to enable"); bch_notice(c, "%s", buf.buf); @@ -260,11 +261,11 @@ struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *sb, /* XXX: we're not checking that offline device have enough space */ - for_each_online_member(c, ca) { + for_each_online_member(c, ca, BCH_DEV_READ_REF_sb_field_resize) { struct bch_sb_handle *dev_sb = &ca->disk_sb; if (bch2_sb_realloc(dev_sb, le32_to_cpu(dev_sb->sb->u64s) + d)) { - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_sb_field_resize); return NULL; } } @@ -384,7 +385,6 @@ static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out) int bch2_sb_validate(struct bch_sb *sb, u64 read_offset, enum bch_validate_flags flags, struct printbuf *out) { - struct bch_sb_field_members_v1 *mi; enum bch_opt_id opt_id; int ret; @@ -468,6 +468,9 @@ int bch2_sb_validate(struct bch_sb *sb, u64 read_offset, SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(sb, BCH_SB_VERSION_INCOMPAT(sb)); } + if (sb->nr_devices > 1) + SET_BCH_SB_MULTI_DEVICE(sb, true); + if (!flags) { /* * Been seeing a bug where these are getting inexplicably @@ -536,14 +539,17 @@ int bch2_sb_validate(struct bch_sb *sb, u64 read_offset, } } + struct bch_sb_field *mi = + bch2_sb_field_get_id(sb, BCH_SB_FIELD_members_v2) ?: + bch2_sb_field_get_id(sb, BCH_SB_FIELD_members_v1); + /* members must be validated first: */ - mi = bch2_sb_field_get(sb, members_v1); if (!mi) { prt_printf(out, "Invalid superblock: member info area missing"); return -BCH_ERR_invalid_sb_members_missing; } - ret = bch2_sb_field_validate(sb, &mi->field, flags, out); + ret = bch2_sb_field_validate(sb, mi, flags, out); if (ret) return ret; @@ -612,11 +618,15 @@ static void bch2_sb_update(struct bch_fs *c) c->sb.features = le64_to_cpu(src->features[0]); c->sb.compat = le64_to_cpu(src->compat[0]); + c->sb.multi_device = BCH_SB_MULTI_DEVICE(src); memset(c->sb.errors_silent, 0, sizeof(c->sb.errors_silent)); struct bch_sb_field_ext *ext = bch2_sb_field_get(src, ext); if (ext) { + c->sb.recovery_passes_required = + bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); + le_bitvector_to_cpu(c->sb.errors_silent, (void *) ext->errors_silent, sizeof(c->sb.errors_silent) * 8); c->sb.btrees_lost_data = le64_to_cpu(ext->btrees_lost_data); @@ -961,7 +971,7 @@ static void write_super_endio(struct bio *bio) } closure_put(&ca->fs->sb_write); - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super); } static void read_back_super(struct bch_fs *c, struct bch_dev *ca) @@ -979,7 +989,7 @@ static void read_back_super(struct bch_fs *c, struct bch_dev *ca) this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], bio_sectors(bio)); - percpu_ref_get(&ca->io_ref[READ]); + enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super); closure_bio_submit(bio, &c->sb_write); } @@ -1005,7 +1015,7 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_sb], bio_sectors(bio)); - percpu_ref_get(&ca->io_ref[READ]); + enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super); closure_bio_submit(bio, &c->sb_write); } @@ -1022,7 +1032,7 @@ int bch2_write_super(struct bch_fs *c) trace_and_count(c, write_super, c, _RET_IP_); - if (c->opts.very_degraded) + if (c->opts.degraded == BCH_DEGRADED_very) degraded_flags |= BCH_FORCE_IF_LOST; lockdep_assert_held(&c->sb_lock); @@ -1037,13 +1047,13 @@ int bch2_write_super(struct bch_fs *c) * For now, we expect to be able to call write_super() when we're not * yet RW: */ - for_each_online_member(c, ca) { + for_each_online_member(c, ca, BCH_DEV_READ_REF_write_super) { ret = darray_push(&online_devices, ca); if (bch2_fs_fatal_err_on(ret, c, "%s: error allocating online devices", __func__)) { - percpu_ref_put(&ca->io_ref[READ]); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super); goto out; } - percpu_ref_get(&ca->io_ref[READ]); + enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super); } /* Make sure we're using the new magic numbers: */ @@ -1102,7 +1112,7 @@ int bch2_write_super(struct bch_fs *c) prt_str(&buf, ")"); bch2_fs_fatal_error(c, ": %s", buf.buf); printbuf_exit(&buf); - ret = -BCH_ERR_sb_not_downgraded; + ret = bch_err_throw(c, sb_not_downgraded); goto out; } @@ -1132,7 +1142,7 @@ int bch2_write_super(struct bch_fs *c) if (c->opts.errors != BCH_ON_ERROR_continue && c->opts.errors != BCH_ON_ERROR_fix_safe) { - ret = -BCH_ERR_erofs_sb_err; + ret = bch_err_throw(c, erofs_sb_err); bch2_fs_fatal_error(c, "%s", buf.buf); } else { bch_err(c, "%s", buf.buf); @@ -1151,7 +1161,7 @@ int bch2_write_super(struct bch_fs *c) ca->disk_sb.seq); bch2_fs_fatal_error(c, "%s", buf.buf); printbuf_exit(&buf); - ret = -BCH_ERR_erofs_sb_err; + ret = bch_err_throw(c, erofs_sb_err); } } @@ -1205,12 +1215,12 @@ int bch2_write_super(struct bch_fs *c) !can_mount_with_written), c, ": Unable to write superblock to sufficient devices (from %ps)", (void *) _RET_IP_)) - ret = -BCH_ERR_erofs_sb_err; + ret = bch_err_throw(c, erofs_sb_err); out: /* Make new options visible after they're persistent: */ bch2_sb_update(c); darray_for_each(online_devices, ca) - percpu_ref_put(&(*ca)->io_ref[READ]); + enumerated_ref_put(&(*ca)->io_ref[READ], BCH_DEV_READ_REF_write_super); darray_exit(&online_devices); printbuf_exit(&err); return ret; @@ -1270,6 +1280,31 @@ void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version, bool incompat) } } +void bch2_sb_upgrade_incompat(struct bch_fs *c) +{ + mutex_lock(&c->sb_lock); + if (c->sb.version == c->sb.version_incompat_allowed) + goto unlock; + + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "Now allowing incompatible features up to "); + bch2_version_to_text(&buf, c->sb.version); + prt_str(&buf, ", previously allowed up to "); + bch2_version_to_text(&buf, c->sb.version_incompat_allowed); + prt_newline(&buf); + + bch_notice(c, "%s", buf.buf); + printbuf_exit(&buf); + + c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); + SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb, + max(BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb), c->sb.version)); + bch2_write_super(c); +unlock: + mutex_unlock(&c->sb_lock); +} + static int bch2_sb_ext_validate(struct bch_sb *sb, struct bch_sb_field *f, enum bch_validate_flags flags, struct printbuf *err) { diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h index 78f708a6fbcd..a3b7a90f2533 100644 --- a/fs/bcachefs/super-io.h +++ b/fs/bcachefs/super-io.h @@ -107,6 +107,7 @@ static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat) bool bch2_check_version_downgrade(struct bch_fs *); void bch2_sb_upgrade(struct bch_fs *, unsigned, bool); +void bch2_sb_upgrade_incompat(struct bch_fs *); void __bch2_sb_field_to_text(struct printbuf *, struct bch_sb *, struct bch_sb_field *); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 84a37d971ffd..397a69da5a75 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -10,6 +10,8 @@ #include "bcachefs.h" #include "alloc_background.h" #include "alloc_foreground.h" +#include "async_objs.h" +#include "backpointers.h" #include "bkey_sort.h" #include "btree_cache.h" #include "btree_gc.h" @@ -28,6 +30,7 @@ #include "disk_accounting.h" #include "disk_groups.h" #include "ec.h" +#include "enumerated_ref.h" #include "errcode.h" #include "error.h" #include "fs.h" @@ -48,6 +51,7 @@ #include "quota.h" #include "rebalance.h" #include "recovery.h" +#include "recovery_passes.h" #include "replicas.h" #include "sb-clean.h" #include "sb-counters.h" @@ -75,14 +79,32 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>"); MODULE_DESCRIPTION("bcachefs filesystem"); -const char * const bch2_fs_flag_strs[] = { +typedef DARRAY(struct bch_sb_handle) bch_sb_handles; + #define x(n) #n, +const char * const bch2_fs_flag_strs[] = { BCH_FS_FLAGS() -#undef x NULL }; -void bch2_print_str(struct bch_fs *c, const char *str) +const char * const bch2_write_refs[] = { + BCH_WRITE_REFS() + NULL +}; + +const char * const bch2_dev_read_refs[] = { + BCH_DEV_READ_REFS() + NULL +}; + +const char * const bch2_dev_write_refs[] = { + BCH_DEV_WRITE_REFS() + NULL +}; +#undef x + +static void __bch2_print_str(struct bch_fs *c, const char *prefix, + const char *str, bool nonblocking) { #ifdef __KERNEL__ struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c); @@ -92,7 +114,17 @@ void bch2_print_str(struct bch_fs *c, const char *str) return; } #endif - bch2_print_string_as_lines(KERN_ERR, str); + bch2_print_string_as_lines(KERN_ERR, str, nonblocking); +} + +void bch2_print_str(struct bch_fs *c, const char *prefix, const char *str) +{ + __bch2_print_str(c, prefix, str, false); +} + +void bch2_print_str_nonblocking(struct bch_fs *c, const char *prefix, const char *str) +{ + __bch2_print_str(c, prefix, str, true); } __printf(2, 0) @@ -183,26 +215,21 @@ static int bch2_dev_alloc(struct bch_fs *, unsigned); static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *); static void bch2_dev_io_ref_stop(struct bch_dev *, int); static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *); +static int bch2_fs_init_rw(struct bch_fs *); struct bch_fs *bch2_dev_to_fs(dev_t dev) { - struct bch_fs *c; - - mutex_lock(&bch_fs_list_lock); - rcu_read_lock(); + guard(mutex)(&bch_fs_list_lock); + guard(rcu)(); + struct bch_fs *c; list_for_each_entry(c, &bch_fs_list, list) for_each_member_device_rcu(c, ca, NULL) if (ca->disk_sb.bdev && ca->disk_sb.bdev->bd_dev == dev) { closure_get(&c->cl); - goto found; + return c; } - c = NULL; -found: - rcu_read_unlock(); - mutex_unlock(&bch_fs_list_lock); - - return c; + return NULL; } static struct bch_fs *__bch2_uuid_to_fs(__uuid_t uuid) @@ -297,15 +324,13 @@ static void __bch2_fs_read_only(struct bch_fs *c) } } -#ifndef BCH_WRITE_REF_DEBUG -static void bch2_writes_disabled(struct percpu_ref *writes) +static void bch2_writes_disabled(struct enumerated_ref *writes) { struct bch_fs *c = container_of(writes, struct bch_fs, writes); set_bit(BCH_FS_write_disable_complete, &c->flags); wake_up(&bch2_read_only_wait); } -#endif void bch2_fs_read_only(struct bch_fs *c) { @@ -323,12 +348,7 @@ void bch2_fs_read_only(struct bch_fs *c) * writes will return -EROFS: */ set_bit(BCH_FS_going_ro, &c->flags); -#ifndef BCH_WRITE_REF_DEBUG - percpu_ref_kill(&c->writes); -#else - for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) - bch2_write_ref_put(c, i); -#endif + enumerated_ref_stop_async(&c->writes); /* * If we're not doing an emergency shutdown, we want to wait on @@ -366,7 +386,7 @@ void bch2_fs_read_only(struct bch_fs *c) !test_bit(BCH_FS_emergency_ro, &c->flags) && test_bit(BCH_FS_started, &c->flags) && test_bit(BCH_FS_clean_shutdown, &c->flags) && - c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) { + c->recovery.pass_done >= BCH_RECOVERY_PASS_journal_replay) { BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal)); BUG_ON(atomic_long_read(&c->btree_cache.nr_dirty)); BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty)); @@ -412,6 +432,30 @@ bool bch2_fs_emergency_read_only(struct bch_fs *c) return ret; } +static bool __bch2_fs_emergency_read_only2(struct bch_fs *c, struct printbuf *out, + bool locked) +{ + bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags); + + if (!locked) + bch2_journal_halt(&c->journal); + else + bch2_journal_halt_locked(&c->journal); + bch2_fs_read_only_async(c); + wake_up(&bch2_read_only_wait); + + if (ret) + prt_printf(out, "emergency read only at seq %llu\n", + journal_cur_seq(&c->journal)); + + return ret; +} + +bool bch2_fs_emergency_read_only2(struct bch_fs *c, struct printbuf *out) +{ + return __bch2_fs_emergency_read_only2(c, out, false); +} + bool bch2_fs_emergency_read_only_locked(struct bch_fs *c) { bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags); @@ -429,9 +473,17 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags)); + if (WARN_ON(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))) + return bch_err_throw(c, erofs_no_alloc_info); + if (test_bit(BCH_FS_initial_gc_unfixed, &c->flags)) { bch_err(c, "cannot go rw, unfixed btree errors"); - return -BCH_ERR_erofs_unfixed_errors; + return bch_err_throw(c, erofs_unfixed_errors); + } + + if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) { + bch_err(c, "cannot go rw, filesystem is an unresized image file"); + return bch_err_throw(c, erofs_filesystem_full); } if (test_bit(BCH_FS_rw, &c->flags)) @@ -439,16 +491,23 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) bch_info(c, "going read-write"); + ret = bch2_fs_init_rw(c); + if (ret) + goto err; + ret = bch2_sb_members_v2_init(c); if (ret) goto err; clear_bit(BCH_FS_clean_shutdown, &c->flags); - __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), READ) { - bch2_dev_allocator_add(c, ca); - percpu_ref_reinit(&ca->io_ref[WRITE]); - } + scoped_guard(rcu) + for_each_online_member_rcu(c, ca) + if (ca->mi.state == BCH_MEMBER_STATE_rw) { + bch2_dev_allocator_add(c, ca); + enumerated_ref_start(&ca->io_ref[WRITE]); + } + bch2_recalc_capacity(c); /* @@ -474,14 +533,7 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) set_bit(BCH_FS_rw, &c->flags); set_bit(BCH_FS_was_rw, &c->flags); -#ifndef BCH_WRITE_REF_DEBUG - percpu_ref_reinit(&c->writes); -#else - for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) { - BUG_ON(atomic_long_read(&c->writes[i])); - atomic_long_inc(&c->writes[i]); - } -#endif + enumerated_ref_start(&c->writes); ret = bch2_copygc_start(c); if (ret) { @@ -512,10 +564,13 @@ int bch2_fs_read_write(struct bch_fs *c) { if (c->opts.recovery_pass_last && c->opts.recovery_pass_last < BCH_RECOVERY_PASS_journal_replay) - return -BCH_ERR_erofs_norecovery; + return bch_err_throw(c, erofs_norecovery); if (c->opts.nochanges) - return -BCH_ERR_erofs_nochanges; + return bch_err_throw(c, erofs_nochanges); + + if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) + return bch_err_throw(c, erofs_no_alloc_info); return __bch2_fs_read_write(c, false); } @@ -543,35 +598,37 @@ static void __bch2_fs_free(struct bch_fs *c) bch2_find_btree_nodes_exit(&c->found_btree_nodes); bch2_free_pending_node_rewrites(c); bch2_free_fsck_errs(c); - bch2_fs_accounting_exit(c); - bch2_fs_sb_errors_exit(c); - bch2_fs_counters_exit(c); + bch2_fs_vfs_exit(c); bch2_fs_snapshots_exit(c); + bch2_fs_sb_errors_exit(c); + bch2_fs_replicas_exit(c); + bch2_fs_rebalance_exit(c); bch2_fs_quota_exit(c); + bch2_fs_nocow_locking_exit(c); + bch2_fs_journal_exit(&c->journal); bch2_fs_fs_io_direct_exit(c); bch2_fs_fs_io_buffered_exit(c); bch2_fs_fsio_exit(c); - bch2_fs_vfs_exit(c); - bch2_fs_ec_exit(c); - bch2_fs_encryption_exit(c); - bch2_fs_nocow_locking_exit(c); bch2_fs_io_write_exit(c); bch2_fs_io_read_exit(c); + bch2_fs_encryption_exit(c); + bch2_fs_ec_exit(c); + bch2_fs_counters_exit(c); + bch2_fs_compress_exit(c); + bch2_io_clock_exit(&c->io_clock[WRITE]); + bch2_io_clock_exit(&c->io_clock[READ]); bch2_fs_buckets_waiting_for_journal_exit(c); - bch2_fs_btree_interior_update_exit(c); + bch2_fs_btree_write_buffer_exit(c); bch2_fs_btree_key_cache_exit(&c->btree_key_cache); - bch2_fs_btree_cache_exit(c); bch2_fs_btree_iter_exit(c); - bch2_fs_replicas_exit(c); - bch2_fs_journal_exit(&c->journal); - bch2_io_clock_exit(&c->io_clock[WRITE]); - bch2_io_clock_exit(&c->io_clock[READ]); - bch2_fs_compress_exit(c); - bch2_fs_btree_gc_exit(c); + bch2_fs_btree_interior_update_exit(c); + bch2_fs_btree_cache_exit(c); + bch2_fs_accounting_exit(c); + bch2_fs_async_obj_exit(c); bch2_journal_keys_put_initial(c); bch2_find_btree_nodes_exit(&c->found_btree_nodes); + BUG_ON(atomic_read(&c->journal_keys.ref)); - bch2_fs_btree_write_buffer_exit(c); percpu_free_rwsem(&c->mark_lock); if (c->online_reserved) { u64 v = percpu_u64_get(c->online_reserved); @@ -587,9 +644,7 @@ static void __bch2_fs_free(struct bch_fs *c) mempool_exit(&c->btree_bounce_pool); bioset_exit(&c->btree_bio); mempool_exit(&c->fill_iter); -#ifndef BCH_WRITE_REF_DEBUG - percpu_ref_exit(&c->writes); -#endif + enumerated_ref_exit(&c->writes); kfree(rcu_dereference_protected(c->disk_groups, 1)); kfree(c->journal_seq_blacklist_table); @@ -601,8 +656,8 @@ static void __bch2_fs_free(struct bch_fs *c) destroy_workqueue(c->btree_read_complete_wq); if (c->copygc_wq) destroy_workqueue(c->copygc_wq); - if (c->btree_io_complete_wq) - destroy_workqueue(c->btree_io_complete_wq); + if (c->btree_write_complete_wq) + destroy_workqueue(c->btree_write_complete_wq); if (c->btree_update_wq) destroy_workqueue(c->btree_update_wq); @@ -628,6 +683,12 @@ void __bch2_fs_stop(struct bch_fs *c) bch2_fs_read_only(c); up_write(&c->state_lock); + for (unsigned i = 0; i < c->sb.nr_devices; i++) { + struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true); + if (ca) + bch2_dev_io_ref_stop(ca, READ); + } + for_each_member_device(c, ca) bch2_dev_unlink(ca); @@ -656,8 +717,6 @@ void __bch2_fs_stop(struct bch_fs *c) void bch2_fs_free(struct bch_fs *c) { - unsigned i; - mutex_lock(&bch_fs_list_lock); list_del(&c->list); mutex_unlock(&bch_fs_list_lock); @@ -665,7 +724,7 @@ void bch2_fs_free(struct bch_fs *c) closure_sync(&c->cl); closure_debug_destroy(&c->cl); - for (i = 0; i < c->sb.nr_devices; i++) { + for (unsigned i = 0; i < c->sb.nr_devices; i++) { struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true); if (ca) { @@ -693,9 +752,10 @@ static int bch2_fs_online(struct bch_fs *c) lockdep_assert_held(&bch_fs_list_lock); - if (__bch2_uuid_to_fs(c->sb.uuid)) { + if (c->sb.multi_device && + __bch2_uuid_to_fs(c->sb.uuid)) { bch_err(c, "filesystem UUID already open"); - return -EINVAL; + return bch_err_throw(c, filesystem_uuid_already_open); } ret = bch2_fs_chardev_init(c); @@ -706,7 +766,9 @@ static int bch2_fs_online(struct bch_fs *c) bch2_fs_debug_init(c); - ret = kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ?: + ret = (c->sb.multi_device + ? kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) + : kobject_add(&c->kobj, NULL, "%s", c->name)) ?: kobject_add(&c->internal, &c->kobj, "internal") ?: kobject_add(&c->opts_dir, &c->kobj, "options") ?: #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT @@ -737,7 +799,37 @@ err: return ret; } -static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) +static int bch2_fs_init_rw(struct bch_fs *c) +{ + if (test_bit(BCH_FS_rw_init_done, &c->flags)) + return 0; + + if (!(c->btree_update_wq = alloc_workqueue("bcachefs", + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) || + !(c->btree_write_complete_wq = alloc_workqueue("bcachefs_btree_write_complete", + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || + !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || + !(c->btree_write_submit_wq = alloc_workqueue("bcachefs_btree_write_sumit", + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || + !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref", + WQ_FREEZABLE, 0))) + return bch_err_throw(c, ENOMEM_fs_other_alloc); + + int ret = bch2_fs_btree_interior_update_init(c) ?: + bch2_fs_btree_write_buffer_init(c) ?: + bch2_fs_fs_io_buffered_init(c) ?: + bch2_fs_io_write_init(c) ?: + bch2_fs_journal_init(&c->journal); + if (ret) + return ret; + + set_bit(BCH_FS_rw_init_done, &c->flags); + return 0; +} + +static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts, + bch_sb_handles *sbs) { struct bch_fs *c; struct printbuf name = PRINTBUF; @@ -750,7 +842,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) goto out; } - c->stdio = (void *)(unsigned long) opts.stdio; + c->stdio = (void *)(unsigned long) opts->stdio; __module_get(THIS_MODULE); @@ -774,24 +866,29 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) refcount_set(&c->ro_ref, 1); init_waitqueue_head(&c->ro_ref_wait); - spin_lock_init(&c->recovery_pass_lock); - sema_init(&c->online_fsck_mutex, 1); for (i = 0; i < BCH_TIME_STAT_NR; i++) bch2_time_stats_init(&c->times[i]); - bch2_fs_copygc_init(c); - bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); - bch2_fs_btree_iter_init_early(c); - bch2_fs_btree_interior_update_init_early(c); - bch2_fs_journal_keys_init(c); bch2_fs_allocator_background_init(c); bch2_fs_allocator_foreground_init(c); - bch2_fs_rebalance_init(c); - bch2_fs_quota_init(c); + bch2_fs_btree_cache_init_early(&c->btree_cache); + bch2_fs_btree_gc_init_early(c); + bch2_fs_btree_interior_update_init_early(c); + bch2_fs_btree_iter_init_early(c); + bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); + bch2_fs_btree_write_buffer_init_early(c); + bch2_fs_copygc_init(c); bch2_fs_ec_init_early(c); + bch2_fs_journal_init_early(&c->journal); + bch2_fs_journal_keys_init(c); bch2_fs_move_init(c); + bch2_fs_nocow_locking_init_early(c); + bch2_fs_quota_init(c); + bch2_fs_recovery_passes_init(c); bch2_fs_sb_errors_init_early(c); + bch2_fs_snapshots_init_early(c); + bch2_fs_subvolumes_init_early(c); INIT_LIST_HEAD(&c->list); @@ -817,8 +914,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write]; c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq]; - bch2_fs_btree_cache_init_early(&c->btree_cache); - mutex_init(&c->sectors_available_lock); ret = percpu_init_rwsem(&c->mark_lock); @@ -832,14 +927,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) if (ret) goto err; - pr_uuid(&name, c->sb.user_uuid.b); - ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0; - if (ret) - goto err; - - strscpy(c->name, name.buf, sizeof(c->name)); - printbuf_exit(&name); - /* Compat: */ if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 && !BCH_SB_JOURNAL_FLUSH_DELAY(sb)) @@ -854,7 +941,14 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) if (ret) goto err; - bch2_opts_apply(&c->opts, opts); + bch2_opts_apply(&c->opts, *opts); + + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && + c->opts.block_size > PAGE_SIZE) { + bch_err(c, "cannot mount bs > ps filesystem without CONFIG_TRANSPARENT_HUGEPAGE"); + ret = -EINVAL; + goto err; + } c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc; if (c->opts.inodes_use_key_cache) @@ -870,26 +964,26 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) goto err; } + if (c->sb.multi_device) + pr_uuid(&name, c->sb.user_uuid.b); + else + prt_bdevname(&name, sbs->data[0].bdev); + + ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0; + if (ret) + goto err; + + strscpy(c->name, name.buf, sizeof(c->name)); + printbuf_exit(&name); + iter_size = sizeof(struct sort_iter) + (btree_blocks(c) + 1) * 2 * sizeof(struct sort_iter_set); - if (!(c->btree_update_wq = alloc_workqueue("bcachefs", - WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) || - !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io", - WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || - !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", - WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || - !(c->btree_read_complete_wq = alloc_workqueue("bcachefs_btree_read_complete", + if (!(c->btree_read_complete_wq = alloc_workqueue("bcachefs_btree_read_complete", WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 512)) || - !(c->btree_write_submit_wq = alloc_workqueue("bcachefs_btree_write_sumit", - WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || - !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref", - WQ_FREEZABLE, 0)) || -#ifndef BCH_WRITE_REF_DEBUG - percpu_ref_init(&c->writes, bch2_writes_disabled, - PERCPU_REF_INIT_DEAD, GFP_KERNEL) || -#endif + enumerated_ref_init(&c->writes, BCH_WRITE_REF_NR, + bch2_writes_disabled) || mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || bioset_init(&c->btree_bio, 1, max(offsetof(struct btree_read_bio, bio), @@ -901,33 +995,28 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1, c->opts.btree_node_size) || mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048)) { - ret = -BCH_ERR_ENOMEM_fs_other_alloc; + ret = bch_err_throw(c, ENOMEM_fs_other_alloc); goto err; } - ret = bch2_fs_counters_init(c) ?: - bch2_fs_sb_errors_init(c) ?: - bch2_io_clock_init(&c->io_clock[READ]) ?: - bch2_io_clock_init(&c->io_clock[WRITE]) ?: - bch2_fs_journal_init(&c->journal) ?: - bch2_fs_btree_iter_init(c) ?: + ret = + bch2_fs_async_obj_init(c) ?: bch2_fs_btree_cache_init(c) ?: + bch2_fs_btree_iter_init(c) ?: bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?: - bch2_fs_btree_interior_update_init(c) ?: - bch2_fs_btree_gc_init(c) ?: bch2_fs_buckets_waiting_for_journal_init(c) ?: - bch2_fs_btree_write_buffer_init(c) ?: - bch2_fs_subvolumes_init(c) ?: - bch2_fs_io_read_init(c) ?: - bch2_fs_io_write_init(c) ?: - bch2_fs_nocow_locking_init(c) ?: - bch2_fs_encryption_init(c) ?: + bch2_io_clock_init(&c->io_clock[READ]) ?: + bch2_io_clock_init(&c->io_clock[WRITE]) ?: bch2_fs_compress_init(c) ?: + bch2_fs_counters_init(c) ?: bch2_fs_ec_init(c) ?: - bch2_fs_vfs_init(c) ?: + bch2_fs_encryption_init(c) ?: bch2_fs_fsio_init(c) ?: - bch2_fs_fs_io_buffered_init(c) ?: - bch2_fs_fs_io_direct_init(c); + bch2_fs_fs_io_direct_init(c) ?: + bch2_fs_io_read_init(c) ?: + bch2_fs_rebalance_init(c) ?: + bch2_fs_sb_errors_init(c) ?: + bch2_fs_vfs_init(c); if (ret) goto err; @@ -942,10 +1031,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) ret = -EINVAL; goto err; } - bch_info(c, "Using encoding defined by superblock: utf8-%u.%u.%u", - unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), - unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), - unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING)); #else if (c->sb.features & BIT_ULL(BCH_FEATURE_casefolding)) { printk(KERN_ERR "Cannot mount a filesystem with casefolding on a kernel without CONFIG_UNICODE\n"); @@ -1013,6 +1098,11 @@ static void print_mount_opts(struct bch_fs *c) bch2_version_to_text(&p, c->sb.version_incompat_allowed); } + if (c->opts.verbose) { + prt_printf(&p, "\n features: "); + prt_bitflags(&p, bch2_sb_features, c->sb.features); + } + bch_info(c, "%s", p.buf); printbuf_exit(&p); } @@ -1020,19 +1110,18 @@ static void print_mount_opts(struct bch_fs *c) static bool bch2_fs_may_start(struct bch_fs *c) { struct bch_dev *ca; - unsigned i, flags = 0; + unsigned flags = 0; - if (c->opts.very_degraded) + switch (c->opts.degraded) { + case BCH_DEGRADED_very: flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST; - - if (c->opts.degraded) + break; + case BCH_DEGRADED_yes: flags |= BCH_FORCE_IF_DEGRADED; - - if (!c->opts.degraded && - !c->opts.very_degraded) { + break; + default: mutex_lock(&c->sb_lock); - - for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { + for (unsigned i = 0; i < c->disk_sb.sb->nr_devices; i++) { if (!bch2_member_exists(c->disk_sb.sb, i)) continue; @@ -1046,9 +1135,10 @@ static bool bch2_fs_may_start(struct bch_fs *c) } } mutex_unlock(&c->sb_lock); + break; } - return bch2_have_enough_devs(c, bch2_online_devs(c), flags, true); + return bch2_have_enough_devs(c, c->online_devs, flags, true); } int bch2_fs_start(struct bch_fs *c) @@ -1058,8 +1148,15 @@ int bch2_fs_start(struct bch_fs *c) print_mount_opts(c); +#ifdef CONFIG_UNICODE + bch_info(c, "Using encoding defined by superblock: utf8-%u.%u.%u", + unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), + unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), + unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING)); +#endif + if (!bch2_fs_may_start(c)) - return -BCH_ERR_insufficient_devices_to_start; + return bch_err_throw(c, insufficient_devices_to_start); down_write(&c->state_lock); mutex_lock(&c->sb_lock); @@ -1070,7 +1167,7 @@ int bch2_fs_start(struct bch_fs *c) sizeof(struct bch_sb_field_ext) / sizeof(u64))) { mutex_unlock(&c->sb_lock); up_write(&c->state_lock); - ret = -BCH_ERR_ENOSPC_sb; + ret = bch_err_throw(c, ENOSPC_sb); goto err; } @@ -1081,13 +1178,20 @@ int bch2_fs_start(struct bch_fs *c) goto err; } - for_each_online_member(c, ca) - bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = cpu_to_le64(now); + scoped_guard(rcu) + for_each_online_member_rcu(c, ca) + bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = + cpu_to_le64(now); + /* + * Dno't write superblock yet: recovery might have to downgrade + */ mutex_unlock(&c->sb_lock); - for_each_rw_member(c, ca) - bch2_dev_allocator_add(c, ca); + scoped_guard(rcu) + for_each_online_member_rcu(c, ca) + if (ca->mi.state == BCH_MEMBER_STATE_rw) + bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); up_write(&c->state_lock); @@ -1100,12 +1204,12 @@ int bch2_fs_start(struct bch_fs *c) if (ret) goto err; - ret = bch2_opts_check_may_set(c); + ret = bch2_opts_hooks_pre_set(c); if (ret) goto err; if (bch2_fs_init_fault("fs_start")) { - ret = -BCH_ERR_injected_fs_start; + ret = bch_err_throw(c, injected_fs_start); goto err; } @@ -1132,11 +1236,11 @@ static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c) struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx); if (le16_to_cpu(sb->block_size) != block_sectors(c)) - return -BCH_ERR_mismatched_block_size; + return bch_err_throw(c, mismatched_block_size); if (le16_to_cpu(m.bucket_size) < BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb)) - return -BCH_ERR_bucket_size_too_small; + return bch_err_throw(c, bucket_size_too_small); return 0; } @@ -1234,11 +1338,14 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs, static void bch2_dev_io_ref_stop(struct bch_dev *ca, int rw) { - if (!percpu_ref_is_zero(&ca->io_ref[rw])) { - reinit_completion(&ca->io_ref_completion[rw]); - percpu_ref_kill(&ca->io_ref[rw]); - wait_for_completion(&ca->io_ref_completion[rw]); - } + if (rw == READ) + clear_bit(ca->dev_idx, ca->fs->online_devs.d); + + if (!enumerated_ref_is_zero(&ca->io_ref[rw])) + enumerated_ref_stop(&ca->io_ref[rw], + rw == READ + ? bch2_dev_read_refs + : bch2_dev_write_refs); } static void bch2_dev_release(struct kobject *kobj) @@ -1250,8 +1357,8 @@ static void bch2_dev_release(struct kobject *kobj) static void bch2_dev_free(struct bch_dev *ca) { - WARN_ON(!percpu_ref_is_zero(&ca->io_ref[WRITE])); - WARN_ON(!percpu_ref_is_zero(&ca->io_ref[READ])); + WARN_ON(!enumerated_ref_is_zero(&ca->io_ref[WRITE])); + WARN_ON(!enumerated_ref_is_zero(&ca->io_ref[READ])); cancel_work_sync(&ca->io_error_work); @@ -1260,6 +1367,9 @@ static void bch2_dev_free(struct bch_dev *ca) if (ca->kobj.state_in_sysfs) kobject_del(&ca->kobj); + bch2_bucket_bitmap_free(&ca->bucket_backpointer_mismatch); + bch2_bucket_bitmap_free(&ca->bucket_backpointer_empty); + bch2_free_super(&ca->disk_sb); bch2_dev_allocator_background_exit(ca); bch2_dev_journal_exit(ca); @@ -1271,8 +1381,8 @@ static void bch2_dev_free(struct bch_dev *ca) bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]); bch2_time_stats_quantiles_exit(&ca->io_latency[READ]); - percpu_ref_exit(&ca->io_ref[WRITE]); - percpu_ref_exit(&ca->io_ref[READ]); + enumerated_ref_exit(&ca->io_ref[WRITE]); + enumerated_ref_exit(&ca->io_ref[READ]); #ifndef CONFIG_BCACHEFS_DEBUG percpu_ref_exit(&ca->ref); #endif @@ -1284,7 +1394,7 @@ static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca) lockdep_assert_held(&c->state_lock); - if (percpu_ref_is_zero(&ca->io_ref[READ])) + if (enumerated_ref_is_zero(&ca->io_ref[READ])) return; __bch2_dev_read_only(c, ca); @@ -1306,20 +1416,6 @@ static void bch2_dev_ref_complete(struct percpu_ref *ref) } #endif -static void bch2_dev_io_ref_read_complete(struct percpu_ref *ref) -{ - struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref[READ]); - - complete(&ca->io_ref_completion[READ]); -} - -static void bch2_dev_io_ref_write_complete(struct percpu_ref *ref) -{ - struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref[WRITE]); - - complete(&ca->io_ref_completion[WRITE]); -} - static void bch2_dev_unlink(struct bch_dev *ca) { struct kobject *b; @@ -1381,8 +1477,6 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, kobject_init(&ca->kobj, &bch2_dev_ktype); init_completion(&ca->ref_completion); - init_completion(&ca->io_ref_completion[READ]); - init_completion(&ca->io_ref_completion[WRITE]); INIT_WORK(&ca->io_error_work, bch2_io_error_work); @@ -1406,12 +1500,13 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, atomic_long_set(&ca->ref, 1); #endif + mutex_init(&ca->bucket_backpointer_mismatch.lock); + mutex_init(&ca->bucket_backpointer_empty.lock); + bch2_dev_allocator_background_init(ca); - if (percpu_ref_init(&ca->io_ref[READ], bch2_dev_io_ref_read_complete, - PERCPU_REF_INIT_DEAD, GFP_KERNEL) || - percpu_ref_init(&ca->io_ref[WRITE], bch2_dev_io_ref_write_complete, - PERCPU_REF_INIT_DEAD, GFP_KERNEL) || + if (enumerated_ref_init(&ca->io_ref[READ], BCH_DEV_READ_REF_NR, NULL) || + enumerated_ref_init(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_NR, NULL) || !(ca->sb_read_scratch = kmalloc(BCH_SB_READ_SCRATCH_BUF_SIZE, GFP_KERNEL)) || bch2_dev_buckets_alloc(c, ca) || !(ca->io_done = alloc_percpu(*ca->io_done))) @@ -1428,7 +1523,9 @@ static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca, { ca->dev_idx = dev_idx; __set_bit(ca->dev_idx, ca->self.d); - scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx); + + if (!ca->name[0]) + scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx); ca->fs = c; rcu_assign_pointer(c->devs[ca->dev_idx], ca); @@ -1454,7 +1551,7 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) bch2_dev_attach(c, ca, dev_idx); return 0; err: - return -BCH_ERR_ENOMEM_dev_alloc; + return bch_err_throw(c, ENOMEM_dev_alloc); } static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) @@ -1464,22 +1561,27 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) if (bch2_dev_is_online(ca)) { bch_err(ca, "already have device online in slot %u", sb->sb->dev_idx); - return -BCH_ERR_device_already_online; + return bch_err_throw(ca->fs, device_already_online); } if (get_capacity(sb->bdev->bd_disk) < ca->mi.bucket_size * ca->mi.nbuckets) { bch_err(ca, "cannot online: device too small"); - return -BCH_ERR_device_size_too_small; + return bch_err_throw(ca->fs, device_size_too_small); } - BUG_ON(!percpu_ref_is_zero(&ca->io_ref[READ])); - BUG_ON(!percpu_ref_is_zero(&ca->io_ref[WRITE])); + BUG_ON(!enumerated_ref_is_zero(&ca->io_ref[READ])); + BUG_ON(!enumerated_ref_is_zero(&ca->io_ref[WRITE])); ret = bch2_dev_journal_init(ca, sb->sb); if (ret) return ret; + struct printbuf name = PRINTBUF; + prt_bdevname(&name, sb->bdev); + strscpy(ca->name, name.buf, sizeof(ca->name)); + printbuf_exit(&name); + /* Commit: */ ca->disk_sb = *sb; memset(sb, 0, sizeof(*sb)); @@ -1493,7 +1595,7 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) ca->dev = ca->disk_sb.bdev->bd_dev; - percpu_ref_reinit(&ca->io_ref[READ]); + enumerated_ref_start(&ca->io_ref[READ]); return 0; } @@ -1517,16 +1619,9 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) if (ret) return ret; - bch2_dev_sysfs_online(c, ca); - - struct printbuf name = PRINTBUF; - prt_bdevname(&name, ca->disk_sb.bdev); + set_bit(ca->dev_idx, c->online_devs.d); - if (c->sb.nr_devices == 1) - strscpy(c->name, name.buf, sizeof(c->name)); - strscpy(ca->name, name.buf, sizeof(ca->name)); - - printbuf_exit(&name); + bch2_dev_sysfs_online(c, ca); bch2_rebalance_wakeup(c); return 0; @@ -1578,7 +1673,7 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, return true; /* do we have enough devices to read from? */ - new_online_devs = bch2_online_devs(c); + new_online_devs = c->online_devs; __clear_bit(ca->dev_idx, new_online_devs.d); return bch2_have_enough_devs(c, new_online_devs, flags, false); @@ -1608,8 +1703,8 @@ static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); - if (percpu_ref_is_zero(&ca->io_ref[WRITE])) - percpu_ref_reinit(&ca->io_ref[WRITE]); + if (enumerated_ref_is_zero(&ca->io_ref[WRITE])) + enumerated_ref_start(&ca->io_ref[WRITE]); bch2_dev_do_discards(ca); } @@ -1624,7 +1719,7 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, return 0; if (!bch2_dev_state_allowed(c, ca, new_state, flags)) - return -BCH_ERR_device_state_not_allowed; + return bch_err_throw(c, device_state_not_allowed); if (new_state != BCH_MEMBER_STATE_rw) __bch2_dev_read_only(c, ca); @@ -1663,6 +1758,8 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) { struct bch_member *m; unsigned dev_idx = ca->dev_idx, data; + bool fast_device_removal = !bch2_request_incompat_feature(c, + bcachefs_metadata_version_fast_device_removal); int ret; down_write(&c->state_lock); @@ -1675,17 +1772,31 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) { bch_err(ca, "Cannot remove without losing data"); - ret = -BCH_ERR_device_state_not_allowed; + ret = bch_err_throw(c, device_state_not_allowed); goto err; } __bch2_dev_read_only(c, ca); - ret = bch2_dev_data_drop(c, ca->dev_idx, flags); - bch_err_msg(ca, ret, "bch2_dev_data_drop()"); + ret = fast_device_removal + ? bch2_dev_data_drop_by_backpointers(c, ca->dev_idx, flags) + : (bch2_dev_data_drop(c, ca->dev_idx, flags) ?: + bch2_dev_remove_stripes(c, ca->dev_idx, flags)); if (ret) goto err; + /* Check if device still has data before blowing away alloc info */ + struct bch_dev_usage usage = bch2_dev_usage_read(ca); + for (unsigned i = 0; i < BCH_DATA_NR; i++) + if (!data_type_is_empty(i) && + !data_type_is_hidden(i) && + usage.buckets[i]) { + bch_err(ca, "Remove failed: still has data (%s, %llu buckets)", + __bch2_data_types[i], usage.buckets[i]); + ret = -EBUSY; + goto err; + } + ret = bch2_dev_remove_alloc(c, ca); bch_err_msg(ca, ret, "bch2_dev_remove_alloc()"); if (ret) @@ -1749,7 +1860,11 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) */ mutex_lock(&c->sb_lock); m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx); - memset(&m->uuid, 0, sizeof(m->uuid)); + + if (fast_device_removal) + m->uuid = BCH_SB_MEMBER_DELETED_UUID; + else + memset(&m->uuid, 0, sizeof(m->uuid)); bch2_write_super(c); @@ -1759,7 +1874,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) err: if (test_bit(BCH_FS_rw, &c->flags) && ca->mi.state == BCH_MEMBER_STATE_rw && - !percpu_ref_is_zero(&ca->io_ref[READ])) + !enumerated_ref_is_zero(&ca->io_ref[READ])) __bch2_dev_read_write(c, ca); up_write(&c->state_lock); return ret; @@ -1769,11 +1884,11 @@ err: int bch2_dev_add(struct bch_fs *c, const char *path) { struct bch_opts opts = bch2_opts_empty(); - struct bch_sb_handle sb; + struct bch_sb_handle sb = {}; struct bch_dev *ca = NULL; struct printbuf errbuf = PRINTBUF; struct printbuf label = PRINTBUF; - int ret; + int ret = 0; ret = bch2_read_super(path, &opts, &sb); bch_err_msg(c, ret, "reading super"); @@ -1790,6 +1905,20 @@ int bch2_dev_add(struct bch_fs *c, const char *path) } } + if (list_empty(&c->list)) { + mutex_lock(&bch_fs_list_lock); + if (__bch2_uuid_to_fs(c->sb.uuid)) + ret = bch_err_throw(c, filesystem_uuid_already_open); + else + list_add(&c->list, &bch_fs_list); + mutex_unlock(&bch_fs_list_lock); + + if (ret) { + bch_err(c, "filesystem UUID already open"); + goto err; + } + } + ret = bch2_dev_may_add(sb.sb, c); if (ret) goto err; @@ -1806,6 +1935,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path) down_write(&c->state_lock); mutex_lock(&c->sb_lock); + SET_BCH_SB_MULTI_DEVICE(c->disk_sb.sb, true); ret = bch2_sb_from_fs(c, ca); bch_err_msg(c, ret, "setting up new superblock"); @@ -1821,6 +1951,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path) goto err_unlock; } unsigned dev_idx = ret; + ret = 0; /* success: */ @@ -1840,27 +1971,29 @@ int bch2_dev_add(struct bch_fs *c, const char *path) bch2_write_super(c); mutex_unlock(&c->sb_lock); - ret = bch2_dev_usage_init(ca, false); - if (ret) - goto err_late; + if (test_bit(BCH_FS_started, &c->flags)) { + ret = bch2_dev_usage_init(ca, false); + if (ret) + goto err_late; - ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); - bch_err_msg(ca, ret, "marking new superblock"); - if (ret) - goto err_late; + ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); + bch_err_msg(ca, ret, "marking new superblock"); + if (ret) + goto err_late; - ret = bch2_fs_freespace_init(c); - bch_err_msg(ca, ret, "initializing free space"); - if (ret) - goto err_late; + ret = bch2_fs_freespace_init(c); + bch_err_msg(ca, ret, "initializing free space"); + if (ret) + goto err_late; - if (ca->mi.state == BCH_MEMBER_STATE_rw) - __bch2_dev_read_write(c, ca); + if (ca->mi.state == BCH_MEMBER_STATE_rw) + __bch2_dev_read_write(c, ca); - ret = bch2_dev_journal_alloc(ca, false); - bch_err_msg(c, ret, "allocating journal"); - if (ret) - goto err_late; + ret = bch2_dev_journal_alloc(ca, false); + bch_err_msg(c, ret, "allocating journal"); + if (ret) + goto err_late; + } up_write(&c->state_lock); out: @@ -1962,7 +2095,7 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags) if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) { bch_err(ca, "Cannot offline required disk"); up_write(&c->state_lock); - return -BCH_ERR_device_state_not_allowed; + return bch_err_throw(c, device_state_not_allowed); } __bch2_dev_offline(c, ca); @@ -1971,6 +2104,18 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags) return 0; } +static int __bch2_dev_resize_alloc(struct bch_dev *ca, u64 old_nbuckets, u64 new_nbuckets) +{ + struct bch_fs *c = ca->fs; + u64 v[3] = { new_nbuckets - old_nbuckets, 0, 0 }; + + return bch2_trans_commit_do(ca->fs, NULL, NULL, 0, + bch2_disk_accounting_mod2(trans, false, v, dev_data_type, + .dev = ca->dev_idx, + .data_type = BCH_DATA_free)) ?: + bch2_dev_freespace_init(c, ca, old_nbuckets, new_nbuckets); +} + int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) { struct bch_member *m; @@ -1989,7 +2134,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) if (nbuckets > BCH_MEMBER_NBUCKETS_MAX) { bch_err(ca, "New device size too big (%llu greater than max %u)", nbuckets, BCH_MEMBER_NBUCKETS_MAX); - ret = -BCH_ERR_device_size_too_big; + ret = bch_err_throw(c, device_size_too_big); goto err; } @@ -1997,7 +2142,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) get_capacity(ca->disk_sb.bdev->bd_disk) < ca->mi.bucket_size * nbuckets) { bch_err(ca, "New size larger than device"); - ret = -BCH_ERR_device_size_too_small; + ret = bch_err_throw(c, device_size_too_small); goto err; } @@ -2018,13 +2163,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) mutex_unlock(&c->sb_lock); if (ca->mi.freespace_initialized) { - u64 v[3] = { nbuckets - old_nbuckets, 0, 0 }; - - ret = bch2_trans_commit_do(ca->fs, NULL, NULL, 0, - bch2_disk_accounting_mod2(trans, false, v, dev_data_type, - .dev = ca->dev_idx, - .data_type = BCH_DATA_free)) ?: - bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets); + ret = __bch2_dev_resize_alloc(ca, old_nbuckets, nbuckets); if (ret) goto err; } @@ -2035,6 +2174,49 @@ err: return ret; } +int bch2_fs_resize_on_mount(struct bch_fs *c) +{ + for_each_online_member(c, ca, BCH_DEV_READ_REF_fs_resize_on_mount) { + u64 old_nbuckets = ca->mi.nbuckets; + u64 new_nbuckets = div64_u64(get_capacity(ca->disk_sb.bdev->bd_disk), + ca->mi.bucket_size); + + if (ca->mi.resize_on_mount && + new_nbuckets > ca->mi.nbuckets) { + bch_info(ca, "resizing to size %llu", new_nbuckets * ca->mi.bucket_size); + int ret = bch2_dev_buckets_resize(c, ca, new_nbuckets); + bch_err_fn(ca, ret); + if (ret) { + enumerated_ref_put(&ca->io_ref[READ], + BCH_DEV_READ_REF_fs_resize_on_mount); + up_write(&c->state_lock); + return ret; + } + + mutex_lock(&c->sb_lock); + struct bch_member *m = + bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); + m->nbuckets = cpu_to_le64(new_nbuckets); + SET_BCH_MEMBER_RESIZE_ON_MOUNT(m, false); + + c->disk_sb.sb->features[0] &= ~cpu_to_le64(BIT_ULL(BCH_FEATURE_small_image)); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + if (ca->mi.freespace_initialized) { + ret = __bch2_dev_resize_alloc(ca, old_nbuckets, new_nbuckets); + if (ret) { + enumerated_ref_put(&ca->io_ref[READ], + BCH_DEV_READ_REF_fs_resize_on_mount); + up_write(&c->state_lock); + return ret; + } + } + } + } + return 0; +} + /* return with ref on ca->ref: */ struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name) { @@ -2095,20 +2277,32 @@ static void bch2_fs_bdev_mark_dead(struct block_device *bdev, bool surprise) if (!ca) goto unlock; - if (bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, BCH_FORCE_IF_DEGRADED)) { + bool dev = bch2_dev_state_allowed(c, ca, + BCH_MEMBER_STATE_failed, + BCH_FORCE_IF_DEGRADED); + + if (!dev && sb) { + if (!surprise) + sync_filesystem(sb); + shrink_dcache_sb(sb); + evict_inodes(sb); + } + + struct printbuf buf = PRINTBUF; + __bch2_log_msg_start(ca->name, &buf); + + prt_printf(&buf, "offline from block layer"); + + if (dev) { __bch2_dev_offline(c, ca); } else { - if (sb) { - if (!surprise) - sync_filesystem(sb); - shrink_dcache_sb(sb); - evict_inodes(sb); - } - bch2_journal_flush(&c->journal); - bch2_fs_emergency_read_only(c); + bch2_fs_emergency_read_only2(c, &buf); } + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + bch2_dev_put(ca); unlock: if (sb) @@ -2151,10 +2345,10 @@ static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r) cmp_int(le64_to_cpu(l->write_time), le64_to_cpu(r->write_time)); } -struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, - struct bch_opts opts) +struct bch_fs *bch2_fs_open(darray_const_str *devices, + struct bch_opts *opts) { - DARRAY(struct bch_sb_handle) sbs = { 0 }; + bch_sb_handles sbs = {}; struct bch_fs *c = NULL; struct bch_sb_handle *best = NULL; struct printbuf errbuf = PRINTBUF; @@ -2163,27 +2357,27 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, if (!try_module_get(THIS_MODULE)) return ERR_PTR(-ENODEV); - if (!nr_devices) { + if (!devices->nr) { ret = -EINVAL; goto err; } - ret = darray_make_room(&sbs, nr_devices); + ret = darray_make_room(&sbs, devices->nr); if (ret) goto err; - for (unsigned i = 0; i < nr_devices; i++) { + darray_for_each(*devices, i) { struct bch_sb_handle sb = { NULL }; - ret = bch2_read_super(devices[i], &opts, &sb); + ret = bch2_read_super(*i, opts, &sb); if (ret) goto err; BUG_ON(darray_push(&sbs, sb)); } - if (opts.nochanges && !opts.read_only) { - ret = -BCH_ERR_erofs_nochanges; + if (opts->nochanges && !opts->read_only) { + ret = bch_err_throw(c, erofs_nochanges); goto err_print; } @@ -2192,7 +2386,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, best = sb; darray_for_each_reverse(sbs, sb) { - ret = bch2_dev_in_fs(best, sb, &opts); + ret = bch2_dev_in_fs(best, sb, opts); if (ret == -BCH_ERR_device_has_been_removed || ret == -BCH_ERR_device_splitbrain) { @@ -2207,7 +2401,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, goto err_print; } - c = bch2_fs_alloc(best->sb, opts); + c = bch2_fs_alloc(best->sb, opts, &sbs); ret = PTR_ERR_OR_ZERO(c); if (ret) goto err; @@ -2236,7 +2430,7 @@ out: return c; err_print: pr_err("bch_fs_open err opening %s: %s", - devices[0], bch2_err_str(ret)); + devices->data[0], bch2_err_str(ret)); err: if (!IS_ERR_OR_NULL(c)) bch2_fs_stop(c); @@ -2273,9 +2467,45 @@ err: return -ENOMEM; } -#define BCH_DEBUG_PARAM(name, description) \ - bool bch2_##name; \ - module_param_named(name, bch2_##name, bool, 0644); \ +#define BCH_DEBUG_PARAM(name, description) DEFINE_STATIC_KEY_FALSE(bch2_##name); +BCH_DEBUG_PARAMS_ALL() +#undef BCH_DEBUG_PARAM + +static int bch2_param_set_static_key_t(const char *val, const struct kernel_param *kp) +{ + /* Match bool exactly, by re-using it. */ + struct static_key *key = kp->arg; + struct kernel_param boolkp = *kp; + bool v; + int ret; + + boolkp.arg = &v; + + ret = param_set_bool(val, &boolkp); + if (ret) + return ret; + if (v) + static_key_enable(key); + else + static_key_disable(key); + return 0; +} + +static int bch2_param_get_static_key_t(char *buffer, const struct kernel_param *kp) +{ + struct static_key *key = kp->arg; + return sprintf(buffer, "%c\n", static_key_enabled(key) ? 'N' : 'Y'); +} + +static const struct kernel_param_ops bch2_param_ops_static_key_t = { + .flags = KERNEL_PARAM_OPS_FL_NOARG, + .set = bch2_param_set_static_key_t, + .get = bch2_param_get_static_key_t, +}; + +#define BCH_DEBUG_PARAM(name, description) \ + module_param_cb(name, &bch2_param_ops_static_key_t, &bch2_##name.key, 0644);\ + __MODULE_PARM_TYPE(name, "static_key_t"); \ MODULE_PARM_DESC(name, description); BCH_DEBUG_PARAMS() #undef BCH_DEBUG_PARAM diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h index 23533bce5709..dc52f06cb2b9 100644 --- a/fs/bcachefs/super.h +++ b/fs/bcachefs/super.h @@ -9,6 +9,9 @@ #include <linux/math64.h> extern const char * const bch2_fs_flag_strs[]; +extern const char * const bch2_write_refs[]; +extern const char * const bch2_dev_read_refs[]; +extern const char * const bch2_dev_write_refs[]; struct bch_fs *bch2_dev_to_fs(dev_t); struct bch_fs *bch2_uuid_to_fs(__uuid_t); @@ -29,18 +32,22 @@ int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64); struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *); bool bch2_fs_emergency_read_only(struct bch_fs *); +bool bch2_fs_emergency_read_only2(struct bch_fs *, struct printbuf *); + bool bch2_fs_emergency_read_only_locked(struct bch_fs *); void bch2_fs_read_only(struct bch_fs *); int bch2_fs_read_write(struct bch_fs *); int bch2_fs_read_write_early(struct bch_fs *); +int bch2_fs_resize_on_mount(struct bch_fs *); + void __bch2_fs_stop(struct bch_fs *); void bch2_fs_free(struct bch_fs *); void bch2_fs_stop(struct bch_fs *); int bch2_fs_start(struct bch_fs *); -struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts); +struct bch_fs *bch2_fs_open(darray_const_str *, struct bch_opts *); extern const struct blk_holder_ops bch2_sb_handle_bdev_ops; diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 82ee333ddd21..05848375cea2 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -25,6 +25,8 @@ #include "disk_accounting.h" #include "disk_groups.h" #include "ec.h" +#include "enumerated_ref.h" +#include "error.h" #include "inode.h" #include "journal.h" #include "journal_reclaim.h" @@ -34,7 +36,9 @@ #include "nocow_locking.h" #include "opts.h" #include "rebalance.h" +#include "recovery_passes.h" #include "replicas.h" +#include "sb-errors.h" #include "super-io.h" #include "tests.h" @@ -141,12 +145,16 @@ do { \ write_attribute(trigger_gc); write_attribute(trigger_discards); write_attribute(trigger_invalidates); +write_attribute(trigger_journal_commit); write_attribute(trigger_journal_flush); write_attribute(trigger_journal_writes); write_attribute(trigger_btree_cache_shrink); write_attribute(trigger_btree_key_cache_shrink); -write_attribute(trigger_freelist_wakeup); write_attribute(trigger_btree_updates); +write_attribute(trigger_freelist_wakeup); +write_attribute(trigger_recalc_capacity); +write_attribute(trigger_delete_dead_snapshots); +write_attribute(trigger_emergency_read_only); read_attribute(gc_gens_pos); read_attribute(uuid); @@ -168,6 +176,7 @@ read_attribute(btree_write_stats); read_attribute(btree_cache_size); read_attribute(compression_stats); +read_attribute(errors); read_attribute(journal_debug); read_attribute(btree_cache); read_attribute(btree_key_cache); @@ -176,25 +185,9 @@ read_attribute(open_buckets); read_attribute(open_buckets_partial); read_attribute(nocow_lock_table); -#ifdef BCH_WRITE_REF_DEBUG +read_attribute(read_refs); read_attribute(write_refs); -static const char * const bch2_write_refs[] = { -#define x(n) #n, - BCH_WRITE_REFS() -#undef x - NULL -}; - -static void bch2_write_refs_to_text(struct printbuf *out, struct bch_fs *c) -{ - bch2_printbuf_tabstop_push(out, 24); - - for (unsigned i = 0; i < ARRAY_SIZE(c->writes); i++) - prt_printf(out, "%s\t%li\n", bch2_write_refs[i], atomic_long_read(&c->writes[i])); -} -#endif - read_attribute(internal_uuid); read_attribute(disk_groups); @@ -212,6 +205,8 @@ read_attribute(copy_gc_wait); sysfs_pd_controller_attribute(rebalance); read_attribute(rebalance_status); +read_attribute(snapshot_delete_status); +read_attribute(recovery_status); read_attribute(new_stripes); @@ -334,6 +329,12 @@ SHOW(bch2_fs) if (attr == &sysfs_rebalance_status) bch2_rebalance_status_to_text(out, c); + if (attr == &sysfs_snapshot_delete_status) + bch2_snapshot_delete_status_to_text(out, c); + + if (attr == &sysfs_recovery_status) + bch2_recovery_pass_status_to_text(out, c); + /* Debugging: */ if (attr == &sysfs_journal_debug) @@ -357,6 +358,9 @@ SHOW(bch2_fs) if (attr == &sysfs_compression_stats) bch2_compression_stats_to_text(out, c); + if (attr == &sysfs_errors) + bch2_fs_errors_to_text(out, c); + if (attr == &sysfs_new_stripes) bch2_new_stripes_to_text(out, c); @@ -369,10 +373,8 @@ SHOW(bch2_fs) if (attr == &sysfs_moving_ctxts) bch2_fs_moving_ctxts_to_text(out, c); -#ifdef BCH_WRITE_REF_DEBUG if (attr == &sysfs_write_refs) - bch2_write_refs_to_text(out, c); -#endif + enumerated_ref_to_text(out, &c->writes, bch2_write_refs); if (attr == &sysfs_nocow_lock_table) bch2_nocow_locks_to_text(out, &c->nocow_locks); @@ -405,7 +407,7 @@ STORE(bch2_fs) if (attr == &sysfs_trigger_btree_updates) queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work); - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs)) + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_sysfs)) return -EROFS; if (attr == &sysfs_trigger_btree_cache_shrink) { @@ -434,6 +436,9 @@ STORE(bch2_fs) if (attr == &sysfs_trigger_invalidates) bch2_do_invalidates(c); + if (attr == &sysfs_trigger_journal_commit) + bch2_journal_flush(&c->journal); + if (attr == &sysfs_trigger_journal_flush) { bch2_journal_flush_all_pins(&c->journal); bch2_journal_meta(&c->journal); @@ -445,6 +450,25 @@ STORE(bch2_fs) if (attr == &sysfs_trigger_freelist_wakeup) closure_wake_up(&c->freelist_wait); + if (attr == &sysfs_trigger_recalc_capacity) { + down_read(&c->state_lock); + bch2_recalc_capacity(c); + up_read(&c->state_lock); + } + + if (attr == &sysfs_trigger_delete_dead_snapshots) + __bch2_delete_dead_snapshots(c); + + if (attr == &sysfs_trigger_emergency_read_only) { + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + + prt_printf(&buf, "shutdown by sysfs\n"); + bch2_fs_emergency_read_only2(c, &buf); + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + } + #ifdef CONFIG_BCACHEFS_TESTS if (attr == &sysfs_perf_test) { char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp; @@ -465,7 +489,7 @@ STORE(bch2_fs) size = ret; } #endif - bch2_write_ref_put(c, BCH_WRITE_REF_sysfs); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_sysfs); return size; } SYSFS_OPS(bch2_fs); @@ -476,8 +500,11 @@ struct attribute *bch2_fs_files[] = { &sysfs_btree_write_stats, &sysfs_rebalance_status, + &sysfs_snapshot_delete_status, + &sysfs_recovery_status, &sysfs_compression_stats, + &sysfs_errors, #ifdef CONFIG_BCACHEFS_TESTS &sysfs_perf_test, @@ -558,9 +585,7 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_new_stripes, &sysfs_open_buckets, &sysfs_open_buckets_partial, -#ifdef BCH_WRITE_REF_DEBUG &sysfs_write_refs, -#endif &sysfs_nocow_lock_table, &sysfs_io_timers_read, &sysfs_io_timers_write, @@ -568,12 +593,16 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_trigger_gc, &sysfs_trigger_discards, &sysfs_trigger_invalidates, + &sysfs_trigger_journal_commit, &sysfs_trigger_journal_flush, &sysfs_trigger_journal_writes, &sysfs_trigger_btree_cache_shrink, &sysfs_trigger_btree_key_cache_shrink, - &sysfs_trigger_freelist_wakeup, &sysfs_trigger_btree_updates, + &sysfs_trigger_freelist_wakeup, + &sysfs_trigger_recalc_capacity, + &sysfs_trigger_delete_dead_snapshots, + &sysfs_trigger_emergency_read_only, &sysfs_gc_gens_pos, @@ -626,7 +655,7 @@ static ssize_t sysfs_opt_store(struct bch_fs *c, * We don't need to take c->writes for correctness, but it eliminates an * unsightly error message in the dmesg log when we're RO: */ - if (unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs))) + if (unlikely(!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_sysfs))) return -EROFS; char *tmp = kstrdup(buf, GFP_KERNEL); @@ -637,40 +666,34 @@ static ssize_t sysfs_opt_store(struct bch_fs *c, u64 v; ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL) ?: - bch2_opt_check_may_set(c, ca, id, v); + bch2_opt_hook_pre_set(c, ca, id, v); kfree(tmp); if (ret < 0) goto err; - bch2_opt_set_sb(c, ca, opt, v); - bch2_opt_set_by_id(&c->opts, id, v); - - if (v && - (id == Opt_background_target || - (id == Opt_foreground_target && !c->opts.background_target) || - id == Opt_background_compression || - (id == Opt_compression && !c->opts.background_compression))) - bch2_set_rebalance_needs_scan(c, 0); - - if (v && id == Opt_rebalance_enabled) - bch2_rebalance_wakeup(c); + bool is_sb = opt->get_sb || opt->get_member; + bool changed = false; - if (v && id == Opt_copygc_enabled) - bch2_copygc_wakeup(c); + if (is_sb) { + changed = bch2_opt_set_sb(c, ca, opt, v); + } else if (!ca) { + changed = bch2_opt_get_by_id(&c->opts, id) != v; + } else { + /* device options that aren't superblock options aren't + * supported */ + BUG(); + } - if (id == Opt_discard && !ca) { - mutex_lock(&c->sb_lock); - for_each_member_device(c, ca) - opt->set_member(bch2_members_v2_get_mut(ca->disk_sb.sb, ca->dev_idx), v); + if (!ca) + bch2_opt_set_by_id(&c->opts, id, v); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - } + if (changed) + bch2_opt_hook_post_set(c, ca, 0, &c->opts, id); ret = size; err: - bch2_write_ref_put(c, BCH_WRITE_REF_sysfs); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_sysfs); return ret; } @@ -821,6 +844,12 @@ SHOW(bch2_dev) if (opt_id >= 0) return sysfs_opt_show(c, ca, opt_id, out); + if (attr == &sysfs_read_refs) + enumerated_ref_to_text(out, &ca->io_ref[READ], bch2_dev_read_refs); + + if (attr == &sysfs_write_refs) + enumerated_ref_to_text(out, &ca->io_ref[WRITE], bch2_dev_write_refs); + return 0; } @@ -876,6 +905,9 @@ struct attribute *bch2_dev_files[] = { /* debug: */ &sysfs_alloc_debug, &sysfs_open_buckets, + + &sysfs_read_refs, + &sysfs_write_refs, NULL }; diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index 519d00d62ae7..dc09532796af 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -199,6 +199,50 @@ DECLARE_EVENT_CLASS(bio, (unsigned long long)__entry->sector, __entry->nr_sector) ); +/* errors */ + +TRACE_EVENT(error_throw, + TP_PROTO(struct bch_fs *c, int bch_err, unsigned long ip), + TP_ARGS(c, bch_err, ip), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(int, err ) + __array(char, err_str, 32 ) + __array(char, ip, 32 ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + __entry->err = bch_err; + strscpy(__entry->err_str, bch2_err_str(bch_err), sizeof(__entry->err_str)); + snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip); + ), + + TP_printk("%d,%d %s ret %s", MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ip, __entry->err_str) +); + +TRACE_EVENT(error_downcast, + TP_PROTO(int bch_err, int std_err, unsigned long ip), + TP_ARGS(bch_err, std_err, ip), + + TP_STRUCT__entry( + __array(char, bch_err, 32 ) + __array(char, std_err, 32 ) + __array(char, ip, 32 ) + ), + + TP_fast_assign( + strscpy(__entry->bch_err, bch2_err_str(bch_err), sizeof(__entry->bch_err)); + strscpy(__entry->std_err, bch2_err_str(std_err), sizeof(__entry->std_err)); + snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip); + ), + + TP_printk("%s ret %s -> %s %s", __entry->ip, + __entry->bch_err, __entry->std_err, __entry->ip) +); + /* disk_accounting.c */ TRACE_EVENT(accounting_mem_insert, @@ -339,6 +383,11 @@ DEFINE_EVENT(bio, io_read_reuse_race, TP_ARGS(bio) ); +DEFINE_EVENT(bio, io_read_fail_and_poison, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) +); + /* ec.c */ TRACE_EVENT(stripe_create, @@ -1122,51 +1171,9 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split, TP_ARGS(trans, caller_ip, path) ); -TRACE_EVENT(trans_restart_upgrade, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path, - unsigned old_locks_want, - unsigned new_locks_want, - struct get_locks_fail *f), - TP_ARGS(trans, caller_ip, path, old_locks_want, new_locks_want, f), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(unsigned long, caller_ip ) - __field(u8, btree_id ) - __field(u8, old_locks_want ) - __field(u8, new_locks_want ) - __field(u8, level ) - __field(u32, path_seq ) - __field(u32, node_seq ) - TRACE_BPOS_entries(pos) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->caller_ip = caller_ip; - __entry->btree_id = path->btree_id; - __entry->old_locks_want = old_locks_want; - __entry->new_locks_want = new_locks_want; - __entry->level = f->l; - __entry->path_seq = path->l[f->l].lock_seq; - __entry->node_seq = IS_ERR_OR_NULL(f->b) ? 0 : f->b->c.lock.seq; - TRACE_BPOS_assign(pos, path->pos) - ), - - TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u level %u path seq %u node seq %u", - __entry->trans_fn, - (void *) __entry->caller_ip, - bch2_btree_id_str(__entry->btree_id), - __entry->pos_inode, - __entry->pos_offset, - __entry->pos_snapshot, - __entry->old_locks_want, - __entry->new_locks_want, - __entry->level, - __entry->path_seq, - __entry->node_seq) +DEFINE_EVENT(fs_str, trans_restart_upgrade, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); DEFINE_EVENT(trans_str, trans_restart_relock, @@ -1468,23 +1475,19 @@ DEFINE_EVENT(fs_str, data_update, TP_ARGS(c, str) ); -TRACE_EVENT(error_downcast, - TP_PROTO(int bch_err, int std_err, unsigned long ip), - TP_ARGS(bch_err, std_err, ip), - - TP_STRUCT__entry( - __array(char, bch_err, 32 ) - __array(char, std_err, 32 ) - __array(char, ip, 32 ) - ), +DEFINE_EVENT(fs_str, io_move_pred, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) +); - TP_fast_assign( - strscpy(__entry->bch_err, bch2_err_str(bch_err), sizeof(__entry->bch_err)); - strscpy(__entry->std_err, bch2_err_str(std_err), sizeof(__entry->std_err)); - snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip); - ), +DEFINE_EVENT(fs_str, io_move_created_rebalance, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) +); - TP_printk("%s -> %s %s", __entry->bch_err, __entry->std_err, __entry->ip) +DEFINE_EVENT(fs_str, io_move_evacuate_bucket, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); #ifdef CONFIG_BCACHEFS_PATH_TRACEPOINTS diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index 87af551692f4..dc3817f545fa 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -252,8 +252,18 @@ void bch2_prt_u64_base2(struct printbuf *out, u64 v) bch2_prt_u64_base2_nbits(out, v, fls64(v) ?: 1); } -static void __bch2_print_string_as_lines(const char *prefix, const char *lines, - bool nonblocking) +static bool string_is_spaces(const char *str) +{ + while (*str) { + if (*str != ' ') + return false; + str++; + } + return true; +} + +void bch2_print_string_as_lines(const char *prefix, const char *lines, + bool nonblocking) { bool locked = false; const char *p; @@ -272,6 +282,9 @@ static void __bch2_print_string_as_lines(const char *prefix, const char *lines, while (*lines) { p = strchrnul(lines, '\n'); + if (!*p && string_is_spaces(lines)) + break; + printk("%s%.*s\n", prefix, (int) (p - lines), lines); if (!*p) break; @@ -281,16 +294,6 @@ static void __bch2_print_string_as_lines(const char *prefix, const char *lines, console_unlock(); } -void bch2_print_string_as_lines(const char *prefix, const char *lines) -{ - return __bch2_print_string_as_lines(prefix, lines, false); -} - -void bch2_print_string_as_lines_nonblocking(const char *prefix, const char *lines) -{ - return __bch2_print_string_as_lines(prefix, lines, true); -} - int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigned skipnr, gfp_t gfp) { @@ -725,6 +728,16 @@ void bch2_corrupt_bio(struct bio *bio) } #endif +void bch2_bio_to_text(struct printbuf *out, struct bio *bio) +{ + prt_printf(out, "bi_remaining:\t%u\n", + atomic_read(&bio->__bi_remaining)); + prt_printf(out, "bi_end_io:\t%ps\n", + bio->bi_end_io); + prt_printf(out, "bi_status:\t%u\n", + bio->bi_status); +} + #if 0 void eytzinger1_test(void) { @@ -1003,14 +1016,14 @@ u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr) return ret; } -void bch2_darray_str_exit(darray_str *d) +void bch2_darray_str_exit(darray_const_str *d) { darray_for_each(*d, i) kfree(*i); darray_exit(d); } -int bch2_split_devs(const char *_dev_name, darray_str *ret) +int bch2_split_devs(const char *_dev_name, darray_const_str *ret) { darray_init(ret); diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index 3e52c7f8ddd2..0a4b1d433621 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -14,8 +14,10 @@ #include <linux/log2.h> #include <linux/percpu.h> #include <linux/preempt.h> +#include <linux/random.h> #include <linux/ratelimit.h> #include <linux/slab.h> +#include <linux/sort.h> #include <linux/vmalloc.h> #include <linux/workqueue.h> @@ -55,15 +57,16 @@ static inline size_t buf_pages(void *p, size_t len) PAGE_SIZE); } -static inline void *bch2_kvmalloc(size_t n, gfp_t flags) +static inline void *bch2_kvmalloc_noprof(size_t n, gfp_t flags) { void *p = unlikely(n >= INT_MAX) - ? vmalloc(n) - : kvmalloc(n, flags & ~__GFP_ZERO); + ? vmalloc_noprof(n) + : kvmalloc_noprof(n, flags & ~__GFP_ZERO); if (p && (flags & __GFP_ZERO)) memset(p, 0, n); return p; } +#define bch2_kvmalloc(...) alloc_hooks(bch2_kvmalloc_noprof(__VA_ARGS__)) #define init_heap(heap, _size, gfp) \ ({ \ @@ -211,8 +214,7 @@ u64 bch2_read_flag_list(const char *, const char * const[]); void bch2_prt_u64_base2_nbits(struct printbuf *, u64, unsigned); void bch2_prt_u64_base2(struct printbuf *, u64); -void bch2_print_string_as_lines(const char *prefix, const char *lines); -void bch2_print_string_as_lines_nonblocking(const char *prefix, const char *lines); +void bch2_print_string_as_lines(const char *, const char *, bool); typedef DARRAY(unsigned long) bch_stacktrace; int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *, unsigned, gfp_t); @@ -419,6 +421,8 @@ static inline void bch2_maybe_corrupt_bio(struct bio *bio, unsigned ratio) #define bch2_maybe_corrupt_bio(...) do {} while (0) #endif +void bch2_bio_to_text(struct printbuf *, struct bio *); + static inline void memcpy_u64s_small(void *dst, const void *src, unsigned u64s) { @@ -669,8 +673,6 @@ static inline void percpu_memset(void __percpu *p, int c, size_t bytes) u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned); -#define cmp_int(l, r) ((l > r) - (l < r)) - static inline int u8_cmp(u8 l, u8 r) { return cmp_int(l, r); @@ -688,8 +690,8 @@ static inline bool qstr_eq(const struct qstr l, const struct qstr r) return l.len == r.len && !memcmp(l.name, r.name, l.len); } -void bch2_darray_str_exit(darray_str *); -int bch2_split_devs(const char *, darray_str *); +void bch2_darray_str_exit(darray_const_str *); +int bch2_split_devs(const char *, darray_const_str *); #ifdef __KERNEL__ diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index e6be32003f3b..627f153798c6 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -38,7 +38,7 @@ static u64 xattr_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) struct bkey_s_c_xattr x = bkey_s_c_to_xattr(k); return bch2_xattr_hash(info, - &X_SEARCH(x.v->x_type, x.v->x_name, x.v->x_name_len)); + &X_SEARCH(x.v->x_type, x.v->x_name_and_value, x.v->x_name_len)); } static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r) @@ -48,7 +48,7 @@ static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r) return l.v->x_type != r->type || l.v->x_name_len != r->name.len || - memcmp(l.v->x_name, r->name.name, r->name.len); + memcmp(l.v->x_name_and_value, r->name.name, r->name.len); } static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) @@ -58,7 +58,7 @@ static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) return l.v->x_type != r.v->x_type || l.v->x_name_len != r.v->x_name_len || - memcmp(l.v->x_name, r.v->x_name, r.v->x_name_len); + memcmp(l.v->x_name_and_value, r.v->x_name_and_value, r.v->x_name_len); } const struct bch_hash_desc bch2_xattr_hash_desc = { @@ -96,7 +96,7 @@ int bch2_xattr_validate(struct bch_fs *c, struct bkey_s_c k, c, xattr_invalid_type, "invalid type (%u)", xattr.v->x_type); - bkey_fsck_err_on(memchr(xattr.v->x_name, '\0', xattr.v->x_name_len), + bkey_fsck_err_on(memchr(xattr.v->x_name_and_value, '\0', xattr.v->x_name_len), c, xattr_name_invalid_chars, "xattr name has invalid characters"); fsck_err: @@ -120,13 +120,13 @@ void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c, unsigned name_len = xattr.v->x_name_len; unsigned val_len = le16_to_cpu(xattr.v->x_val_len); unsigned max_name_val_bytes = bkey_val_bytes(xattr.k) - - offsetof(struct bch_xattr, x_name); + offsetof(struct bch_xattr, x_name_and_value); val_len = min_t(int, val_len, max_name_val_bytes - name_len); name_len = min(name_len, max_name_val_bytes); prt_printf(out, "%.*s:%.*s", - name_len, xattr.v->x_name, + name_len, xattr.v->x_name_and_value, val_len, (char *) xattr_val(xattr.v)); if (xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS || @@ -176,6 +176,11 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, if (ret) return ret; + /* + * Besides the ctime update, extents, dirents and xattrs updates require + * that an inode update also happens - to ensure that if a key exists in + * one of those btrees with a given snapshot ID an inode is also present + */ inode_u->bi_ctime = bch2_current_time(c); ret = bch2_inode_write(trans, &inode_iter, inode_u); @@ -202,7 +207,7 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, xattr->v.x_type = type; xattr->v.x_name_len = namelen; xattr->v.x_val_len = cpu_to_le16(size); - memcpy(xattr->v.x_name, name, namelen); + memcpy(xattr->v.x_name_and_value, name, namelen); memcpy(xattr_val(&xattr->v), value, size); ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info, @@ -270,7 +275,7 @@ static int bch2_xattr_emit(struct dentry *dentry, if (!prefix) return 0; - return __bch2_xattr_emit(prefix, xattr->x_name, xattr->x_name_len, buf); + return __bch2_xattr_emit(prefix, xattr->x_name_and_value, xattr->x_name_len, buf); } static int bch2_xattr_list_bcachefs(struct bch_fs *c, @@ -529,7 +534,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, if (ret < 0) goto err_class_exit; - ret = bch2_opt_check_may_set(c, NULL, opt_id, v); + ret = bch2_opt_hook_pre_set(c, NULL, opt_id, v); if (ret < 0) goto err_class_exit; diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h index 132fbbd15a66..1139bf345f70 100644 --- a/fs/bcachefs/xattr.h +++ b/fs/bcachefs/xattr.h @@ -18,12 +18,12 @@ void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); static inline unsigned xattr_val_u64s(unsigned name_len, unsigned val_len) { - return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name) + + return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name_and_value) + name_len + val_len, sizeof(u64)); } #define xattr_val(_xattr) \ - ((void *) (_xattr)->x_name + (_xattr)->x_name_len) + ((void *) (_xattr)->x_name_and_value + (_xattr)->x_name_len) struct xattr_search_key { u8 type; diff --git a/fs/bcachefs/xattr_format.h b/fs/bcachefs/xattr_format.h index 67426e33d04e..4121b78d9a92 100644 --- a/fs/bcachefs/xattr_format.h +++ b/fs/bcachefs/xattr_format.h @@ -16,10 +16,10 @@ struct bch_xattr { /* * x_name contains the name and value counted by * x_name_len + x_val_len. The introduction of - * __counted_by(x_name_len) caused a false positive + * __counted_by(x_name_len) previously caused a false positive * detection of an out of bounds write. */ - __u8 x_name[]; + __u8 x_name_and_value[]; } __packed __aligned(8); #endif /* _BCACHEFS_XATTR_FORMAT_H */ |