diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2025-05-15 14:20:48 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2025-05-15 14:20:48 -0700 |
commit | fee3e843b309444f48157e2188efa6818bae85cf (patch) | |
tree | 2603b082adbe58a8b63caaddb498f1b12d484807 | |
parent | 4d0be1aa26b7dba4960c37d9f8d695eb513bb04d (diff) | |
parent | 9c09e59cc55cdf7feb29971fd792fc1947010b79 (diff) |
Pull bcachefs fixes from Kent Overstreet:
"The main user reported ones are:
- Fix a btree iterator locking inconsistency that's been causing us
to go emergency read-only in evacuate: "Fix broken btree_path lock
invariants in next_node()"
- Minor btree node cache reclaim tweak that should help with OOMs:
don't set btree nodes as accessed on fill
- Fix a bch2_bkey_clear_rebalance() issue that was causing rebalance
to do needless work"
* tag 'bcachefs-2025-05-15' of git://evilpiepirate.org/bcachefs:
bcachefs: fix wrong arg to fsck_err()
bcachefs: Fix missing commit in backpointer to missing target
bcachefs: Fix accidental O(n^2) in fiemap
bcachefs: Fix set_should_be_locked() call in peek_slot()
bcachefs: Fix self deadlock
bcachefs: Don't set btree nodes as accessed on fill
bcachefs: Fix livelock in journal_entry_open()
bcachefs: Fix broken btree_path lock invariants in next_node()
bcachefs: Don't strip rebalance_opts from indirect extents
-rw-r--r-- | fs/bcachefs/backpointers.c | 117 | ||||
-rw-r--r-- | fs/bcachefs/btree_cache.c | 9 | ||||
-rw-r--r-- | fs/bcachefs/btree_iter.c | 22 | ||||
-rw-r--r-- | fs/bcachefs/disk_accounting.c | 17 | ||||
-rw-r--r-- | fs/bcachefs/disk_accounting.h | 16 | ||||
-rw-r--r-- | fs/bcachefs/fs.c | 4 | ||||
-rw-r--r-- | fs/bcachefs/fsck.c | 2 | ||||
-rw-r--r-- | fs/bcachefs/journal_reclaim.c | 17 | ||||
-rw-r--r-- | fs/bcachefs/rebalance.c | 2 |
9 files changed, 140 insertions, 66 deletions
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index ff26bb515150..5f195d2280a4 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -192,7 +192,8 @@ static inline int bch2_backpointers_maybe_flush(struct btree_trans *trans, static int backpointer_target_not_found(struct btree_trans *trans, struct bkey_s_c_backpointer bp, struct bkey_s_c target_k, - struct bkey_buf *last_flushed) + struct bkey_buf *last_flushed, + bool commit) { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; @@ -228,18 +229,77 @@ static int backpointer_target_not_found(struct btree_trans *trans, } if (fsck_err(trans, backpointer_to_missing_ptr, - "%s", buf.buf)) + "%s", buf.buf)) { ret = bch2_backpointer_del(trans, bp.k->p); + if (ret || !commit) + goto out; + + /* + * Normally, on transaction commit from inside a transaction, + * we'll return -BCH_ERR_transaction_restart_nested, since a + * transaction commit invalidates pointers given out by peek(). + * + * However, since we're updating a write buffer btree, if we + * return a transaction restart and loop we won't see that the + * backpointer has been deleted without an additional write + * buffer flush - and those are expensive. + * + * So we're relying on the caller immediately advancing to the + * next backpointer and starting a new transaction immediately + * after backpointer_get_key() returns NULL: + */ + ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); + } +out: fsck_err: printbuf_exit(&buf); return ret; } -struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, - struct bkey_s_c_backpointer bp, - struct btree_iter *iter, - unsigned iter_flags, - struct bkey_buf *last_flushed) +static struct btree *__bch2_backpointer_get_node(struct btree_trans *trans, + struct bkey_s_c_backpointer bp, + struct btree_iter *iter, + struct bkey_buf *last_flushed, + bool commit) +{ + struct bch_fs *c = trans->c; + + BUG_ON(!bp.v->level); + + bch2_trans_node_iter_init(trans, iter, + bp.v->btree_id, + bp.v->pos, + 0, + bp.v->level - 1, + 0); + struct btree *b = bch2_btree_iter_peek_node(trans, iter); + if (IS_ERR_OR_NULL(b)) + goto err; + + BUG_ON(b->c.level != bp.v->level - 1); + + if (extent_matches_bp(c, bp.v->btree_id, bp.v->level, + bkey_i_to_s_c(&b->key), bp)) + return b; + + if (btree_node_will_make_reachable(b)) { + b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node); + } else { + int ret = backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key), + last_flushed, commit); + b = ret ? ERR_PTR(ret) : NULL; + } +err: + bch2_trans_iter_exit(trans, iter); + return b; +} + +static struct bkey_s_c __bch2_backpointer_get_key(struct btree_trans *trans, + struct bkey_s_c_backpointer bp, + struct btree_iter *iter, + unsigned iter_flags, + struct bkey_buf *last_flushed, + bool commit) { struct bch_fs *c = trans->c; @@ -277,10 +337,10 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, bch2_trans_iter_exit(trans, iter); if (!bp.v->level) { - int ret = backpointer_target_not_found(trans, bp, k, last_flushed); + int ret = backpointer_target_not_found(trans, bp, k, last_flushed, commit); return ret ? bkey_s_c_err(ret) : bkey_s_c_null; } else { - struct btree *b = bch2_backpointer_get_node(trans, bp, iter, last_flushed); + struct btree *b = __bch2_backpointer_get_node(trans, bp, iter, last_flushed, commit); if (b == ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node)) return bkey_s_c_null; if (IS_ERR_OR_NULL(b)) @@ -295,35 +355,16 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans, struct btree_iter *iter, struct bkey_buf *last_flushed) { - struct bch_fs *c = trans->c; - - BUG_ON(!bp.v->level); - - bch2_trans_node_iter_init(trans, iter, - bp.v->btree_id, - bp.v->pos, - 0, - bp.v->level - 1, - 0); - struct btree *b = bch2_btree_iter_peek_node(trans, iter); - if (IS_ERR_OR_NULL(b)) - goto err; - - BUG_ON(b->c.level != bp.v->level - 1); - - if (extent_matches_bp(c, bp.v->btree_id, bp.v->level, - bkey_i_to_s_c(&b->key), bp)) - return b; + return __bch2_backpointer_get_node(trans, bp, iter, last_flushed, true); +} - if (btree_node_will_make_reachable(b)) { - b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node); - } else { - int ret = backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key), last_flushed); - b = ret ? ERR_PTR(ret) : NULL; - } -err: - bch2_trans_iter_exit(trans, iter); - return b; +struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, + struct bkey_s_c_backpointer bp, + struct btree_iter *iter, + unsigned iter_flags, + struct bkey_buf *last_flushed) +{ + return __bch2_backpointer_get_key(trans, bp, iter, iter_flags, last_flushed, true); } static int bch2_check_backpointer_has_valid_bucket(struct btree_trans *trans, struct bkey_s_c k, @@ -521,7 +562,7 @@ check_existing_bp: struct bkey_s_c_backpointer other_bp = bkey_s_c_to_backpointer(bp_k); struct bkey_s_c other_extent = - bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0, NULL); + __bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0, NULL, false); ret = bkey_err(other_extent); if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) ret = 0; diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 9b80201c7982..899891295797 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -852,7 +852,6 @@ out: b->sib_u64s[1] = 0; b->whiteout_u64s = 0; bch2_btree_keys_init(b); - set_btree_node_accessed(b); bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], start_time); @@ -1286,6 +1285,10 @@ lock_node: six_unlock_read(&b->c.lock); goto retry; } + + /* avoid atomic set bit if it's not needed: */ + if (!btree_node_accessed(b)) + set_btree_node_accessed(b); } /* XXX: waiting on IO with btree locks held: */ @@ -1301,10 +1304,6 @@ lock_node: prefetch(p + L1_CACHE_BYTES * 2); } - /* avoid atomic set bit if it's not needed: */ - if (!btree_node_accessed(b)) - set_btree_node_accessed(b); - if (unlikely(btree_node_read_error(b))) { six_unlock_read(&b->c.lock); b = ERR_PTR(-BCH_ERR_btree_node_read_err_cached); diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 59fa527ac685..a873ec1baf58 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -1971,6 +1971,12 @@ struct btree *bch2_btree_iter_next_node(struct btree_trans *trans, struct btree_ return NULL; } + /* + * We don't correctly handle nodes with extra intent locks here: + * downgrade so we don't violate locking invariants + */ + bch2_btree_path_downgrade(trans, path); + if (!bch2_btree_node_relock(trans, path, path->level + 1)) { __bch2_btree_path_unlock(trans, path); path->l[path->level].b = ERR_PTR(-BCH_ERR_no_btree_node_relock); @@ -2743,7 +2749,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre ret = trans_maybe_inject_restart(trans, _RET_IP_); if (unlikely(ret)) { k = bkey_s_c_err(ret); - goto out_no_locked; + goto out; } /* extents can't span inode numbers: */ @@ -2763,13 +2769,15 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); if (unlikely(ret)) { k = bkey_s_c_err(ret); - goto out_no_locked; + goto out; } struct btree_path *path = btree_iter_path(trans, iter); if (unlikely(!btree_path_node(path, path->level))) return bkey_s_c_null; + btree_path_set_should_be_locked(trans, path); + if ((iter->flags & BTREE_ITER_cached) || !(iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots))) { k = bkey_s_c_null; @@ -2790,12 +2798,12 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre if (!bkey_err(k)) iter->k = *k.k; /* We're not returning a key from iter->path: */ - goto out_no_locked; + goto out; } - k = bch2_btree_path_peek_slot(trans->paths + iter->path, &iter->k); + k = bch2_btree_path_peek_slot(btree_iter_path(trans, iter), &iter->k); if (unlikely(!k.k)) - goto out_no_locked; + goto out; if (unlikely(k.k->type == KEY_TYPE_whiteout && (iter->flags & BTREE_ITER_filter_snapshots) && @@ -2833,7 +2841,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre } if (unlikely(bkey_err(k))) - goto out_no_locked; + goto out; next = k.k ? bkey_start_pos(k.k) : POS_MAX; @@ -2855,8 +2863,6 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre } } out: - btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter)); -out_no_locked: bch2_btree_iter_verify_entry_exit(iter); bch2_btree_iter_verify(trans, iter); ret = bch2_btree_iter_verify_ret(trans, iter, k); diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index b007319b72e9..1f0422bfae35 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -376,6 +376,19 @@ int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a, return ret; } +int bch2_accounting_mem_insert_locked(struct bch_fs *c, struct bkey_s_c_accounting a, + enum bch_accounting_mode mode) +{ + struct bch_replicas_padded r; + + if (mode != BCH_ACCOUNTING_read && + accounting_to_replicas(&r.e, a.k->p) && + !bch2_replicas_marked_locked(c, &r.e)) + return -BCH_ERR_btree_insert_need_mark_replicas; + + return __bch2_accounting_mem_insert(c, a); +} + static bool accounting_mem_entry_is_zero(struct accounting_mem_entry *e) { for (unsigned i = 0; i < e->nr_counters; i++) @@ -583,7 +596,7 @@ int bch2_gc_accounting_done(struct bch_fs *c) accounting_key_init(&k_i.k, &acc_k, src_v, nr); bch2_accounting_mem_mod_locked(trans, bkey_i_to_s_c_accounting(&k_i.k), - BCH_ACCOUNTING_normal); + BCH_ACCOUNTING_normal, true); preempt_disable(); struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage); @@ -612,7 +625,7 @@ static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k) percpu_down_read(&c->mark_lock); int ret = bch2_accounting_mem_mod_locked(trans, bkey_s_c_to_accounting(k), - BCH_ACCOUNTING_read); + BCH_ACCOUNTING_read, false); percpu_up_read(&c->mark_lock); return ret; } diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h index abb1f6206fe9..d557b99b3c0a 100644 --- a/fs/bcachefs/disk_accounting.h +++ b/fs/bcachefs/disk_accounting.h @@ -136,6 +136,7 @@ enum bch_accounting_mode { }; int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode); +int bch2_accounting_mem_insert_locked(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode); void bch2_accounting_mem_gc(struct bch_fs *); static inline bool bch2_accounting_is_mem(struct disk_accounting_pos acc) @@ -150,7 +151,8 @@ static inline bool bch2_accounting_is_mem(struct disk_accounting_pos acc) */ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, struct bkey_s_c_accounting a, - enum bch_accounting_mode mode) + enum bch_accounting_mode mode, + bool write_locked) { struct bch_fs *c = trans->c; struct bch_accounting_mem *acc = &c->accounting; @@ -189,7 +191,11 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, while ((idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), accounting_pos_cmp, &a.k->p)) >= acc->k.nr) { - int ret = bch2_accounting_mem_insert(c, a, mode); + int ret = 0; + if (unlikely(write_locked)) + ret = bch2_accounting_mem_insert_locked(c, a, mode); + else + ret = bch2_accounting_mem_insert(c, a, mode); if (ret) return ret; } @@ -206,7 +212,7 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, static inline int bch2_accounting_mem_add(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc) { percpu_down_read(&trans->c->mark_lock); - int ret = bch2_accounting_mem_mod_locked(trans, a, gc ? BCH_ACCOUNTING_gc : BCH_ACCOUNTING_normal); + int ret = bch2_accounting_mem_mod_locked(trans, a, gc ? BCH_ACCOUNTING_gc : BCH_ACCOUNTING_normal, false); percpu_up_read(&trans->c->mark_lock); return ret; } @@ -259,7 +265,7 @@ static inline int bch2_accounting_trans_commit_hook(struct btree_trans *trans, EBUG_ON(bversion_zero(a->k.bversion)); return likely(!(commit_flags & BCH_TRANS_COMMIT_skip_accounting_apply)) - ? bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal) + ? bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal, false) : 0; } @@ -271,7 +277,7 @@ static inline void bch2_accounting_trans_commit_revert(struct btree_trans *trans struct bkey_s_accounting a = accounting_i_to_s(a_i); bch2_accounting_neg(a); - bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal); + bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal, false); bch2_accounting_neg(a); } } diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index b6801861c66f..4b742e62255b 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -1429,7 +1429,9 @@ static int bch2_next_fiemap_extent(struct btree_trans *trans, if (ret) goto err; - ret = bch2_next_fiemap_pagecache_extent(trans, inode, start, end, cur); + u64 pagecache_end = k.k ? max(start, bkey_start_offset(k.k)) : end; + + ret = bch2_next_fiemap_pagecache_extent(trans, inode, start, pagecache_end, cur); if (ret) goto err; diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 7b25cedd3e40..71d428f376a5 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -2446,7 +2446,7 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, u32 parent = le32_to_cpu(s.v->fs_path_parent); if (darray_u32_has(&subvol_path, parent)) { - if (fsck_err(c, subvol_loop, "subvolume loop")) + if (fsck_err(trans, subvol_loop, "subvolume loop")) ret = reattach_subvol(trans, s); break; } diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index 976464d8a695..cc00b0fc40d8 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -17,6 +17,8 @@ #include <linux/kthread.h> #include <linux/sched/mm.h> +static bool __should_discard_bucket(struct journal *, struct journal_device *); + /* Free space calculations: */ static unsigned journal_space_from(struct journal_device *ja, @@ -203,8 +205,7 @@ void bch2_journal_space_available(struct journal *j) ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk) ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr; - if (ja->discard_idx != ja->dirty_idx_ondisk) - can_discard = true; + can_discard |= __should_discard_bucket(j, ja); max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size); nr_online++; @@ -264,13 +265,19 @@ out: /* Discards - last part of journal reclaim: */ -static bool should_discard_bucket(struct journal *j, struct journal_device *ja) +static bool __should_discard_bucket(struct journal *j, struct journal_device *ja) { - spin_lock(&j->lock); unsigned min_free = max(4, ja->nr / 8); - bool ret = bch2_journal_dev_buckets_available(j, ja, journal_space_discarded) < min_free && + return bch2_journal_dev_buckets_available(j, ja, journal_space_discarded) < + min_free && ja->discard_idx != ja->dirty_idx_ondisk; +} + +static bool should_discard_bucket(struct journal *j, struct journal_device *ja) +{ + spin_lock(&j->lock); + bool ret = __should_discard_bucket(j, ja); spin_unlock(&j->lock); return ret; diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 4ccdfc1f34aa..623273556aa9 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -309,7 +309,7 @@ static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) { - if (!bch2_bkey_rebalance_opts(k)) + if (k.k->type == KEY_TYPE_reflink_v || !bch2_bkey_rebalance_opts(k)) return 0; struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0); |