summaryrefslogtreecommitdiff
path: root/fs/bcachefs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/bcachefs')
-rw-r--r--fs/bcachefs/Kconfig8
-rw-r--r--fs/bcachefs/Makefile4
-rw-r--r--fs/bcachefs/alloc_background.c238
-rw-r--r--fs/bcachefs/alloc_background.h10
-rw-r--r--fs/bcachefs/alloc_foreground.c614
-rw-r--r--fs/bcachefs/alloc_foreground.h77
-rw-r--r--fs/bcachefs/alloc_types.h16
-rw-r--r--fs/bcachefs/async_objs.c132
-rw-r--r--fs/bcachefs/async_objs.h44
-rw-r--r--fs/bcachefs/async_objs_types.h25
-rw-r--r--fs/bcachefs/backpointers.c292
-rw-r--r--fs/bcachefs/backpointers.h17
-rw-r--r--fs/bcachefs/bcachefs.h289
-rw-r--r--fs/bcachefs/bcachefs_format.h30
-rw-r--r--fs/bcachefs/bkey.c47
-rw-r--r--fs/bcachefs/bkey.h4
-rw-r--r--fs/bcachefs/bkey_methods.c2
-rw-r--r--fs/bcachefs/bset.c64
-rw-r--r--fs/bcachefs/bset.h22
-rw-r--r--fs/bcachefs/btree_cache.c198
-rw-r--r--fs/bcachefs/btree_gc.c89
-rw-r--r--fs/bcachefs/btree_gc.h3
-rw-r--r--fs/bcachefs/btree_io.c375
-rw-r--r--fs/bcachefs/btree_io.h12
-rw-r--r--fs/bcachefs/btree_iter.c355
-rw-r--r--fs/bcachefs/btree_iter.h116
-rw-r--r--fs/bcachefs/btree_journal_iter.c19
-rw-r--r--fs/bcachefs/btree_key_cache.c64
-rw-r--r--fs/bcachefs/btree_locking.c250
-rw-r--r--fs/bcachefs/btree_locking.h72
-rw-r--r--fs/bcachefs/btree_node_scan.c20
-rw-r--r--fs/bcachefs/btree_trans_commit.c115
-rw-r--r--fs/bcachefs/btree_types.h33
-rw-r--r--fs/bcachefs/btree_update.c133
-rw-r--r--fs/bcachefs/btree_update.h82
-rw-r--r--fs/bcachefs/btree_update_interior.c154
-rw-r--r--fs/bcachefs/btree_update_interior.h6
-rw-r--r--fs/bcachefs/btree_write_buffer.c26
-rw-r--r--fs/bcachefs/btree_write_buffer.h1
-rw-r--r--fs/bcachefs/buckets.c220
-rw-r--r--fs/bcachefs/buckets.h12
-rw-r--r--fs/bcachefs/buckets_waiting_for_journal.c3
-rw-r--r--fs/bcachefs/chardev.c9
-rw-r--r--fs/bcachefs/checksum.c30
-rw-r--r--fs/bcachefs/checksum.h2
-rw-r--r--fs/bcachefs/clock.c47
-rw-r--r--fs/bcachefs/clock.h1
-rw-r--r--fs/bcachefs/compress.c24
-rw-r--r--fs/bcachefs/darray.h59
-rw-r--r--fs/bcachefs/data_update.c379
-rw-r--r--fs/bcachefs/data_update.h15
-rw-r--r--fs/bcachefs/debug.c115
-rw-r--r--fs/bcachefs/debug.h20
-rw-r--r--fs/bcachefs/dirent.c182
-rw-r--r--fs/bcachefs/dirent.h16
-rw-r--r--fs/bcachefs/disk_accounting.c149
-rw-r--r--fs/bcachefs/disk_accounting.h18
-rw-r--r--fs/bcachefs/disk_groups.c152
-rw-r--r--fs/bcachefs/ec.c302
-rw-r--r--fs/bcachefs/ec.h9
-rw-r--r--fs/bcachefs/ec_types.h7
-rw-r--r--fs/bcachefs/enumerated_ref.c144
-rw-r--r--fs/bcachefs/enumerated_ref.h66
-rw-r--r--fs/bcachefs/enumerated_ref_types.h19
-rw-r--r--fs/bcachefs/errcode.c4
-rw-r--r--fs/bcachefs/errcode.h22
-rw-r--r--fs/bcachefs/error.c186
-rw-r--r--fs/bcachefs/error.h27
-rw-r--r--fs/bcachefs/extent_update.c67
-rw-r--r--fs/bcachefs/extent_update.h2
-rw-r--r--fs/bcachefs/extents.c183
-rw-r--r--fs/bcachefs/extents.h3
-rw-r--r--fs/bcachefs/extents_types.h1
-rw-r--r--fs/bcachefs/fast_list.c156
-rw-r--r--fs/bcachefs/fast_list.h41
-rw-r--r--fs/bcachefs/fs-io-buffered.c30
-rw-r--r--fs/bcachefs/fs-io-direct.c7
-rw-r--r--fs/bcachefs/fs-io-pagecache.c2
-rw-r--r--fs/bcachefs/fs-io.c34
-rw-r--r--fs/bcachefs/fs-ioctl.c18
-rw-r--r--fs/bcachefs/fs.c60
-rw-r--r--fs/bcachefs/fsck.c513
-rw-r--r--fs/bcachefs/fsck.h6
-rw-r--r--fs/bcachefs/inode.c214
-rw-r--r--fs/bcachefs/inode.h44
-rw-r--r--fs/bcachefs/inode_format.h7
-rw-r--r--fs/bcachefs/io_misc.c2
-rw-r--r--fs/bcachefs/io_read.c340
-rw-r--r--fs/bcachefs/io_read.h25
-rw-r--r--fs/bcachefs/io_write.c84
-rw-r--r--fs/bcachefs/io_write.h28
-rw-r--r--fs/bcachefs/io_write_types.h32
-rw-r--r--fs/bcachefs/journal.c191
-rw-r--r--fs/bcachefs/journal.h8
-rw-r--r--fs/bcachefs/journal_io.c446
-rw-r--r--fs/bcachefs/journal_io.h1
-rw-r--r--fs/bcachefs/journal_reclaim.c75
-rw-r--r--fs/bcachefs/journal_sb.c2
-rw-r--r--fs/bcachefs/journal_seq_blacklist.c14
-rw-r--r--fs/bcachefs/journal_seq_blacklist.h1
-rw-r--r--fs/bcachefs/journal_types.h2
-rw-r--r--fs/bcachefs/lru.c6
-rw-r--r--fs/bcachefs/migrate.c121
-rw-r--r--fs/bcachefs/migrate.h3
-rw-r--r--fs/bcachefs/move.c331
-rw-r--r--fs/bcachefs/move.h17
-rw-r--r--fs/bcachefs/move_types.h8
-rw-r--r--fs/bcachefs/movinggc.c227
-rw-r--r--fs/bcachefs/movinggc.h5
-rw-r--r--fs/bcachefs/namei.c279
-rw-r--r--fs/bcachefs/namei.h7
-rw-r--r--fs/bcachefs/nocow_locking.c4
-rw-r--r--fs/bcachefs/nocow_locking.h2
-rw-r--r--fs/bcachefs/opts.c170
-rw-r--r--fs/bcachefs/opts.h38
-rw-r--r--fs/bcachefs/printbuf.h8
-rw-r--r--fs/bcachefs/quota.c6
-rw-r--r--fs/bcachefs/rebalance.c241
-rw-r--r--fs/bcachefs/rebalance.h14
-rw-r--r--fs/bcachefs/rebalance_types.h6
-rw-r--r--fs/bcachefs/recovery.c140
-rw-r--r--fs/bcachefs/recovery.h3
-rw-r--r--fs/bcachefs/recovery_passes.c649
-rw-r--r--fs/bcachefs/recovery_passes.h31
-rw-r--r--fs/bcachefs/recovery_passes_format.h106
-rw-r--r--fs/bcachefs/recovery_passes_types.h93
-rw-r--r--fs/bcachefs/reflink.c14
-rw-r--r--fs/bcachefs/replicas.c35
-rw-r--r--fs/bcachefs/sb-counters_format.h3
-rw-r--r--fs/bcachefs/sb-downgrade.c11
-rw-r--r--fs/bcachefs/sb-errors.c22
-rw-r--r--fs/bcachefs/sb-errors.h1
-rw-r--r--fs/bcachefs/sb-errors_format.h6
-rw-r--r--fs/bcachefs/sb-members.c98
-rw-r--r--fs/bcachefs/sb-members.h90
-rw-r--r--fs/bcachefs/sb-members_format.h6
-rw-r--r--fs/bcachefs/sb-members_types.h1
-rw-r--r--fs/bcachefs/six.c7
-rw-r--r--fs/bcachefs/snapshot.c621
-rw-r--r--fs/bcachefs/snapshot.h116
-rw-r--r--fs/bcachefs/snapshot_format.h4
-rw-r--r--fs/bcachefs/snapshot_types.h57
-rw-r--r--fs/bcachefs/str_hash.c368
-rw-r--r--fs/bcachefs/str_hash.h34
-rw-r--r--fs/bcachefs/subvolume.c108
-rw-r--r--fs/bcachefs/subvolume.h5
-rw-r--r--fs/bcachefs/subvolume_types.h27
-rw-r--r--fs/bcachefs/super-io.c71
-rw-r--r--fs/bcachefs/super-io.h1
-rw-r--r--fs/bcachefs/super.c738
-rw-r--r--fs/bcachefs/super.h9
-rw-r--r--fs/bcachefs/sysfs.c132
-rw-r--r--fs/bcachefs/trace.h123
-rw-r--r--fs/bcachefs/util.c41
-rw-r--r--fs/bcachefs/util.h20
-rw-r--r--fs/bcachefs/xattr.c23
-rw-r--r--fs/bcachefs/xattr.h4
-rw-r--r--fs/bcachefs/xattr_format.h4
158 files changed, 9150 insertions, 5101 deletions
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
index 07709b0d7688..8cb2b9d5da96 100644
--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@@ -103,6 +103,14 @@ config BCACHEFS_PATH_TRACEPOINTS
Enable extra tracepoints for debugging btree_path operations; we don't
normally want these enabled because they happen at very high rates.
+config BCACHEFS_TRANS_KMALLOC_TRACE
+ bool "Trace bch2_trans_kmalloc() calls"
+ depends on BCACHEFS_FS
+
+config BCACHEFS_ASYNC_OBJECT_LISTS
+ bool "Keep async objects on fast_lists for debugfs visibility"
+ depends on BCACHEFS_FS && DEBUG_FS
+
config MEAN_AND_VARIANCE_UNIT_TEST
tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS
depends on KUNIT
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index 9af65079374f..93c8ee5425c8 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -35,11 +35,13 @@ bcachefs-y := \
disk_accounting.o \
disk_groups.o \
ec.o \
+ enumerated_ref.o \
errcode.o \
error.o \
extents.o \
extent_update.o \
eytzinger.o \
+ fast_list.o \
fs.o \
fs-ioctl.o \
fs-io.o \
@@ -97,6 +99,8 @@ bcachefs-y := \
varint.o \
xattr.o
+bcachefs-$(CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS) += async_objs.o
+
obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST) += mean_and_variance_test.o
# Silence "note: xyz changed in GCC X.X" messages
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 94ea9e49aec4..b228a5a64479 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -17,10 +17,10 @@
#include "debug.h"
#include "disk_accounting.h"
#include "ec.h"
+#include "enumerated_ref.h"
#include "error.h"
#include "lru.h"
#include "recovery.h"
-#include "trace.h"
#include "varint.h"
#include <linux/kthread.h>
@@ -308,7 +308,8 @@ int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k,
"data type inconsistency");
bkey_fsck_err_on(!a.io_time[READ] &&
- c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs,
+ !(c->recovery.passes_to_run &
+ BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs)),
c, alloc_key_cached_but_read_time_zero,
"cached bucket with read_time == 0");
break;
@@ -335,11 +336,10 @@ void bch2_alloc_v4_swab(struct bkey_s k)
a->stripe_sectors = swab32(a->stripe_sectors);
}
-void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+static inline void __bch2_alloc_v4_to_text(struct printbuf *out, struct bch_fs *c,
+ unsigned dev, const struct bch_alloc_v4 *a)
{
- struct bch_alloc_v4 _a;
- const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
- struct bch_dev *ca = c ? bch2_dev_bucket_tryget_noerror(c, k.k->p) : NULL;
+ struct bch_dev *ca = c ? bch2_dev_tryget_noerror(c, dev) : NULL;
prt_newline(out);
printbuf_indent_add(out, 2);
@@ -367,6 +367,19 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
bch2_dev_put(ca);
}
+void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+ struct bch_alloc_v4 _a;
+ const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
+
+ __bch2_alloc_v4_to_text(out, c, k.k->p.inode, a);
+}
+
+void bch2_alloc_v4_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+ __bch2_alloc_v4_to_text(out, c, k.k->p.inode, bkey_s_c_to_alloc_v4(k).v);
+}
+
void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out)
{
if (k.k->type == KEY_TYPE_alloc_v4) {
@@ -478,12 +491,27 @@ struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans,
enum btree_iter_update_trigger_flags flags)
{
struct btree_iter iter;
- struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update_noupdate(trans, &iter, pos);
- int ret = PTR_ERR_OR_ZERO(a);
- if (ret)
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, pos,
+ BTREE_ITER_with_updates|
+ BTREE_ITER_cached|
+ BTREE_ITER_intent);
+ int ret = bkey_err(k);
+ if (unlikely(ret))
return ERR_PTR(ret);
- ret = bch2_trans_update(trans, &iter, &a->k_i, flags);
+ if ((void *) k.v >= trans->mem &&
+ (void *) k.v < trans->mem + trans->mem_top) {
+ bch2_trans_iter_exit(trans, &iter);
+ return container_of(bkey_s_c_to_alloc_v4(k).v, struct bkey_i_alloc_v4, v);
+ }
+
+ struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut_inlined(trans, k);
+ if (IS_ERR(a)) {
+ bch2_trans_iter_exit(trans, &iter);
+ return a;
+ }
+
+ ret = bch2_trans_update_ip(trans, &iter, &a->k_i, flags, _RET_IP_);
bch2_trans_iter_exit(trans, &iter);
return unlikely(ret) ? ERR_PTR(ret) : a;
}
@@ -680,8 +708,8 @@ static int __need_discard_or_freespace_err(struct btree_trans *trans,
set ? "" : "un",
bch2_btree_id_str(btree),
buf.buf);
- if (ret == -BCH_ERR_fsck_ignore ||
- ret == -BCH_ERR_fsck_errors_not_fixed)
+ if (bch2_err_matches(ret, BCH_ERR_fsck_ignore) ||
+ bch2_err_matches(ret, BCH_ERR_fsck_errors_not_fixed))
ret = 0;
printbuf_exit(&buf);
@@ -837,7 +865,7 @@ int bch2_trigger_alloc(struct btree_trans *trans,
struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p);
if (!ca)
- return -BCH_ERR_trigger_alloc;
+ return bch_err_throw(c, trigger_alloc);
struct bch_alloc_v4 old_a_convert;
const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert);
@@ -913,15 +941,6 @@ int bch2_trigger_alloc(struct btree_trans *trans,
goto err;
}
- if ((flags & BTREE_TRIGGER_bucket_invalidate) &&
- old_a->cached_sectors) {
- ret = bch2_mod_dev_cached_sectors(trans, ca->dev_idx,
- -((s64) old_a->cached_sectors),
- flags & BTREE_TRIGGER_gc);
- if (ret)
- goto err;
- }
-
ret = bch2_alloc_key_to_dev_counters(trans, ca, old_a, new_a, flags);
if (ret)
goto err;
@@ -980,14 +999,11 @@ int bch2_trigger_alloc(struct btree_trans *trans,
}
if (new_a->gen != old_a->gen) {
- rcu_read_lock();
+ guard(rcu)();
u8 *gen = bucket_gen(ca, new.k->p.offset);
- if (unlikely(!gen)) {
- rcu_read_unlock();
+ if (unlikely(!gen))
goto invalid_bucket;
- }
*gen = new_a->gen;
- rcu_read_unlock();
}
#define eval_state(_a, expr) ({ const struct bch_alloc_v4 *a = _a; expr; })
@@ -1013,15 +1029,12 @@ int bch2_trigger_alloc(struct btree_trans *trans,
}
if ((flags & BTREE_TRIGGER_gc) && (flags & BTREE_TRIGGER_insert)) {
- rcu_read_lock();
+ guard(rcu)();
struct bucket *g = gc_bucket(ca, new.k->p.offset);
- if (unlikely(!g)) {
- rcu_read_unlock();
+ if (unlikely(!g))
goto invalid_bucket;
- }
g->gen_valid = 1;
g->gen = new_a->gen;
- rcu_read_unlock();
}
err:
fsck_err:
@@ -1031,7 +1044,7 @@ fsck_err:
invalid_bucket:
bch2_fs_inconsistent(c, "reference to invalid bucket\n%s",
(bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf));
- ret = -BCH_ERR_trigger_alloc;
+ ret = bch_err_throw(c, trigger_alloc);
goto err;
}
@@ -1097,13 +1110,12 @@ static bool next_bucket(struct bch_fs *c, struct bch_dev **ca, struct bpos *buck
bucket->offset = 0;
}
- rcu_read_lock();
+ guard(rcu)();
*ca = __bch2_next_dev_idx(c, bucket->inode, NULL);
if (*ca) {
*bucket = POS((*ca)->dev_idx, (*ca)->mi.first_bucket);
bch2_dev_get(*ca);
}
- rcu_read_unlock();
return *ca != NULL;
}
@@ -1381,7 +1393,7 @@ static void check_discard_freespace_key_work(struct work_struct *work)
container_of(work, struct check_discard_freespace_key_async, work);
bch2_trans_do(w->c, bch2_recheck_discard_freespace_key(trans, w->pos));
- bch2_write_ref_put(w->c, BCH_WRITE_REF_check_discard_freespace_key);
+ enumerated_ref_put(&w->c->writes, BCH_WRITE_REF_check_discard_freespace_key);
kfree(w);
}
@@ -1446,7 +1458,7 @@ delete:
ret = bch2_btree_bit_mod_iter(trans, iter, false) ?:
bch2_trans_commit(trans, NULL, NULL,
BCH_TRANS_COMMIT_no_enospc) ?:
- -BCH_ERR_transaction_restart_commit;
+ bch_err_throw(c, transaction_restart_commit);
goto out;
} else {
/*
@@ -1458,7 +1470,7 @@ delete:
if (!w)
goto out;
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_check_discard_freespace_key)) {
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_check_discard_freespace_key)) {
kfree(w);
goto out;
}
@@ -1467,6 +1479,8 @@ delete:
w->c = c;
w->pos = BBPOS(iter->btree_id, iter->pos);
queue_work(c->write_ref_wq, &w->work);
+
+ ret = 1; /* don't allocate from this bucket */
goto out;
}
}
@@ -1767,14 +1781,16 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
static int discard_in_flight_add(struct bch_dev *ca, u64 bucket, bool in_progress)
{
+ struct bch_fs *c = ca->fs;
int ret;
mutex_lock(&ca->discard_buckets_in_flight_lock);
- darray_for_each(ca->discard_buckets_in_flight, i)
- if (i->bucket == bucket) {
- ret = -BCH_ERR_EEXIST_discard_in_flight_add;
- goto out;
- }
+ struct discard_in_flight *i =
+ darray_find_p(ca->discard_buckets_in_flight, i, i->bucket == bucket);
+ if (i) {
+ ret = bch_err_throw(c, EEXIST_discard_in_flight_add);
+ goto out;
+ }
ret = darray_push(&ca->discard_buckets_in_flight, ((struct discard_in_flight) {
.in_progress = in_progress,
@@ -1788,14 +1804,11 @@ out:
static void discard_in_flight_remove(struct bch_dev *ca, u64 bucket)
{
mutex_lock(&ca->discard_buckets_in_flight_lock);
- darray_for_each(ca->discard_buckets_in_flight, i)
- if (i->bucket == bucket) {
- BUG_ON(!i->in_progress);
- darray_remove_item(&ca->discard_buckets_in_flight, i);
- goto found;
- }
- BUG();
-found:
+ struct discard_in_flight *i =
+ darray_find_p(ca->discard_buckets_in_flight, i, i->bucket == bucket);
+ BUG_ON(!i || !i->in_progress);
+
+ darray_remove_item(&ca->discard_buckets_in_flight, i);
mutex_unlock(&ca->discard_buckets_in_flight_lock);
}
@@ -1806,19 +1819,6 @@ struct discard_buckets_state {
u64 discarded;
};
-/*
- * This is needed because discard is both a filesystem option and a device
- * option, and mount options are supposed to apply to that mount and not be
- * persisted, i.e. if it's set as a mount option we can't propagate it to the
- * device.
- */
-static inline bool discard_opt_enabled(struct bch_fs *c, struct bch_dev *ca)
-{
- return test_bit(BCH_FS_discard_mount_opt_set, &c->flags)
- ? c->opts.discard
- : ca->mi.discard;
-}
-
static int bch2_discard_one_bucket(struct btree_trans *trans,
struct bch_dev *ca,
struct btree_iter *need_discard_iter,
@@ -1882,7 +1882,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
s->discarded++;
*discard_pos_done = iter.pos;
- if (discard_opt_enabled(c, ca) && !c->opts.nochanges) {
+ if (bch2_discard_opt_enabled(c, ca) && !c->opts.nochanges) {
/*
* This works without any other locks because this is the only
* thread that removes items from the need_discard tree
@@ -1952,26 +1952,26 @@ static void bch2_do_discards_work(struct work_struct *work)
trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded,
bch2_err_str(ret));
- percpu_ref_put(&ca->io_ref[WRITE]);
- bch2_write_ref_put(c, BCH_WRITE_REF_discard);
+ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_dev_do_discards);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard);
}
void bch2_dev_do_discards(struct bch_dev *ca)
{
struct bch_fs *c = ca->fs;
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard))
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_discard))
return;
- if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
+ if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_dev_do_discards))
goto put_write_ref;
if (queue_work(c->write_ref_wq, &ca->discard_work))
return;
- percpu_ref_put(&ca->io_ref[WRITE]);
+ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_dev_do_discards);
put_write_ref:
- bch2_write_ref_put(c, BCH_WRITE_REF_discard);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard);
}
void bch2_do_discards(struct bch_fs *c)
@@ -2047,8 +2047,8 @@ static void bch2_do_discards_fast_work(struct work_struct *work)
trace_discard_buckets_fast(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret));
bch2_trans_put(trans);
- percpu_ref_put(&ca->io_ref[WRITE]);
- bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
+ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_discard_one_bucket_fast);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard_fast);
}
static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket)
@@ -2058,18 +2058,18 @@ static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket)
if (discard_in_flight_add(ca, bucket, false))
return;
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast))
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_discard_fast))
return;
- if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
+ if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_discard_one_bucket_fast))
goto put_ref;
if (queue_work(c->write_ref_wq, &ca->discard_fast_work))
return;
- percpu_ref_put(&ca->io_ref[WRITE]);
+ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_discard_one_bucket_fast);
put_ref:
- bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard_fast);
}
static int invalidate_one_bp(struct btree_trans *trans,
@@ -2180,8 +2180,11 @@ static int invalidate_one_bucket(struct btree_trans *trans,
BUG_ON(a->data_type != BCH_DATA_cached);
BUG_ON(a->dirty_sectors);
- if (!a->cached_sectors)
- bch_err(c, "invalidating empty bucket, confused");
+ if (!a->cached_sectors) {
+ bch2_check_bucket_backpointer_mismatch(trans, ca, bucket.offset,
+ true, last_flushed);
+ goto out;
+ }
unsigned cached_sectors = a->cached_sectors;
u8 gen = a->gen;
@@ -2261,27 +2264,27 @@ restart_err:
bch2_trans_iter_exit(trans, &iter);
err:
bch2_trans_put(trans);
- percpu_ref_put(&ca->io_ref[WRITE]);
bch2_bkey_buf_exit(&last_flushed, c);
- bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
+ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_do_invalidates);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_invalidate);
}
void bch2_dev_do_invalidates(struct bch_dev *ca)
{
struct bch_fs *c = ca->fs;
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate))
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_invalidate))
return;
- if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
+ if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_do_invalidates))
goto put_ref;
if (queue_work(c->write_ref_wq, &ca->invalidate_work))
return;
- percpu_ref_put(&ca->io_ref[WRITE]);
+ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_do_invalidates);
put_ref:
- bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_invalidate);
}
void bch2_do_invalidates(struct bch_fs *c)
@@ -2392,14 +2395,16 @@ bkey_err:
int bch2_fs_freespace_init(struct bch_fs *c)
{
- int ret = 0;
- bool doing_init = false;
+ if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image))
+ return 0;
+
/*
* We can crash during the device add path, so we need to check this on
* every mount:
*/
+ bool doing_init = false;
for_each_member_device(c, ca) {
if (ca->mi.freespace_initialized)
continue;
@@ -2409,7 +2414,7 @@ int bch2_fs_freespace_init(struct bch_fs *c)
doing_init = true;
}
- ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);
+ int ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);
if (ret) {
bch2_dev_put(ca);
bch_err_fn(c, ret);
@@ -2439,8 +2444,7 @@ int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
* We clear the LRU and need_discard btrees first so that we don't race
* with bch2_do_invalidates() and bch2_do_discards()
*/
- ret = bch2_dev_remove_stripes(c, ca->dev_idx) ?:
- bch2_btree_delete_range(c, BTREE_ID_lru, start, end,
+ ret = bch2_btree_delete_range(c, BTREE_ID_lru, start, end,
BTREE_TRIGGER_norun, NULL) ?:
bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end,
BTREE_TRIGGER_norun, NULL) ?:
@@ -2503,15 +2507,15 @@ void bch2_recalc_capacity(struct bch_fs *c)
lockdep_assert_held(&c->state_lock);
- for_each_online_member(c, ca) {
- struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi;
-
- ra_pages += bdi->ra_pages;
- }
+ guard(rcu)();
+ for_each_member_device_rcu(c, ca, NULL) {
+ struct block_device *bdev = READ_ONCE(ca->disk_sb.bdev);
+ if (bdev)
+ ra_pages += bdev->bd_disk->bdi->ra_pages;
- bch2_set_ra_pages(c, ra_pages);
+ if (ca->mi.state != BCH_MEMBER_STATE_rw)
+ continue;
- __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), READ) {
u64 dev_reserve = 0;
/*
@@ -2549,6 +2553,8 @@ void bch2_recalc_capacity(struct bch_fs *c)
ca->mi.bucket_size);
}
+ bch2_set_ra_pages(c, ra_pages);
+
gc_reserve = c->opts.gc_reserve_bytes
? c->opts.gc_reserve_bytes >> 9
: div64_u64(capacity * c->opts.gc_reserve_percent, 100);
@@ -2570,7 +2576,8 @@ u64 bch2_min_rw_member_capacity(struct bch_fs *c)
{
u64 ret = U64_MAX;
- for_each_rw_member(c, ca)
+ guard(rcu)();
+ for_each_rw_member_rcu(c, ca)
ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size);
return ret;
}
@@ -2578,19 +2585,31 @@ u64 bch2_min_rw_member_capacity(struct bch_fs *c)
static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
{
struct open_bucket *ob;
- bool ret = false;
for (ob = c->open_buckets;
ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
ob++) {
- spin_lock(&ob->lock);
- if (ob->valid && !ob->on_partial_list &&
- ob->dev == ca->dev_idx)
- ret = true;
- spin_unlock(&ob->lock);
+ scoped_guard(spinlock, &ob->lock) {
+ if (ob->valid && !ob->on_partial_list &&
+ ob->dev == ca->dev_idx)
+ return true;
+ }
}
- return ret;
+ return false;
+}
+
+void bch2_dev_allocator_set_rw(struct bch_fs *c, struct bch_dev *ca, bool rw)
+{
+ /* BCH_DATA_free == all rw devs */
+
+ for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
+ if (rw &&
+ (i == BCH_DATA_free ||
+ (ca->mi.data_allowed & BIT(i))))
+ set_bit(ca->dev_idx, c->rw_devs[i].d);
+ else
+ clear_bit(ca->dev_idx, c->rw_devs[i].d);
}
/* device goes ro: */
@@ -2599,9 +2618,7 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
lockdep_assert_held(&c->state_lock);
/* First, remove device from allocation groups: */
-
- for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
- clear_bit(ca->dev_idx, c->rw_devs[i].d);
+ bch2_dev_allocator_set_rw(c, ca, false);
c->rw_devs_change_count++;
@@ -2635,10 +2652,7 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
{
lockdep_assert_held(&c->state_lock);
- for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
- if (ca->mi.data_allowed & (1 << i))
- set_bit(ca->dev_idx, c->rw_devs[i].d);
-
+ bch2_dev_allocator_set_rw(c, ca, true);
c->rw_devs_change_count++;
}
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 34b3d6ac4fbb..0cc5adc55b6f 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -13,11 +13,9 @@
static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos)
{
- rcu_read_lock();
+ guard(rcu)();
struct bch_dev *ca = bch2_dev_rcu_noerror(c, pos.inode);
- bool ret = ca && bucket_valid(ca, pos.offset);
- rcu_read_unlock();
- return ret;
+ return ca && bucket_valid(ca, pos.offset);
}
static inline u64 bucket_to_u64(struct bpos bucket)
@@ -253,6 +251,7 @@ int bch2_alloc_v4_validate(struct bch_fs *, struct bkey_s_c,
struct bkey_validate_context);
void bch2_alloc_v4_swab(struct bkey_s);
void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+void bch2_alloc_v4_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_alloc ((struct bkey_ops) { \
.key_validate = bch2_alloc_v1_validate, \
@@ -277,7 +276,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_alloc_v4 ((struct bkey_ops) { \
.key_validate = bch2_alloc_v4_validate, \
- .val_to_text = bch2_alloc_to_text, \
+ .val_to_text = bch2_alloc_v4_to_text, \
.swab = bch2_alloc_v4_swab, \
.trigger = bch2_trigger_alloc, \
.min_val_size = 48, \
@@ -350,6 +349,7 @@ int bch2_dev_remove_alloc(struct bch_fs *, struct bch_dev *);
void bch2_recalc_capacity(struct bch_fs *);
u64 bch2_min_rw_member_capacity(struct bch_fs *);
+void bch2_dev_allocator_set_rw(struct bch_fs *, struct bch_dev *, bool);
void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 7ec022e9361a..b375ad610acd 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -69,10 +69,9 @@ const char * const bch2_watermarks[] = {
void bch2_reset_alloc_cursors(struct bch_fs *c)
{
- rcu_read_lock();
+ guard(rcu)();
for_each_member_device_rcu(c, ca, NULL)
memset(ca->alloc_cursor, 0, sizeof(ca->alloc_cursor));
- rcu_read_unlock();
}
static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob)
@@ -154,7 +153,7 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
static inline bool is_superblock_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
{
- if (c->curr_recovery_pass > BCH_RECOVERY_PASS_trans_mark_dev_sbs)
+ if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_trans_mark_dev_sbs))
return false;
return bch2_is_superblock_bucket(ca, b);
@@ -166,9 +165,8 @@ static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob)
ARRAY_SIZE(c->open_buckets_partial));
spin_lock(&c->freelist_lock);
- rcu_read_lock();
- bch2_dev_rcu(c, ob->dev)->nr_partial_buckets++;
- rcu_read_unlock();
+ scoped_guard(rcu)
+ bch2_dev_rcu(c, ob->dev)->nr_partial_buckets++;
ob->on_partial_list = true;
c->open_buckets_partial[c->open_buckets_partial_nr++] =
@@ -180,11 +178,11 @@ static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob)
}
static inline bool may_alloc_bucket(struct bch_fs *c,
- struct bpos bucket,
- struct bucket_alloc_state *s)
+ struct alloc_request *req,
+ struct bpos bucket)
{
if (bch2_bucket_is_open(c, bucket.inode, bucket.offset)) {
- s->skipped_open++;
+ req->counters.skipped_open++;
return false;
}
@@ -193,48 +191,49 @@ static inline bool may_alloc_bucket(struct bch_fs *c,
bucket.inode, bucket.offset);
if (journal_seq_ready > c->journal.flushed_seq_ondisk) {
if (journal_seq_ready > c->journal.flushing_seq)
- s->need_journal_commit++;
- s->skipped_need_journal_commit++;
+ req->counters.need_journal_commit++;
+ req->counters.skipped_need_journal_commit++;
return false;
}
if (bch2_bucket_nocow_is_locked(&c->nocow_locks, bucket)) {
- s->skipped_nocow++;
+ req->counters.skipped_nocow++;
return false;
}
return true;
}
-static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
+static struct open_bucket *__try_alloc_bucket(struct bch_fs *c,
+ struct alloc_request *req,
u64 bucket, u8 gen,
- enum bch_watermark watermark,
- struct bucket_alloc_state *s,
struct closure *cl)
{
+ struct bch_dev *ca = req->ca;
+
if (unlikely(is_superblock_bucket(c, ca, bucket)))
return NULL;
if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) {
- s->skipped_nouse++;
+ req->counters.skipped_nouse++;
return NULL;
}
spin_lock(&c->freelist_lock);
- if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(watermark))) {
+ if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(req->watermark))) {
if (cl)
closure_wait(&c->open_buckets_wait, cl);
track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], true);
spin_unlock(&c->freelist_lock);
- return ERR_PTR(-BCH_ERR_open_buckets_empty);
+ return ERR_PTR(bch_err_throw(c, open_buckets_empty));
}
/* Recheck under lock: */
if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
spin_unlock(&c->freelist_lock);
- s->skipped_open++;
+ req->counters.skipped_open++;
return NULL;
}
@@ -258,16 +257,15 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
return ob;
}
-static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca,
- enum bch_watermark watermark,
- struct bucket_alloc_state *s,
+static struct open_bucket *try_alloc_bucket(struct btree_trans *trans,
+ struct alloc_request *req,
struct btree_iter *freespace_iter,
struct closure *cl)
{
struct bch_fs *c = trans->c;
u64 b = freespace_iter->pos.offset & ~(~0ULL << 56);
- if (!may_alloc_bucket(c, POS(ca->dev_idx, b), s))
+ if (!may_alloc_bucket(c, req, POS(req->ca->dev_idx, b)))
return NULL;
u8 gen;
@@ -277,7 +275,7 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
if (ret)
return NULL;
- return __try_alloc_bucket(c, ca, b, gen, watermark, s, cl);
+ return __try_alloc_bucket(c, req, b, gen, cl);
}
/*
@@ -285,17 +283,16 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
*/
static noinline struct open_bucket *
bch2_bucket_alloc_early(struct btree_trans *trans,
- struct bch_dev *ca,
- enum bch_watermark watermark,
- struct bucket_alloc_state *s,
+ struct alloc_request *req,
struct closure *cl)
{
struct bch_fs *c = trans->c;
+ struct bch_dev *ca = req->ca;
struct btree_iter iter, citer;
struct bkey_s_c k, ck;
struct open_bucket *ob = NULL;
u64 first_bucket = ca->mi.first_bucket;
- u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap];
+ u64 *dev_alloc_cursor = &ca->alloc_cursor[req->btree_bitmap];
u64 alloc_start = max(first_bucket, *dev_alloc_cursor);
u64 alloc_cursor = alloc_start;
int ret;
@@ -317,10 +314,10 @@ again:
if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)))
break;
- if (s->btree_bitmap != BTREE_BITMAP_ANY &&
- s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca,
+ if (req->btree_bitmap != BTREE_BITMAP_ANY &&
+ req->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca,
bucket_to_sector(ca, bucket), ca->mi.bucket_size)) {
- if (s->btree_bitmap == BTREE_BITMAP_YES &&
+ if (req->btree_bitmap == BTREE_BITMAP_YES &&
bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift)
break;
@@ -328,8 +325,8 @@ again:
round_up(bucket_to_sector(ca, bucket) + 1,
1ULL << ca->mi.btree_bitmap_shift));
bch2_btree_iter_set_pos(trans, &iter, POS(ca->dev_idx, bucket));
- s->buckets_seen++;
- s->skipped_mi_btree_bitmap++;
+ req->counters.buckets_seen++;
+ req->counters.skipped_mi_btree_bitmap++;
continue;
}
@@ -348,11 +345,10 @@ again:
if (a->data_type != BCH_DATA_free)
goto next;
- s->buckets_seen++;
+ req->counters.buckets_seen++;
- ob = may_alloc_bucket(c, k.k->p, s)
- ? __try_alloc_bucket(c, ca, k.k->p.offset, a->gen,
- watermark, s, cl)
+ ob = may_alloc_bucket(c, req, k.k->p)
+ ? __try_alloc_bucket(c, req, k.k->p.offset, a->gen, cl)
: NULL;
next:
bch2_set_btree_iter_dontneed(trans, &citer);
@@ -378,15 +374,14 @@ next:
}
static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
- struct bch_dev *ca,
- enum bch_watermark watermark,
- struct bucket_alloc_state *s,
- struct closure *cl)
+ struct alloc_request *req,
+ struct closure *cl)
{
+ struct bch_dev *ca = req->ca;
struct btree_iter iter;
struct bkey_s_c k;
struct open_bucket *ob = NULL;
- u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap];
+ u64 *dev_alloc_cursor = &ca->alloc_cursor[req->btree_bitmap];
u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(*dev_alloc_cursor));
u64 alloc_cursor = alloc_start;
int ret;
@@ -402,13 +397,13 @@ again:
iter.k.size = iter.k.p.offset - iter.pos.offset;
while (iter.k.size) {
- s->buckets_seen++;
+ req->counters.buckets_seen++;
u64 bucket = iter.pos.offset & ~(~0ULL << 56);
- if (s->btree_bitmap != BTREE_BITMAP_ANY &&
- s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca,
+ if (req->btree_bitmap != BTREE_BITMAP_ANY &&
+ req->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca,
bucket_to_sector(ca, bucket), ca->mi.bucket_size)) {
- if (s->btree_bitmap == BTREE_BITMAP_YES &&
+ if (req->btree_bitmap == BTREE_BITMAP_YES &&
bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift)
goto fail;
@@ -418,11 +413,11 @@ again:
alloc_cursor = bucket|(iter.pos.offset & (~0ULL << 56));
bch2_btree_iter_set_pos(trans, &iter, POS(ca->dev_idx, alloc_cursor));
- s->skipped_mi_btree_bitmap++;
+ req->counters.skipped_mi_btree_bitmap++;
goto next;
}
- ob = try_alloc_bucket(trans, ca, watermark, s, &iter, cl);
+ ob = try_alloc_bucket(trans, req, &iter, cl);
if (ob) {
if (!IS_ERR(ob))
*dev_alloc_cursor = iter.pos.offset;
@@ -453,33 +448,30 @@ fail:
return ob;
}
-static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca,
- enum bch_watermark watermark,
- enum bch_data_type data_type,
+static noinline void trace_bucket_alloc2(struct bch_fs *c,
+ struct alloc_request *req,
struct closure *cl,
- struct bch_dev_usage *usage,
- struct bucket_alloc_state *s,
struct open_bucket *ob)
{
struct printbuf buf = PRINTBUF;
printbuf_tabstop_push(&buf, 24);
- prt_printf(&buf, "dev\t%s (%u)\n", ca->name, ca->dev_idx);
- prt_printf(&buf, "watermark\t%s\n", bch2_watermarks[watermark]);
- prt_printf(&buf, "data type\t%s\n", __bch2_data_types[data_type]);
+ prt_printf(&buf, "dev\t%s (%u)\n", req->ca->name, req->ca->dev_idx);
+ prt_printf(&buf, "watermark\t%s\n", bch2_watermarks[req->watermark]);
+ prt_printf(&buf, "data type\t%s\n", __bch2_data_types[req->data_type]);
prt_printf(&buf, "blocking\t%u\n", cl != NULL);
- prt_printf(&buf, "free\t%llu\n", usage->buckets[BCH_DATA_free]);
- prt_printf(&buf, "avail\t%llu\n", dev_buckets_free(ca, *usage, watermark));
- prt_printf(&buf, "copygc_wait\t%lu/%lli\n",
+ prt_printf(&buf, "free\t%llu\n", req->usage.buckets[BCH_DATA_free]);
+ prt_printf(&buf, "avail\t%llu\n", dev_buckets_free(req->ca, req->usage, req->watermark));
+ prt_printf(&buf, "copygc_wait\t%llu/%lli\n",
bch2_copygc_wait_amount(c),
c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now));
- prt_printf(&buf, "seen\t%llu\n", s->buckets_seen);
- prt_printf(&buf, "open\t%llu\n", s->skipped_open);
- prt_printf(&buf, "need journal commit\t%llu\n", s->skipped_need_journal_commit);
- prt_printf(&buf, "nocow\t%llu\n", s->skipped_nocow);
- prt_printf(&buf, "nouse\t%llu\n", s->skipped_nouse);
- prt_printf(&buf, "mi_btree_bitmap\t%llu\n", s->skipped_mi_btree_bitmap);
+ prt_printf(&buf, "seen\t%llu\n", req->counters.buckets_seen);
+ prt_printf(&buf, "open\t%llu\n", req->counters.skipped_open);
+ prt_printf(&buf, "need journal commit\t%llu\n", req->counters.skipped_need_journal_commit);
+ prt_printf(&buf, "nocow\t%llu\n", req->counters.skipped_nocow);
+ prt_printf(&buf, "nouse\t%llu\n", req->counters.skipped_nouse);
+ prt_printf(&buf, "mi_btree_bitmap\t%llu\n", req->counters.skipped_mi_btree_bitmap);
if (!IS_ERR(ob)) {
prt_printf(&buf, "allocated\t%llu\n", ob->bucket);
@@ -495,47 +487,42 @@ static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca,
/**
* bch2_bucket_alloc_trans - allocate a single bucket from a specific device
* @trans: transaction object
- * @ca: device to allocate from
- * @watermark: how important is this allocation?
- * @data_type: BCH_DATA_journal, btree, user...
+ * @req: state for the entire allocation
* @cl: if not NULL, closure to be used to wait if buckets not available
* @nowait: if true, do not wait for buckets to become available
- * @usage: for secondarily also returning the current device usage
*
* Returns: an open_bucket on success, or an ERR_PTR() on failure.
*/
static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
- struct bch_dev *ca,
- enum bch_watermark watermark,
- enum bch_data_type data_type,
- struct closure *cl,
- bool nowait,
- struct bch_dev_usage *usage)
+ struct alloc_request *req,
+ struct closure *cl,
+ bool nowait)
{
struct bch_fs *c = trans->c;
+ struct bch_dev *ca = req->ca;
struct open_bucket *ob = NULL;
bool freespace = READ_ONCE(ca->mi.freespace_initialized);
u64 avail;
- struct bucket_alloc_state s = {
- .btree_bitmap = data_type == BCH_DATA_btree,
- };
bool waiting = nowait;
+
+ req->btree_bitmap = req->data_type == BCH_DATA_btree;
+ memset(&req->counters, 0, sizeof(req->counters));
again:
- bch2_dev_usage_read_fast(ca, usage);
- avail = dev_buckets_free(ca, *usage, watermark);
+ bch2_dev_usage_read_fast(ca, &req->usage);
+ avail = dev_buckets_free(ca, req->usage, req->watermark);
- if (usage->buckets[BCH_DATA_need_discard] > avail)
+ if (req->usage.buckets[BCH_DATA_need_discard] > avail)
bch2_dev_do_discards(ca);
- if (usage->buckets[BCH_DATA_need_gc_gens] > avail)
+ if (req->usage.buckets[BCH_DATA_need_gc_gens] > avail)
bch2_gc_gens_async(c);
- if (should_invalidate_buckets(ca, *usage))
+ if (should_invalidate_buckets(ca, req->usage))
bch2_dev_do_invalidates(ca);
if (!avail) {
- if (watermark > BCH_WATERMARK_normal &&
- c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations)
+ if (req->watermark > BCH_WATERMARK_normal &&
+ c->recovery.pass_done < BCH_RECOVERY_PASS_check_allocations)
goto alloc;
if (cl && !waiting) {
@@ -546,7 +533,7 @@ again:
track_event_change(&c->times[BCH_TIME_blocked_allocate], true);
- ob = ERR_PTR(-BCH_ERR_freelist_empty);
+ ob = ERR_PTR(bch_err_throw(c, freelist_empty));
goto err;
}
@@ -554,27 +541,27 @@ again:
closure_wake_up(&c->freelist_wait);
alloc:
ob = likely(freespace)
- ? bch2_bucket_alloc_freelist(trans, ca, watermark, &s, cl)
- : bch2_bucket_alloc_early(trans, ca, watermark, &s, cl);
+ ? bch2_bucket_alloc_freelist(trans, req, cl)
+ : bch2_bucket_alloc_early(trans, req, cl);
- if (s.need_journal_commit * 2 > avail)
+ if (req->counters.need_journal_commit * 2 > avail)
bch2_journal_flush_async(&c->journal, NULL);
- if (!ob && s.btree_bitmap != BTREE_BITMAP_ANY) {
- s.btree_bitmap = BTREE_BITMAP_ANY;
+ if (!ob && req->btree_bitmap != BTREE_BITMAP_ANY) {
+ req->btree_bitmap = BTREE_BITMAP_ANY;
goto alloc;
}
- if (!ob && freespace && c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) {
+ if (!ob && freespace && c->recovery.pass_done < BCH_RECOVERY_PASS_check_alloc_info) {
freespace = false;
goto alloc;
}
err:
if (!ob)
- ob = ERR_PTR(-BCH_ERR_no_buckets_found);
+ ob = ERR_PTR(bch_err_throw(c, no_buckets_found));
if (!IS_ERR(ob))
- ob->data_type = data_type;
+ ob->data_type = req->data_type;
if (!IS_ERR(ob))
count_event(c, bucket_alloc);
@@ -584,7 +571,7 @@ err:
if (!IS_ERR(ob)
? trace_bucket_alloc_enabled()
: trace_bucket_alloc_fail_enabled())
- trace_bucket_alloc2(c, ca, watermark, data_type, cl, usage, &s, ob);
+ trace_bucket_alloc2(c, req, cl, ob);
return ob;
}
@@ -594,12 +581,15 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
enum bch_data_type data_type,
struct closure *cl)
{
- struct bch_dev_usage usage;
struct open_bucket *ob;
+ struct alloc_request req = {
+ .watermark = watermark,
+ .data_type = data_type,
+ .ca = ca,
+ };
bch2_trans_do(c,
- PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark,
- data_type, cl, false, &usage)));
+ PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, &req, cl, false)));
return ob;
}
@@ -611,18 +601,18 @@ static int __dev_stripe_cmp(struct dev_stripe_state *stripe,
#define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r)
-struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c,
- struct dev_stripe_state *stripe,
- struct bch_devs_mask *devs)
+void bch2_dev_alloc_list(struct bch_fs *c,
+ struct dev_stripe_state *stripe,
+ struct bch_devs_mask *devs,
+ struct dev_alloc_list *ret)
{
- struct dev_alloc_list ret = { .nr = 0 };
- unsigned i;
+ ret->nr = 0;
+ unsigned i;
for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX)
- ret.data[ret.nr++] = i;
+ ret->data[ret->nr++] = i;
- bubble_sort(ret.data, ret.nr, dev_stripe_cmp);
- return ret;
+ bubble_sort(ret->data, ret->nr, dev_stripe_cmp);
}
static const u64 stripe_clock_hand_rescale = 1ULL << 62; /* trigger rescale at */
@@ -693,64 +683,53 @@ void bch2_dev_stripe_increment(struct bch_dev *ca,
}
static int add_new_bucket(struct bch_fs *c,
- struct open_buckets *ptrs,
- struct bch_devs_mask *devs_may_alloc,
- unsigned nr_replicas,
- unsigned *nr_effective,
- bool *have_cache,
- struct open_bucket *ob)
+ struct alloc_request *req,
+ struct open_bucket *ob)
{
unsigned durability = ob_dev(c, ob)->mi.durability;
- BUG_ON(*nr_effective >= nr_replicas);
+ BUG_ON(req->nr_effective >= req->nr_replicas);
- __clear_bit(ob->dev, devs_may_alloc->d);
- *nr_effective += durability;
- *have_cache |= !durability;
+ __clear_bit(ob->dev, req->devs_may_alloc.d);
+ req->nr_effective += durability;
+ req->have_cache |= !durability;
- ob_push(c, ptrs, ob);
+ ob_push(c, &req->ptrs, ob);
- if (*nr_effective >= nr_replicas)
+ if (req->nr_effective >= req->nr_replicas)
return 1;
if (ob->ec)
return 1;
return 0;
}
-int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
- struct open_buckets *ptrs,
- struct dev_stripe_state *stripe,
- struct bch_devs_mask *devs_may_alloc,
- unsigned nr_replicas,
- unsigned *nr_effective,
- bool *have_cache,
- enum bch_write_flags flags,
- enum bch_data_type data_type,
- enum bch_watermark watermark,
- struct closure *cl)
+inline int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
+ struct alloc_request *req,
+ struct dev_stripe_state *stripe,
+ struct closure *cl)
{
struct bch_fs *c = trans->c;
- int ret = -BCH_ERR_insufficient_devices;
+ int ret = 0;
+
+ BUG_ON(req->nr_effective >= req->nr_replicas);
- BUG_ON(*nr_effective >= nr_replicas);
+ bch2_dev_alloc_list(c, stripe, &req->devs_may_alloc, &req->devs_sorted);
- struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, stripe, devs_may_alloc);
- darray_for_each(devs_sorted, i) {
- struct bch_dev *ca = bch2_dev_tryget_noerror(c, *i);
- if (!ca)
+ darray_for_each(req->devs_sorted, i) {
+ req->ca = bch2_dev_tryget_noerror(c, *i);
+ if (!req->ca)
continue;
- if (!ca->mi.durability && *have_cache) {
- bch2_dev_put(ca);
+ if (!req->ca->mi.durability && req->have_cache) {
+ bch2_dev_put(req->ca);
continue;
}
- struct bch_dev_usage usage;
- struct open_bucket *ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type,
- cl, flags & BCH_WRITE_alloc_nowait, &usage);
+ struct open_bucket *ob = bch2_bucket_alloc_trans(trans, req, cl,
+ req->flags & BCH_WRITE_alloc_nowait);
if (!IS_ERR(ob))
- bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
- bch2_dev_put(ca);
+ bch2_dev_stripe_increment_inlined(req->ca, stripe, &req->usage);
+ bch2_dev_put(req->ca);
if (IS_ERR(ob)) {
ret = PTR_ERR(ob);
@@ -759,15 +738,16 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
continue;
}
- if (add_new_bucket(c, ptrs, devs_may_alloc,
- nr_replicas, nr_effective,
- have_cache, ob)) {
- ret = 0;
+ ret = add_new_bucket(c, req, ob);
+ if (ret)
break;
- }
}
- return ret;
+ if (ret == 1)
+ return 0;
+ if (ret)
+ return ret;
+ return bch_err_throw(c, insufficient_devices);
}
/* Allocate from stripes: */
@@ -779,35 +759,28 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
*/
static int bucket_alloc_from_stripe(struct btree_trans *trans,
- struct open_buckets *ptrs,
- struct write_point *wp,
- struct bch_devs_mask *devs_may_alloc,
- u16 target,
- unsigned nr_replicas,
- unsigned *nr_effective,
- bool *have_cache,
- enum bch_watermark watermark,
- enum bch_write_flags flags,
- struct closure *cl)
+ struct alloc_request *req,
+ struct closure *cl)
{
struct bch_fs *c = trans->c;
int ret = 0;
- if (nr_replicas < 2)
+ if (req->nr_replicas < 2)
return 0;
- if (ec_open_bucket(c, ptrs))
+ if (ec_open_bucket(c, &req->ptrs))
return 0;
struct ec_stripe_head *h =
- bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl);
+ bch2_ec_stripe_head_get(trans, req, 0, cl);
if (IS_ERR(h))
return PTR_ERR(h);
if (!h)
return 0;
- struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc);
- darray_for_each(devs_sorted, i)
+ bch2_dev_alloc_list(c, &req->wp->stripe, &req->devs_may_alloc, &req->devs_sorted);
+
+ darray_for_each(req->devs_sorted, i)
for (unsigned ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) {
if (!h->s->blocks[ec_idx])
continue;
@@ -818,9 +791,7 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans,
ob->ec = h->s;
ec_stripe_new_get(h->s, STRIPE_REF_io);
- ret = add_new_bucket(c, ptrs, devs_may_alloc,
- nr_replicas, nr_effective,
- have_cache, ob);
+ ret = add_new_bucket(c, req, ob);
goto out;
}
}
@@ -832,65 +803,49 @@ out:
/* Sector allocator */
static bool want_bucket(struct bch_fs *c,
- struct write_point *wp,
- struct bch_devs_mask *devs_may_alloc,
- bool *have_cache, bool ec,
+ struct alloc_request *req,
struct open_bucket *ob)
{
struct bch_dev *ca = ob_dev(c, ob);
- if (!test_bit(ob->dev, devs_may_alloc->d))
+ if (!test_bit(ob->dev, req->devs_may_alloc.d))
return false;
- if (ob->data_type != wp->data_type)
+ if (ob->data_type != req->wp->data_type)
return false;
if (!ca->mi.durability &&
- (wp->data_type == BCH_DATA_btree || ec || *have_cache))
+ (req->wp->data_type == BCH_DATA_btree || req->ec || req->have_cache))
return false;
- if (ec != (ob->ec != NULL))
+ if (req->ec != (ob->ec != NULL))
return false;
return true;
}
static int bucket_alloc_set_writepoint(struct bch_fs *c,
- struct open_buckets *ptrs,
- struct write_point *wp,
- struct bch_devs_mask *devs_may_alloc,
- unsigned nr_replicas,
- unsigned *nr_effective,
- bool *have_cache,
- bool ec)
+ struct alloc_request *req)
{
- struct open_buckets ptrs_skip = { .nr = 0 };
struct open_bucket *ob;
unsigned i;
int ret = 0;
- open_bucket_for_each(c, &wp->ptrs, ob, i) {
- if (!ret && want_bucket(c, wp, devs_may_alloc,
- have_cache, ec, ob))
- ret = add_new_bucket(c, ptrs, devs_may_alloc,
- nr_replicas, nr_effective,
- have_cache, ob);
+ req->scratch_ptrs.nr = 0;
+
+ open_bucket_for_each(c, &req->wp->ptrs, ob, i) {
+ if (!ret && want_bucket(c, req, ob))
+ ret = add_new_bucket(c, req, ob);
else
- ob_push(c, &ptrs_skip, ob);
+ ob_push(c, &req->scratch_ptrs, ob);
}
- wp->ptrs = ptrs_skip;
+ req->wp->ptrs = req->scratch_ptrs;
return ret;
}
static int bucket_alloc_set_partial(struct bch_fs *c,
- struct open_buckets *ptrs,
- struct write_point *wp,
- struct bch_devs_mask *devs_may_alloc,
- unsigned nr_replicas,
- unsigned *nr_effective,
- bool *have_cache, bool ec,
- enum bch_watermark watermark)
+ struct alloc_request *req)
{
int i, ret = 0;
@@ -905,13 +860,12 @@ static int bucket_alloc_set_partial(struct bch_fs *c,
for (i = c->open_buckets_partial_nr - 1; i >= 0; --i) {
struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i];
- if (want_bucket(c, wp, devs_may_alloc, have_cache, ec, ob)) {
+ if (want_bucket(c, req, ob)) {
struct bch_dev *ca = ob_dev(c, ob);
- struct bch_dev_usage usage;
u64 avail;
- bch2_dev_usage_read_fast(ca, &usage);
- avail = dev_buckets_free(ca, usage, watermark) + ca->nr_partial_buckets;
+ bch2_dev_usage_read_fast(ca, &req->usage);
+ avail = dev_buckets_free(ca, req->usage, req->watermark) + ca->nr_partial_buckets;
if (!avail)
continue;
@@ -920,13 +874,10 @@ static int bucket_alloc_set_partial(struct bch_fs *c,
i);
ob->on_partial_list = false;
- rcu_read_lock();
- bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--;
- rcu_read_unlock();
+ scoped_guard(rcu)
+ bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--;
- ret = add_new_bucket(c, ptrs, devs_may_alloc,
- nr_replicas, nr_effective,
- have_cache, ob);
+ ret = add_new_bucket(c, req, ob);
if (ret)
break;
}
@@ -937,61 +888,41 @@ unlock:
}
static int __open_bucket_add_buckets(struct btree_trans *trans,
- struct open_buckets *ptrs,
- struct write_point *wp,
- struct bch_devs_list *devs_have,
- u16 target,
- bool erasure_code,
- unsigned nr_replicas,
- unsigned *nr_effective,
- bool *have_cache,
- enum bch_watermark watermark,
- enum bch_write_flags flags,
- struct closure *_cl)
+ struct alloc_request *req,
+ struct closure *_cl)
{
struct bch_fs *c = trans->c;
- struct bch_devs_mask devs;
struct open_bucket *ob;
struct closure *cl = NULL;
unsigned i;
int ret;
- devs = target_rw_devs(c, wp->data_type, target);
+ req->devs_may_alloc = target_rw_devs(c, req->wp->data_type, req->target);
/* Don't allocate from devices we already have pointers to: */
- darray_for_each(*devs_have, i)
- __clear_bit(*i, devs.d);
+ darray_for_each(*req->devs_have, i)
+ __clear_bit(*i, req->devs_may_alloc.d);
- open_bucket_for_each(c, ptrs, ob, i)
- __clear_bit(ob->dev, devs.d);
+ open_bucket_for_each(c, &req->ptrs, ob, i)
+ __clear_bit(ob->dev, req->devs_may_alloc.d);
- ret = bucket_alloc_set_writepoint(c, ptrs, wp, &devs,
- nr_replicas, nr_effective,
- have_cache, erasure_code);
+ ret = bucket_alloc_set_writepoint(c, req);
if (ret)
return ret;
- ret = bucket_alloc_set_partial(c, ptrs, wp, &devs,
- nr_replicas, nr_effective,
- have_cache, erasure_code, watermark);
+ ret = bucket_alloc_set_partial(c, req);
if (ret)
return ret;
- if (erasure_code) {
- ret = bucket_alloc_from_stripe(trans, ptrs, wp, &devs,
- target,
- nr_replicas, nr_effective,
- have_cache,
- watermark, flags, _cl);
+ if (req->ec) {
+ ret = bucket_alloc_from_stripe(trans, req, _cl);
} else {
retry_blocking:
/*
* Try nonblocking first, so that if one device is full we'll try from
* other devices:
*/
- ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs,
- nr_replicas, nr_effective, have_cache,
- flags, wp->data_type, watermark, cl);
+ ret = bch2_bucket_alloc_set_trans(trans, req, &req->wp->stripe, cl);
if (ret &&
!bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
!bch2_err_matches(ret, BCH_ERR_insufficient_devices) &&
@@ -1005,38 +936,27 @@ retry_blocking:
}
static int open_bucket_add_buckets(struct btree_trans *trans,
- struct open_buckets *ptrs,
- struct write_point *wp,
- struct bch_devs_list *devs_have,
- u16 target,
- unsigned erasure_code,
- unsigned nr_replicas,
- unsigned *nr_effective,
- bool *have_cache,
- enum bch_watermark watermark,
- enum bch_write_flags flags,
- struct closure *cl)
+ struct alloc_request *req,
+ struct closure *cl)
{
int ret;
- if (erasure_code && !ec_open_bucket(trans->c, ptrs)) {
- ret = __open_bucket_add_buckets(trans, ptrs, wp,
- devs_have, target, erasure_code,
- nr_replicas, nr_effective, have_cache,
- watermark, flags, cl);
+ if (req->ec && !ec_open_bucket(trans->c, &req->ptrs)) {
+ ret = __open_bucket_add_buckets(trans, req, cl);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
bch2_err_matches(ret, BCH_ERR_operation_blocked) ||
bch2_err_matches(ret, BCH_ERR_freelist_empty) ||
bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
return ret;
- if (*nr_effective >= nr_replicas)
+ if (req->nr_effective >= req->nr_replicas)
return 0;
}
- ret = __open_bucket_add_buckets(trans, ptrs, wp,
- devs_have, target, false,
- nr_replicas, nr_effective, have_cache,
- watermark, flags, cl);
+ bool ec = false;
+ swap(ec, req->ec);
+ ret = __open_bucket_add_buckets(trans, req, cl);
+ swap(ec, req->ec);
+
return ret < 0 ? ret : 0;
}
@@ -1137,9 +1057,8 @@ void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca,
ob->on_partial_list = false;
- rcu_read_lock();
- bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--;
- rcu_read_unlock();
+ scoped_guard(rcu)
+ bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--;
spin_unlock(&c->freelist_lock);
bch2_open_bucket_put(c, ob);
@@ -1167,14 +1086,11 @@ static struct write_point *__writepoint_find(struct hlist_head *head,
{
struct write_point *wp;
- rcu_read_lock();
+ guard(rcu)();
hlist_for_each_entry_rcu(wp, head, node)
if (wp->write_point == write_point)
- goto out;
- wp = NULL;
-out:
- rcu_read_unlock();
- return wp;
+ return wp;
+ return NULL;
}
static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor)
@@ -1185,7 +1101,7 @@ static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor)
return stranded * factor > free;
}
-static bool try_increase_writepoints(struct bch_fs *c)
+static noinline bool try_increase_writepoints(struct bch_fs *c)
{
struct write_point *wp;
@@ -1198,7 +1114,7 @@ static bool try_increase_writepoints(struct bch_fs *c)
return true;
}
-static bool try_decrease_writepoints(struct btree_trans *trans, unsigned old_nr)
+static noinline bool try_decrease_writepoints(struct btree_trans *trans, unsigned old_nr)
{
struct bch_fs *c = trans->c;
struct write_point *wp;
@@ -1289,26 +1205,26 @@ out:
static noinline void
deallocate_extra_replicas(struct bch_fs *c,
- struct open_buckets *ptrs,
- struct open_buckets *ptrs_no_use,
- unsigned extra_replicas)
+ struct alloc_request *req)
{
- struct open_buckets ptrs2 = { 0 };
struct open_bucket *ob;
+ unsigned extra_replicas = req->nr_effective - req->nr_replicas;
unsigned i;
- open_bucket_for_each(c, ptrs, ob, i) {
+ req->scratch_ptrs.nr = 0;
+
+ open_bucket_for_each(c, &req->ptrs, ob, i) {
unsigned d = ob_dev(c, ob)->mi.durability;
if (d && d <= extra_replicas) {
extra_replicas -= d;
- ob_push(c, ptrs_no_use, ob);
+ ob_push(c, &req->wp->ptrs, ob);
} else {
- ob_push(c, &ptrs2, ob);
+ ob_push(c, &req->scratch_ptrs, ob);
}
}
- *ptrs = ptrs2;
+ req->ptrs = req->scratch_ptrs;
}
/*
@@ -1327,51 +1243,53 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
struct write_point **wp_ret)
{
struct bch_fs *c = trans->c;
- struct write_point *wp;
struct open_bucket *ob;
- struct open_buckets ptrs;
- unsigned nr_effective, write_points_nr;
- bool have_cache;
- int ret;
+ unsigned write_points_nr;
int i;
+ struct alloc_request *req = bch2_trans_kmalloc_nomemzero(trans, sizeof(*req));
+ int ret = PTR_ERR_OR_ZERO(req);
+ if (unlikely(ret))
+ return ret;
+
if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING))
erasure_code = false;
+ req->nr_replicas = nr_replicas;
+ req->target = target;
+ req->ec = erasure_code;
+ req->watermark = watermark;
+ req->flags = flags;
+ req->devs_have = devs_have;
+
BUG_ON(!nr_replicas || !nr_replicas_required);
retry:
- ptrs.nr = 0;
- nr_effective = 0;
- write_points_nr = c->write_points_nr;
- have_cache = false;
+ req->ptrs.nr = 0;
+ req->nr_effective = 0;
+ req->have_cache = false;
+ write_points_nr = c->write_points_nr;
+
+ *wp_ret = req->wp = writepoint_find(trans, write_point.v);
- *wp_ret = wp = writepoint_find(trans, write_point.v);
+ req->data_type = req->wp->data_type;
ret = bch2_trans_relock(trans);
if (ret)
goto err;
/* metadata may not allocate on cache devices: */
- if (wp->data_type != BCH_DATA_user)
- have_cache = true;
+ if (req->data_type != BCH_DATA_user)
+ req->have_cache = true;
if (target && !(flags & BCH_WRITE_only_specified_devs)) {
- ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
- target, erasure_code,
- nr_replicas, &nr_effective,
- &have_cache, watermark,
- flags, NULL);
+ ret = open_bucket_add_buckets(trans, req, NULL);
if (!ret ||
bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto alloc_done;
/* Don't retry from all devices if we're out of open buckets: */
if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) {
- int ret2 = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
- target, erasure_code,
- nr_replicas, &nr_effective,
- &have_cache, watermark,
- flags, cl);
+ int ret2 = open_bucket_add_buckets(trans, req, cl);
if (!ret2 ||
bch2_err_matches(ret2, BCH_ERR_transaction_restart) ||
bch2_err_matches(ret2, BCH_ERR_open_buckets_empty)) {
@@ -1384,45 +1302,38 @@ retry:
* Only try to allocate cache (durability = 0 devices) from the
* specified target:
*/
- have_cache = true;
+ req->have_cache = true;
+ req->target = 0;
- ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
- 0, erasure_code,
- nr_replicas, &nr_effective,
- &have_cache, watermark,
- flags, cl);
+ ret = open_bucket_add_buckets(trans, req, cl);
} else {
- ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
- target, erasure_code,
- nr_replicas, &nr_effective,
- &have_cache, watermark,
- flags, cl);
+ ret = open_bucket_add_buckets(trans, req, cl);
}
alloc_done:
- BUG_ON(!ret && nr_effective < nr_replicas);
+ BUG_ON(!ret && req->nr_effective < req->nr_replicas);
- if (erasure_code && !ec_open_bucket(c, &ptrs))
+ if (erasure_code && !ec_open_bucket(c, &req->ptrs))
pr_debug("failed to get ec bucket: ret %u", ret);
if (ret == -BCH_ERR_insufficient_devices &&
- nr_effective >= nr_replicas_required)
+ req->nr_effective >= nr_replicas_required)
ret = 0;
if (ret)
goto err;
- if (nr_effective > nr_replicas)
- deallocate_extra_replicas(c, &ptrs, &wp->ptrs, nr_effective - nr_replicas);
+ if (req->nr_effective > req->nr_replicas)
+ deallocate_extra_replicas(c, req);
/* Free buckets we didn't use: */
- open_bucket_for_each(c, &wp->ptrs, ob, i)
+ open_bucket_for_each(c, &req->wp->ptrs, ob, i)
open_bucket_free_unused(c, ob);
- wp->ptrs = ptrs;
+ req->wp->ptrs = req->ptrs;
- wp->sectors_free = UINT_MAX;
+ req->wp->sectors_free = UINT_MAX;
- open_bucket_for_each(c, &wp->ptrs, ob, i) {
+ open_bucket_for_each(c, &req->wp->ptrs, ob, i) {
/*
* Ensure proper write alignment - either due to misaligned
* bucket sizes (from buggy bcachefs-tools), or writes that mix
@@ -1436,58 +1347,44 @@ alloc_done:
ob->sectors_free = max_t(int, 0, ob->sectors_free - align);
- wp->sectors_free = min(wp->sectors_free, ob->sectors_free);
+ req->wp->sectors_free = min(req->wp->sectors_free, ob->sectors_free);
}
- wp->sectors_free = rounddown(wp->sectors_free, block_sectors(c));
+ req->wp->sectors_free = rounddown(req->wp->sectors_free, block_sectors(c));
/* Did alignment use up space in an open_bucket? */
- if (unlikely(!wp->sectors_free)) {
- bch2_alloc_sectors_done(c, wp);
+ if (unlikely(!req->wp->sectors_free)) {
+ bch2_alloc_sectors_done(c, req->wp);
goto retry;
}
- BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX);
+ BUG_ON(!req->wp->sectors_free || req->wp->sectors_free == UINT_MAX);
return 0;
err:
- open_bucket_for_each(c, &wp->ptrs, ob, i)
- if (ptrs.nr < ARRAY_SIZE(ptrs.v))
- ob_push(c, &ptrs, ob);
+ open_bucket_for_each(c, &req->wp->ptrs, ob, i)
+ if (req->ptrs.nr < ARRAY_SIZE(req->ptrs.v))
+ ob_push(c, &req->ptrs, ob);
else
open_bucket_free_unused(c, ob);
- wp->ptrs = ptrs;
+ req->wp->ptrs = req->ptrs;
- mutex_unlock(&wp->lock);
+ mutex_unlock(&req->wp->lock);
if (bch2_err_matches(ret, BCH_ERR_freelist_empty) &&
try_decrease_writepoints(trans, write_points_nr))
goto retry;
if (cl && bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
- ret = -BCH_ERR_bucket_alloc_blocked;
+ ret = bch_err_throw(c, bucket_alloc_blocked);
if (cl && !(flags & BCH_WRITE_alloc_nowait) &&
bch2_err_matches(ret, BCH_ERR_freelist_empty))
- ret = -BCH_ERR_bucket_alloc_blocked;
+ ret = bch_err_throw(c, bucket_alloc_blocked);
return ret;
}
-struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
-{
- struct bch_dev *ca = ob_dev(c, ob);
-
- return (struct bch_extent_ptr) {
- .type = 1 << BCH_EXTENT_ENTRY_ptr,
- .gen = ob->gen,
- .dev = ob->dev,
- .offset = bucket_to_sector(ca, ob->bucket) +
- ca->mi.bucket_size -
- ob->sectors_free,
- };
-}
-
void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
struct bkey_i *k, unsigned sectors,
bool cached)
@@ -1617,6 +1514,8 @@ static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c,
struct open_bucket *ob;
unsigned i;
+ mutex_lock(&wp->lock);
+
prt_printf(out, "%lu: ", wp->write_point);
prt_human_readable_u64(out, wp->sectors_allocated << 9);
@@ -1634,6 +1533,8 @@ static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c,
open_bucket_for_each(c, &wp->ptrs, ob, i)
bch2_open_bucket_to_text(out, c, ob);
printbuf_indent_sub(out, 2);
+
+ mutex_unlock(&wp->lock);
}
void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c)
@@ -1731,13 +1632,18 @@ static noinline void bch2_print_allocator_stuck(struct bch_fs *c)
printbuf_indent_sub(&buf, 2);
prt_newline(&buf);
- for_each_online_member(c, ca) {
- prt_printf(&buf, "Dev %u:\n", ca->dev_idx);
- printbuf_indent_add(&buf, 2);
- bch2_dev_alloc_debug_to_text(&buf, ca);
- printbuf_indent_sub(&buf, 2);
- prt_newline(&buf);
- }
+ bch2_printbuf_make_room(&buf, 4096);
+
+ buf.atomic++;
+ scoped_guard(rcu)
+ for_each_online_member_rcu(c, ca) {
+ prt_printf(&buf, "Dev %u:\n", ca->dev_idx);
+ printbuf_indent_add(&buf, 2);
+ bch2_dev_alloc_debug_to_text(&buf, ca);
+ printbuf_indent_sub(&buf, 2);
+ prt_newline(&buf);
+ }
+ --buf.atomic;
prt_printf(&buf, "Copygc debug:\n");
printbuf_indent_add(&buf, 2);
@@ -1750,7 +1656,7 @@ static noinline void bch2_print_allocator_stuck(struct bch_fs *c)
bch2_journal_debug_to_text(&buf, &c->journal);
printbuf_indent_sub(&buf, 2);
- bch2_print_string_as_lines(KERN_ERR, buf.buf);
+ bch2_print_str(c, KERN_ERR, buf.buf);
printbuf_exit(&buf);
}
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index 4c1e33cf57c0..1b3fc8460096 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -3,8 +3,10 @@
#define _BCACHEFS_ALLOC_FOREGROUND_H
#include "bcachefs.h"
+#include "buckets.h"
#include "alloc_types.h"
#include "extents.h"
+#include "io_write_types.h"
#include "sb-members.h"
#include <linux/hash.h>
@@ -23,9 +25,57 @@ struct dev_alloc_list {
u8 data[BCH_SB_MEMBERS_MAX];
};
-struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *,
- struct dev_stripe_state *,
- struct bch_devs_mask *);
+struct alloc_request {
+ unsigned nr_replicas;
+ unsigned target;
+ bool ec;
+ enum bch_watermark watermark;
+ enum bch_write_flags flags;
+ enum bch_data_type data_type;
+ struct bch_devs_list *devs_have;
+ struct write_point *wp;
+
+ /* These fields are used primarily by open_bucket_add_buckets */
+ struct open_buckets ptrs;
+ unsigned nr_effective; /* sum of @ptrs durability */
+ bool have_cache; /* have we allocated from a 0 durability dev */
+ struct bch_devs_mask devs_may_alloc;
+
+ /* bch2_bucket_alloc_set_trans(): */
+ struct dev_alloc_list devs_sorted;
+ struct bch_dev_usage usage;
+
+ /* bch2_bucket_alloc_trans(): */
+ struct bch_dev *ca;
+
+ enum {
+ BTREE_BITMAP_NO,
+ BTREE_BITMAP_YES,
+ BTREE_BITMAP_ANY,
+ } btree_bitmap;
+
+ struct {
+ u64 buckets_seen;
+ u64 skipped_open;
+ u64 skipped_need_journal_commit;
+ u64 need_journal_commit;
+ u64 skipped_nocow;
+ u64 skipped_nouse;
+ u64 skipped_mi_btree_bitmap;
+ } counters;
+
+ unsigned scratch_nr_replicas;
+ unsigned scratch_nr_effective;
+ bool scratch_have_cache;
+ enum bch_data_type scratch_data_type;
+ struct open_buckets scratch_ptrs;
+ struct bch_devs_mask scratch_devs_may_alloc;
+};
+
+void bch2_dev_alloc_list(struct bch_fs *,
+ struct dev_stripe_state *,
+ struct bch_devs_mask *,
+ struct dev_alloc_list *);
void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *);
static inline struct bch_dev *ob_dev(struct bch_fs *c, struct open_bucket *ob)
@@ -173,11 +223,8 @@ static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64
}
enum bch_write_flags;
-int bch2_bucket_alloc_set_trans(struct btree_trans *, struct open_buckets *,
- struct dev_stripe_state *, struct bch_devs_mask *,
- unsigned, unsigned *, bool *, enum bch_write_flags,
- enum bch_data_type, enum bch_watermark,
- struct closure *);
+int bch2_bucket_alloc_set_trans(struct btree_trans *, struct alloc_request *,
+ struct dev_stripe_state *, struct closure *);
int bch2_alloc_sectors_start_trans(struct btree_trans *,
unsigned, unsigned,
@@ -189,7 +236,19 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *,
struct closure *,
struct write_point **);
-struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *, struct open_bucket *);
+static inline struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
+{
+ struct bch_dev *ca = ob_dev(c, ob);
+
+ return (struct bch_extent_ptr) {
+ .type = 1 << BCH_EXTENT_ENTRY_ptr,
+ .gen = ob->gen,
+ .dev = ob->dev,
+ .offset = bucket_to_sector(ca, ob->bucket) +
+ ca->mi.bucket_size -
+ ob->sectors_free,
+ };
+}
/*
* Append pointers to the space we just allocated to @k, and mark @sectors space
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index 8f79f46c2a78..e7becdf22cba 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -8,22 +8,6 @@
#include "clock_types.h"
#include "fifo.h"
-struct bucket_alloc_state {
- enum {
- BTREE_BITMAP_NO,
- BTREE_BITMAP_YES,
- BTREE_BITMAP_ANY,
- } btree_bitmap;
-
- u64 buckets_seen;
- u64 skipped_open;
- u64 skipped_need_journal_commit;
- u64 need_journal_commit;
- u64 skipped_nocow;
- u64 skipped_nouse;
- u64 skipped_mi_btree_bitmap;
-};
-
#define BCH_WATERMARKS() \
x(stripe) \
x(normal) \
diff --git a/fs/bcachefs/async_objs.c b/fs/bcachefs/async_objs.c
new file mode 100644
index 000000000000..a7cd1f0f0964
--- /dev/null
+++ b/fs/bcachefs/async_objs.c
@@ -0,0 +1,132 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Async obj debugging: keep asynchronous objects on (very fast) lists, make
+ * them visibile in debugfs:
+ */
+
+#include "bcachefs.h"
+#include "async_objs.h"
+#include "btree_io.h"
+#include "debug.h"
+#include "io_read.h"
+#include "io_write.h"
+
+#include <linux/debugfs.h>
+
+static void promote_obj_to_text(struct printbuf *out, void *obj)
+{
+ bch2_promote_op_to_text(out, obj);
+}
+
+static void rbio_obj_to_text(struct printbuf *out, void *obj)
+{
+ bch2_read_bio_to_text(out, obj);
+}
+
+static void write_op_obj_to_text(struct printbuf *out, void *obj)
+{
+ bch2_write_op_to_text(out, obj);
+}
+
+static void btree_read_bio_obj_to_text(struct printbuf *out, void *obj)
+{
+ struct btree_read_bio *rbio = obj;
+ bch2_btree_read_bio_to_text(out, rbio);
+}
+
+static void btree_write_bio_obj_to_text(struct printbuf *out, void *obj)
+{
+ struct btree_write_bio *wbio = obj;
+ bch2_bio_to_text(out, &wbio->wbio.bio);
+}
+
+static int bch2_async_obj_list_open(struct inode *inode, struct file *file)
+{
+ struct async_obj_list *list = inode->i_private;
+ struct dump_iter *i;
+
+ i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL);
+ if (!i)
+ return -ENOMEM;
+
+ file->private_data = i;
+ i->from = POS_MIN;
+ i->iter = 0;
+ i->c = container_of(list, struct bch_fs, async_objs[list->idx]);
+ i->list = list;
+ i->buf = PRINTBUF;
+ return 0;
+}
+
+static ssize_t bch2_async_obj_list_read(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ struct dump_iter *i = file->private_data;
+ struct async_obj_list *list = i->list;
+ ssize_t ret = 0;
+
+ i->ubuf = buf;
+ i->size = size;
+ i->ret = 0;
+
+ struct genradix_iter iter;
+ void *obj;
+ fast_list_for_each_from(&list->list, iter, obj, i->iter) {
+ ret = bch2_debugfs_flush_buf(i);
+ if (ret)
+ return ret;
+
+ if (!i->size)
+ break;
+
+ list->obj_to_text(&i->buf, obj);
+ }
+
+ if (i->buf.allocation_failure)
+ ret = -ENOMEM;
+ else
+ i->iter = iter.pos;
+
+ if (!ret)
+ ret = bch2_debugfs_flush_buf(i);
+
+ return ret ?: i->ret;
+}
+
+static const struct file_operations async_obj_ops = {
+ .owner = THIS_MODULE,
+ .open = bch2_async_obj_list_open,
+ .release = bch2_dump_release,
+ .read = bch2_async_obj_list_read,
+};
+
+void bch2_fs_async_obj_debugfs_init(struct bch_fs *c)
+{
+ c->async_obj_dir = debugfs_create_dir("async_objs", c->fs_debug_dir);
+
+#define x(n) debugfs_create_file(#n, 0400, c->async_obj_dir, \
+ &c->async_objs[BCH_ASYNC_OBJ_LIST_##n], &async_obj_ops);
+ BCH_ASYNC_OBJ_LISTS()
+#undef x
+}
+
+void bch2_fs_async_obj_exit(struct bch_fs *c)
+{
+ for (unsigned i = 0; i < ARRAY_SIZE(c->async_objs); i++)
+ fast_list_exit(&c->async_objs[i].list);
+}
+
+int bch2_fs_async_obj_init(struct bch_fs *c)
+{
+ for (unsigned i = 0; i < ARRAY_SIZE(c->async_objs); i++) {
+ if (fast_list_init(&c->async_objs[i].list))
+ return -BCH_ERR_ENOMEM_async_obj_init;
+ c->async_objs[i].idx = i;
+ }
+
+#define x(n) c->async_objs[BCH_ASYNC_OBJ_LIST_##n].obj_to_text = n##_obj_to_text;
+ BCH_ASYNC_OBJ_LISTS()
+#undef x
+
+ return 0;
+}
diff --git a/fs/bcachefs/async_objs.h b/fs/bcachefs/async_objs.h
new file mode 100644
index 000000000000..cd6489b8cf76
--- /dev/null
+++ b/fs/bcachefs/async_objs.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ASYNC_OBJS_H
+#define _BCACHEFS_ASYNC_OBJS_H
+
+#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
+static inline void __async_object_list_del(struct fast_list *head, unsigned idx)
+{
+ fast_list_remove(head, idx);
+}
+
+static inline int __async_object_list_add(struct fast_list *head, void *obj, unsigned *idx)
+{
+ int ret = fast_list_add(head, obj);
+ *idx = ret > 0 ? ret : 0;
+ return ret < 0 ? ret : 0;
+}
+
+#define async_object_list_del(_c, _list, idx) \
+ __async_object_list_del(&(_c)->async_objs[BCH_ASYNC_OBJ_LIST_##_list].list, idx)
+
+#define async_object_list_add(_c, _list, obj, idx) \
+ __async_object_list_add(&(_c)->async_objs[BCH_ASYNC_OBJ_LIST_##_list].list, obj, idx)
+
+void bch2_fs_async_obj_debugfs_init(struct bch_fs *);
+void bch2_fs_async_obj_exit(struct bch_fs *);
+int bch2_fs_async_obj_init(struct bch_fs *);
+
+#else /* CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS */
+
+#define async_object_list_del(_c, _n, idx) do {} while (0)
+
+static inline int __async_object_list_add(void)
+{
+ return 0;
+}
+#define async_object_list_add(_c, _n, obj, idx) __async_object_list_add()
+
+static inline void bch2_fs_async_obj_debugfs_init(struct bch_fs *c) {}
+static inline void bch2_fs_async_obj_exit(struct bch_fs *c) {}
+static inline int bch2_fs_async_obj_init(struct bch_fs *c) { return 0; }
+
+#endif /* CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS */
+
+#endif /* _BCACHEFS_ASYNC_OBJS_H */
diff --git a/fs/bcachefs/async_objs_types.h b/fs/bcachefs/async_objs_types.h
new file mode 100644
index 000000000000..8d713c0f5841
--- /dev/null
+++ b/fs/bcachefs/async_objs_types.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ASYNC_OBJS_TYPES_H
+#define _BCACHEFS_ASYNC_OBJS_TYPES_H
+
+#define BCH_ASYNC_OBJ_LISTS() \
+ x(promote) \
+ x(rbio) \
+ x(write_op) \
+ x(btree_read_bio) \
+ x(btree_write_bio)
+
+enum bch_async_obj_lists {
+#define x(n) BCH_ASYNC_OBJ_LIST_##n,
+ BCH_ASYNC_OBJ_LISTS()
+#undef x
+ BCH_ASYNC_OBJ_NR
+};
+
+struct async_obj_list {
+ struct fast_list list;
+ void (*obj_to_text)(struct printbuf *, void *);
+ unsigned idx;
+};
+
+#endif /* _BCACHEFS_ASYNC_OBJS_TYPES_H */
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 5f195d2280a4..e76809e71858 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -12,9 +12,20 @@
#include "disk_accounting.h"
#include "error.h"
#include "progress.h"
+#include "recovery_passes.h"
#include <linux/mm.h>
+static int bch2_bucket_bitmap_set(struct bch_dev *, struct bucket_bitmap *, u64);
+
+static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp)
+{
+ return (struct bbpos) {
+ .btree = bp.btree_id,
+ .pos = bp.pos,
+ };
+}
+
int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k,
struct bkey_validate_context from)
{
@@ -37,17 +48,19 @@ void bch2_backpointer_to_text(struct printbuf *out, struct bch_fs *c, struct bke
{
struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
- rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp.k->p.inode);
- if (ca) {
- u32 bucket_offset;
- struct bpos bucket = bp_pos_to_bucket_and_offset(ca, bp.k->p, &bucket_offset);
- rcu_read_unlock();
+ struct bch_dev *ca;
+ u32 bucket_offset;
+ struct bpos bucket;
+ scoped_guard(rcu) {
+ ca = bch2_dev_rcu_noerror(c, bp.k->p.inode);
+ if (ca)
+ bucket = bp_pos_to_bucket_and_offset(ca, bp.k->p, &bucket_offset);
+ }
+
+ if (ca)
prt_printf(out, "bucket=%llu:%llu:%u ", bucket.inode, bucket.offset, bucket_offset);
- } else {
- rcu_read_unlock();
+ else
prt_printf(out, "sector=%llu:%llu ", bp.k->p.inode, bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT);
- }
bch2_btree_id_level_to_text(out, bp.v->btree_id, bp.v->level);
prt_str(out, " data_type=");
@@ -96,6 +109,8 @@ static noinline int backpointer_mod_err(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF;
+ bool will_check = c->recovery.passes_to_run &
+ BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers);
int ret = 0;
if (insert) {
@@ -110,9 +125,7 @@ static noinline int backpointer_mod_err(struct btree_trans *trans,
prt_printf(&buf, "for ");
bch2_bkey_val_to_text(&buf, c, orig_k);
-
- bch_err(c, "%s", buf.buf);
- } else if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) {
+ } else if (!will_check) {
prt_printf(&buf, "backpointer not found when deleting\n");
printbuf_indent_add(&buf, 2);
@@ -128,9 +141,8 @@ static noinline int backpointer_mod_err(struct btree_trans *trans,
bch2_bkey_val_to_text(&buf, c, orig_k);
}
- if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers &&
- __bch2_inconsistent_error(c, &buf))
- ret = -BCH_ERR_erofs_unfixed_errors;
+ if (!will_check && __bch2_inconsistent_error(c, &buf))
+ ret = bch_err_throw(c, erofs_unfixed_errors);
bch_err(c, "%s", buf.buf);
printbuf_exit(&buf);
@@ -174,7 +186,7 @@ err:
static int bch2_backpointer_del(struct btree_trans *trans, struct bpos pos)
{
- return (likely(!bch2_backpointers_no_use_write_buffer)
+ return (!static_branch_unlikely(&bch2_backpointers_no_use_write_buffer)
? bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, pos)
: bch2_btree_delete(trans, BTREE_ID_backpointers, pos, 0)) ?:
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
@@ -184,7 +196,7 @@ static inline int bch2_backpointers_maybe_flush(struct btree_trans *trans,
struct bkey_s_c visiting_k,
struct bkey_buf *last_flushed)
{
- return likely(!bch2_backpointers_no_use_write_buffer)
+ return !static_branch_unlikely(&bch2_backpointers_no_use_write_buffer)
? bch2_btree_write_buffer_maybe_flush(trans, visiting_k, last_flushed)
: 0;
}
@@ -283,7 +295,7 @@ static struct btree *__bch2_backpointer_get_node(struct btree_trans *trans,
return b;
if (btree_node_will_make_reachable(b)) {
- b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node);
+ b = ERR_PTR(bch_err_throw(c, backpointer_to_overwritten_btree_node));
} else {
int ret = backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key),
last_flushed, commit);
@@ -341,7 +353,7 @@ static struct bkey_s_c __bch2_backpointer_get_key(struct btree_trans *trans,
return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
} else {
struct btree *b = __bch2_backpointer_get_node(trans, bp, iter, last_flushed, commit);
- if (b == ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node))
+ if (b == ERR_PTR(bch_err_throw(c, backpointer_to_overwritten_btree_node)))
return bkey_s_c_null;
if (IS_ERR_OR_NULL(b))
return ((struct bkey_s_c) { .k = ERR_CAST(b) });
@@ -478,7 +490,8 @@ found:
bytes = p.crc.compressed_size << 9;
- struct bch_dev *ca = bch2_dev_get_ioref(c, dev, READ);
+ struct bch_dev *ca = bch2_dev_get_ioref(c, dev, READ,
+ BCH_DEV_READ_REF_check_extent_checksums);
if (!ca)
return false;
@@ -515,7 +528,8 @@ err:
if (bio)
bio_put(bio);
kvfree(data_buf);
- percpu_ref_put(&ca->io_ref[READ]);
+ enumerated_ref_put(&ca->io_ref[READ],
+ BCH_DEV_READ_REF_check_extent_checksums);
printbuf_exit(&buf);
return ret;
}
@@ -579,6 +593,7 @@ check_existing_bp:
bkey_for_each_ptr(other_extent_ptrs, ptr)
if (ptr->dev == bp->k.p.inode &&
dev_ptr_stale_rcu(ca, ptr)) {
+ rcu_read_unlock();
ret = drop_dev_and_update(trans, other_bp.v->btree_id,
other_extent, bp->k.p.inode);
if (ret)
@@ -636,7 +651,7 @@ check_existing_bp:
prt_newline(&buf);
bch2_bkey_val_to_text(&buf, c, other_extent);
bch_err(c, "%s", buf.buf);
- ret = -BCH_ERR_fsck_repair_unimplemented;
+ ret = bch_err_throw(c, fsck_repair_unimplemented);
goto err;
missing:
printbuf_reset(&buf);
@@ -667,24 +682,32 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
if (p.ptr.dev == BCH_SB_MEMBER_INVALID)
continue;
- rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev);
- bool check = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_mismatches);
- bool empty = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_empty);
+ bool empty;
+ {
+ /* scoped_guard() is a loop, so it breaks continue */
+ guard(rcu)();
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev);
+ if (!ca)
+ continue;
- bool stale = p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr));
- rcu_read_unlock();
+ if (p.ptr.cached && dev_ptr_stale_rcu(ca, &p.ptr))
+ continue;
- if ((check || empty) && !stale) {
- struct bkey_i_backpointer bp;
- bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp);
+ u64 b = PTR_BUCKET_NR(ca, &p.ptr);
+ if (!bch2_bucket_bitmap_test(&ca->bucket_backpointer_mismatch, b))
+ continue;
- int ret = check
- ? check_bp_exists(trans, s, &bp, k)
- : bch2_bucket_backpointer_mod(trans, k, &bp, true);
- if (ret)
- return ret;
+ empty = bch2_bucket_bitmap_test(&ca->bucket_backpointer_empty, b);
}
+
+ struct bkey_i_backpointer bp;
+ bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp);
+
+ int ret = !empty
+ ? check_bp_exists(trans, s, &bp, k)
+ : bch2_bucket_backpointer_mod(trans, k, &bp, true);
+ if (ret)
+ return ret;
}
return 0;
@@ -722,14 +745,6 @@ err:
return ret;
}
-static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp)
-{
- return (struct bbpos) {
- .btree = bp.btree_id,
- .pos = bp.pos,
- };
-}
-
static u64 mem_may_pin_bytes(struct bch_fs *c)
{
struct sysinfo i;
@@ -788,6 +803,13 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
return ret;
}
+static inline int bch2_fs_going_ro(struct bch_fs *c)
+{
+ return test_bit(BCH_FS_going_ro, &c->flags)
+ ? -EROFS
+ : 0;
+}
+
static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
struct extents_to_bp_state *s)
{
@@ -815,6 +837,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
ret = for_each_btree_key_continue(trans, iter, 0, k, ({
bch2_progress_update_iter(trans, &progress, &iter, "extents_to_backpointers");
+ bch2_fs_going_ro(c) ?:
check_extent_to_backpointers(trans, s, btree_id, level, k) ?:
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
}));
@@ -854,6 +877,7 @@ static int data_type_to_alloc_counter(enum bch_data_type t)
static int check_bucket_backpointers_to_extents(struct btree_trans *, struct bch_dev *, struct bpos);
static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct bkey_s_c alloc_k,
+ bool *had_mismatch,
struct bkey_buf *last_flushed)
{
struct bch_fs *c = trans->c;
@@ -861,6 +885,8 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b
const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert);
bool need_commit = false;
+ *had_mismatch = false;
+
if (a->data_type == BCH_DATA_sb ||
a->data_type == BCH_DATA_journal ||
a->data_type == BCH_DATA_parity)
@@ -927,16 +953,22 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b
sectors[ALLOC_cached] > a->cached_sectors ||
sectors[ALLOC_stripe] > a->stripe_sectors) {
ret = check_bucket_backpointers_to_extents(trans, ca, alloc_k.k->p) ?:
- -BCH_ERR_transaction_restart_nested;
+ bch_err_throw(c, transaction_restart_nested);
goto err;
}
- if (!sectors[ALLOC_dirty] &&
- !sectors[ALLOC_stripe] &&
- !sectors[ALLOC_cached])
- __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_empty);
- else
- __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_mismatches);
+ bool empty = (sectors[ALLOC_dirty] +
+ sectors[ALLOC_stripe] +
+ sectors[ALLOC_cached]) == 0;
+
+ ret = bch2_bucket_bitmap_set(ca, &ca->bucket_backpointer_mismatch,
+ alloc_k.k->p.offset) ?:
+ (empty
+ ? bch2_bucket_bitmap_set(ca, &ca->bucket_backpointer_empty,
+ alloc_k.k->p.offset)
+ : 0);
+
+ *had_mismatch = true;
}
err:
bch2_dev_put(ca);
@@ -949,7 +981,7 @@ static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k)
case KEY_TYPE_btree_ptr_v2: {
bool ret = false;
- rcu_read_lock();
+ guard(rcu)();
struct bpos pos = bkey_s_c_to_btree_ptr_v2(k).v->min_key;
while (pos.inode <= k.k->p.inode) {
if (pos.inode >= c->sb.nr_devices)
@@ -960,8 +992,14 @@ static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k)
goto next;
struct bpos bucket = bp_pos_to_bucket(ca, pos);
- bucket.offset = find_next_bit(ca->bucket_backpointer_mismatches,
- ca->mi.nbuckets, bucket.offset);
+ u64 next = ca->mi.nbuckets;
+
+ unsigned long *bitmap = READ_ONCE(ca->bucket_backpointer_mismatch.buckets);
+ if (bitmap)
+ next = min_t(u64, next,
+ find_next_bit(bitmap, ca->mi.nbuckets, bucket.offset));
+
+ bucket.offset = next;
if (bucket.offset == ca->mi.nbuckets)
goto next;
@@ -971,7 +1009,6 @@ static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k)
next:
pos = SPOS(pos.inode + 1, 0, 0);
}
- rcu_read_unlock();
return ret;
}
@@ -1070,28 +1107,6 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
{
int ret = 0;
- /*
- * Can't allow devices to come/go/resize while we have bucket bitmaps
- * allocated
- */
- down_read(&c->state_lock);
-
- for_each_member_device(c, ca) {
- BUG_ON(ca->bucket_backpointer_mismatches);
- ca->bucket_backpointer_mismatches = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets),
- sizeof(unsigned long),
- GFP_KERNEL);
- ca->bucket_backpointer_empty = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets),
- sizeof(unsigned long),
- GFP_KERNEL);
- if (!ca->bucket_backpointer_mismatches ||
- !ca->bucket_backpointer_empty) {
- bch2_dev_put(ca);
- ret = -BCH_ERR_ENOMEM_backpointer_mismatches_bitmap;
- goto err_free_bitmaps;
- }
- }
-
struct btree_trans *trans = bch2_trans_get(c);
struct extents_to_bp_state s = { .bp_start = POS_MIN };
@@ -1100,23 +1115,24 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
ret = for_each_btree_key(trans, iter, BTREE_ID_alloc,
POS_MIN, BTREE_ITER_prefetch, k, ({
- check_bucket_backpointer_mismatch(trans, k, &s.last_flushed);
+ bool had_mismatch;
+ bch2_fs_going_ro(c) ?:
+ check_bucket_backpointer_mismatch(trans, k, &had_mismatch, &s.last_flushed);
}));
if (ret)
goto err;
- u64 nr_buckets = 0, nr_mismatches = 0, nr_empty = 0;
+ u64 nr_buckets = 0, nr_mismatches = 0;
for_each_member_device(c, ca) {
nr_buckets += ca->mi.nbuckets;
- nr_mismatches += bitmap_weight(ca->bucket_backpointer_mismatches, ca->mi.nbuckets);
- nr_empty += bitmap_weight(ca->bucket_backpointer_empty, ca->mi.nbuckets);
+ nr_mismatches += ca->bucket_backpointer_mismatch.nr;
}
- if (!nr_mismatches && !nr_empty)
+ if (!nr_mismatches)
goto err;
bch_info(c, "scanning for missing backpointers in %llu/%llu buckets",
- nr_mismatches + nr_empty, nr_buckets);
+ nr_mismatches, nr_buckets);
while (1) {
ret = bch2_pin_backpointer_nodes_with_missing(trans, s.bp_start, &s.bp_end);
@@ -1147,23 +1163,71 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
s.bp_start = bpos_successor(s.bp_end);
}
+
+ for_each_member_device(c, ca) {
+ bch2_bucket_bitmap_free(&ca->bucket_backpointer_mismatch);
+ bch2_bucket_bitmap_free(&ca->bucket_backpointer_empty);
+ }
err:
bch2_trans_put(trans);
bch2_bkey_buf_exit(&s.last_flushed, c);
bch2_btree_cache_unpin(c);
-err_free_bitmaps:
- for_each_member_device(c, ca) {
- kvfree(ca->bucket_backpointer_empty);
- ca->bucket_backpointer_empty = NULL;
- kvfree(ca->bucket_backpointer_mismatches);
- ca->bucket_backpointer_mismatches = NULL;
- }
- up_read(&c->state_lock);
bch_err_fn(c, ret);
return ret;
}
+static int check_bucket_backpointer_pos_mismatch(struct btree_trans *trans,
+ struct bpos bucket,
+ bool *had_mismatch,
+ struct bkey_buf *last_flushed)
+{
+ struct btree_iter alloc_iter;
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &alloc_iter,
+ BTREE_ID_alloc, bucket,
+ BTREE_ITER_cached);
+ int ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ ret = check_bucket_backpointer_mismatch(trans, k, had_mismatch, last_flushed);
+ bch2_trans_iter_exit(trans, &alloc_iter);
+ return ret;
+}
+
+int bch2_check_bucket_backpointer_mismatch(struct btree_trans *trans,
+ struct bch_dev *ca, u64 bucket,
+ bool copygc,
+ struct bkey_buf *last_flushed)
+{
+ struct bch_fs *c = trans->c;
+ bool had_mismatch;
+ int ret = lockrestart_do(trans,
+ check_bucket_backpointer_pos_mismatch(trans, POS(ca->dev_idx, bucket),
+ &had_mismatch, last_flushed));
+ if (ret || !had_mismatch)
+ return ret;
+
+ u64 nr = ca->bucket_backpointer_mismatch.nr;
+ u64 allowed = copygc ? ca->mi.nbuckets >> 7 : 0;
+
+ struct printbuf buf = PRINTBUF;
+ __bch2_log_msg_start(ca->name, &buf);
+
+ prt_printf(&buf, "Detected missing backpointers in bucket %llu, now have %llu/%llu with missing\n",
+ bucket, nr, ca->mi.nbuckets);
+
+ bch2_run_explicit_recovery_pass(c, &buf,
+ BCH_RECOVERY_PASS_check_extents_to_backpointers,
+ nr < allowed ? RUN_RECOVERY_PASS_ratelimit : 0);
+
+ bch2_print_str(c, KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+ return 0;
+}
+
+/* backpointers -> extents */
+
static int check_one_backpointer(struct btree_trans *trans,
struct bbpos start,
struct bbpos end,
@@ -1279,3 +1343,49 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c)
bch_err_fn(c, ret);
return ret;
}
+
+static int bch2_bucket_bitmap_set(struct bch_dev *ca, struct bucket_bitmap *b, u64 bit)
+{
+ scoped_guard(mutex, &b->lock) {
+ if (!b->buckets) {
+ b->buckets = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets),
+ sizeof(unsigned long), GFP_KERNEL);
+ if (!b->buckets)
+ return bch_err_throw(ca->fs, ENOMEM_backpointer_mismatches_bitmap);
+ }
+
+ b->nr += !__test_and_set_bit(bit, b->buckets);
+ }
+
+ return 0;
+}
+
+int bch2_bucket_bitmap_resize(struct bch_dev *ca, struct bucket_bitmap *b,
+ u64 old_size, u64 new_size)
+{
+ scoped_guard(mutex, &b->lock) {
+ if (!b->buckets)
+ return 0;
+
+ unsigned long *n = kvcalloc(BITS_TO_LONGS(new_size),
+ sizeof(unsigned long), GFP_KERNEL);
+ if (!n)
+ return bch_err_throw(ca->fs, ENOMEM_backpointer_mismatches_bitmap);
+
+ memcpy(n, b->buckets,
+ BITS_TO_LONGS(min(old_size, new_size)) * sizeof(unsigned long));
+ kvfree(b->buckets);
+ b->buckets = n;
+ }
+
+ return 0;
+}
+
+void bch2_bucket_bitmap_free(struct bucket_bitmap *b)
+{
+ mutex_lock(&b->lock);
+ kvfree(b->buckets);
+ b->buckets = NULL;
+ b->nr = 0;
+ mutex_unlock(&b->lock);
+}
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
index 16575dbc5736..7e71afee1ac0 100644
--- a/fs/bcachefs/backpointers.h
+++ b/fs/bcachefs/backpointers.h
@@ -53,11 +53,10 @@ static inline struct bpos bp_pos_to_bucket_and_offset(const struct bch_dev *ca,
static inline bool bp_pos_to_bucket_nodev_noerror(struct bch_fs *c, struct bpos bp_pos, struct bpos *bucket)
{
- rcu_read_lock();
+ guard(rcu)();
struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp_pos.inode);
if (ca)
*bucket = bp_pos_to_bucket(ca, bp_pos);
- rcu_read_unlock();
return ca != NULL;
}
@@ -102,7 +101,7 @@ static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans,
struct bkey_i_backpointer *bp,
bool insert)
{
- if (unlikely(bch2_backpointers_no_use_write_buffer))
+ if (static_branch_unlikely(&bch2_backpointers_no_use_write_buffer))
return bch2_bucket_backpointer_mod_nowritebuffer(trans, orig_k, bp, insert);
if (!insert) {
@@ -182,8 +181,20 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct bkey_s_c_b
struct btree *bch2_backpointer_get_node(struct btree_trans *, struct bkey_s_c_backpointer,
struct btree_iter *, struct bkey_buf *);
+int bch2_check_bucket_backpointer_mismatch(struct btree_trans *, struct bch_dev *, u64,
+ bool, struct bkey_buf *);
+
int bch2_check_btree_backpointers(struct bch_fs *);
int bch2_check_extents_to_backpointers(struct bch_fs *);
int bch2_check_backpointers_to_extents(struct bch_fs *);
+static inline bool bch2_bucket_bitmap_test(struct bucket_bitmap *b, u64 i)
+{
+ unsigned long *bitmap = READ_ONCE(b->buckets);
+ return bitmap && test_bit(i, bitmap);
+}
+
+int bch2_bucket_bitmap_resize(struct bch_dev *, struct bucket_bitmap *, u64, u64);
+void bch2_bucket_bitmap_free(struct bucket_bitmap *);
+
#endif /* _BCACHEFS_BACKPOINTERS_BACKGROUND_H */
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 75f7408da173..3651a296d506 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -183,6 +183,16 @@
#define pr_fmt(fmt) "%s() " fmt "\n", __func__
#endif
+#ifdef CONFIG_BCACHEFS_DEBUG
+#define ENUMERATED_REF_DEBUG
+#endif
+
+#ifndef dynamic_fault
+#define dynamic_fault(...) 0
+#endif
+
+#define race_fault(...) dynamic_fault("bcachefs:race")
+
#include <linux/backing-dev-defs.h>
#include <linux/bug.h>
#include <linux/bio.h>
@@ -209,24 +219,40 @@
#include "btree_journal_iter_types.h"
#include "disk_accounting_types.h"
#include "errcode.h"
+#include "fast_list.h"
#include "fifo.h"
#include "nocow_locking_types.h"
#include "opts.h"
-#include "recovery_passes_types.h"
#include "sb-errors_types.h"
#include "seqmutex.h"
+#include "snapshot_types.h"
#include "time_stats.h"
#include "util.h"
-#ifdef CONFIG_BCACHEFS_DEBUG
-#define BCH_WRITE_REF_DEBUG
-#endif
-
-#ifndef dynamic_fault
-#define dynamic_fault(...) 0
-#endif
+#include "alloc_types.h"
+#include "async_objs_types.h"
+#include "btree_gc_types.h"
+#include "btree_types.h"
+#include "btree_node_scan_types.h"
+#include "btree_write_buffer_types.h"
+#include "buckets_types.h"
+#include "buckets_waiting_for_journal_types.h"
+#include "clock_types.h"
+#include "disk_groups_types.h"
+#include "ec_types.h"
+#include "enumerated_ref_types.h"
+#include "journal_types.h"
+#include "keylist_types.h"
+#include "quota_types.h"
+#include "rebalance_types.h"
+#include "recovery_passes_types.h"
+#include "replicas_types.h"
+#include "sb-members_types.h"
+#include "subvolume_types.h"
+#include "super_types.h"
+#include "thread_with_file_types.h"
-#define race_fault(...) dynamic_fault("bcachefs:race")
+#include "trace.h"
#define count_event(_c, _name) this_cpu_inc((_c)->counters[BCH_COUNTER_##_name])
@@ -269,7 +295,8 @@ do { \
#define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n")
-void bch2_print_str(struct bch_fs *, const char *);
+void bch2_print_str(struct bch_fs *, const char *, const char *);
+void bch2_print_str_nonblocking(struct bch_fs *, const char *, const char *);
__printf(2, 3)
void bch2_print_opts(struct bch_opts *, const char *, ...);
@@ -293,6 +320,16 @@ do { \
bch2_print(_c, __VA_ARGS__); \
} while (0)
+#define bch2_print_str_ratelimited(_c, ...) \
+do { \
+ static DEFINE_RATELIMIT_STATE(_rs, \
+ DEFAULT_RATELIMIT_INTERVAL, \
+ DEFAULT_RATELIMIT_BURST); \
+ \
+ if (__ratelimit(&_rs)) \
+ bch2_print_str(_c, __VA_ARGS__); \
+} while (0)
+
#define bch_info(c, fmt, ...) \
bch2_print(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__)
#define bch_info_ratelimited(c, fmt, ...) \
@@ -368,6 +405,14 @@ do { \
pr_info(fmt, ##__VA_ARGS__); \
} while (0)
+static inline int __bch2_err_trace(struct bch_fs *c, int err)
+{
+ trace_error_throw(c, err, _THIS_IP_);
+ return err;
+}
+
+#define bch_err_throw(_c, _err) __bch2_err_trace(_c, -BCH_ERR_##_err)
+
/* Parameters that are useful for debugging, but should always be compiled in: */
#define BCH_DEBUG_PARAMS_ALWAYS() \
BCH_DEBUG_PARAM(key_merging_disabled, \
@@ -390,17 +435,20 @@ do { \
"compare them") \
BCH_DEBUG_PARAM(backpointers_no_use_write_buffer, \
"Don't use the write buffer for backpointers, enabling "\
- "extra runtime checks")
-
-/* Parameters that should only be compiled in debug mode: */
-#define BCH_DEBUG_PARAMS_DEBUG() \
- BCH_DEBUG_PARAM(expensive_debug_checks, \
- "Enables various runtime debugging checks that " \
- "significantly affect performance") \
+ "extra runtime checks") \
+ BCH_DEBUG_PARAM(debug_check_btree_locking, \
+ "Enable additional asserts for btree locking") \
BCH_DEBUG_PARAM(debug_check_iterators, \
"Enables extra verification for btree iterators") \
+ BCH_DEBUG_PARAM(debug_check_bset_lookups, \
+ "Enables extra verification for bset lookups") \
BCH_DEBUG_PARAM(debug_check_btree_accounting, \
"Verify btree accounting for keys within a node") \
+ BCH_DEBUG_PARAM(debug_check_bkey_unpack, \
+ "Enables extra verification for bkey unpack")
+
+/* Parameters that should only be compiled in debug mode: */
+#define BCH_DEBUG_PARAMS_DEBUG() \
BCH_DEBUG_PARAM(journal_seq_verify, \
"Store the journal sequence number in the version " \
"number of every btree key, and verify that btree " \
@@ -427,15 +475,9 @@ do { \
#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS()
#endif
-#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name;
-BCH_DEBUG_PARAMS()
-#undef BCH_DEBUG_PARAM
-
-#ifndef CONFIG_BCACHEFS_DEBUG
-#define BCH_DEBUG_PARAM(name, description) static const __maybe_unused bool bch2_##name;
-BCH_DEBUG_PARAMS_DEBUG()
+#define BCH_DEBUG_PARAM(name, description) extern struct static_key_false bch2_##name;
+BCH_DEBUG_PARAMS_ALL()
#undef BCH_DEBUG_PARAM
-#endif
#define BCH_TIME_STATS() \
x(btree_node_mem_alloc) \
@@ -443,6 +485,7 @@ BCH_DEBUG_PARAMS_DEBUG()
x(btree_node_compact) \
x(btree_node_merge) \
x(btree_node_sort) \
+ x(btree_node_get) \
x(btree_node_read) \
x(btree_node_read_done) \
x(btree_node_write) \
@@ -450,6 +493,10 @@ BCH_DEBUG_PARAMS_DEBUG()
x(btree_interior_update_total) \
x(btree_gc) \
x(data_write) \
+ x(data_write_to_submit) \
+ x(data_write_to_queue) \
+ x(data_write_to_btree_update) \
+ x(data_write_btree_update) \
x(data_read) \
x(data_promote) \
x(journal_flush_write) \
@@ -472,26 +519,6 @@ enum bch_time_stats {
BCH_TIME_STAT_NR
};
-#include "alloc_types.h"
-#include "btree_gc_types.h"
-#include "btree_types.h"
-#include "btree_node_scan_types.h"
-#include "btree_write_buffer_types.h"
-#include "buckets_types.h"
-#include "buckets_waiting_for_journal_types.h"
-#include "clock_types.h"
-#include "disk_groups_types.h"
-#include "ec_types.h"
-#include "journal_types.h"
-#include "keylist_types.h"
-#include "quota_types.h"
-#include "rebalance_types.h"
-#include "replicas_types.h"
-#include "sb-members_types.h"
-#include "subvolume_types.h"
-#include "super_types.h"
-#include "thread_with_file_types.h"
-
/* Number of nodes btree coalesce will try to coalesce at once */
#define GC_MERGE_NODES 4U
@@ -514,6 +541,57 @@ struct discard_in_flight {
u64 bucket:63;
};
+#define BCH_DEV_READ_REFS() \
+ x(bch2_online_devs) \
+ x(trans_mark_dev_sbs) \
+ x(read_fua_test) \
+ x(sb_field_resize) \
+ x(write_super) \
+ x(journal_read) \
+ x(fs_journal_alloc) \
+ x(fs_resize_on_mount) \
+ x(btree_node_read) \
+ x(btree_node_read_all_replicas) \
+ x(btree_node_scrub) \
+ x(btree_node_write) \
+ x(btree_node_scan) \
+ x(btree_verify_replicas) \
+ x(btree_node_ondisk_to_text) \
+ x(io_read) \
+ x(check_extent_checksums) \
+ x(ec_block)
+
+enum bch_dev_read_ref {
+#define x(n) BCH_DEV_READ_REF_##n,
+ BCH_DEV_READ_REFS()
+#undef x
+ BCH_DEV_READ_REF_NR,
+};
+
+#define BCH_DEV_WRITE_REFS() \
+ x(journal_write) \
+ x(journal_do_discards) \
+ x(dev_do_discards) \
+ x(discard_one_bucket_fast) \
+ x(do_invalidates) \
+ x(nocow_flush) \
+ x(io_write) \
+ x(ec_block) \
+ x(ec_bucket_zero)
+
+enum bch_dev_write_ref {
+#define x(n) BCH_DEV_WRITE_REF_##n,
+ BCH_DEV_WRITE_REFS()
+#undef x
+ BCH_DEV_WRITE_REF_NR,
+};
+
+struct bucket_bitmap {
+ unsigned long *buckets;
+ u64 nr;
+ struct mutex lock;
+};
+
struct bch_dev {
struct kobject kobj;
#ifdef CONFIG_BCACHEFS_DEBUG
@@ -524,8 +602,7 @@ struct bch_dev {
struct percpu_ref ref;
#endif
struct completion ref_completion;
- struct percpu_ref io_ref[2];
- struct completion io_ref_completion[2];
+ struct enumerated_ref io_ref[2];
struct bch_fs *fs;
@@ -559,8 +636,8 @@ struct bch_dev {
u8 *oldest_gen;
unsigned long *buckets_nouse;
- unsigned long *bucket_backpointer_mismatches;
- unsigned long *bucket_backpointer_empty;
+ struct bucket_bitmap bucket_backpointer_mismatch;
+ struct bucket_bitmap bucket_backpointer_empty;
struct bch_dev_usage_full __percpu
*usage;
@@ -572,10 +649,6 @@ struct bch_dev {
unsigned nr_partial_buckets;
unsigned nr_btree_reserve;
- size_t inc_gen_needs_gc;
- size_t inc_gen_really_needs_gc;
- size_t buckets_waiting_on_journal;
-
struct work_struct invalidate_work;
struct work_struct discard_work;
struct mutex discard_buckets_in_flight_lock;
@@ -614,14 +687,15 @@ struct bch_dev {
x(accounting_replay_done) \
x(may_go_rw) \
x(rw) \
+ x(rw_init_done) \
x(was_rw) \
x(stopping) \
x(emergency_ro) \
x(going_ro) \
x(write_disable_complete) \
x(clean_shutdown) \
- x(recovery_running) \
- x(fsck_running) \
+ x(in_recovery) \
+ x(in_fsck) \
x(initial_gc_unfixed) \
x(need_delete_dead_snapshots) \
x(error) \
@@ -648,8 +722,10 @@ struct btree_transaction_stats {
struct bch2_time_stats lock_hold_times;
struct mutex lock;
unsigned nr_max_paths;
- unsigned journal_entries_size;
unsigned max_mem;
+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
+ darray_trans_kmalloc_trace trans_kmalloc_trace;
+#endif
char *max_paths_text;
};
@@ -670,9 +746,6 @@ struct btree_trans_buf {
struct btree_trans *trans;
};
-#define BCACHEFS_ROOT_SUBVOL_INUM \
- ((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO })
-
#define BCH_WRITE_REFS() \
x(journal) \
x(trans) \
@@ -694,7 +767,8 @@ struct btree_trans_buf {
x(snapshot_delete_pagecache) \
x(sysfs) \
x(btree_write_buffer) \
- x(btree_node_scrub)
+ x(btree_node_scrub) \
+ x(async_recovery_passes)
enum bch_write_ref {
#define x(n) BCH_WRITE_REF_##n,
@@ -728,11 +802,7 @@ struct bch_fs {
struct rw_semaphore state_lock;
/* Counts outstanding writes, for clean transition to read-only */
-#ifdef BCH_WRITE_REF_DEBUG
- atomic_long_t writes[BCH_WRITE_REF_NR];
-#else
- struct percpu_ref writes;
-#endif
+ struct enumerated_ref writes;
/*
* Certain operations are only allowed in single threaded mode, during
* recovery, and we want to assert that this is the case:
@@ -776,6 +846,7 @@ struct bch_fs {
u8 nr_devices;
u8 clean;
+ bool multi_device; /* true if we've ever had more than one device */
u8 encryption_type;
@@ -785,6 +856,7 @@ struct bch_fs {
unsigned nsec_per_time_unit;
u64 features;
u64 compat;
+ u64 recovery_passes_required;
unsigned long errors_silent[BITS_TO_LONGS(BCH_FSCK_ERR_MAX)];
u64 btrees_lost_data;
} sb;
@@ -809,7 +881,7 @@ struct bch_fs {
struct mutex snapshot_table_lock;
struct rw_semaphore snapshot_create_lock;
- struct work_struct snapshot_delete_work;
+ struct snapshot_delete snapshot_delete;
struct work_struct snapshot_wait_for_pagecache_and_delete_work;
snapshot_id_list snapshots_unlinked;
struct mutex snapshots_unlinked_lock;
@@ -874,7 +946,7 @@ struct bch_fs {
struct btree_write_buffer btree_write_buffer;
struct workqueue_struct *btree_update_wq;
- struct workqueue_struct *btree_io_complete_wq;
+ struct workqueue_struct *btree_write_complete_wq;
/* copygc needs its own workqueue for index updates.. */
struct workqueue_struct *copygc_wq;
/*
@@ -885,6 +957,7 @@ struct bch_fs {
struct workqueue_struct *write_ref_wq;
/* ALLOCATION */
+ struct bch_devs_mask online_devs;
struct bch_devs_mask rw_devs[BCH_DATA_NR];
unsigned long rw_devs_change_count;
@@ -979,6 +1052,10 @@ struct bch_fs {
nocow_locks;
struct rhashtable promote_table;
+#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
+ struct async_obj_list async_objs[BCH_ASYNC_OBJ_NR];
+#endif
+
mempool_t compression_bounce[2];
mempool_t compress_workspace[BCH_COMPRESSION_OPT_NR];
size_t zstd_workspace_size;
@@ -1048,25 +1125,12 @@ struct bch_fs {
/* RECOVERY */
u64 journal_replay_seq_start;
u64 journal_replay_seq_end;
- /*
- * Two different uses:
- * "Has this fsck pass?" - i.e. should this type of error be an
- * emergency read-only
- * And, in certain situations fsck will rewind to an earlier pass: used
- * for signaling to the toplevel code which pass we want to run now.
- */
- enum bch_recovery_pass curr_recovery_pass;
- enum bch_recovery_pass next_recovery_pass;
- /* bitmask of recovery passes that we actually ran */
- u64 recovery_passes_complete;
- /* never rewinds version of curr_recovery_pass */
- enum bch_recovery_pass recovery_pass_done;
- spinlock_t recovery_pass_lock;
- struct semaphore online_fsck_mutex;
+ struct bch_fs_recovery recovery;
/* DEBUG JUNK */
struct dentry *fs_debug_dir;
struct dentry *btree_debug_dir;
+ struct dentry *async_obj_dir;
struct btree_debug btree_debug[BTREE_ID_NR];
struct btree *verify_data;
struct btree_node *verify_ondisk;
@@ -1108,54 +1172,6 @@ struct bch_fs {
extern struct wait_queue_head bch2_read_only_wait;
-static inline void bch2_write_ref_get(struct bch_fs *c, enum bch_write_ref ref)
-{
-#ifdef BCH_WRITE_REF_DEBUG
- atomic_long_inc(&c->writes[ref]);
-#else
- percpu_ref_get(&c->writes);
-#endif
-}
-
-static inline bool __bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref)
-{
-#ifdef BCH_WRITE_REF_DEBUG
- return !test_bit(BCH_FS_going_ro, &c->flags) &&
- atomic_long_inc_not_zero(&c->writes[ref]);
-#else
- return percpu_ref_tryget(&c->writes);
-#endif
-}
-
-static inline bool bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref)
-{
-#ifdef BCH_WRITE_REF_DEBUG
- return !test_bit(BCH_FS_going_ro, &c->flags) &&
- atomic_long_inc_not_zero(&c->writes[ref]);
-#else
- return percpu_ref_tryget_live(&c->writes);
-#endif
-}
-
-static inline void bch2_write_ref_put(struct bch_fs *c, enum bch_write_ref ref)
-{
-#ifdef BCH_WRITE_REF_DEBUG
- long v = atomic_long_dec_return(&c->writes[ref]);
-
- BUG_ON(v < 0);
- if (v)
- return;
- for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++)
- if (atomic_long_read(&c->writes[i]))
- return;
-
- set_bit(BCH_FS_write_disable_complete, &c->flags);
- wake_up(&bch2_read_only_wait);
-#else
- percpu_ref_put(&c->writes);
-#endif
-}
-
static inline bool bch2_ro_ref_tryget(struct bch_fs *c)
{
if (test_bit(BCH_FS_stopping, &c->flags))
@@ -1256,4 +1272,17 @@ static inline unsigned data_replicas_required(struct bch_fs *c)
#define BKEY_PADDED_ONSTACK(key, pad) \
struct { struct bkey_i key; __u64 key ## _pad[pad]; }
+/*
+ * This is needed because discard is both a filesystem option and a device
+ * option, and mount options are supposed to apply to that mount and not be
+ * persisted, i.e. if it's set as a mount option we can't propagate it to the
+ * device.
+ */
+static inline bool bch2_discard_opt_enabled(struct bch_fs *c, struct bch_dev *ca)
+{
+ return test_bit(BCH_FS_discard_mount_opt_set, &c->flags)
+ ? c->opts.discard
+ : ca->mi.discard;
+}
+
#endif /* _BCACHEFS_H */
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index d6e4a496f02b..b4a04df5ea95 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -497,7 +497,8 @@ struct bch_sb_field {
x(members_v2, 11) \
x(errors, 12) \
x(ext, 13) \
- x(downgrade, 14)
+ x(downgrade, 14) \
+ x(recovery_passes, 15)
#include "alloc_background_format.h"
#include "dirent_format.h"
@@ -510,6 +511,7 @@ struct bch_sb_field {
#include "logged_ops_format.h"
#include "lru_format.h"
#include "quota_format.h"
+#include "recovery_passes_format.h"
#include "reflink_format.h"
#include "replicas_format.h"
#include "snapshot_format.h"
@@ -695,7 +697,10 @@ struct bch_sb_field_ext {
x(stripe_backpointers, BCH_VERSION(1, 22)) \
x(stripe_lru, BCH_VERSION(1, 23)) \
x(casefolding, BCH_VERSION(1, 24)) \
- x(extent_flags, BCH_VERSION(1, 25))
+ x(extent_flags, BCH_VERSION(1, 25)) \
+ x(snapshot_deletion_v2, BCH_VERSION(1, 26)) \
+ x(fast_device_removal, BCH_VERSION(1, 27)) \
+ x(inode_has_case_insensitive, BCH_VERSION(1, 28))
enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9,
@@ -846,7 +851,7 @@ LE64_BITMASK(BCH_SB_SHARD_INUMS, struct bch_sb, flags[3], 28, 29);
LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30);
LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62);
LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63);
-/* one free bit */
+LE64_BITMASK(BCH_SB_MULTI_DEVICE, struct bch_sb, flags[3], 63, 64);
LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32);
LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33);
LE64_BITMASK(BCH_SB_NOCOW, struct bch_sb, flags[4], 33, 34);
@@ -867,7 +872,9 @@ LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED,
LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6], 0, 4);
LE64_BITMASK(BCH_SB_WRITE_ERROR_TIMEOUT,struct bch_sb, flags[6], 4, 14);
LE64_BITMASK(BCH_SB_CSUM_ERR_RETRY_NR, struct bch_sb, flags[6], 14, 20);
+LE64_BITMASK(BCH_SB_DEGRADED_ACTION, struct bch_sb, flags[6], 20, 22);
LE64_BITMASK(BCH_SB_CASEFOLD, struct bch_sb, flags[6], 22, 23);
+LE64_BITMASK(BCH_SB_REBALANCE_AC_ONLY, struct bch_sb, flags[6], 23, 24);
static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb)
{
@@ -922,7 +929,9 @@ static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u
x(alloc_v2, 17) \
x(extents_across_btree_nodes, 18) \
x(incompat_version_field, 19) \
- x(casefolding, 20)
+ x(casefolding, 20) \
+ x(no_alloc_info, 21) \
+ x(small_image, 22)
#define BCH_SB_FEATURES_ALWAYS \
(BIT_ULL(BCH_FEATURE_new_extent_overwrite)| \
@@ -989,6 +998,19 @@ enum bch_error_actions {
BCH_ON_ERROR_NR
};
+#define BCH_DEGRADED_ACTIONS() \
+ x(ask, 0) \
+ x(yes, 1) \
+ x(very, 2) \
+ x(no, 3)
+
+enum bch_degraded_actions {
+#define x(t, n) BCH_DEGRADED_##t = n,
+ BCH_DEGRADED_ACTIONS()
+#undef x
+ BCH_DEGRADED_ACTIONS_NR
+};
+
#define BCH_STR_HASH_TYPES() \
x(crc32c, 0) \
x(crc64, 1) \
diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
index 995ba32e9b6e..ee823c640642 100644
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
@@ -47,11 +47,9 @@ void bch2_bkey_packed_to_binary_text(struct printbuf *out,
}
}
-#ifdef CONFIG_BCACHEFS_DEBUG
-
-static void bch2_bkey_pack_verify(const struct bkey_packed *packed,
- const struct bkey *unpacked,
- const struct bkey_format *format)
+static void __bch2_bkey_pack_verify(const struct bkey_packed *packed,
+ const struct bkey *unpacked,
+ const struct bkey_format *format)
{
struct bkey tmp;
@@ -95,11 +93,13 @@ static void bch2_bkey_pack_verify(const struct bkey_packed *packed,
}
}
-#else
static inline void bch2_bkey_pack_verify(const struct bkey_packed *packed,
- const struct bkey *unpacked,
- const struct bkey_format *format) {}
-#endif
+ const struct bkey *unpacked,
+ const struct bkey_format *format)
+{
+ if (static_branch_unlikely(&bch2_debug_check_bkey_unpack))
+ __bch2_bkey_pack_verify(packed, unpacked, format);
+}
struct pack_state {
const struct bkey_format *format;
@@ -398,7 +398,6 @@ static bool set_inc_field_lossy(struct pack_state *state, unsigned field, u64 v)
return ret;
}
-#ifdef CONFIG_BCACHEFS_DEBUG
static bool bkey_packed_successor(struct bkey_packed *out,
const struct btree *b,
struct bkey_packed k)
@@ -455,7 +454,6 @@ static bool bkey_format_has_too_big_fields(const struct bkey_format *f)
return false;
}
-#endif
/*
* Returns a packed key that compares <= in
@@ -472,9 +470,7 @@ enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out,
const struct bkey_format *f = &b->format;
struct pack_state state = pack_state_init(f, out);
u64 *w = out->_data;
-#ifdef CONFIG_BCACHEFS_DEBUG
struct bpos orig = in;
-#endif
bool exact = true;
unsigned i;
@@ -527,18 +523,18 @@ enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out,
out->format = KEY_FORMAT_LOCAL_BTREE;
out->type = KEY_TYPE_deleted;
-#ifdef CONFIG_BCACHEFS_DEBUG
- if (exact) {
- BUG_ON(bkey_cmp_left_packed(b, out, &orig));
- } else {
- struct bkey_packed successor;
+ if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) {
+ if (exact) {
+ BUG_ON(bkey_cmp_left_packed(b, out, &orig));
+ } else {
+ struct bkey_packed successor;
- BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0);
- BUG_ON(bkey_packed_successor(&successor, b, *out) &&
- bkey_cmp_left_packed(b, &successor, &orig) < 0 &&
- !bkey_format_has_too_big_fields(f));
+ BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0);
+ BUG_ON(bkey_packed_successor(&successor, b, *out) &&
+ bkey_cmp_left_packed(b, &successor, &orig) < 0 &&
+ !bkey_format_has_too_big_fields(f));
+ }
}
-#endif
return exact ? BKEY_PACK_POS_EXACT : BKEY_PACK_POS_SMALLER;
}
@@ -627,14 +623,13 @@ struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s)
}
}
-#ifdef CONFIG_BCACHEFS_DEBUG
- {
+ if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) {
struct printbuf buf = PRINTBUF;
BUG_ON(bch2_bkey_format_invalid(NULL, &ret, 0, &buf));
printbuf_exit(&buf);
}
-#endif
+
return ret;
}
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index 054e2d5e8448..3ccd521c190a 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -191,6 +191,7 @@ static inline struct bpos bkey_max(struct bpos l, struct bpos r)
static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r)
{
return bpos_eq(l.k->p, r.k->p) &&
+ l.k->size == r.k->size &&
bkey_bytes(l.k) == bkey_bytes(r.k) &&
!memcmp(l.v, r.v, bkey_val_bytes(l.k));
}
@@ -397,8 +398,7 @@ __bkey_unpack_key_format_checked(const struct btree *b,
compiled_unpack_fn unpack_fn = b->aux_data;
unpack_fn(dst, src);
- if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
- bch2_expensive_debug_checks) {
+ if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) {
struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src);
BUG_ON(memcmp(dst, &dst2, sizeof(*dst)));
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 00d05ccfaf73..fcd8c82cba4f 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -356,7 +356,7 @@ bool bch2_bkey_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
return ops->key_merge &&
bch2_bkey_maybe_mergable(l.k, r.k) &&
(u64) l.k->size + r.k->size <= KEY_SIZE_MAX &&
- !bch2_key_merging_disabled &&
+ !static_branch_unlikely(&bch2_key_merging_disabled) &&
ops->key_merge(c, l, r);
}
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 9a4a83d6fd2d..32841f762eb2 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -144,8 +144,6 @@ struct btree_nr_keys bch2_btree_node_count_keys(struct btree *b)
return nr;
}
-#ifdef CONFIG_BCACHEFS_DEBUG
-
void __bch2_verify_btree_nr_keys(struct btree *b)
{
struct btree_nr_keys nr = bch2_btree_node_count_keys(b);
@@ -153,7 +151,7 @@ void __bch2_verify_btree_nr_keys(struct btree *b)
BUG_ON(memcmp(&nr, &b->nr, sizeof(nr)));
}
-static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter,
+static void __bch2_btree_node_iter_next_check(struct btree_node_iter *_iter,
struct btree *b)
{
struct btree_node_iter iter = *_iter;
@@ -190,8 +188,8 @@ static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter,
}
}
-void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
- struct btree *b)
+void __bch2_btree_node_iter_verify(struct btree_node_iter *iter,
+ struct btree *b)
{
struct btree_node_iter_set *set, *s2;
struct bkey_packed *k, *p;
@@ -237,8 +235,8 @@ found:
}
}
-void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
- struct bkey_packed *insert, unsigned clobber_u64s)
+static void __bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
+ struct bkey_packed *insert, unsigned clobber_u64s)
{
struct bset_tree *t = bch2_bkey_to_bset(b, where);
struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where);
@@ -285,12 +283,15 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
#endif
}
-#else
-
-static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter,
- struct btree *b) {}
+static inline void bch2_verify_insert_pos(struct btree *b,
+ struct bkey_packed *where,
+ struct bkey_packed *insert,
+ unsigned clobber_u64s)
+{
+ if (static_branch_unlikely(&bch2_debug_check_bset_lookups))
+ __bch2_verify_insert_pos(b, where, insert, clobber_u64s);
+}
-#endif
/* Auxiliary search trees */
@@ -361,9 +362,8 @@ static struct bkey_float *bkey_float(const struct btree *b,
return ro_aux_tree_base(b, t)->f + idx;
}
-static void bset_aux_tree_verify(struct btree *b)
+static void __bset_aux_tree_verify(struct btree *b)
{
-#ifdef CONFIG_BCACHEFS_DEBUG
for_each_bset(b, t) {
if (t->aux_data_offset == U16_MAX)
continue;
@@ -375,7 +375,12 @@ static void bset_aux_tree_verify(struct btree *b)
BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b));
BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b));
}
-#endif
+}
+
+static inline void bset_aux_tree_verify(struct btree *b)
+{
+ if (static_branch_unlikely(&bch2_debug_check_bset_lookups))
+ __bset_aux_tree_verify(b);
}
void bch2_btree_keys_init(struct btree *b)
@@ -495,15 +500,11 @@ static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t,
};
}
-static void bch2_bset_verify_rw_aux_tree(struct btree *b,
- struct bset_tree *t)
+static void __bch2_bset_verify_rw_aux_tree(struct btree *b, struct bset_tree *t)
{
struct bkey_packed *k = btree_bkey_first(b, t);
unsigned j = 0;
- if (!bch2_expensive_debug_checks)
- return;
-
BUG_ON(bset_has_ro_aux_tree(t));
if (!bset_has_rw_aux_tree(t))
@@ -530,6 +531,13 @@ start:
}
}
+static inline void bch2_bset_verify_rw_aux_tree(struct btree *b,
+ struct bset_tree *t)
+{
+ if (static_branch_unlikely(&bch2_debug_check_bset_lookups))
+ __bch2_bset_verify_rw_aux_tree(b, t);
+}
+
/* returns idx of first entry >= offset: */
static unsigned rw_aux_tree_bsearch(struct btree *b,
struct bset_tree *t,
@@ -869,7 +877,7 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
k = p;
}
- if (bch2_expensive_debug_checks) {
+ if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) {
BUG_ON(ret >= orig_k);
for (i = ret
@@ -1195,7 +1203,7 @@ struct bkey_packed *bch2_bset_search_linear(struct btree *b,
bkey_iter_pos_cmp(b, m, search) < 0)
m = bkey_p_next(m);
- if (bch2_expensive_debug_checks) {
+ if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) {
struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
BUG_ON(prev &&
@@ -1435,9 +1443,9 @@ static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter,
void bch2_btree_node_iter_advance(struct btree_node_iter *iter,
struct btree *b)
{
- if (bch2_expensive_debug_checks) {
- bch2_btree_node_iter_verify(iter, b);
- bch2_btree_node_iter_next_check(iter, b);
+ if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) {
+ __bch2_btree_node_iter_verify(iter, b);
+ __bch2_btree_node_iter_next_check(iter, b);
}
__bch2_btree_node_iter_advance(iter, b);
@@ -1453,8 +1461,7 @@ struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter,
struct btree_node_iter_set *set;
unsigned end = 0;
- if (bch2_expensive_debug_checks)
- bch2_btree_node_iter_verify(iter, b);
+ bch2_btree_node_iter_verify(iter, b);
for_each_bset(b, t) {
k = bch2_bkey_prev_all(b, t,
@@ -1489,8 +1496,7 @@ found:
iter->data[0].k = __btree_node_key_to_offset(b, prev);
iter->data[0].end = end;
- if (bch2_expensive_debug_checks)
- bch2_btree_node_iter_verify(iter, b);
+ bch2_btree_node_iter_verify(iter, b);
return prev;
}
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index 6953d55b72cc..a15ecf9d006e 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -517,27 +517,19 @@ void bch2_dump_bset(struct bch_fs *, struct btree *, struct bset *, unsigned);
void bch2_dump_btree_node(struct bch_fs *, struct btree *);
void bch2_dump_btree_node_iter(struct btree *, struct btree_node_iter *);
-#ifdef CONFIG_BCACHEFS_DEBUG
-
void __bch2_verify_btree_nr_keys(struct btree *);
-void bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *);
-void bch2_verify_insert_pos(struct btree *, struct bkey_packed *,
- struct bkey_packed *, unsigned);
-
-#else
+void __bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *);
-static inline void __bch2_verify_btree_nr_keys(struct btree *b) {}
static inline void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
- struct btree *b) {}
-static inline void bch2_verify_insert_pos(struct btree *b,
- struct bkey_packed *where,
- struct bkey_packed *insert,
- unsigned clobber_u64s) {}
-#endif
+ struct btree *b)
+{
+ if (static_branch_unlikely(&bch2_debug_check_bset_lookups))
+ __bch2_btree_node_iter_verify(iter, b);
+}
static inline void bch2_verify_btree_nr_keys(struct btree *b)
{
- if (bch2_debug_check_btree_accounting)
+ if (static_branch_unlikely(&bch2_debug_check_btree_accounting))
__bch2_verify_btree_nr_keys(b);
}
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 899891295797..91e0aa796e6b 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -17,12 +17,6 @@
#include <linux/sched/mm.h>
#include <linux/swap.h>
-#define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \
-do { \
- if (shrinker_counter) \
- bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_##counter]++; \
-} while (0)
-
const char * const bch2_btree_node_flags[] = {
"typebit",
"typebit",
@@ -155,7 +149,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
b->data = kvmalloc(btree_buf_bytes(b), gfp);
if (!b->data)
- return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
+ return bch_err_throw(c, ENOMEM_btree_node_mem_alloc);
#ifdef __KERNEL__
b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp);
#else
@@ -168,7 +162,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
if (!b->aux_data) {
kvfree(b->data);
b->data = NULL;
- return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
+ return bch_err_throw(c, ENOMEM_btree_node_mem_alloc);
}
return 0;
@@ -350,115 +344,118 @@ static inline struct btree *btree_cache_find(struct btree_cache *bc,
return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params);
}
-/*
- * this version is for btree nodes that have already been freed (we're not
- * reaping a real btree node)
- */
-static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter)
+static int __btree_node_reclaim_checks(struct bch_fs *c, struct btree *b,
+ bool flush, bool locked)
{
struct btree_cache *bc = &c->btree_cache;
- int ret = 0;
lockdep_assert_held(&bc->lock);
-wait_on_io:
- if (b->flags & ((1U << BTREE_NODE_dirty)|
- (1U << BTREE_NODE_read_in_flight)|
+
+ if (btree_node_noevict(b)) {
+ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_noevict]++;
+ return bch_err_throw(c, ENOMEM_btree_node_reclaim);
+ }
+ if (btree_node_write_blocked(b)) {
+ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_blocked]++;
+ return bch_err_throw(c, ENOMEM_btree_node_reclaim);
+ }
+ if (btree_node_will_make_reachable(b)) {
+ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_will_make_reachable]++;
+ return bch_err_throw(c, ENOMEM_btree_node_reclaim);
+ }
+
+ if (btree_node_dirty(b)) {
+ if (!flush) {
+ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_dirty]++;
+ return bch_err_throw(c, ENOMEM_btree_node_reclaim);
+ }
+
+ if (locked) {
+ /*
+ * Using the underscore version because we don't want to compact
+ * bsets after the write, since this node is about to be evicted
+ * - unless btree verify mode is enabled, since it runs out of
+ * the post write cleanup:
+ */
+ if (static_branch_unlikely(&bch2_verify_btree_ondisk))
+ bch2_btree_node_write(c, b, SIX_LOCK_intent,
+ BTREE_WRITE_cache_reclaim);
+ else
+ __bch2_btree_node_write(c, b,
+ BTREE_WRITE_cache_reclaim);
+ }
+ }
+
+ if (b->flags & ((1U << BTREE_NODE_read_in_flight)|
(1U << BTREE_NODE_write_in_flight))) {
if (!flush) {
- if (btree_node_dirty(b))
- BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
- else if (btree_node_read_in_flight(b))
- BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
+ if (btree_node_read_in_flight(b))
+ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_read_in_flight]++;
else if (btree_node_write_in_flight(b))
- BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
- return -BCH_ERR_ENOMEM_btree_node_reclaim;
+ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_in_flight]++;
+ return bch_err_throw(c, ENOMEM_btree_node_reclaim);
}
+ if (locked)
+ return -EINTR;
+
/* XXX: waiting on IO with btree cache lock held */
bch2_btree_node_wait_on_read(b);
bch2_btree_node_wait_on_write(b);
}
+ return 0;
+}
+
+/*
+ * this version is for btree nodes that have already been freed (we're not
+ * reaping a real btree node)
+ */
+static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
+{
+ struct btree_cache *bc = &c->btree_cache;
+ int ret = 0;
+
+ lockdep_assert_held(&bc->lock);
+retry_unlocked:
+ ret = __btree_node_reclaim_checks(c, b, flush, false);
+ if (ret)
+ return ret;
+
if (!six_trylock_intent(&b->c.lock)) {
- BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent);
- return -BCH_ERR_ENOMEM_btree_node_reclaim;
+ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_lock_intent]++;
+ return bch_err_throw(c, ENOMEM_btree_node_reclaim);
}
if (!six_trylock_write(&b->c.lock)) {
- BTREE_CACHE_NOT_FREED_INCREMENT(lock_write);
- goto out_unlock_intent;
+ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_lock_write]++;
+ six_unlock_intent(&b->c.lock);
+ return bch_err_throw(c, ENOMEM_btree_node_reclaim);
}
/* recheck under lock */
- if (b->flags & ((1U << BTREE_NODE_read_in_flight)|
- (1U << BTREE_NODE_write_in_flight))) {
- if (!flush) {
- if (btree_node_read_in_flight(b))
- BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
- else if (btree_node_write_in_flight(b))
- BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
- goto out_unlock;
- }
+ ret = __btree_node_reclaim_checks(c, b, flush, true);
+ if (ret) {
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
- goto wait_on_io;
- }
-
- if (btree_node_noevict(b)) {
- BTREE_CACHE_NOT_FREED_INCREMENT(noevict);
- goto out_unlock;
- }
- if (btree_node_write_blocked(b)) {
- BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked);
- goto out_unlock;
- }
- if (btree_node_will_make_reachable(b)) {
- BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable);
- goto out_unlock;
+ if (ret == -EINTR)
+ goto retry_unlocked;
+ return ret;
}
- if (btree_node_dirty(b)) {
- if (!flush) {
- BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
- goto out_unlock;
- }
- /*
- * Using the underscore version because we don't want to compact
- * bsets after the write, since this node is about to be evicted
- * - unless btree verify mode is enabled, since it runs out of
- * the post write cleanup:
- */
- if (bch2_verify_btree_ondisk)
- bch2_btree_node_write(c, b, SIX_LOCK_intent,
- BTREE_WRITE_cache_reclaim);
- else
- __bch2_btree_node_write(c, b,
- BTREE_WRITE_cache_reclaim);
-
- six_unlock_write(&b->c.lock);
- six_unlock_intent(&b->c.lock);
- goto wait_on_io;
- }
-out:
if (b->hash_val && !ret)
trace_and_count(c, btree_cache_reap, c, b);
- return ret;
-out_unlock:
- six_unlock_write(&b->c.lock);
-out_unlock_intent:
- six_unlock_intent(&b->c.lock);
- ret = -BCH_ERR_ENOMEM_btree_node_reclaim;
- goto out;
+ return 0;
}
-static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter)
+static int btree_node_reclaim(struct bch_fs *c, struct btree *b)
{
- return __btree_node_reclaim(c, b, false, shrinker_counter);
+ return __btree_node_reclaim(c, b, false);
}
static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
{
- return __btree_node_reclaim(c, b, true, false);
+ return __btree_node_reclaim(c, b, true);
}
static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
@@ -476,7 +473,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
unsigned long ret = SHRINK_STOP;
bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >= list->nr * 3 / 4;
- if (bch2_btree_shrinker_disabled)
+ if (static_branch_unlikely(&bch2_btree_shrinker_disabled))
return SHRINK_STOP;
mutex_lock(&bc->lock);
@@ -490,7 +487,10 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
* IO can always make forward progress:
*/
can_free = btree_cache_can_free(list);
- nr = min_t(unsigned long, nr, can_free);
+ if (nr > can_free) {
+ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_cache_reserve] += nr - can_free;
+ nr = can_free;
+ }
i = 0;
list_for_each_entry_safe(b, t, &bc->freeable, list) {
@@ -506,7 +506,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
if (touched >= nr)
goto out;
- if (!btree_node_reclaim(c, b, true)) {
+ if (!btree_node_reclaim(c, b)) {
btree_node_data_free(bc, b);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
@@ -522,7 +522,7 @@ restart:
clear_btree_node_accessed(b);
bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_access_bit]++;
--touched;;
- } else if (!btree_node_reclaim(c, b, true)) {
+ } else if (!btree_node_reclaim(c, b)) {
__bch2_btree_node_hash_remove(bc, b);
__btree_node_data_free(bc, b);
@@ -569,7 +569,7 @@ static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
{
struct btree_cache_list *list = shrink->private_data;
- if (bch2_btree_shrinker_disabled)
+ if (static_branch_unlikely(&bch2_btree_shrinker_disabled))
return 0;
return btree_cache_can_free(list);
@@ -682,7 +682,7 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
return 0;
err:
- return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+ return bch_err_throw(c, ENOMEM_fs_btree_cache_init);
}
void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
@@ -727,7 +727,7 @@ int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure
if (!cl) {
trace_and_count(c, btree_cache_cannibalize_lock_fail, trans);
- return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock;
+ return bch_err_throw(c, ENOMEM_btree_cache_cannibalize_lock);
}
closure_wait(&bc->alloc_wait, cl);
@@ -741,7 +741,7 @@ int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure
}
trace_and_count(c, btree_cache_cannibalize_lock_fail, trans);
- return -BCH_ERR_btree_cache_cannibalize_lock_blocked;
+ return bch_err_throw(c, btree_cache_cannibalize_lock_blocked);
success:
trace_and_count(c, btree_cache_cannibalize_lock, trans);
@@ -755,7 +755,7 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c)
for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++)
list_for_each_entry_reverse(b, &bc->live[i].list, list)
- if (!btree_node_reclaim(c, b, false))
+ if (!btree_node_reclaim(c, b))
return b;
while (1) {
@@ -790,7 +790,7 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea
* disk node. Check the freed list before allocating a new one:
*/
list_for_each_entry(b, freed, list)
- if (!btree_node_reclaim(c, b, false)) {
+ if (!btree_node_reclaim(c, b)) {
list_del_init(&b->list);
goto got_node;
}
@@ -817,7 +817,7 @@ got_node:
* the list. Check if there's any freed nodes there:
*/
list_for_each_entry(b2, &bc->freeable, list)
- if (!btree_node_reclaim(c, b2, false)) {
+ if (!btree_node_reclaim(c, b2)) {
swap(b->data, b2->data);
swap(b->aux_data, b2->aux_data);
@@ -977,7 +977,7 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
/* Unlock before doing IO: */
six_unlock_intent(&b->c.lock);
- bch2_trans_unlock_noassert(trans);
+ bch2_trans_unlock(trans);
bch2_btree_node_read(trans, b, sync);
@@ -1003,7 +1003,7 @@ static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
{
struct printbuf buf = PRINTBUF;
- if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations)
+ if (c->recovery.pass_done < BCH_RECOVERY_PASS_check_allocations)
return;
prt_printf(&buf,
@@ -1492,9 +1492,10 @@ void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc
prt_btree_cache_line(out, c, "live:", bc->live[0].nr);
prt_btree_cache_line(out, c, "pinned:", bc->live[1].nr);
- prt_btree_cache_line(out, c, "freeable:", bc->nr_freeable);
+ prt_btree_cache_line(out, c, "reserve:", bc->nr_reserve);
+ prt_btree_cache_line(out, c, "freed:", bc->nr_freeable);
prt_btree_cache_line(out, c, "dirty:", atomic_long_read(&bc->nr_dirty));
- prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock);
+ prt_printf(out, "cannibalize lock:\t%s\n", bc->alloc_lock ? "held" : "not held");
prt_newline(out);
for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) {
@@ -1505,6 +1506,7 @@ void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc
}
prt_newline(out);
+ prt_printf(out, "counters since mount:\n");
prt_printf(out, "freed:\t%zu\n", bc->nr_freed);
prt_printf(out, "not freed:\n");
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 37b69d89341f..9ddcbe1bda78 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -22,6 +22,7 @@
#include "debug.h"
#include "disk_accounting.h"
#include "ec.h"
+#include "enumerated_ref.h"
#include "error.h"
#include "extents.h"
#include "journal.h"
@@ -149,7 +150,7 @@ static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
if (!new)
- return -BCH_ERR_ENOMEM_gc_repair_key;
+ return bch_err_throw(c, ENOMEM_gc_repair_key);
btree_ptr_to_v2(b, new);
b->data->min_key = new_min;
@@ -189,7 +190,7 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
if (!new)
- return -BCH_ERR_ENOMEM_gc_repair_key;
+ return bch_err_throw(c, ENOMEM_gc_repair_key);
btree_ptr_to_v2(b, new);
b->data->max_key = new_max;
@@ -370,20 +371,13 @@ again:
prt_char(&buf, ' ');
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k));
- if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO),
- trans, btree_node_read_error,
- "Topology repair: unreadable btree node at\n%s",
- buf.buf)) {
+ if (bch2_err_matches(ret, EIO)) {
bch2_btree_node_evict(trans, cur_k.k);
cur = NULL;
ret = bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, cur_k.k->k.p);
if (ret)
break;
-
- ret = bch2_btree_lost_data(c, b->c.btree_id);
- if (ret)
- break;
continue;
}
@@ -545,9 +539,6 @@ int bch2_check_topology(struct bch_fs *c)
bch2_btree_id_to_text(&buf, i);
if (r->error) {
- ret = bch2_btree_lost_data(c, i);
- if (ret)
- break;
reconstruct_root:
bch_info(c, "btree root %s unreadable, must recover from scan", buf.buf);
@@ -628,7 +619,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
deleted.p = k.k->p;
if (initial) {
- BUG_ON(bch2_journal_seq_verify &&
+ BUG_ON(static_branch_unlikely(&bch2_journal_seq_verify) &&
k.k->bversion.lo > atomic64_read(&c->journal.seq));
if (fsck_err_on(btree_id != BTREE_ID_accounting &&
@@ -944,7 +935,7 @@ static int bch2_gc_alloc_start(struct bch_fs *c)
ret = genradix_prealloc(&ca->buckets_gc, ca->mi.nbuckets, GFP_KERNEL);
if (ret) {
bch2_dev_put(ca);
- ret = -BCH_ERR_ENOMEM_gc_alloc_start;
+ ret = bch_err_throw(c, ENOMEM_gc_alloc_start);
break;
}
}
@@ -1088,6 +1079,10 @@ out:
* allocator thread - issue wakeup in case they blocked on gc_lock:
*/
closure_wake_up(&c->freelist_wait);
+
+ if (!ret && !test_bit(BCH_FS_errors_not_fixed, &c->flags))
+ bch2_sb_members_clean_deleted(c);
+
bch_err_fn(c, ret);
return ret;
}
@@ -1098,42 +1093,41 @@ static int gc_btree_gens_key(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- struct bkey_i *u;
- int ret;
if (unlikely(test_bit(BCH_FS_going_ro, &c->flags)))
return -EROFS;
- rcu_read_lock();
- bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
- if (!ca)
- continue;
+ bool too_stale = false;
+ scoped_guard(rcu) {
+ bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
+ if (!ca)
+ continue;
- if (dev_ptr_stale(ca, ptr) > 16) {
- rcu_read_unlock();
- goto update;
+ too_stale |= dev_ptr_stale(ca, ptr) > 16;
}
+
+ if (!too_stale)
+ bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
+ if (!ca)
+ continue;
+
+ u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)];
+ if (gen_after(*gen, ptr->gen))
+ *gen = ptr->gen;
+ }
}
- bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
- if (!ca)
- continue;
+ if (too_stale) {
+ struct bkey_i *u = bch2_bkey_make_mut(trans, iter, &k, 0);
+ int ret = PTR_ERR_OR_ZERO(u);
+ if (ret)
+ return ret;
- u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)];
- if (gen_after(*gen, ptr->gen))
- *gen = ptr->gen;
+ bch2_extent_normalize(c, bkey_i_to_s(u));
}
- rcu_read_unlock();
- return 0;
-update:
- u = bch2_bkey_make_mut(trans, iter, &k, 0);
- ret = PTR_ERR_OR_ZERO(u);
- if (ret)
- return ret;
- bch2_extent_normalize(c, bkey_i_to_s(u));
return 0;
}
@@ -1186,7 +1180,7 @@ int bch2_gc_gens(struct bch_fs *c)
ca->oldest_gen = kvmalloc(gens->nbuckets, GFP_KERNEL);
if (!ca->oldest_gen) {
bch2_dev_put(ca);
- ret = -BCH_ERR_ENOMEM_gc_gens;
+ ret = bch_err_throw(c, ENOMEM_gc_gens);
goto err;
}
@@ -1256,26 +1250,21 @@ static void bch2_gc_gens_work(struct work_struct *work)
{
struct bch_fs *c = container_of(work, struct bch_fs, gc_gens_work);
bch2_gc_gens(c);
- bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_gc_gens);
}
void bch2_gc_gens_async(struct bch_fs *c)
{
- if (bch2_write_ref_tryget(c, BCH_WRITE_REF_gc_gens) &&
+ if (enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_gc_gens) &&
!queue_work(c->write_ref_wq, &c->gc_gens_work))
- bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_gc_gens);
}
-void bch2_fs_btree_gc_exit(struct bch_fs *c)
-{
-}
-
-int bch2_fs_btree_gc_init(struct bch_fs *c)
+void bch2_fs_btree_gc_init_early(struct bch_fs *c)
{
seqcount_init(&c->gc_pos_lock);
INIT_WORK(&c->gc_gens_work, bch2_gc_gens_work);
init_rwsem(&c->gc_lock);
mutex_init(&c->gc_gens_lock);
- return 0;
}
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
index 9693a90a48a2..ec77662369a2 100644
--- a/fs/bcachefs/btree_gc.h
+++ b/fs/bcachefs/btree_gc.h
@@ -83,7 +83,6 @@ void bch2_gc_pos_to_text(struct printbuf *, struct gc_pos *);
int bch2_gc_gens(struct bch_fs *);
void bch2_gc_gens_async(struct bch_fs *);
-void bch2_fs_btree_gc_exit(struct bch_fs *);
-int bch2_fs_btree_gc_init(struct bch_fs *);
+void bch2_fs_btree_gc_init_early(struct bch_fs *);
#endif /* _BCACHEFS_BTREE_GC_H */
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 60782f3e5aec..57eff3012a7b 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "async_objs.h"
#include "bkey_buf.h"
#include "bkey_methods.h"
#include "bkey_sort.h"
@@ -13,6 +14,7 @@
#include "buckets.h"
#include "checksum.h"
#include "debug.h"
+#include "enumerated_ref.h"
#include "error.h"
#include "extents.h"
#include "io_write.h"
@@ -514,19 +516,23 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
struct bch_dev *ca,
+ bool print_pos,
struct btree *b, struct bset *i, struct bkey_packed *k,
- unsigned offset, int write)
+ unsigned offset, int rw)
{
- prt_printf(out, bch2_log_msg(c, "%s"),
- write == READ
- ? "error validating btree node "
- : "corrupt btree node before write ");
+ if (print_pos) {
+ prt_str(out, rw == READ
+ ? "error validating btree node "
+ : "corrupt btree node before write ");
+ prt_printf(out, "at btree ");
+ bch2_btree_pos_to_text(out, c, b);
+ prt_newline(out);
+ }
+
if (ca)
- prt_printf(out, "on %s ", ca->name);
- prt_printf(out, "at btree ");
- bch2_btree_pos_to_text(out, c, b);
+ prt_printf(out, "%s ", ca->name);
- prt_printf(out, "\nnode offset %u/%u",
+ prt_printf(out, "node offset %u/%u",
b->written, btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)));
if (i)
prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s));
@@ -537,75 +543,110 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
prt_str(out, ": ");
}
-__printf(10, 11)
+__printf(11, 12)
static int __btree_err(int ret,
struct bch_fs *c,
struct bch_dev *ca,
struct btree *b,
struct bset *i,
struct bkey_packed *k,
- int write,
- bool have_retry,
+ int rw,
enum bch_sb_error_id err_type,
+ struct bch_io_failures *failed,
+ struct printbuf *err_msg,
const char *fmt, ...)
{
- bool silent = c->curr_recovery_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes;
+ if (c->recovery.curr_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes)
+ return bch_err_throw(c, fsck_fix);
+
+ bool have_retry = false;
+ int ret2;
+
+ if (ca) {
+ bch2_mark_btree_validate_failure(failed, ca->dev_idx);
+
+ struct extent_ptr_decoded pick;
+ have_retry = !bch2_bkey_pick_read_device(c,
+ bkey_i_to_s_c(&b->key),
+ failed, &pick, -1);
+ }
if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry)
- ret = -BCH_ERR_btree_node_read_err_fixable;
+ ret = bch_err_throw(c, btree_node_read_err_fixable);
if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry)
- ret = -BCH_ERR_btree_node_read_err_bad_node;
+ ret = bch_err_throw(c, btree_node_read_err_bad_node);
+
+ bch2_sb_error_count(c, err_type);
- if (!silent && ret != -BCH_ERR_btree_node_read_err_fixable)
- bch2_sb_error_count(c, err_type);
+ bool print_deferred = err_msg &&
+ rw == READ &&
+ !(test_bit(BCH_FS_in_fsck, &c->flags) &&
+ c->opts.fix_errors == FSCK_FIX_ask);
struct printbuf out = PRINTBUF;
- if (write != WRITE && ret != -BCH_ERR_btree_node_read_err_fixable) {
- printbuf_indent_add_nextline(&out, 2);
-#ifdef BCACHEFS_LOG_PREFIX
- prt_printf(&out, bch2_log_msg(c, ""));
-#endif
- }
+ bch2_log_msg_start(c, &out);
+
+ if (!print_deferred)
+ err_msg = &out;
- btree_err_msg(&out, c, ca, b, i, k, b->written, write);
+ btree_err_msg(err_msg, c, ca, !print_deferred, b, i, k, b->written, rw);
va_list args;
va_start(args, fmt);
- prt_vprintf(&out, fmt, args);
+ prt_vprintf(err_msg, fmt, args);
va_end(args);
- if (write == WRITE) {
+ if (print_deferred) {
+ prt_newline(err_msg);
+
+ switch (ret) {
+ case -BCH_ERR_btree_node_read_err_fixable:
+ ret2 = bch2_fsck_err_opt(c, FSCK_CAN_FIX, err_type);
+ if (!bch2_err_matches(ret2, BCH_ERR_fsck_fix) &&
+ !bch2_err_matches(ret2, BCH_ERR_fsck_ignore)) {
+ ret = ret2;
+ goto fsck_err;
+ }
+
+ if (!have_retry)
+ ret = bch_err_throw(c, fsck_fix);
+ goto out;
+ case -BCH_ERR_btree_node_read_err_bad_node:
+ prt_str(&out, ", ");
+ ret = __bch2_topology_error(c, &out);
+ break;
+ }
+
+ goto out;
+ }
+
+ if (rw == WRITE) {
prt_str(&out, ", ");
ret = __bch2_inconsistent_error(c, &out)
? -BCH_ERR_fsck_errors_not_fixed
: 0;
- silent = false;
+ goto print;
}
switch (ret) {
case -BCH_ERR_btree_node_read_err_fixable:
- ret = !silent
- ? __bch2_fsck_err(c, NULL, FSCK_CAN_FIX, err_type, "%s", out.buf)
- : -BCH_ERR_fsck_fix;
- if (ret != -BCH_ERR_fsck_fix &&
- ret != -BCH_ERR_fsck_ignore)
+ ret2 = __bch2_fsck_err(c, NULL, FSCK_CAN_FIX, err_type, "%s", out.buf);
+ if (!bch2_err_matches(ret2, BCH_ERR_fsck_fix) &&
+ !bch2_err_matches(ret2, BCH_ERR_fsck_ignore)) {
+ ret = ret2;
goto fsck_err;
- ret = -BCH_ERR_fsck_fix;
+ }
+
+ if (!have_retry)
+ ret = bch_err_throw(c, fsck_fix);
goto out;
case -BCH_ERR_btree_node_read_err_bad_node:
prt_str(&out, ", ");
ret = __bch2_topology_error(c, &out);
- if (ret)
- silent = false;
- break;
- case -BCH_ERR_btree_node_read_err_incompatible:
- ret = -BCH_ERR_fsck_errors_not_fixed;
- silent = false;
break;
}
-
- if (!silent)
- bch2_print_string_as_lines(KERN_ERR, out.buf);
+print:
+ bch2_print_str(c, KERN_ERR, out.buf);
out:
fsck_err:
printbuf_exit(&out);
@@ -614,16 +655,17 @@ fsck_err:
#define btree_err(type, c, ca, b, i, k, _err_type, msg, ...) \
({ \
- int _ret = __btree_err(type, c, ca, b, i, k, write, have_retry, \
+ int _ret = __btree_err(type, c, ca, b, i, k, write, \
BCH_FSCK_ERR_##_err_type, \
+ failed, err_msg, \
msg, ##__VA_ARGS__); \
\
- if (_ret != -BCH_ERR_fsck_fix) { \
+ if (!bch2_err_matches(_ret, BCH_ERR_fsck_fix)) { \
ret = _ret; \
goto fsck_err; \
} \
\
- *saw_error = true; \
+ true; \
})
#define btree_err_on(cond, ...) ((cond) ? btree_err(__VA_ARGS__) : false)
@@ -681,8 +723,9 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b)
static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
struct btree *b, struct bset *i,
- unsigned offset, unsigned sectors,
- int write, bool have_retry, bool *saw_error)
+ unsigned offset, unsigned sectors, int write,
+ struct bch_io_failures *failed,
+ struct printbuf *err_msg)
{
unsigned version = le16_to_cpu(i->version);
unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key));
@@ -895,7 +938,8 @@ static inline int btree_node_read_bkey_cmp(const struct btree *b,
static int validate_bset_keys(struct bch_fs *c, struct btree *b,
struct bset *i, int write,
- bool have_retry, bool *saw_error)
+ struct bch_io_failures *failed,
+ struct printbuf *err_msg)
{
unsigned version = le16_to_cpu(i->version);
struct bkey_packed *k, *prev = NULL;
@@ -1008,7 +1052,9 @@ fsck_err:
}
int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
- struct btree *b, bool have_retry, bool *saw_error)
+ struct btree *b,
+ struct bch_io_failures *failed,
+ struct printbuf *err_msg)
{
struct btree_node_entry *bne;
struct sort_iter *iter;
@@ -1018,11 +1064,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
bool used_mempool, blacklisted;
bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
- unsigned u64s;
unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key));
u64 max_journal_seq = 0;
struct printbuf buf = PRINTBUF;
- int ret = 0, retry_read = 0, write = READ;
+ int ret = 0, write = READ;
u64 start_time = local_clock();
b->version_ondisk = U16_MAX;
@@ -1156,15 +1201,14 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
b->version_ondisk = min(b->version_ondisk,
le16_to_cpu(i->version));
- ret = validate_bset(c, ca, b, i, b->written, sectors,
- READ, have_retry, saw_error);
+ ret = validate_bset(c, ca, b, i, b->written, sectors, READ, failed, err_msg);
if (ret)
goto fsck_err;
if (!b->written)
btree_node_set_format(b, b->data->format);
- ret = validate_bset_keys(c, b, i, READ, have_retry, saw_error);
+ ret = validate_bset_keys(c, b, i, READ, failed, err_msg);
if (ret)
goto fsck_err;
@@ -1225,23 +1269,20 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
sorted = btree_bounce_alloc(c, btree_buf_bytes(b), &used_mempool);
sorted->keys.u64s = 0;
- set_btree_bset(b, b->set, &b->data->keys);
-
b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter);
memset((uint8_t *)(sorted + 1) + b->nr.live_u64s * sizeof(u64), 0,
btree_buf_bytes(b) -
sizeof(struct btree_node) -
b->nr.live_u64s * sizeof(u64));
- u64s = le16_to_cpu(sorted->keys.u64s);
+ b->data->keys.u64s = sorted->keys.u64s;
*sorted = *b->data;
- sorted->keys.u64s = cpu_to_le16(u64s);
swap(sorted, b->data);
set_btree_bset(b, b->set, &b->data->keys);
b->nsets = 1;
b->data->keys.journal_seq = cpu_to_le64(max_journal_seq);
- BUG_ON(b->nr.live_u64s != u64s);
+ BUG_ON(b->nr.live_u64s != le16_to_cpu(b->data->keys.u64s));
btree_bounce_free(c, btree_buf_bytes(b), used_mempool, sorted);
@@ -1255,7 +1296,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
ret = btree_node_bkey_val_validate(c, b, u.s_c, READ);
if (ret == -BCH_ERR_fsck_delete_bkey ||
- (bch2_inject_invalid_keys &&
+ (static_branch_unlikely(&bch2_inject_invalid_keys) &&
!bversion_cmp(u.k->bversion, MAX_VERSION))) {
btree_keys_account_key_drop(&b->nr, 0, k);
@@ -1284,31 +1325,21 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
btree_node_reset_sib_u64s(b);
- rcu_read_lock();
- bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
- struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev);
+ scoped_guard(rcu)
+ bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
+ struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev);
- if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw)
- set_btree_node_need_rewrite(b);
- }
- rcu_read_unlock();
+ if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw)
+ set_btree_node_need_rewrite(b);
+ }
if (!ptr_written)
set_btree_node_need_rewrite(b);
-out:
+fsck_err:
mempool_free(iter, &c->fill_iter);
printbuf_exit(&buf);
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time);
- return retry_read;
-fsck_err:
- if (ret == -BCH_ERR_btree_node_read_err_want_retry ||
- ret == -BCH_ERR_btree_node_read_err_must_retry) {
- retry_read = 1;
- } else {
- set_btree_node_read_error(b);
- bch2_btree_lost_data(c, b->c.btree_id);
- }
- goto out;
+ return ret;
}
static void btree_node_read_work(struct work_struct *work)
@@ -1320,16 +1351,26 @@ static void btree_node_read_work(struct work_struct *work)
struct btree *b = rb->b;
struct bio *bio = &rb->bio;
struct bch_io_failures failed = { .nr = 0 };
+ int ret = 0;
+
struct printbuf buf = PRINTBUF;
- bool saw_error = false;
- bool retry = false;
- bool can_retry;
+ bch2_log_msg_start(c, &buf);
+
+ prt_printf(&buf, "btree node read error at btree ");
+ bch2_btree_pos_to_text(&buf, c, b);
+ prt_newline(&buf);
goto start;
while (1) {
- retry = true;
- bch_info(c, "retrying read");
- ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ);
+ ret = bch2_bkey_pick_read_device(c,
+ bkey_i_to_s_c(&b->key),
+ &failed, &rb->pick, -1);
+ if (ret) {
+ set_btree_node_read_error(b);
+ break;
+ }
+
+ ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ, BCH_DEV_READ_REF_btree_node_read);
rb->have_ioref = ca != NULL;
rb->start_time = local_clock();
bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META);
@@ -1346,59 +1387,59 @@ static void btree_node_read_work(struct work_struct *work)
bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
rb->start_time, !bio->bi_status);
start:
- printbuf_reset(&buf);
- bch2_btree_pos_to_text(&buf, c, b);
-
- if (ca && bio->bi_status)
- bch_err_dev_ratelimited(ca,
- "btree read error %s for %s",
- bch2_blk_status_to_str(bio->bi_status), buf.buf);
if (rb->have_ioref)
- percpu_ref_put(&ca->io_ref[READ]);
+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_read);
rb->have_ioref = false;
- bch2_mark_io_failure(&failed, &rb->pick, false);
-
- can_retry = bch2_bkey_pick_read_device(c,
- bkey_i_to_s_c(&b->key),
- &failed, &rb->pick, -1) > 0;
-
- if (!bio->bi_status &&
- !bch2_btree_node_read_done(c, ca, b, can_retry, &saw_error)) {
- if (retry)
- bch_info(c, "retry success");
- break;
+ if (bio->bi_status) {
+ bch2_mark_io_failure(&failed, &rb->pick, false);
+ continue;
}
- saw_error = true;
+ ret = bch2_btree_node_read_done(c, ca, b, &failed, &buf);
+ if (ret == -BCH_ERR_btree_node_read_err_want_retry ||
+ ret == -BCH_ERR_btree_node_read_err_must_retry)
+ continue;
- if (!can_retry) {
+ if (ret)
set_btree_node_read_error(b);
- bch2_btree_lost_data(c, b->c.btree_id);
- break;
- }
+
+ break;
}
- bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read],
- rb->start_time);
- bio_put(&rb->bio);
+ bch2_io_failures_to_text(&buf, c, &failed);
+
+ if (btree_node_read_error(b))
+ bch2_btree_lost_data(c, &buf, b->c.btree_id);
+
+ /*
+ * only print retry success if we read from a replica with no errors
+ */
+ if (btree_node_read_error(b))
+ prt_printf(&buf, "ret %s", bch2_err_str(ret));
+ else if (failed.nr) {
+ if (!bch2_dev_io_failures(&failed, rb->pick.ptr.dev))
+ prt_printf(&buf, "retry success");
+ else
+ prt_printf(&buf, "repair success");
+ }
- if ((saw_error ||
+ if ((failed.nr ||
btree_node_need_rewrite(b)) &&
!btree_node_read_error(b) &&
- c->curr_recovery_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) {
- if (saw_error) {
- printbuf_reset(&buf);
- bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
- prt_str(&buf, " ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
- bch_err_ratelimited(c, "%s: rewriting btree node at due to error\n %s",
- __func__, buf.buf);
- }
-
+ c->recovery.curr_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) {
+ prt_printf(&buf, " (rewriting node)");
bch2_btree_node_rewrite_async(c, b);
}
+ prt_newline(&buf);
+
+ if (failed.nr)
+ bch2_print_str_ratelimited(c, KERN_ERR, buf.buf);
+ async_object_list_del(c, btree_read_bio, rb->list_idx);
+ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read],
+ rb->start_time);
+ bio_put(&rb->bio);
printbuf_exit(&buf);
clear_btree_node_read_in_flight(b);
smp_mb__after_atomic();
@@ -1419,6 +1460,11 @@ static void btree_node_read_endio(struct bio *bio)
queue_work(c->btree_read_complete_wq, &rb->work);
}
+void bch2_btree_read_bio_to_text(struct printbuf *out, struct btree_read_bio *rbio)
+{
+ bch2_bio_to_text(out, &rbio->bio);
+}
+
struct btree_node_read_all {
struct closure cl;
struct bch_fs *c;
@@ -1478,12 +1524,13 @@ static CLOSURE_CALLBACK(btree_node_read_all_replicas_done)
struct btree *b = ra->b;
struct printbuf buf = PRINTBUF;
bool dump_bset_maps = false;
- bool have_retry = false;
int ret = 0, best = -1, write = READ;
unsigned i, written = 0, written2 = 0;
__le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2
? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0;
bool _saw_error = false, *saw_error = &_saw_error;
+ struct printbuf *err_msg = NULL;
+ struct bch_io_failures *failed = NULL;
for (i = 0; i < ra->nr; i++) {
struct btree_node *bn = ra->buf[i];
@@ -1576,14 +1623,19 @@ fsck_err:
if (best >= 0) {
memcpy(b->data, ra->buf[best], btree_buf_bytes(b));
- ret = bch2_btree_node_read_done(c, NULL, b, false, saw_error);
+ ret = bch2_btree_node_read_done(c, NULL, b, NULL, NULL);
} else {
ret = -1;
}
if (ret) {
set_btree_node_read_error(b);
- bch2_btree_lost_data(c, b->c.btree_id);
+
+ struct printbuf buf = PRINTBUF;
+ bch2_btree_lost_data(c, &buf, b->c.btree_id);
+ if (buf.pos)
+ bch_err(c, "%s", buf.buf);
+ printbuf_exit(&buf);
} else if (*saw_error)
bch2_btree_node_rewrite_async(c, b);
@@ -1612,7 +1664,8 @@ static void btree_node_read_all_replicas_endio(struct bio *bio)
struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev);
bch2_latency_acct(ca, rb->start_time, READ);
- percpu_ref_put(&ca->io_ref[READ]);
+ enumerated_ref_put(&ca->io_ref[READ],
+ BCH_DEV_READ_REF_btree_node_read_all_replicas);
}
ra->err[rb->idx] = bio->bi_status;
@@ -1634,7 +1687,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool
ra = kzalloc(sizeof(*ra), GFP_NOFS);
if (!ra)
- return -BCH_ERR_ENOMEM_btree_node_read_all_replicas;
+ return bch_err_throw(c, ENOMEM_btree_node_read_all_replicas);
closure_init(&ra->cl, NULL);
ra->c = c;
@@ -1652,7 +1705,8 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool
i = 0;
bkey_for_each_ptr_decode(k.k, ptrs, pick, entry) {
- struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
+ struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ,
+ BCH_DEV_READ_REF_btree_node_read_all_replicas);
struct btree_read_bio *rb =
container_of(ra->bio[i], struct btree_read_bio, bio);
rb->c = c;
@@ -1703,7 +1757,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
trace_and_count(c, btree_node_read, trans, b);
- if (bch2_verify_all_btree_replicas &&
+ if (static_branch_unlikely(&bch2_verify_all_btree_replicas) &&
!btree_node_read_all_replicas(c, b, sync))
return;
@@ -1711,26 +1765,34 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
NULL, &pick, -1);
if (ret <= 0) {
+ bool ratelimit = true;
struct printbuf buf = PRINTBUF;
+ bch2_log_msg_start(c, &buf);
prt_str(&buf, "btree node read error: no device to read from\n at ");
bch2_btree_pos_to_text(&buf, c, b);
- bch_err_ratelimited(c, "%s", buf.buf);
-
- if (c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_check_topology) &&
- c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology)
- bch2_fatal_error(c);
+ prt_newline(&buf);
+ bch2_btree_lost_data(c, &buf, b->c.btree_id);
+
+ if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_check_topology) &&
+ bch2_fs_emergency_read_only2(c, &buf))
+ ratelimit = false;
+
+ static DEFINE_RATELIMIT_STATE(rs,
+ DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+ if (!ratelimit || __ratelimit(&rs))
+ bch2_print_str(c, KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
set_btree_node_read_error(b);
- bch2_btree_lost_data(c, b->c.btree_id);
clear_btree_node_read_in_flight(b);
smp_mb__after_atomic();
wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
- printbuf_exit(&buf);
return;
}
- ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
+ ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, BCH_DEV_READ_REF_btree_node_read);
bio = bio_alloc_bioset(NULL,
buf_pages(b->data, btree_buf_bytes(b)),
@@ -1749,6 +1811,8 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
bio->bi_end_io = btree_node_read_endio;
bch2_bio_map(bio, b->data, btree_buf_bytes(b));
+ async_object_list_add(c, btree_read_bio, rb, &rb->list_idx);
+
if (rb->have_ioref) {
this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
bio_sectors(bio));
@@ -1805,7 +1869,7 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id,
bch2_btree_node_hash_remove(&c->btree_cache, b);
mutex_unlock(&c->btree_cache.lock);
- ret = -BCH_ERR_btree_node_read_error;
+ ret = bch_err_throw(c, btree_node_read_error);
goto err;
}
@@ -1922,7 +1986,7 @@ static void btree_node_scrub_work(struct work_struct *work)
bch_err(c, "error validating btree node during scrub on %s at btree %s",
scrub->ca->name, err.buf);
- ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
+ ret = bch2_btree_node_rewrite(trans, &iter, b, 0, 0);
}
err:
bch2_trans_iter_exit(trans, &iter);
@@ -1933,9 +1997,9 @@ err:
printbuf_exit(&err);
bch2_bkey_buf_exit(&scrub->key, c);;
btree_bounce_free(c, c->opts.btree_node_size, scrub->used_mempool, scrub->buf);
- percpu_ref_put(&scrub->ca->io_ref[READ]);
+ enumerated_ref_put(&scrub->ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scrub);
kfree(scrub);
- bch2_write_ref_put(c, BCH_WRITE_REF_btree_node_scrub);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_node_scrub);
}
static void btree_node_scrub_endio(struct bio *bio)
@@ -1954,17 +2018,18 @@ int bch2_btree_node_scrub(struct btree_trans *trans,
struct bch_fs *c = trans->c;
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_node_scrub))
- return -BCH_ERR_erofs_no_writes;
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_node_scrub))
+ return bch_err_throw(c, erofs_no_writes);
struct extent_ptr_decoded pick;
int ret = bch2_bkey_pick_read_device(c, k, NULL, &pick, dev);
if (ret <= 0)
goto err;
- struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
+ struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ,
+ BCH_DEV_READ_REF_btree_node_scrub);
if (!ca) {
- ret = -BCH_ERR_device_offline;
+ ret = bch_err_throw(c, device_offline);
goto err;
}
@@ -2002,9 +2067,9 @@ int bch2_btree_node_scrub(struct btree_trans *trans,
return 0;
err_free:
btree_bounce_free(c, c->opts.btree_node_size, used_mempool, buf);
- percpu_ref_put(&ca->io_ref[READ]);
+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scrub);
err:
- bch2_write_ref_put(c, BCH_WRITE_REF_btree_node_scrub);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_node_scrub);
return ret;
}
@@ -2101,7 +2166,7 @@ static void btree_node_write_work(struct work_struct *work)
bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) {
- ret = -BCH_ERR_btree_node_write_all_failed;
+ ret = bch_err_throw(c, btree_node_write_all_failed);
goto err;
}
@@ -2121,6 +2186,7 @@ static void btree_node_write_work(struct work_struct *work)
goto err;
}
out:
+ async_object_list_del(c, btree_write_bio, wbio->list_idx);
bio_put(&wbio->wbio.bio);
btree_node_write_done(c, b, start_time);
return;
@@ -2172,7 +2238,8 @@ static void btree_node_write_endio(struct bio *bio)
* btree writes yet (due to device removal/ro):
*/
if (wbio->have_ioref)
- percpu_ref_put(&ca->io_ref[READ]);
+ enumerated_ref_put(&ca->io_ref[READ],
+ BCH_DEV_READ_REF_btree_node_write);
if (parent) {
bio_put(bio);
@@ -2184,14 +2251,12 @@ static void btree_node_write_endio(struct bio *bio)
smp_mb__after_atomic();
wake_up_bit(&b->flags, BTREE_NODE_write_in_flight_inner);
INIT_WORK(&wb->work, btree_node_write_work);
- queue_work(c->btree_io_complete_wq, &wb->work);
+ queue_work(c->btree_write_complete_wq, &wb->work);
}
static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
struct bset *i, unsigned sectors)
{
- bool saw_error;
-
int ret = bch2_bkey_validate(c, bkey_i_to_s_c(&b->key),
(struct bkey_validate_context) {
.from = BKEY_VALIDATE_btree_node,
@@ -2204,8 +2269,8 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
return ret;
}
- ret = validate_bset_keys(c, b, i, WRITE, false, &saw_error) ?:
- validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false, &saw_error);
+ ret = validate_bset_keys(c, b, i, WRITE, NULL, NULL) ?:
+ validate_bset(c, NULL, b, i, b->written, sectors, WRITE, NULL, NULL);
if (ret) {
bch2_inconsistent_error(c);
dump_stack();
@@ -2472,6 +2537,8 @@ do_write:
atomic64_inc(&c->btree_write_stats[type].nr);
atomic64_add(bytes_to_write, &c->btree_write_stats[type].bytes);
+ async_object_list_add(c, btree_write_bio, wbio, &wbio->list_idx);
+
INIT_WORK(&wbio->work, btree_write_submit);
queue_work(c->btree_write_submit_wq, &wbio->work);
return;
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index dbf76d22c660..30a5180532c8 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -41,6 +41,9 @@ struct btree_read_bio {
u64 start_time;
unsigned have_ioref:1;
unsigned idx:7;
+#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
+ unsigned list_idx;
+#endif
struct extent_ptr_decoded pick;
struct work_struct work;
struct bio bio;
@@ -53,6 +56,9 @@ struct btree_write_bio {
unsigned data_bytes;
unsigned sector_offset;
u64 start_time;
+#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
+ unsigned list_idx;
+#endif
struct bch_write_bio wbio;
};
@@ -128,11 +134,15 @@ void bch2_btree_build_aux_trees(struct btree *);
void bch2_btree_init_next(struct btree_trans *, struct btree *);
int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *,
- struct btree *, bool, bool *);
+ struct btree *,
+ struct bch_io_failures *,
+ struct printbuf *);
void bch2_btree_node_read(struct btree_trans *, struct btree *, bool);
int bch2_btree_root_read(struct bch_fs *, enum btree_id,
const struct bkey_i *, unsigned);
+void bch2_btree_read_bio_to_text(struct printbuf *, struct btree_read_bio *);
+
int bch2_btree_node_scrub(struct btree_trans *, enum btree_id, unsigned,
struct bkey_s_c, unsigned);
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index ac5f2046550d..b78403376c07 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -16,6 +16,7 @@
#include "journal_io.h"
#include "replicas.h"
#include "snapshot.h"
+#include "super.h"
#include "trace.h"
#include <linux/random.h>
@@ -114,11 +115,9 @@ static inline bool btree_path_pos_in_node(struct btree_path *path,
!btree_path_pos_after_node(path, b);
}
-/* Btree iterator: */
+/* Debug: */
-#ifdef CONFIG_BCACHEFS_DEBUG
-
-static void bch2_btree_path_verify_cached(struct btree_trans *trans,
+static void __bch2_btree_path_verify_cached(struct btree_trans *trans,
struct btree_path *path)
{
struct bkey_cached *ck;
@@ -135,7 +134,7 @@ static void bch2_btree_path_verify_cached(struct btree_trans *trans,
btree_node_unlock(trans, path, 0);
}
-static void bch2_btree_path_verify_level(struct btree_trans *trans,
+static void __bch2_btree_path_verify_level(struct btree_trans *trans,
struct btree_path *path, unsigned level)
{
struct btree_path_level *l;
@@ -147,16 +146,13 @@ static void bch2_btree_path_verify_level(struct btree_trans *trans,
struct printbuf buf3 = PRINTBUF;
const char *msg;
- if (!bch2_debug_check_iterators)
- return;
-
l = &path->l[level];
tmp = l->iter;
locked = btree_node_locked(path, level);
if (path->cached) {
if (!level)
- bch2_btree_path_verify_cached(trans, path);
+ __bch2_btree_path_verify_cached(trans, path);
return;
}
@@ -217,7 +213,7 @@ err:
msg, level, buf1.buf, buf2.buf, buf3.buf);
}
-static void bch2_btree_path_verify(struct btree_trans *trans,
+static void __bch2_btree_path_verify(struct btree_trans *trans,
struct btree_path *path)
{
struct bch_fs *c = trans->c;
@@ -229,22 +225,22 @@ static void bch2_btree_path_verify(struct btree_trans *trans,
break;
}
- bch2_btree_path_verify_level(trans, path, i);
+ __bch2_btree_path_verify_level(trans, path, i);
}
- bch2_btree_path_verify_locks(path);
+ bch2_btree_path_verify_locks(trans, path);
}
-void bch2_trans_verify_paths(struct btree_trans *trans)
+void __bch2_trans_verify_paths(struct btree_trans *trans)
{
struct btree_path *path;
unsigned iter;
trans_for_each_path(trans, path, iter)
- bch2_btree_path_verify(trans, path);
+ __bch2_btree_path_verify(trans, path);
}
-static void bch2_btree_iter_verify(struct btree_trans *trans, struct btree_iter *iter)
+static void __bch2_btree_iter_verify(struct btree_trans *trans, struct btree_iter *iter)
{
BUG_ON(!!(iter->flags & BTREE_ITER_cached) != btree_iter_path(trans, iter)->cached);
@@ -256,11 +252,11 @@ static void bch2_btree_iter_verify(struct btree_trans *trans, struct btree_iter
!btree_type_has_snapshot_field(iter->btree_id));
if (iter->update_path)
- bch2_btree_path_verify(trans, &trans->paths[iter->update_path]);
- bch2_btree_path_verify(trans, btree_iter_path(trans, iter));
+ __bch2_btree_path_verify(trans, &trans->paths[iter->update_path]);
+ __bch2_btree_path_verify(trans, btree_iter_path(trans, iter));
}
-static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
+static void __bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
{
BUG_ON((iter->flags & BTREE_ITER_filter_snapshots) &&
!iter->pos.snapshot);
@@ -274,16 +270,13 @@ static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
bkey_gt(iter->pos, iter->k.p)));
}
-static int bch2_btree_iter_verify_ret(struct btree_trans *trans,
- struct btree_iter *iter, struct bkey_s_c k)
+static int __bch2_btree_iter_verify_ret(struct btree_trans *trans,
+ struct btree_iter *iter, struct bkey_s_c k)
{
struct btree_iter copy;
struct bkey_s_c prev;
int ret = 0;
- if (!bch2_debug_check_iterators)
- return 0;
-
if (!(iter->flags & BTREE_ITER_filter_snapshots))
return 0;
@@ -324,7 +317,7 @@ out:
return ret;
}
-void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
+void __bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
struct bpos pos)
{
bch2_trans_verify_not_unlocked_or_in_restart(trans);
@@ -357,19 +350,40 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
panic("not locked: %s %s\n", bch2_btree_id_str(id), buf.buf);
}
-#else
-
static inline void bch2_btree_path_verify_level(struct btree_trans *trans,
- struct btree_path *path, unsigned l) {}
+ struct btree_path *path, unsigned l)
+{
+ if (static_branch_unlikely(&bch2_debug_check_iterators))
+ __bch2_btree_path_verify_level(trans, path, l);
+}
+
static inline void bch2_btree_path_verify(struct btree_trans *trans,
- struct btree_path *path) {}
+ struct btree_path *path)
+{
+ if (static_branch_unlikely(&bch2_debug_check_iterators))
+ __bch2_btree_path_verify(trans, path);
+}
+
static inline void bch2_btree_iter_verify(struct btree_trans *trans,
- struct btree_iter *iter) {}
-static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {}
-static inline int bch2_btree_iter_verify_ret(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_s_c k) { return 0; }
+ struct btree_iter *iter)
+{
+ if (static_branch_unlikely(&bch2_debug_check_iterators))
+ __bch2_btree_iter_verify(trans, iter);
+}
-#endif
+static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
+{
+ if (static_branch_unlikely(&bch2_debug_check_iterators))
+ __bch2_btree_iter_verify_entry_exit(iter);
+}
+
+static inline int bch2_btree_iter_verify_ret(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_s_c k)
+{
+ return static_branch_unlikely(&bch2_debug_check_iterators)
+ ? __bch2_btree_iter_verify_ret(trans, iter, k)
+ : 0;
+}
/* Btree path: fixups after btree updates */
@@ -523,7 +537,7 @@ void bch2_btree_node_iter_fix(struct btree_trans *trans,
__bch2_btree_node_iter_fix(path, b, node_iter, t,
where, clobber_u64s, new_u64s);
- if (bch2_debug_check_iterators)
+ if (static_branch_unlikely(&bch2_debug_check_iterators))
bch2_btree_node_iter_verify(node_iter, b);
}
@@ -876,8 +890,7 @@ static noinline void btree_node_mem_ptr_set(struct btree_trans *trans,
static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
struct btree_path *path,
- unsigned flags,
- struct bkey_buf *out)
+ unsigned flags)
{
struct bch_fs *c = trans->c;
struct btree_path_level *l = path_l(path);
@@ -901,7 +914,7 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
goto err;
}
- bch2_bkey_buf_reassemble(out, c, k);
+ bkey_reassemble(&trans->btree_path_down, k);
if ((flags & BTREE_ITER_prefetch) &&
c->opts.btree_node_prefetch)
@@ -912,6 +925,22 @@ err:
return ret;
}
+static noinline_for_stack int btree_node_missing_err(struct btree_trans *trans,
+ struct btree_path *path)
+{
+ struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "node not found at pos ");
+ bch2_bpos_to_text(&buf, path->pos);
+ prt_str(&buf, " within parent node ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&path_l(path)->b->key));
+
+ bch2_fs_fatal_error(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ return bch_err_throw(c, btree_need_topology_repair);
+}
+
static __always_inline int btree_path_down(struct btree_trans *trans,
struct btree_path *path,
unsigned flags,
@@ -922,51 +951,38 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
struct btree *b;
unsigned level = path->level - 1;
enum six_lock_type lock_type = __btree_lock_want(path, level);
- struct bkey_buf tmp;
int ret;
EBUG_ON(!btree_node_locked(path, path->level));
- bch2_bkey_buf_init(&tmp);
-
if (unlikely(trans->journal_replay_not_finished)) {
- ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp);
+ ret = btree_node_iter_and_journal_peek(trans, path, flags);
if (ret)
- goto err;
+ return ret;
} else {
struct bkey_packed *k = bch2_btree_node_iter_peek(&l->iter, l->b);
- if (!k) {
- struct printbuf buf = PRINTBUF;
-
- prt_str(&buf, "node not found at pos ");
- bch2_bpos_to_text(&buf, path->pos);
- prt_str(&buf, " within parent node ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&l->b->key));
-
- bch2_fs_fatal_error(c, "%s", buf.buf);
- printbuf_exit(&buf);
- ret = -BCH_ERR_btree_need_topology_repair;
- goto err;
- }
+ if (unlikely(!k))
+ return btree_node_missing_err(trans, path);
- bch2_bkey_buf_unpack(&tmp, c, l->b, k);
+ bch2_bkey_unpack(l->b, &trans->btree_path_down, k);
- if ((flags & BTREE_ITER_prefetch) &&
+ if (unlikely((flags & BTREE_ITER_prefetch)) &&
c->opts.btree_node_prefetch) {
ret = btree_path_prefetch(trans, path);
if (ret)
- goto err;
+ return ret;
}
}
- b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip);
+ b = bch2_btree_node_get(trans, path, &trans->btree_path_down,
+ level, lock_type, trace_ip);
ret = PTR_ERR_OR_ZERO(b);
if (unlikely(ret))
- goto err;
+ return ret;
- if (likely(!trans->journal_replay_not_finished &&
- tmp.k->k.type == KEY_TYPE_btree_ptr_v2) &&
- unlikely(b != btree_node_mem_ptr(tmp.k)))
+ if (unlikely(b != btree_node_mem_ptr(&trans->btree_path_down)) &&
+ likely(!trans->journal_replay_not_finished &&
+ trans->btree_path_down.k.type == KEY_TYPE_btree_ptr_v2))
btree_node_mem_ptr_set(trans, path, level + 1, b);
if (btree_node_read_locked(path, level + 1))
@@ -977,10 +993,8 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
path->level = level;
bch2_btree_path_level_init(trans, path, b);
- bch2_btree_path_verify_locks(path);
-err:
- bch2_bkey_buf_exit(&tmp, c);
- return ret;
+ bch2_btree_path_verify_locks(trans, path);
+ return 0;
}
static int bch2_btree_path_traverse_all(struct btree_trans *trans)
@@ -992,7 +1006,7 @@ static int bch2_btree_path_traverse_all(struct btree_trans *trans)
int ret = 0;
if (trans->in_traverse_all)
- return -BCH_ERR_transaction_restart_in_traverse_all;
+ return bch_err_throw(c, transaction_restart_in_traverse_all);
trans->in_traverse_all = true;
retry_all:
@@ -1089,7 +1103,7 @@ static void btree_path_set_level_down(struct btree_trans *trans,
if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED)
btree_node_unlock(trans, path, l);
- btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+ btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE);
bch2_btree_path_verify(trans, path);
}
@@ -1287,7 +1301,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans,
if (unlikely(path->cached)) {
btree_node_unlock(trans, path, 0);
path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_up);
- btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+ btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE);
goto out;
}
@@ -1316,7 +1330,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans,
}
if (unlikely(level != path->level)) {
- btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+ btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE);
__bch2_btree_path_unlock(trans, path);
}
out:
@@ -1385,45 +1399,45 @@ static bool bch2_btree_path_can_relock(struct btree_trans *trans, struct btree_p
void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool intent)
{
- struct btree_path *path = trans->paths + path_idx, *dup;
+ struct btree_path *path = trans->paths + path_idx, *dup = NULL;
if (!__btree_path_put(trans, path, intent))
return;
+ if (!path->preserve && !path->should_be_locked)
+ goto free;
+
dup = path->preserve
? have_path_at_pos(trans, path)
: have_node_at_pos(trans, path);
-
- trace_btree_path_free(trans, path_idx, dup);
-
- if (!dup && !(!path->preserve && !is_btree_node(path, path->level)))
+ if (!dup)
return;
- if (path->should_be_locked && !trans->restarted) {
- if (!dup)
- return;
-
+ /*
+ * If we need this path locked, the duplicate also has te be locked
+ * before we free this one:
+ */
+ if (path->should_be_locked &&
+ !dup->should_be_locked &&
+ !trans->restarted) {
if (!(trans->locked
? bch2_btree_path_relock_norestart(trans, dup)
: bch2_btree_path_can_relock(trans, dup)))
return;
- }
- if (dup) {
- dup->preserve |= path->preserve;
- dup->should_be_locked |= path->should_be_locked;
+ dup->should_be_locked = true;
}
- __bch2_path_free(trans, path_idx);
-}
-
-static void bch2_path_put_nokeep(struct btree_trans *trans, btree_path_idx_t path,
- bool intent)
-{
- if (!__btree_path_put(trans, trans->paths + path, intent))
- return;
+ BUG_ON(path->should_be_locked &&
+ !trans->restarted &&
+ trans->locked &&
+ !btree_node_locked(dup, dup->level));
- __bch2_path_free(trans, path);
+ path->should_be_locked = false;
+ dup->preserve |= path->preserve;
+free:
+ trace_btree_path_free(trans, path_idx, dup);
+ __bch2_path_free(trans, path_idx);
}
void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count)
@@ -1485,7 +1499,7 @@ void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
prt_newline(buf);
}
- for (struct jset_entry *e = trans->journal_entries;
+ for (struct jset_entry *e = btree_trans_journal_entries_start(trans);
e != btree_trans_journal_entries_top(trans);
e = vstruct_next(e)) {
bch2_journal_entry_to_text(buf, trans->c, e);
@@ -1591,7 +1605,7 @@ void __bch2_dump_trans_paths_updates(struct btree_trans *trans, bool nosort)
__bch2_trans_paths_to_text(&buf, trans, nosort);
bch2_trans_updates_to_text(&buf, trans);
- bch2_print_str(trans->c, buf.buf);
+ bch2_print_str(trans->c, KERN_ERR, buf.buf);
printbuf_exit(&buf);
}
@@ -1735,6 +1749,10 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans,
btree_trans_sort_paths(trans);
+ if (intent)
+ locks_want = max(locks_want, level + 1);
+ locks_want = min(locks_want, BTREE_MAX_DEPTH);
+
trans_for_each_path_inorder(trans, path, iter) {
if (__btree_path_cmp(path,
btree_id,
@@ -1749,7 +1767,8 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans,
if (path_pos &&
trans->paths[path_pos].cached == cached &&
trans->paths[path_pos].btree_id == btree_id &&
- trans->paths[path_pos].level == level) {
+ trans->paths[path_pos].level == level &&
+ bch2_btree_path_upgrade_norestart(trans, trans->paths + path_pos, locks_want)) {
trace_btree_path_get(trans, trans->paths + path_pos, &pos);
__btree_path_get(trans, trans->paths + path_pos, intent);
@@ -1781,9 +1800,6 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans,
if (!(flags & BTREE_ITER_nopreserve))
path->preserve = true;
- if (path->intent_ref)
- locks_want = max(locks_want, level + 1);
-
/*
* If the path has locks_want greater than requested, we don't downgrade
* it here - on transaction restart because btree node split needs to
@@ -1792,10 +1808,6 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans,
* a successful transaction commit.
*/
- locks_want = min(locks_want, BTREE_MAX_DEPTH);
- if (locks_want > path->locks_want)
- bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want, NULL);
-
return path_idx;
}
@@ -1967,6 +1979,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_trans *trans, struct btree_
/* got to end? */
if (!btree_path_node(path, path->level + 1)) {
+ path->should_be_locked = false;
btree_path_set_level_up(trans, path);
return NULL;
}
@@ -1978,12 +1991,12 @@ struct btree *bch2_btree_iter_next_node(struct btree_trans *trans, struct btree_
bch2_btree_path_downgrade(trans, path);
if (!bch2_btree_node_relock(trans, path, path->level + 1)) {
+ trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path);
+ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
__bch2_btree_path_unlock(trans, path);
path->l[path->level].b = ERR_PTR(-BCH_ERR_no_btree_node_relock);
path->l[path->level + 1].b = ERR_PTR(-BCH_ERR_no_btree_node_relock);
- btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
- trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path);
- ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
+ btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE);
goto err;
}
@@ -2344,8 +2357,7 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *trans, struct btree
}
if (iter->update_path) {
- bch2_path_put_nokeep(trans, iter->update_path,
- iter->flags & BTREE_ITER_intent);
+ bch2_path_put(trans, iter->update_path, iter->flags & BTREE_ITER_intent);
iter->update_path = 0;
}
@@ -2374,8 +2386,8 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *trans, struct btree
if (iter->update_path &&
!bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) {
- bch2_path_put_nokeep(trans, iter->update_path,
- iter->flags & BTREE_ITER_intent);
+ bch2_path_put(trans, iter->update_path,
+ iter->flags & BTREE_ITER_intent);
iter->update_path = 0;
}
@@ -2634,7 +2646,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct
* the last possible snapshot overwrite, return
* it:
*/
- bch2_path_put_nokeep(trans, iter->path,
+ bch2_path_put(trans, iter->path,
iter->flags & BTREE_ITER_intent);
iter->path = saved_path;
saved_path = 0;
@@ -2664,8 +2676,8 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct
* our previous saved candidate:
*/
if (saved_path) {
- bch2_path_put_nokeep(trans, saved_path,
- iter->flags & BTREE_ITER_intent);
+ bch2_path_put(trans, saved_path,
+ iter->flags & BTREE_ITER_intent);
saved_path = 0;
}
@@ -2708,7 +2720,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct
iter->pos.snapshot = iter->snapshot;
out_no_locked:
if (saved_path)
- bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_intent);
+ bch2_path_put(trans, saved_path, iter->flags & BTREE_ITER_intent);
bch2_btree_iter_verify_entry_exit(iter);
bch2_btree_iter_verify(trans, iter);
@@ -2929,7 +2941,7 @@ static void btree_trans_verify_sorted(struct btree_trans *trans)
struct btree_path *path, *prev = NULL;
struct trans_for_each_path_inorder_iter iter;
- if (!bch2_debug_check_iterators)
+ if (!static_branch_unlikely(&bch2_debug_check_iterators))
return;
trans_for_each_path_inorder(trans, path, iter) {
@@ -3031,7 +3043,7 @@ static inline void btree_path_list_add(struct btree_trans *trans,
void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
{
if (iter->update_path)
- bch2_path_put_nokeep(trans, iter->update_path,
+ bch2_path_put(trans, iter->update_path,
iter->flags & BTREE_ITER_intent);
if (iter->path)
bch2_path_put(trans, iter->path,
@@ -3095,7 +3107,19 @@ void bch2_trans_copy_iter(struct btree_trans *trans,
dst->key_cache_path = 0;
}
-void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
+void bch2_trans_kmalloc_trace_to_text(struct printbuf *out,
+ darray_trans_kmalloc_trace *trace)
+{
+ printbuf_tabstops_reset(out);
+ printbuf_tabstop_push(out, 60);
+
+ darray_for_each(*trace, i)
+ prt_printf(out, "%pS\t%zu\n", (void *) i->ip, i->bytes);
+}
+#endif
+
+void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size, unsigned long ip)
{
struct bch_fs *c = trans->c;
unsigned new_top = trans->mem_top + size;
@@ -3105,14 +3129,35 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
void *new_mem;
void *p;
- WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX);
+ if (WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX)) {
+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
+ struct printbuf buf = PRINTBUF;
+ bch2_trans_kmalloc_trace_to_text(&buf, &trans->trans_kmalloc_trace);
+ bch2_print_str(c, KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+#endif
+ }
ret = trans_maybe_inject_restart(trans, _RET_IP_);
if (ret)
return ERR_PTR(ret);
struct btree_transaction_stats *s = btree_trans_stats(trans);
- s->max_mem = max(s->max_mem, new_bytes);
+ if (new_bytes > s->max_mem) {
+ mutex_lock(&s->lock);
+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
+ darray_resize(&s->trans_kmalloc_trace, trans->trans_kmalloc_trace.nr);
+ s->trans_kmalloc_trace.nr = min(s->trans_kmalloc_trace.size,
+ trans->trans_kmalloc_trace.nr);
+
+ memcpy(s->trans_kmalloc_trace.data,
+ trans->trans_kmalloc_trace.data,
+ sizeof(s->trans_kmalloc_trace.data[0]) *
+ s->trans_kmalloc_trace.nr);
+#endif
+ s->max_mem = new_bytes;
+ mutex_unlock(&s->lock);
+ }
if (trans->used_mempool) {
if (trans->mem_bytes >= new_bytes)
@@ -3172,6 +3217,8 @@ out_new_mem:
BCH_ERR_transaction_restart_mem_realloced, _RET_IP_));
}
out_change_top:
+ bch2_trans_kmalloc_trace(trans, size, ip);
+
p = trans->mem + trans->mem_top;
trans->mem_top += size;
memset(p, 0, size);
@@ -3231,7 +3278,6 @@ u32 bch2_trans_begin(struct btree_trans *trans)
trans->restart_count++;
trans->mem_top = 0;
- trans->journal_entries = NULL;
trans_for_each_path(trans, path, i) {
path->should_be_locked = false;
@@ -3285,6 +3331,10 @@ u32 bch2_trans_begin(struct btree_trans *trans)
}
#endif
+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
+ trans->trans_kmalloc_trace.nr = 0;
+#endif
+
trans_set_locked(trans, false);
if (trans->restarted) {
@@ -3385,7 +3435,6 @@ got_trans:
}
trans->nr_paths_max = s->nr_max_paths;
- trans->journal_entries_size = s->journal_entries_size;
}
trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
@@ -3397,29 +3446,45 @@ got_trans:
return trans;
}
-static void check_btree_paths_leaked(struct btree_trans *trans)
-{
#ifdef CONFIG_BCACHEFS_DEBUG
- struct bch_fs *c = trans->c;
+
+static bool btree_paths_leaked(struct btree_trans *trans)
+{
struct btree_path *path;
unsigned i;
trans_for_each_path(trans, path, i)
if (path->ref)
- goto leaked;
- return;
-leaked:
- bch_err(c, "btree paths leaked from %s!", trans->fn);
- trans_for_each_path(trans, path, i)
- if (path->ref)
- printk(KERN_ERR " btree %s %pS\n",
- bch2_btree_id_str(path->btree_id),
- (void *) path->ip_allocated);
- /* Be noisy about this: */
- bch2_fatal_error(c);
-#endif
+ return true;
+ return false;
}
+static void check_btree_paths_leaked(struct btree_trans *trans)
+{
+ if (btree_paths_leaked(trans)) {
+ struct bch_fs *c = trans->c;
+ struct btree_path *path;
+ unsigned i;
+
+ struct printbuf buf = PRINTBUF;
+ bch2_log_msg_start(c, &buf);
+
+ prt_printf(&buf, "btree paths leaked from %s!\n", trans->fn);
+ trans_for_each_path(trans, path, i)
+ if (path->ref)
+ prt_printf(&buf, "btree %s %pS\n",
+ bch2_btree_id_str(path->btree_id),
+ (void *) path->ip_allocated);
+
+ bch2_fs_emergency_read_only2(c, &buf);
+ bch2_print_str(c, KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+ }
+}
+#else
+static inline void check_btree_paths_leaked(struct btree_trans *trans) {}
+#endif
+
void bch2_trans_put(struct btree_trans *trans)
__releases(&c->btree_trans_barrier)
{
@@ -3454,6 +3519,9 @@ void bch2_trans_put(struct btree_trans *trans)
#ifdef CONFIG_BCACHEFS_DEBUG
darray_exit(&trans->last_restarted_trace);
#endif
+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
+ darray_exit(&trans->trans_kmalloc_trace);
+#endif
unsigned long *paths_allocated = trans->paths_allocated;
trans->paths_allocated = NULL;
@@ -3500,13 +3568,12 @@ bch2_btree_bkey_cached_common_to_text(struct printbuf *out,
struct btree_bkey_cached_common *b)
{
struct six_lock_count c = six_lock_counts(&b->lock);
- struct task_struct *owner;
pid_t pid;
- rcu_read_lock();
- owner = READ_ONCE(b->lock.owner);
- pid = owner ? owner->pid : 0;
- rcu_read_unlock();
+ scoped_guard(rcu) {
+ struct task_struct *owner = READ_ONCE(b->lock.owner);
+ pid = owner ? owner->pid : 0;
+ }
prt_printf(out, "\t%px %c ", b, b->cached ? 'c' : 'b');
bch2_btree_id_to_text(out, b->btree_id);
@@ -3535,7 +3602,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
prt_printf(out, "%i %s\n", task ? task->pid : 0, trans->fn);
/* trans->paths is rcu protected vs. freeing */
- rcu_read_lock();
+ guard(rcu)();
out->atomic++;
struct btree_path *paths = rcu_dereference(trans->paths);
@@ -3578,7 +3645,6 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
}
out:
--out->atomic;
- rcu_read_unlock();
}
void bch2_fs_btree_iter_exit(struct bch_fs *c)
@@ -3608,6 +3674,9 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c)
for (s = c->btree_transaction_stats;
s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
s++) {
+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
+ darray_exit(&s->trans_kmalloc_trace);
+#endif
kfree(s->max_paths_text);
bch2_time_stats_exit(&s->lock_hold_times);
}
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 9d2cccf5d21a..09dd3e52622e 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -46,9 +46,11 @@ static inline bool __btree_path_put(struct btree_trans *trans, struct btree_path
return --path->ref == 0;
}
-static inline void btree_path_set_dirty(struct btree_path *path,
+static inline void btree_path_set_dirty(struct btree_trans *trans,
+ struct btree_path *path,
enum btree_path_uptodate u)
{
+ BUG_ON(path->should_be_locked && trans->locked && !trans->restarted);
path->uptodate = max_t(unsigned, path->uptodate, u);
}
@@ -285,14 +287,23 @@ static inline int bch2_trans_mutex_lock(struct btree_trans *trans, struct mutex
: __bch2_trans_mutex_lock(trans, lock);
}
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_trans_verify_paths(struct btree_trans *);
-void bch2_assert_pos_locked(struct btree_trans *, enum btree_id, struct bpos);
-#else
-static inline void bch2_trans_verify_paths(struct btree_trans *trans) {}
-static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
- struct bpos pos) {}
-#endif
+/* Debug: */
+
+void __bch2_trans_verify_paths(struct btree_trans *);
+void __bch2_assert_pos_locked(struct btree_trans *, enum btree_id, struct bpos);
+
+static inline void bch2_trans_verify_paths(struct btree_trans *trans)
+{
+ if (static_branch_unlikely(&bch2_debug_check_iterators))
+ __bch2_trans_verify_paths(trans);
+}
+
+static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id btree,
+ struct bpos pos)
+{
+ if (static_branch_unlikely(&bch2_debug_check_iterators))
+ __bch2_assert_pos_locked(trans, btree, pos);
+}
void bch2_btree_path_fix_key_modified(struct btree_trans *trans,
struct btree *, struct bkey_packed *);
@@ -543,43 +554,73 @@ void bch2_trans_copy_iter(struct btree_trans *, struct btree_iter *, struct btre
void bch2_set_btree_iter_dontneed(struct btree_trans *, struct btree_iter *);
-void *__bch2_trans_kmalloc(struct btree_trans *, size_t);
+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
+void bch2_trans_kmalloc_trace_to_text(struct printbuf *,
+ darray_trans_kmalloc_trace *);
+#endif
-/**
- * bch2_trans_kmalloc - allocate memory for use by the current transaction
- *
- * Must be called after bch2_trans_begin, which on second and further calls
- * frees all memory allocated in this transaction
- */
-static inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
+void *__bch2_trans_kmalloc(struct btree_trans *, size_t, unsigned long);
+
+static inline void bch2_trans_kmalloc_trace(struct btree_trans *trans, size_t size,
+ unsigned long ip)
+{
+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
+ darray_push(&trans->trans_kmalloc_trace,
+ ((struct trans_kmalloc_trace) { .ip = ip, .bytes = size }));
+#endif
+}
+
+static __always_inline void *bch2_trans_kmalloc_nomemzero_ip(struct btree_trans *trans, size_t size,
+ unsigned long ip)
{
size = roundup(size, 8);
+ bch2_trans_kmalloc_trace(trans, size, ip);
+
if (likely(trans->mem_top + size <= trans->mem_bytes)) {
void *p = trans->mem + trans->mem_top;
trans->mem_top += size;
- memset(p, 0, size);
return p;
} else {
- return __bch2_trans_kmalloc(trans, size);
+ return __bch2_trans_kmalloc(trans, size, ip);
}
}
-static inline void *bch2_trans_kmalloc_nomemzero(struct btree_trans *trans, size_t size)
+static __always_inline void *bch2_trans_kmalloc_ip(struct btree_trans *trans, size_t size,
+ unsigned long ip)
{
- size = round_up(size, 8);
+ size = roundup(size, 8);
+
+ bch2_trans_kmalloc_trace(trans, size, ip);
if (likely(trans->mem_top + size <= trans->mem_bytes)) {
void *p = trans->mem + trans->mem_top;
trans->mem_top += size;
+ memset(p, 0, size);
return p;
} else {
- return __bch2_trans_kmalloc(trans, size);
+ return __bch2_trans_kmalloc(trans, size, ip);
}
}
+/**
+ * bch2_trans_kmalloc - allocate memory for use by the current transaction
+ *
+ * Must be called after bch2_trans_begin, which on second and further calls
+ * frees all memory allocated in this transaction
+ */
+static __always_inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
+{
+ return bch2_trans_kmalloc_ip(trans, size, _THIS_IP_);
+}
+
+static __always_inline void *bch2_trans_kmalloc_nomemzero(struct btree_trans *trans, size_t size)
+{
+ return bch2_trans_kmalloc_nomemzero_ip(trans, size, _THIS_IP_);
+}
+
static inline struct bkey_s_c __bch2_bkey_get_iter(struct btree_trans *trans,
struct btree_iter *iter,
unsigned btree_id, struct bpos pos,
@@ -922,16 +963,6 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_trans *,
_p; \
})
-#define bch2_trans_run(_c, _do) \
-({ \
- struct btree_trans *trans = bch2_trans_get(_c); \
- int _ret = (_do); \
- bch2_trans_put(trans); \
- _ret; \
-})
-
-#define bch2_trans_do(_c, _do) bch2_trans_run(_c, lockrestart_do(trans, _do))
-
struct btree_trans *__bch2_trans_get(struct bch_fs *, unsigned);
void bch2_trans_put(struct btree_trans *);
@@ -949,6 +980,27 @@ unsigned bch2_trans_get_fn_idx(const char *);
__bch2_trans_get(_c, trans_fn_idx); \
})
+/*
+ * We don't use DEFINE_CLASS() because using a function for the constructor
+ * breaks bch2_trans_get()'s use of __func__
+ */
+typedef struct btree_trans * class_btree_trans_t;
+static inline void class_btree_trans_destructor(struct btree_trans **p)
+{
+ struct btree_trans *trans = *p;
+ bch2_trans_put(trans);
+}
+
+#define class_btree_trans_constructor(_c) bch2_trans_get(_c)
+
+#define bch2_trans_run(_c, _do) \
+({ \
+ CLASS(btree_trans, trans)(_c); \
+ (_do); \
+})
+
+#define bch2_trans_do(_c, _do) bch2_trans_run(_c, lockrestart_do(trans, _do))
+
void bch2_btree_trans_to_text(struct printbuf *, struct btree_trans *);
void bch2_fs_btree_iter_exit(struct bch_fs *);
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
index ade3b5addd75..cf7398751644 100644
--- a/fs/bcachefs/btree_journal_iter.c
+++ b/fs/bcachefs/btree_journal_iter.c
@@ -292,7 +292,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
if (!new_keys.data) {
bch_err(c, "%s: error allocating new key array (size %zu)",
__func__, new_keys.size);
- return -BCH_ERR_ENOMEM_journal_key_insert;
+ return bch_err_throw(c, ENOMEM_journal_key_insert);
}
/* Since @keys was full, there was no gap: */
@@ -331,7 +331,7 @@ int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL);
if (!n)
- return -BCH_ERR_ENOMEM_journal_key_insert;
+ return bch_err_throw(c, ENOMEM_journal_key_insert);
bkey_copy(n, k);
ret = bch2_journal_key_insert_take(c, id, level, n);
@@ -457,11 +457,9 @@ static void bch2_journal_iter_advance(struct journal_iter *iter)
static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
{
- struct bkey_s_c ret = bkey_s_c_null;
-
journal_iter_verify(iter);
- rcu_read_lock();
+ guard(rcu)();
while (iter->idx < iter->keys->size) {
struct journal_key *k = iter->keys->data + iter->idx;
@@ -470,19 +468,16 @@ static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
break;
BUG_ON(cmp);
- if (!k->overwritten) {
- ret = bkey_i_to_s_c(k->k);
- break;
- }
+ if (!k->overwritten)
+ return bkey_i_to_s_c(k->k);
if (k->overwritten_range)
iter->idx = idx_to_pos(iter->keys, rcu_dereference(k->overwritten_range)->end);
else
bch2_journal_iter_advance(iter);
}
- rcu_read_unlock();
- return ret;
+ return bkey_s_c_null;
}
static void bch2_journal_iter_exit(struct journal_iter *iter)
@@ -741,7 +736,7 @@ int bch2_journal_keys_sort(struct bch_fs *c)
if (keys->nr * 8 > keys->size * 7) {
bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu keys at seq %llu",
keys->nr, keys->size, nr_read, le64_to_cpu(i->j.seq));
- return -BCH_ERR_ENOMEM_journal_keys_sort;
+ return bch_err_throw(c, ENOMEM_journal_keys_sort);
}
BUG_ON(darray_push(keys, n));
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 669825f89cdd..d96188b92db2 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -101,8 +101,8 @@ static void __bkey_cached_free(struct rcu_pending *pending, struct rcu_head *rcu
kmem_cache_free(bch2_key_cache, ck);
}
-static void bkey_cached_free(struct btree_key_cache *bc,
- struct bkey_cached *ck)
+static inline void bkey_cached_free_noassert(struct btree_key_cache *bc,
+ struct bkey_cached *ck)
{
kfree(ck->k);
ck->k = NULL;
@@ -116,6 +116,19 @@ static void bkey_cached_free(struct btree_key_cache *bc,
this_cpu_inc(*bc->nr_pending);
}
+static void bkey_cached_free(struct btree_trans *trans,
+ struct btree_key_cache *bc,
+ struct bkey_cached *ck)
+{
+ /*
+ * we'll hit strange issues in the SRCU code if we aren't holding an
+ * SRCU read lock...
+ */
+ EBUG_ON(!trans->srcu_held);
+
+ bkey_cached_free_noassert(bc, ck);
+}
+
static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp)
{
gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE;
@@ -174,27 +187,23 @@ lock:
static struct bkey_cached *
bkey_cached_reuse(struct btree_key_cache *c)
{
- struct bucket_table *tbl;
+
+ guard(rcu)();
+ struct bucket_table *tbl = rht_dereference_rcu(c->table.tbl, &c->table);
struct rhash_head *pos;
struct bkey_cached *ck;
- unsigned i;
- rcu_read_lock();
- tbl = rht_dereference_rcu(c->table.tbl, &c->table);
- for (i = 0; i < tbl->size; i++)
+ for (unsigned i = 0; i < tbl->size; i++)
rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
bkey_cached_lock_for_evict(ck)) {
if (bkey_cached_evict(c, ck))
- goto out;
+ return ck;
six_unlock_write(&ck->c.lock);
six_unlock_intent(&ck->c.lock);
}
}
- ck = NULL;
-out:
- rcu_read_unlock();
- return ck;
+ return NULL;
}
static int btree_key_cache_create(struct btree_trans *trans,
@@ -229,7 +238,7 @@ static int btree_key_cache_create(struct btree_trans *trans,
if (unlikely(!ck)) {
bch_err(c, "error allocating memory for key cache item, btree %s",
bch2_btree_id_str(ck_path->btree_id));
- return -BCH_ERR_ENOMEM_btree_key_cache_create;
+ return bch_err_throw(c, ENOMEM_btree_key_cache_create);
}
}
@@ -247,7 +256,7 @@ static int btree_key_cache_create(struct btree_trans *trans,
if (unlikely(!new_k)) {
bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
bch2_btree_id_str(ck->key.btree_id), key_u64s);
- ret = -BCH_ERR_ENOMEM_btree_key_cache_fill;
+ ret = bch_err_throw(c, ENOMEM_btree_key_cache_fill);
} else if (ret) {
kfree(new_k);
goto err;
@@ -281,7 +290,7 @@ static int btree_key_cache_create(struct btree_trans *trans,
ck_path->uptodate = BTREE_ITER_UPTODATE;
return 0;
err:
- bkey_cached_free(bc, ck);
+ bkey_cached_free(trans, bc, ck);
mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED);
return ret;
@@ -511,7 +520,7 @@ evict:
mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
if (bkey_cached_evict(&c->btree_key_cache, ck)) {
- bkey_cached_free(&c->btree_key_cache, ck);
+ bkey_cached_free(trans, &c->btree_key_cache, ck);
} else {
six_unlock_write(&ck->c.lock);
six_unlock_intent(&ck->c.lock);
@@ -625,7 +634,7 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans,
}
bkey_cached_evict(bc, ck);
- bkey_cached_free(bc, ck);
+ bkey_cached_free(trans, bc, ck);
mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED);
@@ -633,10 +642,17 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans,
unsigned i;
trans_for_each_path(trans, path2, i)
if (path2->l[0].b == (void *) ck) {
+ /*
+ * It's safe to clear should_be_locked here because
+ * we're evicting from the key cache, and we still have
+ * the underlying btree locked: filling into the key
+ * cache would require taking a write lock on the btree
+ * node
+ */
+ path2->should_be_locked = false;
__bch2_btree_path_unlock(trans, path2);
path2->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_drop);
- path2->should_be_locked = false;
- btree_path_set_dirty(path2, BTREE_ITER_NEED_TRAVERSE);
+ btree_path_set_dirty(trans, path2, BTREE_ITER_NEED_TRAVERSE);
}
bch2_trans_verify_locks(trans);
@@ -693,7 +709,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
} else if (!bkey_cached_lock_for_evict(ck)) {
bc->skipped_lock_fail++;
} else if (bkey_cached_evict(bc, ck)) {
- bkey_cached_free(bc, ck);
+ bkey_cached_free_noassert(bc, ck);
bc->freed++;
freed++;
} else {
@@ -806,20 +822,20 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
bc->nr_pending = alloc_percpu(size_t);
if (!bc->nr_pending)
- return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+ return bch_err_throw(c, ENOMEM_fs_btree_cache_init);
if (rcu_pending_init(&bc->pending[0], &c->btree_trans_barrier, __bkey_cached_free) ||
rcu_pending_init(&bc->pending[1], &c->btree_trans_barrier, __bkey_cached_free))
- return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+ return bch_err_throw(c, ENOMEM_fs_btree_cache_init);
if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params))
- return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+ return bch_err_throw(c, ENOMEM_fs_btree_cache_init);
bc->table_init_done = true;
shrink = shrinker_alloc(0, "%s-btree_key_cache", c->name);
if (!shrink)
- return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+ return bch_err_throw(c, ENOMEM_fs_btree_cache_init);
bc->shrink = shrink;
shrink->count_objects = bch2_btree_key_cache_count;
shrink->scan_objects = bch2_btree_key_cache_scan;
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 94eb2b73a843..47035aae232e 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "btree_cache.h"
#include "btree_locking.h"
#include "btree_types.h"
@@ -193,6 +194,30 @@ static int btree_trans_abort_preference(struct btree_trans *trans)
return 3;
}
+static noinline __noreturn void break_cycle_fail(struct lock_graph *g)
+{
+ struct printbuf buf = PRINTBUF;
+ buf.atomic++;
+
+ prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks"));
+
+ for (struct trans_waiting_for_lock *i = g->g; i < g->g + g->nr; i++) {
+ struct btree_trans *trans = i->trans;
+
+ bch2_btree_trans_to_text(&buf, trans);
+
+ prt_printf(&buf, "backtrace:\n");
+ printbuf_indent_add(&buf, 2);
+ bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2, GFP_NOWAIT);
+ printbuf_indent_sub(&buf, 2);
+ prt_newline(&buf);
+ }
+
+ bch2_print_str_nonblocking(g->g->trans->c, KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+ BUG();
+}
+
static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle,
struct trans_waiting_for_lock *from)
{
@@ -218,28 +243,8 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle,
}
}
- if (unlikely(!best)) {
- struct printbuf buf = PRINTBUF;
- buf.atomic++;
-
- prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks"));
-
- for (i = g->g; i < g->g + g->nr; i++) {
- struct btree_trans *trans = i->trans;
-
- bch2_btree_trans_to_text(&buf, trans);
-
- prt_printf(&buf, "backtrace:\n");
- printbuf_indent_add(&buf, 2);
- bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2, GFP_NOWAIT);
- printbuf_indent_sub(&buf, 2);
- prt_newline(&buf);
- }
-
- bch2_print_string_as_lines_nonblocking(KERN_ERR, buf.buf);
- printbuf_exit(&buf);
- BUG();
- }
+ if (unlikely(!best))
+ break_cycle_fail(g);
ret = abort_lock(g, abort);
out:
@@ -254,15 +259,14 @@ static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans,
struct printbuf *cycle)
{
struct btree_trans *orig_trans = g->g->trans;
- struct trans_waiting_for_lock *i;
- for (i = g->g; i < g->g + g->nr; i++)
+ for (struct trans_waiting_for_lock *i = g->g; i < g->g + g->nr; i++)
if (i->trans == trans) {
closure_put(&trans->ref);
return break_cycle(g, cycle, i);
}
- if (g->nr == ARRAY_SIZE(g->g)) {
+ if (unlikely(g->nr == ARRAY_SIZE(g->g))) {
closure_put(&trans->ref);
if (orig_trans->lock_may_not_fail)
@@ -307,7 +311,7 @@ int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle)
lock_graph_down(&g, trans);
/* trans->paths is rcu protected vs. freeing */
- rcu_read_lock();
+ guard(rcu)();
if (cycle)
cycle->atomic++;
next:
@@ -405,7 +409,6 @@ up:
out:
if (cycle)
--cycle->atomic;
- rcu_read_unlock();
return ret;
}
@@ -450,13 +453,13 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
/* relock */
-static inline bool btree_path_get_locks(struct btree_trans *trans,
- struct btree_path *path,
- bool upgrade,
- struct get_locks_fail *f)
+static int btree_path_get_locks(struct btree_trans *trans,
+ struct btree_path *path,
+ bool upgrade,
+ struct get_locks_fail *f,
+ int restart_err)
{
unsigned l = path->level;
- int fail_idx = -1;
do {
if (!btree_path_node(path, l))
@@ -464,39 +467,49 @@ static inline bool btree_path_get_locks(struct btree_trans *trans,
if (!(upgrade
? bch2_btree_node_upgrade(trans, path, l)
- : bch2_btree_node_relock(trans, path, l))) {
- fail_idx = l;
-
- if (f) {
- f->l = l;
- f->b = path->l[l].b;
- }
- }
+ : bch2_btree_node_relock(trans, path, l)))
+ goto err;
l++;
} while (l < path->locks_want);
+ if (path->uptodate == BTREE_ITER_NEED_RELOCK)
+ path->uptodate = BTREE_ITER_UPTODATE;
+
+ return path->uptodate < BTREE_ITER_NEED_RELOCK ? 0 : -1;
+err:
+ if (f) {
+ f->l = l;
+ f->b = path->l[l].b;
+ }
+
+ /*
+ * Do transaction restart before unlocking, so we don't pop
+ * should_be_locked asserts
+ */
+ if (restart_err) {
+ btree_trans_restart(trans, restart_err);
+ } else if (path->should_be_locked && !trans->restarted) {
+ if (upgrade)
+ path->locks_want = l;
+ return -1;
+ }
+
+ __bch2_btree_path_unlock(trans, path);
+ btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE);
+
/*
* When we fail to get a lock, we have to ensure that any child nodes
* can't be relocked so bch2_btree_path_traverse has to walk back up to
* the node that we failed to relock:
*/
- if (fail_idx >= 0) {
- __bch2_btree_path_unlock(trans, path);
- btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
-
- do {
- path->l[fail_idx].b = upgrade
- ? ERR_PTR(-BCH_ERR_no_btree_node_upgrade)
- : ERR_PTR(-BCH_ERR_no_btree_node_relock);
- --fail_idx;
- } while (fail_idx >= 0);
- }
-
- if (path->uptodate == BTREE_ITER_NEED_RELOCK)
- path->uptodate = BTREE_ITER_UPTODATE;
+ do {
+ path->l[l].b = upgrade
+ ? ERR_PTR(-BCH_ERR_no_btree_node_upgrade)
+ : ERR_PTR(-BCH_ERR_no_btree_node_relock);
+ } while (l--);
- return path->uptodate < BTREE_ITER_NEED_RELOCK;
+ return -restart_err ?: -1;
}
bool __bch2_btree_node_relock(struct btree_trans *trans,
@@ -583,7 +596,7 @@ int bch2_btree_path_relock_intent(struct btree_trans *trans,
l++) {
if (!bch2_btree_node_relock(trans, path, l)) {
__bch2_btree_path_unlock(trans, path);
- btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+ btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE);
trace_and_count(trans->c, trans_restart_relock_path_intent, trans, _RET_IP_, path);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path_intent);
}
@@ -595,9 +608,7 @@ int bch2_btree_path_relock_intent(struct btree_trans *trans,
__flatten
bool bch2_btree_path_relock_norestart(struct btree_trans *trans, struct btree_path *path)
{
- struct get_locks_fail f;
-
- bool ret = btree_path_get_locks(trans, path, false, &f);
+ bool ret = !btree_path_get_locks(trans, path, false, NULL, 0);
bch2_trans_verify_locks(trans);
return ret;
}
@@ -613,27 +624,37 @@ int __bch2_btree_path_relock(struct btree_trans *trans,
return 0;
}
-bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans,
- struct btree_path *path,
- unsigned new_locks_want,
- struct get_locks_fail *f)
+bool __bch2_btree_path_upgrade_norestart(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned new_locks_want)
{
- EBUG_ON(path->locks_want >= new_locks_want);
-
path->locks_want = new_locks_want;
- bool ret = btree_path_get_locks(trans, path, true, f);
- bch2_trans_verify_locks(trans);
+ /*
+ * If we need it locked, we can't touch it. Otherwise, we can return
+ * success - bch2_path_get() will use this path, and it'll just be
+ * retraversed:
+ */
+ bool ret = !btree_path_get_locks(trans, path, true, NULL, 0) ||
+ !path->should_be_locked;
+
+ bch2_btree_path_verify_locks(trans, path);
return ret;
}
-bool __bch2_btree_path_upgrade(struct btree_trans *trans,
- struct btree_path *path,
- unsigned new_locks_want,
- struct get_locks_fail *f)
+int __bch2_btree_path_upgrade(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned new_locks_want)
{
- bool ret = bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, f);
- if (ret)
+ unsigned old_locks = path->nodes_locked;
+ unsigned old_locks_want = path->locks_want;
+
+ path->locks_want = max_t(unsigned, path->locks_want, new_locks_want);
+
+ struct get_locks_fail f = {};
+ int ret = btree_path_get_locks(trans, path, true, &f,
+ BCH_ERR_transaction_restart_upgrade);
+ if (!ret)
goto out;
/*
@@ -665,9 +686,30 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans,
linked->btree_id == path->btree_id &&
linked->locks_want < new_locks_want) {
linked->locks_want = new_locks_want;
- btree_path_get_locks(trans, linked, true, NULL);
+ btree_path_get_locks(trans, linked, true, NULL, 0);
}
}
+
+ count_event(trans->c, trans_restart_upgrade);
+ if (trace_trans_restart_upgrade_enabled()) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_printf(&buf, "%s %pS\n", trans->fn, (void *) _RET_IP_);
+ prt_printf(&buf, "btree %s pos\n", bch2_btree_id_str(path->btree_id));
+ bch2_bpos_to_text(&buf, path->pos);
+ prt_printf(&buf, "locks want %u -> %u level %u\n",
+ old_locks_want, new_locks_want, f.l);
+ prt_printf(&buf, "nodes_locked %x -> %x\n",
+ old_locks, path->nodes_locked);
+ prt_printf(&buf, "node %s ", IS_ERR(f.b) ? bch2_err_str(PTR_ERR(f.b)) :
+ !f.b ? "(null)" : "(node)");
+ prt_printf(&buf, "path seq %u node seq %u\n",
+ IS_ERR_OR_NULL(f.b) ? 0 : f.b->c.lock.seq,
+ path->l[f.l].lock_seq);
+
+ trace_trans_restart_upgrade(trans->c, buf.buf);
+ printbuf_exit(&buf);
+ }
out:
bch2_trans_verify_locks(trans);
return ret;
@@ -699,7 +741,7 @@ void __bch2_btree_path_downgrade(struct btree_trans *trans,
}
}
- bch2_btree_path_verify_locks(path);
+ bch2_btree_path_verify_locks(trans, path);
trace_path_downgrade(trans, _RET_IP_, path, old_locks_want);
}
@@ -728,7 +770,7 @@ static inline void __bch2_trans_unlock(struct btree_trans *trans)
__bch2_btree_path_unlock(trans, path);
}
-static noinline __cold int bch2_trans_relock_fail(struct btree_trans *trans, struct btree_path *path,
+static noinline __cold void bch2_trans_relock_fail(struct btree_trans *trans, struct btree_path *path,
struct get_locks_fail *f, bool trace)
{
if (!trace)
@@ -738,7 +780,9 @@ static noinline __cold int bch2_trans_relock_fail(struct btree_trans *trans, str
struct printbuf buf = PRINTBUF;
bch2_bpos_to_text(&buf, path->pos);
- prt_printf(&buf, " l=%u seq=%u node seq=", f->l, path->l[f->l].lock_seq);
+ prt_printf(&buf, " %s l=%u seq=%u node seq=",
+ bch2_btree_id_str(path->btree_id),
+ f->l, path->l[f->l].lock_seq);
if (IS_ERR_OR_NULL(f->b)) {
prt_str(&buf, bch2_err_str(PTR_ERR(f->b)));
} else {
@@ -760,7 +804,6 @@ static noinline __cold int bch2_trans_relock_fail(struct btree_trans *trans, str
out:
__bch2_trans_unlock(trans);
bch2_trans_verify_locks(trans);
- return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
}
static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace)
@@ -777,10 +820,14 @@ static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace)
trans_for_each_path(trans, path, i) {
struct get_locks_fail f;
+ int ret;
if (path->should_be_locked &&
- !btree_path_get_locks(trans, path, false, &f))
- return bch2_trans_relock_fail(trans, path, &f, trace);
+ (ret = btree_path_get_locks(trans, path, false, &f,
+ BCH_ERR_transaction_restart_relock))) {
+ bch2_trans_relock_fail(trans, path, &f, trace);
+ return ret;
+ }
}
trans_set_locked(trans, true);
@@ -799,18 +846,11 @@ int bch2_trans_relock_notrace(struct btree_trans *trans)
return __bch2_trans_relock(trans, false);
}
-void bch2_trans_unlock_noassert(struct btree_trans *trans)
+void bch2_trans_unlock(struct btree_trans *trans)
{
- __bch2_trans_unlock(trans);
-
trans_set_unlocked(trans);
-}
-void bch2_trans_unlock(struct btree_trans *trans)
-{
__bch2_trans_unlock(trans);
-
- trans_set_unlocked(trans);
}
void bch2_trans_unlock_long(struct btree_trans *trans)
@@ -842,32 +882,28 @@ int __bch2_trans_mutex_lock(struct btree_trans *trans,
/* Debug */
-#ifdef CONFIG_BCACHEFS_DEBUG
-
-void bch2_btree_path_verify_locks(struct btree_path *path)
+void __bch2_btree_path_verify_locks(struct btree_trans *trans, struct btree_path *path)
{
- /*
- * A path may be uptodate and yet have nothing locked if and only if
- * there is no node at path->level, which generally means we were
- * iterating over all nodes and got to the end of the btree
- */
- BUG_ON(path->uptodate == BTREE_ITER_UPTODATE &&
- btree_path_node(path, path->level) &&
- !path->nodes_locked);
+ if (!path->nodes_locked && btree_path_node(path, path->level)) {
+ /*
+ * A path may be uptodate and yet have nothing locked if and only if
+ * there is no node at path->level, which generally means we were
+ * iterating over all nodes and got to the end of the btree
+ */
+ BUG_ON(path->uptodate == BTREE_ITER_UPTODATE);
+ BUG_ON(path->should_be_locked && trans->locked && !trans->restarted);
+ }
if (!path->nodes_locked)
return;
for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) {
int want = btree_lock_want(path, l);
- int have = btree_node_locked_type(path, l);
+ int have = btree_node_locked_type_nowrite(path, l);
BUG_ON(!is_btree_node(path, l) && have != BTREE_NODE_UNLOCKED);
- BUG_ON(is_btree_node(path, l) &&
- (want == BTREE_NODE_UNLOCKED ||
- have != BTREE_NODE_WRITE_LOCKED) &&
- want != have);
+ BUG_ON(is_btree_node(path, l) && want != have);
BUG_ON(btree_node_locked(path, l) &&
path->l[l].lock_seq != six_lock_seq(&path->l[l].b->c.lock));
@@ -885,7 +921,7 @@ static bool bch2_trans_locked(struct btree_trans *trans)
return false;
}
-void bch2_trans_verify_locks(struct btree_trans *trans)
+void __bch2_trans_verify_locks(struct btree_trans *trans)
{
if (!trans->locked) {
BUG_ON(bch2_trans_locked(trans));
@@ -896,7 +932,5 @@ void bch2_trans_verify_locks(struct btree_trans *trans)
unsigned i;
trans_for_each_path(trans, path, i)
- bch2_btree_path_verify_locks(path);
+ __bch2_btree_path_verify_locks(trans, path);
}
-
-#endif
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index b33ab7af8440..9adca77e2580 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -15,7 +15,6 @@
void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags, gfp_t gfp);
-void bch2_trans_unlock_noassert(struct btree_trans *);
void bch2_trans_unlock_write(struct btree_trans *);
static inline bool is_btree_node(struct btree_path *path, unsigned l)
@@ -44,6 +43,15 @@ static inline int btree_node_locked_type(struct btree_path *path,
return BTREE_NODE_UNLOCKED + ((path->nodes_locked >> (level << 1)) & 3);
}
+static inline int btree_node_locked_type_nowrite(struct btree_path *path,
+ unsigned level)
+{
+ int have = btree_node_locked_type(path, level);
+ return have == BTREE_NODE_WRITE_LOCKED
+ ? BTREE_NODE_INTENT_LOCKED
+ : have;
+}
+
static inline bool btree_node_write_locked(struct btree_path *path, unsigned l)
{
return btree_node_locked_type(path, l) == BTREE_NODE_WRITE_LOCKED;
@@ -152,7 +160,7 @@ static inline int btree_path_highest_level_locked(struct btree_path *path)
static inline void __bch2_btree_path_unlock(struct btree_trans *trans,
struct btree_path *path)
{
- btree_path_set_dirty(path, BTREE_ITER_NEED_RELOCK);
+ btree_path_set_dirty(trans, path, BTREE_ITER_NEED_RELOCK);
while (path->nodes_locked)
btree_node_unlock(trans, path, btree_path_lowest_level_locked(path));
@@ -367,8 +375,8 @@ static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans,
struct btree_path *path, unsigned level)
{
EBUG_ON(btree_node_locked(path, level) &&
- !btree_node_write_locked(path, level) &&
- btree_node_locked_type(path, level) != __btree_lock_want(path, level));
+ btree_node_locked_type_nowrite(path, level) !=
+ __btree_lock_want(path, level));
return likely(btree_node_locked(path, level)) ||
(!IS_ERR_OR_NULL(path->l[level].b) &&
@@ -377,31 +385,29 @@ static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans,
/* upgrade */
-bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *,
- struct btree_path *, unsigned,
- struct get_locks_fail *);
+bool __bch2_btree_path_upgrade_norestart(struct btree_trans *, struct btree_path *, unsigned);
-bool __bch2_btree_path_upgrade(struct btree_trans *,
- struct btree_path *, unsigned,
- struct get_locks_fail *);
+static inline bool bch2_btree_path_upgrade_norestart(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned new_locks_want)
+{
+ return new_locks_want > path->locks_want
+ ? __bch2_btree_path_upgrade_norestart(trans, path, new_locks_want)
+ : true;
+}
+
+int __bch2_btree_path_upgrade(struct btree_trans *,
+ struct btree_path *, unsigned);
static inline int bch2_btree_path_upgrade(struct btree_trans *trans,
struct btree_path *path,
unsigned new_locks_want)
{
- struct get_locks_fail f = {};
- unsigned old_locks_want = path->locks_want;
-
new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
- if (path->locks_want < new_locks_want
- ? __bch2_btree_path_upgrade(trans, path, new_locks_want, &f)
- : path->nodes_locked)
- return 0;
-
- trace_and_count(trans->c, trans_restart_upgrade, trans, _THIS_IP_, path,
- old_locks_want, new_locks_want, &f);
- return btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade);
+ return likely(path->locks_want >= new_locks_want && path->nodes_locked)
+ ? 0
+ : __bch2_btree_path_upgrade(trans, path, new_locks_want);
}
/* misc: */
@@ -427,7 +433,7 @@ static inline void btree_path_set_level_up(struct btree_trans *trans,
struct btree_path *path)
{
__btree_path_set_level_up(trans, path, path->level++);
- btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+ btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE);
}
/* debug */
@@ -439,12 +445,20 @@ struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *,
int bch2_check_for_deadlock(struct btree_trans *, struct printbuf *);
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_btree_path_verify_locks(struct btree_path *);
-void bch2_trans_verify_locks(struct btree_trans *);
-#else
-static inline void bch2_btree_path_verify_locks(struct btree_path *path) {}
-static inline void bch2_trans_verify_locks(struct btree_trans *trans) {}
-#endif
+void __bch2_btree_path_verify_locks(struct btree_trans *, struct btree_path *);
+void __bch2_trans_verify_locks(struct btree_trans *);
+
+static inline void bch2_btree_path_verify_locks(struct btree_trans *trans,
+ struct btree_path *path)
+{
+ if (static_branch_unlikely(&bch2_debug_check_btree_locking))
+ __bch2_btree_path_verify_locks(trans, path);
+}
+
+static inline void bch2_trans_verify_locks(struct btree_trans *trans)
+{
+ if (static_branch_unlikely(&bch2_debug_check_btree_locking))
+ __bch2_trans_verify_locks(trans);
+}
#endif /* _BCACHEFS_BTREE_LOCKING_H */
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
index 86acf037590c..a35847734a60 100644
--- a/fs/bcachefs/btree_node_scan.c
+++ b/fs/bcachefs/btree_node_scan.c
@@ -271,7 +271,7 @@ static int read_btree_nodes_worker(void *p)
err:
bio_put(bio);
free_page((unsigned long) buf);
- percpu_ref_put(&ca->io_ref[READ]);
+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan);
closure_put(w->cl);
kfree(w);
return 0;
@@ -285,13 +285,13 @@ static int read_btree_nodes(struct find_btree_nodes *f)
closure_init_stack(&cl);
- for_each_online_member(c, ca) {
+ for_each_online_member(c, ca, BCH_DEV_READ_REF_btree_node_scan) {
if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree)))
continue;
struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL);
if (!w) {
- percpu_ref_put(&ca->io_ref[READ]);
+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan);
ret = -ENOMEM;
goto err;
}
@@ -303,14 +303,14 @@ static int read_btree_nodes(struct find_btree_nodes *f)
struct task_struct *t = kthread_create(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
ret = PTR_ERR_OR_ZERO(t);
if (ret) {
- percpu_ref_put(&ca->io_ref[READ]);
+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan);
kfree(w);
bch_err_msg(c, ret, "starting kthread");
break;
}
closure_get(&cl);
- percpu_ref_get(&ca->io_ref[READ]);
+ enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan);
wake_up_process(t);
}
err:
@@ -363,6 +363,8 @@ static int handle_overwrites(struct bch_fs *c,
min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL);
}
}
+
+ cond_resched();
}
return 0;
@@ -395,7 +397,7 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c)
printbuf_reset(&buf);
prt_printf(&buf, "%s: nodes found:\n", __func__);
found_btree_nodes_to_text(&buf, c, f->nodes);
- bch2_print_string_as_lines(KERN_INFO, buf.buf);
+ bch2_print_str(c, KERN_INFO, buf.buf);
}
sort_nonatomic(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL);
@@ -424,7 +426,7 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c)
printbuf_reset(&buf);
prt_printf(&buf, "%s: nodes after merging replicas:\n", __func__);
found_btree_nodes_to_text(&buf, c, f->nodes);
- bch2_print_string_as_lines(KERN_INFO, buf.buf);
+ bch2_print_str(c, KERN_INFO, buf.buf);
}
swap(nodes_heap, f->nodes);
@@ -470,7 +472,7 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c)
printbuf_reset(&buf);
prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__);
found_btree_nodes_to_text(&buf, c, f->nodes);
- bch2_print_string_as_lines(KERN_INFO, buf.buf);
+ bch2_print_str(c, KERN_INFO, buf.buf);
} else {
bch_info(c, "btree node scan found %zu nodes after overwrites", f->nodes.nr);
}
@@ -541,7 +543,7 @@ int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree,
struct find_btree_nodes *f = &c->found_btree_nodes;
- int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
+ int ret = bch2_run_print_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
if (ret)
return ret;
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index 7d7e52ddde02..d9710801e3ee 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -11,6 +11,7 @@
#include "btree_write_buffer.h"
#include "buckets.h"
#include "disk_accounting.h"
+#include "enumerated_ref.h"
#include "errcode.h"
#include "error.h"
#include "journal.h"
@@ -20,6 +21,7 @@
#include "snapshot.h"
#include <linux/prefetch.h>
+#include <linux/string_helpers.h>
static const char * const trans_commit_flags_strs[] = {
#define x(n, ...) #n,
@@ -366,14 +368,15 @@ static noinline void journal_transaction_name(struct btree_trans *trans)
struct jset_entry_log *l =
container_of(entry, struct jset_entry_log, entry);
- strncpy(l->d, trans->fn, JSET_ENTRY_LOG_U64s * sizeof(u64));
+ memcpy_and_pad(l->d, JSET_ENTRY_LOG_U64s * sizeof(u64),
+ trans->fn, strlen(trans->fn), 0);
}
static inline int btree_key_can_insert(struct btree_trans *trans,
struct btree *b, unsigned u64s)
{
if (!bch2_btree_node_insert_fits(b, u64s))
- return -BCH_ERR_btree_insert_btree_node_full;
+ return bch_err_throw(trans->c, btree_insert_btree_node_full);
return 0;
}
@@ -391,9 +394,10 @@ btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags,
new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
if (!new_k) {
- bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
+ struct bch_fs *c = trans->c;
+ bch_err(c, "error allocating memory for key cache key, btree %s u64s %u",
bch2_btree_id_str(path->btree_id), new_u64s);
- return -BCH_ERR_ENOMEM_btree_key_cache_insert;
+ return bch_err_throw(c, ENOMEM_btree_key_cache_insert);
}
ret = bch2_trans_relock(trans) ?:
@@ -429,7 +433,7 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags
if (watermark < BCH_WATERMARK_reclaim &&
!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
bch2_btree_key_cache_must_wait(c))
- return -BCH_ERR_btree_insert_need_journal_reclaim;
+ return bch_err_throw(c, btree_insert_need_journal_reclaim);
/*
* bch2_varint_decode can read past the end of the buffer by at most 7
@@ -644,10 +648,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
!(flags & BCH_TRANS_COMMIT_no_journal_res)) {
- if (bch2_journal_seq_verify)
+ if (static_branch_unlikely(&bch2_journal_seq_verify))
trans_for_each_update(trans, i)
i->k->k.bversion.lo = trans->journal_res.seq;
- else if (bch2_inject_invalid_keys)
+ else if (static_branch_unlikely(&bch2_inject_invalid_keys))
trans_for_each_update(trans, i)
i->k->k.bversion = MAX_VERSION;
}
@@ -660,18 +664,17 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
h = h->next;
}
- struct jset_entry *entry = trans->journal_entries;
+ struct bkey_i *accounting;
percpu_down_read(&c->mark_lock);
- for (entry = trans->journal_entries;
- entry != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
- entry = vstruct_next(entry))
- if (entry->type == BCH_JSET_ENTRY_write_buffer_keys &&
- entry->start->k.type == KEY_TYPE_accounting) {
- ret = bch2_accounting_trans_commit_hook(trans, bkey_i_to_accounting(entry->start), flags);
- if (ret)
- goto revert_fs_usage;
- }
+ for (accounting = btree_trans_subbuf_base(trans, &trans->accounting);
+ accounting != btree_trans_subbuf_top(trans, &trans->accounting);
+ accounting = bkey_next(accounting)) {
+ ret = bch2_accounting_trans_commit_hook(trans,
+ bkey_i_to_accounting(accounting), flags);
+ if (ret)
+ goto revert_fs_usage;
+ }
percpu_up_read(&c->mark_lock);
/* XXX: we only want to run this if deltas are nonzero */
@@ -695,8 +698,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
validate_context.flags = BCH_VALIDATE_write|BCH_VALIDATE_commit;
- for (struct jset_entry *i = trans->journal_entries;
- i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
+ for (struct jset_entry *i = btree_trans_journal_entries_start(trans);
+ i != btree_trans_journal_entries_top(trans);
i = vstruct_next(i)) {
ret = bch2_journal_entry_validate(c, NULL, i,
bcachefs_metadata_version_current,
@@ -751,11 +754,18 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
}
memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res),
- trans->journal_entries,
- trans->journal_entries_u64s);
+ btree_trans_journal_entries_start(trans),
+ trans->journal_entries.u64s);
+
+ trans->journal_res.offset += trans->journal_entries.u64s;
+ trans->journal_res.u64s -= trans->journal_entries.u64s;
- trans->journal_res.offset += trans->journal_entries_u64s;
- trans->journal_res.u64s -= trans->journal_entries_u64s;
+ memcpy_u64s_small(bch2_journal_add_entry(j, &trans->journal_res,
+ BCH_JSET_ENTRY_write_buffer_keys,
+ BTREE_ID_accounting, 0,
+ trans->accounting.u64s)->_data,
+ btree_trans_subbuf_base(trans, &trans->accounting),
+ trans->accounting.u64s);
if (trans->journal_seq)
*trans->journal_seq = trans->journal_res.seq;
@@ -777,13 +787,10 @@ fatal_err:
bch2_fs_fatal_error(c, "fatal error in transaction commit: %s", bch2_err_str(ret));
percpu_down_read(&c->mark_lock);
revert_fs_usage:
- for (struct jset_entry *entry2 = trans->journal_entries;
- entry2 != entry;
- entry2 = vstruct_next(entry2))
- if (entry2->type == BCH_JSET_ENTRY_write_buffer_keys &&
- entry2->start->k.type == KEY_TYPE_accounting)
- bch2_accounting_trans_commit_revert(trans,
- bkey_i_to_accounting(entry2->start), flags);
+ for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting);
+ i != accounting;
+ i = bkey_next(i))
+ bch2_accounting_trans_commit_revert(trans, bkey_i_to_accounting(i), flags);
percpu_up_read(&c->mark_lock);
return ret;
}
@@ -888,7 +895,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
*/
if ((flags & BCH_TRANS_COMMIT_journal_reclaim) &&
watermark < BCH_WATERMARK_reclaim) {
- ret = -BCH_ERR_journal_reclaim_would_deadlock;
+ ret = bch_err_throw(c, journal_reclaim_would_deadlock);
goto out;
}
@@ -958,15 +965,36 @@ do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans)
return ret;
}
- for (struct jset_entry *i = trans->journal_entries;
- i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
- i = vstruct_next(i))
+ for (struct jset_entry *i = btree_trans_journal_entries_start(trans);
+ i != btree_trans_journal_entries_top(trans);
+ i = vstruct_next(i)) {
if (i->type == BCH_JSET_ENTRY_btree_keys ||
i->type == BCH_JSET_ENTRY_write_buffer_keys) {
- int ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->start);
- if (ret)
- return ret;
+ jset_entry_for_each_key(i, k) {
+ int ret = bch2_journal_key_insert(c, i->btree_id, i->level, k);
+ if (ret)
+ return ret;
+ }
+ }
+
+ if (i->type == BCH_JSET_ENTRY_btree_root) {
+ guard(mutex)(&c->btree_root_lock);
+
+ struct btree_root *r = bch2_btree_id_root(c, i->btree_id);
+
+ bkey_copy(&r->key, i->start);
+ r->level = i->level;
+ r->alive = true;
}
+ }
+
+ for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting);
+ i != btree_trans_subbuf_top(trans, &trans->accounting);
+ i = bkey_next(i)) {
+ int ret = bch2_journal_key_insert(c, BTREE_ID_accounting, 0, i);
+ if (ret)
+ return ret;
+ }
return 0;
}
@@ -984,7 +1012,8 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
goto out_reset;
if (!trans->nr_updates &&
- !trans->journal_entries_u64s)
+ !trans->journal_entries.u64s &&
+ !trans->accounting.u64s)
goto out_reset;
ret = bch2_trans_commit_run_triggers(trans);
@@ -992,17 +1021,17 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
goto out_reset;
if (!(flags & BCH_TRANS_COMMIT_no_check_rw) &&
- unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) {
+ unlikely(!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_trans))) {
if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags)))
ret = do_bch2_trans_commit_to_journal_replay(trans);
else
- ret = -BCH_ERR_erofs_trans_commit;
+ ret = bch_err_throw(c, erofs_trans_commit);
goto out_reset;
}
EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags));
- trans->journal_u64s = trans->journal_entries_u64s;
+ trans->journal_u64s = trans->journal_entries.u64s + jset_u64s(trans->accounting.u64s);
trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names);
if (trans->journal_transaction_names)
trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s);
@@ -1058,7 +1087,7 @@ retry:
trace_and_count(c, transaction_commit, trans, _RET_IP_);
out:
if (likely(!(flags & BCH_TRANS_COMMIT_no_check_rw)))
- bch2_write_ref_put(c, BCH_WRITE_REF_trans);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_trans);
out_reset:
if (!ret)
bch2_trans_downgrade(trans);
@@ -1078,7 +1107,7 @@ err:
* restart:
*/
if (flags & BCH_TRANS_COMMIT_no_journal_res) {
- ret = -BCH_ERR_transaction_restart_nested;
+ ret = bch_err_throw(c, transaction_restart_nested);
goto out;
}
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 023c472dc9ee..c61c4171ae50 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -139,6 +139,7 @@ struct btree {
};
#define BCH_BTREE_CACHE_NOT_FREED_REASONS() \
+ x(cache_reserve) \
x(lock_intent) \
x(lock_write) \
x(dirty) \
@@ -257,9 +258,6 @@ struct btree_node_iter {
*
* BTREE_TRIGGER_insert - @new is entering the btree
* BTREE_TRIGGER_overwrite - @old is leaving the btree
- *
- * BTREE_TRIGGER_bucket_invalidate - signal from bucket invalidate path to alloc
- * trigger
*/
#define BTREE_TRIGGER_FLAGS() \
x(norun) \
@@ -269,8 +267,7 @@ struct btree_node_iter {
x(gc) \
x(insert) \
x(overwrite) \
- x(is_root) \
- x(bucket_invalidate)
+ x(is_root)
enum {
#define x(n) BTREE_ITER_FLAG_BIT_##n,
@@ -477,6 +474,18 @@ struct btree_trans_paths {
struct btree_path paths[];
};
+struct trans_kmalloc_trace {
+ unsigned long ip;
+ size_t bytes;
+};
+typedef DARRAY(struct trans_kmalloc_trace) darray_trans_kmalloc_trace;
+
+struct btree_trans_subbuf {
+ u16 base;
+ u16 u64s;
+ u16 size;;
+};
+
struct btree_trans {
struct bch_fs *c;
@@ -488,6 +497,9 @@ struct btree_trans {
void *mem;
unsigned mem_top;
unsigned mem_bytes;
+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
+ darray_trans_kmalloc_trace trans_kmalloc_trace;
+#endif
btree_path_idx_t nr_sorted;
btree_path_idx_t nr_paths;
@@ -528,9 +540,8 @@ struct btree_trans {
int srcu_idx;
/* update path: */
- u16 journal_entries_u64s;
- u16 journal_entries_size;
- struct jset_entry *journal_entries;
+ struct btree_trans_subbuf journal_entries;
+ struct btree_trans_subbuf accounting;
struct btree_trans_commit_hook *hooks;
struct journal_entry_pin *journal_pin;
@@ -544,6 +555,8 @@ struct btree_trans {
unsigned journal_u64s;
unsigned extra_disk_res; /* XXX kill */
+ __BKEY_PADDED(btree_path_down, BKEY_BTREE_PTR_VAL_U64s_MAX);
+
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map;
#endif
@@ -647,13 +660,13 @@ static inline struct bset_tree *bset_tree_last(struct btree *b)
static inline void *
__btree_node_offset_to_ptr(const struct btree *b, u16 offset)
{
- return (void *) ((u64 *) b->data + 1 + offset);
+ return (void *) ((u64 *) b->data + offset);
}
static inline u16
__btree_node_ptr_to_offset(const struct btree *b, const void *p)
{
- u16 ret = (u64 *) p - 1 - (u64 *) b->data;
+ u16 ret = (u64 *) p - (u64 *) b->data;
EBUG_ON(__btree_node_offset_to_ptr(b, ret) != p);
return ret;
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index 1e6b7836cc01..e97e78c10f49 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -14,6 +14,8 @@
#include "snapshot.h"
#include "trace.h"
+#include <linux/string_helpers.h>
+
static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
const struct btree_insert_entry *r)
{
@@ -121,65 +123,44 @@ static int need_whiteout_for_snapshot(struct btree_trans *trans,
}
int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
- enum btree_id id,
- struct bpos old_pos,
- struct bpos new_pos)
+ enum btree_id btree, struct bpos pos,
+ snapshot_id_list *s)
{
- struct bch_fs *c = trans->c;
- struct btree_iter old_iter, new_iter = {};
- struct bkey_s_c old_k, new_k;
- snapshot_id_list s;
- struct bkey_i *update;
int ret = 0;
- if (!bch2_snapshot_has_children(c, old_pos.snapshot))
- return 0;
+ darray_for_each(*s, id) {
+ pos.snapshot = *id;
- darray_init(&s);
-
- bch2_trans_iter_init(trans, &old_iter, id, old_pos,
- BTREE_ITER_not_extents|
- BTREE_ITER_all_snapshots);
- while ((old_k = bch2_btree_iter_prev(trans, &old_iter)).k &&
- !(ret = bkey_err(old_k)) &&
- bkey_eq(old_pos, old_k.k->p)) {
- struct bpos whiteout_pos =
- SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);
-
- if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) ||
- snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot))
- continue;
-
- new_k = bch2_bkey_get_iter(trans, &new_iter, id, whiteout_pos,
- BTREE_ITER_not_extents|
- BTREE_ITER_intent);
- ret = bkey_err(new_k);
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, btree, pos,
+ BTREE_ITER_not_extents|
+ BTREE_ITER_intent);
+ ret = bkey_err(k);
if (ret)
break;
- if (new_k.k->type == KEY_TYPE_deleted) {
- update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
+ if (k.k->type == KEY_TYPE_deleted) {
+ struct bkey_i *update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
ret = PTR_ERR_OR_ZERO(update);
- if (ret)
+ if (ret) {
+ bch2_trans_iter_exit(trans, &iter);
break;
+ }
bkey_init(&update->k);
- update->k.p = whiteout_pos;
+ update->k.p = pos;
update->k.type = KEY_TYPE_whiteout;
- ret = bch2_trans_update(trans, &new_iter, update,
+ ret = bch2_trans_update(trans, &iter, update,
BTREE_UPDATE_internal_snapshot_node);
}
- bch2_trans_iter_exit(trans, &new_iter);
+ bch2_trans_iter_exit(trans, &iter);
- ret = snapshot_list_add(c, &s, old_k.k->p.snapshot);
if (ret)
break;
}
- bch2_trans_iter_exit(trans, &new_iter);
- bch2_trans_iter_exit(trans, &old_iter);
- darray_exit(&s);
+ darray_exit(s);
return ret;
}
@@ -509,8 +490,9 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans,
return 0;
}
-int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_i *k, enum btree_iter_update_trigger_flags flags)
+int __must_check bch2_trans_update_ip(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_i *k, enum btree_iter_update_trigger_flags flags,
+ unsigned long ip)
{
kmsan_check_memory(k, bkey_bytes(&k->k));
@@ -546,7 +528,7 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
path_idx = iter->key_cache_path;
}
- return bch2_trans_update_by_path(trans, path_idx, k, flags, _RET_IP_);
+ return bch2_trans_update_by_path(trans, path_idx, k, flags, ip);
}
int bch2_btree_insert_clone_trans(struct btree_trans *trans,
@@ -562,30 +544,29 @@ int bch2_btree_insert_clone_trans(struct btree_trans *trans,
return bch2_btree_insert_trans(trans, btree, n, 0);
}
-struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s)
+void *__bch2_trans_subbuf_alloc(struct btree_trans *trans,
+ struct btree_trans_subbuf *buf,
+ unsigned u64s)
{
- unsigned new_top = trans->journal_entries_u64s + u64s;
- unsigned old_size = trans->journal_entries_size;
+ unsigned new_top = buf->u64s + u64s;
+ unsigned old_size = buf->size;
- if (new_top > trans->journal_entries_size) {
- trans->journal_entries_size = roundup_pow_of_two(new_top);
-
- btree_trans_stats(trans)->journal_entries_size = trans->journal_entries_size;
- }
+ if (new_top > buf->size)
+ buf->size = roundup_pow_of_two(new_top);
- struct jset_entry *n =
- bch2_trans_kmalloc_nomemzero(trans,
- trans->journal_entries_size * sizeof(u64));
+ void *n = bch2_trans_kmalloc_nomemzero(trans, buf->size * sizeof(u64));
if (IS_ERR(n))
- return ERR_CAST(n);
+ return n;
- if (trans->journal_entries)
- memcpy(n, trans->journal_entries, old_size * sizeof(u64));
- trans->journal_entries = n;
+ if (buf->u64s)
+ memcpy(n,
+ btree_trans_subbuf_base(trans, buf),
+ old_size * sizeof(u64));
+ buf->base = (u64 *) n - (u64 *) trans->mem;
- struct jset_entry *e = btree_trans_journal_entries_top(trans);
- trans->journal_entries_u64s = new_top;
- return e;
+ void *p = btree_trans_subbuf_top(trans, buf);
+ buf->u64s = new_top;
+ return p;
}
int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter,
@@ -606,7 +587,7 @@ int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter,
BUG_ON(k.k->type != KEY_TYPE_deleted);
if (bkey_gt(k.k->p, end)) {
- ret = -BCH_ERR_ENOSPC_btree_slot;
+ ret = bch_err_throw(trans->c, ENOSPC_btree_slot);
goto err;
}
@@ -826,26 +807,35 @@ int bch2_btree_bit_mod_buffered(struct btree_trans *trans, enum btree_id btree,
return bch2_trans_update_buffered(trans, btree, &k);
}
-int bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf)
+static int __bch2_trans_log_str(struct btree_trans *trans, const char *str, unsigned len)
{
- unsigned u64s = DIV_ROUND_UP(buf->pos, sizeof(u64));
- prt_chars(buf, '\0', u64s * sizeof(u64) - buf->pos);
-
- int ret = buf->allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0;
- if (ret)
- return ret;
+ unsigned u64s = DIV_ROUND_UP(len, sizeof(u64));
struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(u64s));
- ret = PTR_ERR_OR_ZERO(e);
+ int ret = PTR_ERR_OR_ZERO(e);
if (ret)
return ret;
struct jset_entry_log *l = container_of(e, struct jset_entry_log, entry);
journal_entry_init(e, BCH_JSET_ENTRY_log, 0, 1, u64s);
- memcpy(l->d, buf->buf, buf->pos);
+ memcpy_and_pad(l->d, u64s * sizeof(u64), str, len, 0);
return 0;
}
+int bch2_trans_log_str(struct btree_trans *trans, const char *str)
+{
+ return __bch2_trans_log_str(trans, str, strlen(str));
+}
+
+int bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf)
+{
+ int ret = buf->allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0;
+ if (ret)
+ return ret;
+
+ return __bch2_trans_log_str(trans, buf->buf, buf->pos);
+}
+
int bch2_trans_log_bkey(struct btree_trans *trans, enum btree_id btree,
unsigned level, struct bkey_i *k)
{
@@ -868,7 +858,6 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
prt_vprintf(&buf, fmt, args);
unsigned u64s = DIV_ROUND_UP(buf.pos, sizeof(u64));
- prt_chars(&buf, '\0', u64s * sizeof(u64) - buf.pos);
int ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0;
if (ret)
@@ -881,7 +870,7 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
struct jset_entry_log *l = (void *) &darray_top(c->journal.early_journal_entries);
journal_entry_init(&l->entry, BCH_JSET_ENTRY_log, 0, 1, u64s);
- memcpy(l->d, buf.buf, buf.pos);
+ memcpy_and_pad(l->d, u64s * sizeof(u64), buf.buf, buf.pos, 0);
c->journal.early_journal_entries.nr += jset_u64s(u64s);
} else {
ret = bch2_trans_commit_do(c, NULL, NULL, commit_flags,
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 568e56c91190..9feef1dc4de5 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -4,6 +4,7 @@
#include "btree_iter.h"
#include "journal.h"
+#include "snapshot.h"
struct bch_fs;
struct btree;
@@ -74,7 +75,7 @@ static inline int bch2_btree_delete_at_buffered(struct btree_trans *trans,
}
int __bch2_insert_snapshot_whiteouts(struct btree_trans *, enum btree_id,
- struct bpos, struct bpos);
+ struct bpos, snapshot_id_list *);
/*
* For use when splitting extents in existing snapshots:
@@ -88,11 +89,20 @@ static inline int bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
struct bpos old_pos,
struct bpos new_pos)
{
+ BUG_ON(old_pos.snapshot != new_pos.snapshot);
+
if (!btree_type_has_snapshots(btree) ||
bkey_eq(old_pos, new_pos))
return 0;
- return __bch2_insert_snapshot_whiteouts(trans, btree, old_pos, new_pos);
+ snapshot_id_list s;
+ int ret = bch2_get_snapshot_overwrites(trans, btree, old_pos, &s);
+ if (ret)
+ return ret;
+
+ return s.nr
+ ? __bch2_insert_snapshot_whiteouts(trans, btree, new_pos, &s)
+ : 0;
}
int bch2_trans_update_extent_overwrite(struct btree_trans *, struct btree_iter *,
@@ -102,26 +112,60 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *, struct btree_iter *
int bch2_bkey_get_empty_slot(struct btree_trans *, struct btree_iter *,
enum btree_id, struct bpos);
-int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *,
- struct bkey_i *, enum btree_iter_update_trigger_flags);
+int __must_check bch2_trans_update_ip(struct btree_trans *, struct btree_iter *,
+ struct bkey_i *, enum btree_iter_update_trigger_flags,
+ unsigned long);
+
+static inline int __must_check
+bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_i *k, enum btree_iter_update_trigger_flags flags)
+{
+ return bch2_trans_update_ip(trans, iter, k, flags, _THIS_IP_);
+}
+
+static inline void *btree_trans_subbuf_base(struct btree_trans *trans,
+ struct btree_trans_subbuf *buf)
+{
+ return (u64 *) trans->mem + buf->base;
+}
+
+static inline void *btree_trans_subbuf_top(struct btree_trans *trans,
+ struct btree_trans_subbuf *buf)
+{
+ return (u64 *) trans->mem + buf->base + buf->u64s;
+}
+
+void *__bch2_trans_subbuf_alloc(struct btree_trans *,
+ struct btree_trans_subbuf *,
+ unsigned);
-struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *, unsigned);
+static inline void *
+bch2_trans_subbuf_alloc(struct btree_trans *trans,
+ struct btree_trans_subbuf *buf,
+ unsigned u64s)
+{
+ if (buf->u64s + u64s > buf->size)
+ return __bch2_trans_subbuf_alloc(trans, buf, u64s);
+
+ void *p = btree_trans_subbuf_top(trans, buf);
+ buf->u64s += u64s;
+ return p;
+}
+
+static inline struct jset_entry *btree_trans_journal_entries_start(struct btree_trans *trans)
+{
+ return btree_trans_subbuf_base(trans, &trans->journal_entries);
+}
static inline struct jset_entry *btree_trans_journal_entries_top(struct btree_trans *trans)
{
- return (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
+ return btree_trans_subbuf_top(trans, &trans->journal_entries);
}
static inline struct jset_entry *
bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s)
{
- if (!trans->journal_entries ||
- trans->journal_entries_u64s + u64s > trans->journal_entries_size)
- return __bch2_trans_jset_entry_alloc(trans, u64s);
-
- struct jset_entry *e = btree_trans_journal_entries_top(trans);
- trans->journal_entries_u64s += u64s;
- return e;
+ return bch2_trans_subbuf_alloc(trans, &trans->journal_entries, u64s);
}
int bch2_btree_insert_clone_trans(struct btree_trans *, enum btree_id, struct bkey_i *);
@@ -135,6 +179,8 @@ static inline int __must_check bch2_trans_update_buffered(struct btree_trans *tr
{
kmsan_check_memory(k, bkey_bytes(&k->k));
+ EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX);
+
if (unlikely(!btree_type_uses_write_buffer(btree))) {
int ret = bch2_btree_write_buffer_insert_err(trans, btree, k);
dump_stack();
@@ -169,6 +215,7 @@ void bch2_trans_commit_hook(struct btree_trans *,
struct btree_trans_commit_hook *);
int __bch2_trans_commit(struct btree_trans *, unsigned);
+int bch2_trans_log_str(struct btree_trans *, const char *);
int bch2_trans_log_msg(struct btree_trans *, struct printbuf *);
int bch2_trans_log_bkey(struct btree_trans *, enum btree_id, unsigned, struct bkey_i *);
@@ -217,12 +264,15 @@ static inline void bch2_trans_reset_updates(struct btree_trans *trans)
bch2_path_put(trans, i->path, true);
trans->nr_updates = 0;
- trans->journal_entries_u64s = 0;
+ trans->journal_entries.u64s = 0;
+ trans->journal_entries.size = 0;
+ trans->accounting.u64s = 0;
+ trans->accounting.size = 0;
trans->hooks = NULL;
trans->extra_disk_res = 0;
}
-static inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k,
+static __always_inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k,
unsigned type, unsigned min_bytes)
{
unsigned bytes = max_t(unsigned, min_bytes, bkey_bytes(k.k));
@@ -245,7 +295,7 @@ static inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *t
return mut;
}
-static inline struct bkey_i *bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k)
+static __always_inline struct bkey_i *bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k)
{
return __bch2_bkey_make_mut_noupdate(trans, k, 0, 0);
}
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 00307356d7c8..d2ecb782919b 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -14,6 +14,7 @@
#include "btree_locking.h"
#include "buckets.h"
#include "clock.h"
+#include "enumerated_ref.h"
#include "error.h"
#include "extents.h"
#include "io_write.h"
@@ -56,8 +57,6 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
struct bkey_buf prev;
int ret = 0;
- printbuf_indent_add_nextline(&buf, 2);
-
BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
!bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key,
b->data->min_key));
@@ -68,20 +67,23 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
if (b == btree_node_root(c, b)) {
if (!bpos_eq(b->data->min_key, POS_MIN)) {
- ret = __bch2_topology_error(c, &buf);
-
+ bch2_log_msg_start(c, &buf);
+ prt_printf(&buf, "btree root with incorrect min_key: ");
bch2_bpos_to_text(&buf, b->data->min_key);
- log_fsck_err(trans, btree_root_bad_min_key,
- "btree root with incorrect min_key: %s", buf.buf);
- goto out;
+ prt_newline(&buf);
+
+ bch2_count_fsck_err(c, btree_root_bad_min_key, &buf);
+ goto err;
}
if (!bpos_eq(b->data->max_key, SPOS_MAX)) {
- ret = __bch2_topology_error(c, &buf);
+ bch2_log_msg_start(c, &buf);
+ prt_printf(&buf, "btree root with incorrect max_key: ");
bch2_bpos_to_text(&buf, b->data->max_key);
- log_fsck_err(trans, btree_root_bad_max_key,
- "btree root with incorrect max_key: %s", buf.buf);
- goto out;
+ prt_newline(&buf);
+
+ bch2_count_fsck_err(c, btree_root_bad_max_key, &buf);
+ goto err;
}
}
@@ -99,19 +101,15 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
: bpos_successor(prev.k->k.p);
if (!bpos_eq(expected_min, bp.v->min_key)) {
- ret = __bch2_topology_error(c, &buf);
-
- prt_str(&buf, "end of prev node doesn't match start of next node\nin ");
- bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
- prt_str(&buf, " node ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+ prt_str(&buf, "end of prev node doesn't match start of next node");
prt_str(&buf, "\nprev ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
prt_str(&buf, "\nnext ");
bch2_bkey_val_to_text(&buf, c, k);
+ prt_newline(&buf);
- log_fsck_err(trans, btree_node_topology_bad_min_key, "%s", buf.buf);
- goto out;
+ bch2_count_fsck_err(c, btree_node_topology_bad_min_key, &buf);
+ goto err;
}
bch2_bkey_buf_reassemble(&prev, c, k);
@@ -119,32 +117,34 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
}
if (bkey_deleted(&prev.k->k)) {
- ret = __bch2_topology_error(c, &buf);
-
- prt_str(&buf, "empty interior node\nin ");
- bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
- prt_str(&buf, " node ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
-
- log_fsck_err(trans, btree_node_topology_empty_interior_node, "%s", buf.buf);
- } else if (!bpos_eq(prev.k->k.p, b->key.k.p)) {
- ret = __bch2_topology_error(c, &buf);
+ prt_printf(&buf, "empty interior node\n");
+ bch2_count_fsck_err(c, btree_node_topology_empty_interior_node, &buf);
+ goto err;
+ }
- prt_str(&buf, "last child node doesn't end at end of parent node\nin ");
- bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
- prt_str(&buf, " node ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
- prt_str(&buf, "\nlast key ");
+ if (!bpos_eq(prev.k->k.p, b->key.k.p)) {
+ prt_str(&buf, "last child node doesn't end at end of parent node\nchild: ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
+ prt_newline(&buf);
- log_fsck_err(trans, btree_node_topology_bad_max_key, "%s", buf.buf);
+ bch2_count_fsck_err(c, btree_node_topology_bad_max_key, &buf);
+ goto err;
}
out:
-fsck_err:
bch2_btree_and_journal_iter_exit(&iter);
bch2_bkey_buf_exit(&prev, c);
printbuf_exit(&buf);
return ret;
+err:
+ bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
+ prt_char(&buf, ' ');
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+ prt_newline(&buf);
+
+ ret = __bch2_topology_error(c, &buf);
+ bch2_print_str(c, KERN_ERR, buf.buf);
+ BUG_ON(!ret);
+ goto out;
}
/* Calculate ideal packed bkey format for new btree nodes: */
@@ -284,6 +284,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
struct disk_reservation *res,
struct closure *cl,
bool interior_node,
+ unsigned target,
unsigned flags)
{
struct bch_fs *c = trans->c;
@@ -317,6 +318,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
mutex_unlock(&c->btree_reserve_cache_lock);
retry:
ret = bch2_alloc_sectors_start_trans(trans,
+ target ?:
c->opts.metadata_target ?:
c->opts.foreground_target,
0,
@@ -325,7 +327,9 @@ retry:
res->nr_replicas,
min(res->nr_replicas,
c->opts.metadata_replicas_required),
- watermark, 0, cl, &wp);
+ watermark,
+ target ? BCH_WRITE_only_specified_devs : 0,
+ cl, &wp);
if (unlikely(ret))
goto err;
@@ -505,6 +509,7 @@ static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans *
static int bch2_btree_reserve_get(struct btree_trans *trans,
struct btree_update *as,
unsigned nr_nodes[2],
+ unsigned target,
unsigned flags,
struct closure *cl)
{
@@ -527,7 +532,7 @@ static int bch2_btree_reserve_get(struct btree_trans *trans,
while (p->nr < nr_nodes[interior]) {
b = __bch2_btree_node_alloc(trans, &as->disk_res, cl,
- interior, flags);
+ interior, target, flags);
if (IS_ERR(b)) {
ret = PTR_ERR(b);
goto err;
@@ -679,12 +684,31 @@ static void btree_update_nodes_written(struct btree_update *as)
/*
* Wait for any in flight writes to finish before we free the old nodes
- * on disk:
+ * on disk. But we haven't pinned those old nodes in the btree cache,
+ * they might have already been evicted.
+ *
+ * The update we're completing deleted references to those nodes from the
+ * btree, so we know if they've been evicted they can't be pulled back in.
+ * We just have to check if the nodes we have pointers to are still those
+ * old nodes, and haven't been reused.
+ *
+ * This can't be done locklessly because the data buffer might have been
+ * vmalloc allocated, and they're not RCU freed. We also need the
+ * __no_kmsan_checks annotation because even with the btree node read
+ * lock, nothing tells us that the data buffer has been initialized (if
+ * the btree node has been reused for a different node, and the data
+ * buffer swapped for a new data buffer).
*/
for (i = 0; i < as->nr_old_nodes; i++) {
b = as->old_nodes[i];
- if (btree_node_seq_matches(b, as->old_nodes_seq[i]))
+ bch2_trans_begin(trans);
+ btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
+ bool seq_matches = btree_node_seq_matches(b, as->old_nodes_seq[i]);
+ six_unlock_read(&b->c.lock);
+ bch2_trans_unlock_long(trans);
+
+ if (seq_matches)
wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner,
TASK_UNINTERRUPTIBLE);
}
@@ -1116,7 +1140,8 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *
static struct btree_update *
bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
- unsigned level_start, bool split, unsigned flags)
+ unsigned level_start, bool split,
+ unsigned target, unsigned flags)
{
struct bch_fs *c = trans->c;
struct btree_update *as;
@@ -1226,7 +1251,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
if (ret)
goto err;
- ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, NULL);
+ ret = bch2_btree_reserve_get(trans, as, nr_nodes, target, flags, NULL);
if (bch2_err_matches(ret, ENOSPC) ||
bch2_err_matches(ret, ENOMEM)) {
struct closure cl;
@@ -1238,14 +1263,14 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
if (bch2_err_matches(ret, ENOSPC) &&
(flags & BCH_TRANS_COMMIT_journal_reclaim) &&
watermark < BCH_WATERMARK_reclaim) {
- ret = -BCH_ERR_journal_reclaim_would_deadlock;
+ ret = bch_err_throw(c, journal_reclaim_would_deadlock);
goto err;
}
closure_init_stack(&cl);
do {
- ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, &cl);
+ ret = bch2_btree_reserve_get(trans, as, nr_nodes, target, flags, &cl);
bch2_trans_unlock(trans);
bch2_wait_on_allocator(c, &cl);
@@ -1806,10 +1831,10 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
__func__, b->c.level);
bch2_btree_update_to_text(&buf, as);
bch2_btree_path_to_text(&buf, trans, path_idx);
+ bch2_fs_emergency_read_only2(c, &buf);
- bch2_print_string_as_lines(KERN_ERR, buf.buf);
+ bch2_print_str(c, KERN_ERR, buf.buf);
printbuf_exit(&buf);
- bch2_fs_emergency_read_only(c);
return -EIO;
}
@@ -1878,7 +1903,7 @@ int bch2_btree_split_leaf(struct btree_trans *trans,
as = bch2_btree_update_start(trans, trans->paths + path,
trans->paths[path].level,
- true, flags);
+ true, 0, flags);
if (IS_ERR(as))
return PTR_ERR(as);
@@ -1948,7 +1973,8 @@ int bch2_btree_increase_depth(struct btree_trans *trans, btree_path_idx_t path,
return bch2_btree_split_leaf(trans, path, flags);
struct btree_update *as =
- bch2_btree_update_start(trans, trans->paths + path, b->c.level, true, flags);
+ bch2_btree_update_start(trans, trans->paths + path, b->c.level,
+ true, 0, flags);
if (IS_ERR(as))
return PTR_ERR(as);
@@ -2077,7 +2103,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
parent = btree_node_parent(trans->paths + path, b);
as = bch2_btree_update_start(trans, trans->paths + path, level, false,
- BCH_TRANS_COMMIT_no_enospc|flags);
+ 0, BCH_TRANS_COMMIT_no_enospc|flags);
ret = PTR_ERR_OR_ZERO(as);
if (ret)
goto err;
@@ -2170,7 +2196,7 @@ static int get_iter_to_node(struct btree_trans *trans, struct btree_iter *iter,
if (btree_iter_path(trans, iter)->l[b->c.level].b != b) {
/* node has been freed: */
BUG_ON(!btree_node_dying(b));
- ret = -BCH_ERR_btree_node_dying;
+ ret = bch_err_throw(trans->c, btree_node_dying);
goto err;
}
@@ -2184,6 +2210,7 @@ err:
int bch2_btree_node_rewrite(struct btree_trans *trans,
struct btree_iter *iter,
struct btree *b,
+ unsigned target,
unsigned flags)
{
struct bch_fs *c = trans->c;
@@ -2196,7 +2223,8 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
struct btree_path *path = btree_iter_path(trans, iter);
parent = btree_node_parent(path, b);
- as = bch2_btree_update_start(trans, path, b->c.level, false, flags);
+ as = bch2_btree_update_start(trans, path, b->c.level,
+ false, target, flags);
ret = PTR_ERR_OR_ZERO(as);
if (ret)
goto out;
@@ -2261,7 +2289,7 @@ static int bch2_btree_node_rewrite_key(struct btree_trans *trans,
bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(k);
ret = found
- ? bch2_btree_node_rewrite(trans, &iter, b, flags)
+ ? bch2_btree_node_rewrite(trans, &iter, b, 0, flags)
: -ENOENT;
out:
bch2_trans_iter_exit(trans, &iter);
@@ -2270,7 +2298,9 @@ out:
int bch2_btree_node_rewrite_pos(struct btree_trans *trans,
enum btree_id btree, unsigned level,
- struct bpos pos, unsigned flags)
+ struct bpos pos,
+ unsigned target,
+ unsigned flags)
{
BUG_ON(!level);
@@ -2282,7 +2312,7 @@ int bch2_btree_node_rewrite_pos(struct btree_trans *trans,
if (ret)
goto err;
- ret = bch2_btree_node_rewrite(trans, &iter, b, flags);
+ ret = bch2_btree_node_rewrite(trans, &iter, b, target, flags);
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
@@ -2296,7 +2326,7 @@ int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *trans,
if (ret)
return ret == -BCH_ERR_btree_node_dying ? 0 : ret;
- ret = bch2_btree_node_rewrite(trans, &iter, b, flags);
+ ret = bch2_btree_node_rewrite(trans, &iter, b, 0, flags);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
@@ -2330,7 +2360,7 @@ static void async_btree_node_rewrite_work(struct work_struct *work)
closure_wake_up(&c->btree_node_rewrites_wait);
bch2_bkey_buf_exit(&a->key, c);
- bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_node_rewrite);
kfree(a);
}
@@ -2351,8 +2381,8 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
bool now = false, pending = false;
spin_lock(&c->btree_node_rewrites_lock);
- if (c->curr_recovery_pass > BCH_RECOVERY_PASS_journal_replay &&
- bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) {
+ if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_journal_replay) &&
+ enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_node_rewrite)) {
list_add(&a->list, &c->btree_node_rewrites);
now = true;
} else if (!test_bit(BCH_FS_may_go_rw, &c->flags)) {
@@ -2391,7 +2421,7 @@ void bch2_do_pending_node_rewrites(struct bch_fs *c)
if (!a)
break;
- bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite);
+ enumerated_ref_get(&c->writes, BCH_WRITE_REF_node_rewrite);
queue_work(c->btree_node_rewrite_worker, &a->work);
}
}
@@ -2780,16 +2810,16 @@ int bch2_fs_btree_interior_update_init(struct bch_fs *c)
c->btree_interior_update_worker =
alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 8);
if (!c->btree_interior_update_worker)
- return -BCH_ERR_ENOMEM_btree_interior_update_worker_init;
+ return bch_err_throw(c, ENOMEM_btree_interior_update_worker_init);
c->btree_node_rewrite_worker =
alloc_ordered_workqueue("btree_node_rewrite", WQ_UNBOUND);
if (!c->btree_node_rewrite_worker)
- return -BCH_ERR_ENOMEM_btree_interior_update_worker_init;
+ return bch_err_throw(c, ENOMEM_btree_interior_update_worker_init);
if (mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
sizeof(struct btree_update)))
- return -BCH_ERR_ENOMEM_btree_interior_update_pool_init;
+ return bch_err_throw(c, ENOMEM_btree_interior_update_pool_init);
return 0;
}
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index be71cd73b864..7fe793788a79 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -144,7 +144,7 @@ static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans,
EBUG_ON(!btree_node_locked(path, level));
- if (bch2_btree_node_merging_disabled)
+ if (static_branch_unlikely(&bch2_btree_node_merging_disabled))
return 0;
b = path->l[level].b;
@@ -168,10 +168,10 @@ static inline int bch2_foreground_maybe_merge(struct btree_trans *trans,
}
int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *,
- struct btree *, unsigned);
+ struct btree *, unsigned, unsigned);
int bch2_btree_node_rewrite_pos(struct btree_trans *,
enum btree_id, unsigned,
- struct bpos, unsigned);
+ struct bpos, unsigned, unsigned);
int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *,
struct btree *, unsigned);
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index 0941fb2c026d..90b21e61d2b6 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -7,6 +7,7 @@
#include "btree_update_interior.h"
#include "btree_write_buffer.h"
#include "disk_accounting.h"
+#include "enumerated_ref.h"
#include "error.h"
#include "extents.h"
#include "journal.h"
@@ -181,6 +182,8 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite
return wb_flush_one_slowpath(trans, iter, wb);
}
+ EBUG_ON(!bpos_eq(wb->k.k.p, path->pos));
+
bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq);
(*fast)++;
return 0;
@@ -391,7 +394,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
bool accounting_accumulated = false;
do {
if (race_fault()) {
- ret = -BCH_ERR_journal_reclaim_would_deadlock;
+ ret = bch_err_throw(c, journal_reclaim_would_deadlock);
break;
}
@@ -629,11 +632,11 @@ int bch2_btree_write_buffer_tryflush(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer))
- return -BCH_ERR_erofs_no_writes;
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_write_buffer))
+ return bch_err_throw(c, erofs_no_writes);
int ret = bch2_btree_write_buffer_flush_nocheck_rw(trans);
- bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer);
return ret;
}
@@ -673,7 +676,7 @@ int bch2_btree_write_buffer_maybe_flush(struct btree_trans *trans,
goto err;
bch2_bkey_buf_copy(last_flushed, c, tmp.k);
- ret = -BCH_ERR_transaction_restart_write_buffer_flush;
+ ret = bch_err_throw(c, transaction_restart_write_buffer_flush);
}
err:
bch2_bkey_buf_exit(&tmp, c);
@@ -692,7 +695,7 @@ static void bch2_btree_write_buffer_flush_work(struct work_struct *work)
} while (!ret && bch2_btree_write_buffer_should_flush(c));
mutex_unlock(&wb->flushing.lock);
- bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer);
}
static void wb_accounting_sort(struct btree_write_buffer *wb)
@@ -821,9 +824,9 @@ int bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_
bch2_journal_pin_drop(&c->journal, &dst->wb->pin);
if (bch2_btree_write_buffer_should_flush(c) &&
- __bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer) &&
+ __enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_write_buffer) &&
!queue_work(system_unbound_wq, &c->btree_write_buffer.flush_work))
- bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer);
if (dst->wb == &wb->flushing)
mutex_unlock(&wb->flushing.lock);
@@ -866,13 +869,18 @@ void bch2_fs_btree_write_buffer_exit(struct bch_fs *c)
darray_exit(&wb->inc.keys);
}
-int bch2_fs_btree_write_buffer_init(struct bch_fs *c)
+void bch2_fs_btree_write_buffer_init_early(struct bch_fs *c)
{
struct btree_write_buffer *wb = &c->btree_write_buffer;
mutex_init(&wb->inc.lock);
mutex_init(&wb->flushing.lock);
INIT_WORK(&wb->flush_work, bch2_btree_write_buffer_flush_work);
+}
+
+int bch2_fs_btree_write_buffer_init(struct bch_fs *c)
+{
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
/* Will be resized by journal as needed: */
unsigned initial_size = 1 << 16;
diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h
index d535cea28bde..05f56fd1eed0 100644
--- a/fs/bcachefs/btree_write_buffer.h
+++ b/fs/bcachefs/btree_write_buffer.h
@@ -101,6 +101,7 @@ int bch2_journal_keys_to_write_buffer_end(struct bch_fs *, struct journal_keys_t
int bch2_btree_write_buffer_resize(struct bch_fs *, size_t);
void bch2_fs_btree_write_buffer_exit(struct bch_fs *);
+void bch2_fs_btree_write_buffer_init_early(struct bch_fs *);
int bch2_fs_btree_write_buffer_init(struct bch_fs *);
#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_H */
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 31fbc2716d8b..f25903c10e8a 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -156,10 +156,14 @@ static int bch2_check_fix_ptr(struct btree_trans *trans,
g->gen_valid = true;
g->gen = p.ptr.gen;
} else {
+ /* this pointer will be dropped */
*do_update = true;
+ goto out;
}
}
+ /* g->gen_valid == true */
+
if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0,
trans, ptr_gen_newer_than_bucket_gen,
"bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
@@ -172,15 +176,13 @@ static int bch2_check_fix_ptr(struct btree_trans *trans,
if (!p.ptr.cached &&
(g->data_type != BCH_DATA_btree ||
data_type == BCH_DATA_btree)) {
- g->gen_valid = true;
- g->gen = p.ptr.gen;
- g->data_type = 0;
+ g->data_type = data_type;
g->stripe_sectors = 0;
g->dirty_sectors = 0;
g->cached_sectors = 0;
- } else {
- *do_update = true;
}
+
+ *do_update = true;
}
if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX,
@@ -217,9 +219,22 @@ static int bch2_check_fix_ptr(struct btree_trans *trans,
bch2_data_type_str(data_type),
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- if (data_type == BCH_DATA_btree) {
- g->gen_valid = true;
- g->gen = p.ptr.gen;
+ if (!p.ptr.cached &&
+ data_type == BCH_DATA_btree) {
+ switch (g->data_type) {
+ case BCH_DATA_sb:
+ bch_err(c, "btree and superblock in the same bucket - cannot repair");
+ ret = bch_err_throw(c, fsck_repair_unimplemented);
+ goto out;
+ case BCH_DATA_journal:
+ ret = bch2_dev_journal_bucket_delete(ca, PTR_BUCKET_NR(ca, &p.ptr));
+ bch_err_msg(c, ret, "error deleting journal bucket %zu",
+ PTR_BUCKET_NR(ca, &p.ptr));
+ if (ret)
+ goto out;
+ break;
+ }
+
g->data_type = data_type;
g->stripe_sectors = 0;
g->dirty_sectors = 0;
@@ -269,6 +284,9 @@ int bch2_check_fix_ptrs(struct btree_trans *trans,
struct printbuf buf = PRINTBUF;
int ret = 0;
+ /* We don't yet do btree key updates correctly for when we're RW */
+ BUG_ON(test_bit(BCH_FS_rw, &c->flags));
+
bkey_for_each_ptr_decode(k.k, ptrs_c, p, entry_c) {
ret = bch2_check_fix_ptr(trans, k, p, entry_c, &do_update);
if (ret)
@@ -276,20 +294,13 @@ int bch2_check_fix_ptrs(struct btree_trans *trans,
}
if (do_update) {
- if (flags & BTREE_TRIGGER_is_root) {
- bch_err(c, "cannot update btree roots yet");
- ret = -EINVAL;
- goto err;
- }
-
struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
ret = PTR_ERR_OR_ZERO(new);
if (ret)
goto err;
- rcu_read_lock();
- bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, !bch2_dev_exists(c, ptr->dev));
- rcu_read_unlock();
+ scoped_guard(rcu)
+ bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, !bch2_dev_exists(c, ptr->dev));
if (level) {
/*
@@ -298,14 +309,11 @@ int bch2_check_fix_ptrs(struct btree_trans *trans,
* sort it out:
*/
struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
- rcu_read_lock();
- bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
- struct bucket *g = PTR_GC_BUCKET(ca, ptr);
-
- ptr->gen = g->gen;
- }
- rcu_read_unlock();
+ scoped_guard(rcu)
+ bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
+ ptr->gen = PTR_GC_BUCKET(ca, ptr)->gen;
+ }
} else {
struct bkey_ptrs ptrs;
union bch_extent_entry *entry;
@@ -369,19 +377,41 @@ found:
bch_info(c, "new key %s", buf.buf);
}
- struct btree_iter iter;
- bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level,
- BTREE_ITER_intent|BTREE_ITER_all_snapshots);
- ret = bch2_btree_iter_traverse(trans, &iter) ?:
- bch2_trans_update(trans, &iter, new,
- BTREE_UPDATE_internal_snapshot_node|
- BTREE_TRIGGER_norun);
- bch2_trans_iter_exit(trans, &iter);
- if (ret)
- goto err;
+ if (!(flags & BTREE_TRIGGER_is_root)) {
+ struct btree_iter iter;
+ bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level,
+ BTREE_ITER_intent|BTREE_ITER_all_snapshots);
+ ret = bch2_btree_iter_traverse(trans, &iter) ?:
+ bch2_trans_update(trans, &iter, new,
+ BTREE_UPDATE_internal_snapshot_node|
+ BTREE_TRIGGER_norun);
+ bch2_trans_iter_exit(trans, &iter);
+ if (ret)
+ goto err;
+
+ if (level)
+ bch2_btree_node_update_key_early(trans, btree, level - 1, k, new);
+ } else {
+ struct jset_entry *e = bch2_trans_jset_entry_alloc(trans,
+ jset_u64s(new->k.u64s));
+ ret = PTR_ERR_OR_ZERO(e);
+ if (ret)
+ goto err;
- if (level)
- bch2_btree_node_update_key_early(trans, btree, level - 1, k, new);
+ journal_entry_set(e,
+ BCH_JSET_ENTRY_btree_root,
+ btree, level - 1,
+ new, new->k.u64s);
+
+ /*
+ * no locking, we're single threaded and not rw yet, see
+ * the big assertino above that we repeat here:
+ */
+ BUG_ON(test_bit(BCH_FS_rw, &c->flags));
+
+ struct btree *b = bch2_btree_id_root(c, btree)->b;
+ bkey_copy(&b->key, new);
+ }
}
err:
printbuf_exit(&buf);
@@ -392,29 +422,32 @@ static int bucket_ref_update_err(struct btree_trans *trans, struct printbuf *buf
struct bkey_s_c k, bool insert, enum bch_sb_error_id id)
{
struct bch_fs *c = trans->c;
- bool repeat = false, print = true, suppress = false;
prt_printf(buf, "\nwhile marking ");
bch2_bkey_val_to_text(buf, c, k);
prt_newline(buf);
- __bch2_count_fsck_err(c, id, buf->buf, &repeat, &print, &suppress);
+ bool print = __bch2_count_fsck_err(c, id, buf);
- int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations);
+ int ret = bch2_run_explicit_recovery_pass(c, buf,
+ BCH_RECOVERY_PASS_check_allocations, 0);
if (insert) {
- print = true;
- suppress = false;
-
bch2_trans_updates_to_text(buf, trans);
__bch2_inconsistent_error(c, buf);
- ret = -BCH_ERR_bucket_ref_update;
+ /*
+ * If we're in recovery, run_explicit_recovery_pass might give
+ * us an error code for rewinding recovery
+ */
+ if (!ret)
+ ret = bch_err_throw(c, bucket_ref_update);
+ } else {
+ /* Always ignore overwrite errors, so that deletion works */
+ ret = 0;
}
- if (suppress)
- prt_printf(buf, "Ratelimiting new instances of previous error\n");
- if (print)
- bch2_print_string_as_lines(KERN_ERR, buf->buf);
+ if (print || insert)
+ bch2_print_str(c, KERN_ERR, buf->buf);
return ret;
}
@@ -599,7 +632,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev);
if (unlikely(!ca)) {
if (insert && p.ptr.dev != BCH_SB_MEMBER_INVALID)
- ret = -BCH_ERR_trigger_pointer;
+ ret = bch_err_throw(c, trigger_pointer);
goto err;
}
@@ -607,7 +640,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
if (!bucket_valid(ca, bucket.offset)) {
if (insert) {
bch2_dev_bucket_missing(ca, bucket.offset);
- ret = -BCH_ERR_trigger_pointer;
+ ret = bch_err_throw(c, trigger_pointer);
}
goto err;
}
@@ -629,7 +662,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s",
p.ptr.dev,
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- ret = -BCH_ERR_trigger_pointer;
+ ret = bch_err_throw(c, trigger_pointer);
goto err;
}
@@ -655,6 +688,8 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans,
s64 sectors,
enum btree_iter_update_trigger_flags flags)
{
+ struct bch_fs *c = trans->c;
+
if (flags & BTREE_TRIGGER_transactional) {
struct btree_iter iter;
struct bkey_i_stripe *s = bch2_bkey_get_mut_typed(trans, &iter,
@@ -672,7 +707,7 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans,
bch2_trans_inconsistent(trans,
"stripe pointer doesn't match stripe %llu",
(u64) p.ec.idx);
- ret = -BCH_ERR_trigger_stripe_pointer;
+ ret = bch_err_throw(c, trigger_stripe_pointer);
goto err;
}
@@ -692,13 +727,11 @@ err:
}
if (flags & BTREE_TRIGGER_gc) {
- struct bch_fs *c = trans->c;
-
struct gc_stripe *m = genradix_ptr_alloc(&c->gc_stripes, p.ec.idx, GFP_KERNEL);
if (!m) {
bch_err(c, "error allocating memory for gc_stripes, idx %llu",
(u64) p.ec.idx);
- return -BCH_ERR_ENOMEM_mark_stripe_ptr;
+ return bch_err_throw(c, ENOMEM_mark_stripe_ptr);
}
gc_stripe_lock(m);
@@ -711,9 +744,9 @@ err:
(u64) p.ec.idx);
bch2_bkey_val_to_text(&buf, c, k);
__bch2_inconsistent_error(c, &buf);
- bch2_print_string_as_lines(KERN_ERR, buf.buf);
+ bch2_print_str(c, KERN_ERR, buf.buf);
printbuf_exit(&buf);
- return -BCH_ERR_trigger_stripe_pointer;
+ return bch_err_throw(c, trigger_stripe_pointer);
}
m->block_sectors[p.ec.block] += sectors;
@@ -736,8 +769,7 @@ err:
static int __trigger_extent(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c k,
- enum btree_iter_update_trigger_flags flags,
- s64 *replicas_sectors)
+ enum btree_iter_update_trigger_flags flags)
{
bool gc = flags & BTREE_TRIGGER_gc;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
@@ -748,6 +780,8 @@ static int __trigger_extent(struct btree_trans *trans,
: BCH_DATA_user;
int ret = 0;
+ s64 replicas_sectors = 0;
+
struct disk_accounting_pos acc_replicas_key;
memset(&acc_replicas_key, 0, sizeof(acc_replicas_key));
acc_replicas_key.type = BCH_DISK_ACCOUNTING_replicas;
@@ -774,7 +808,7 @@ static int __trigger_extent(struct btree_trans *trans,
if (ret)
return ret;
} else if (!p.has_ec) {
- *replicas_sectors += disk_sectors;
+ replicas_sectors += disk_sectors;
replicas_entry_add_dev(&acc_replicas_key.replicas, p.ptr.dev);
} else {
ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags);
@@ -812,13 +846,13 @@ static int __trigger_extent(struct btree_trans *trans,
}
if (acc_replicas_key.replicas.nr_devs) {
- ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, replicas_sectors, 1, gc);
+ ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, &replicas_sectors, 1, gc);
if (ret)
return ret;
}
if (acc_replicas_key.replicas.nr_devs && !level && k.k->p.snapshot) {
- ret = bch2_disk_accounting_mod2_nr(trans, gc, replicas_sectors, 1, snapshot, k.k->p.snapshot);
+ ret = bch2_disk_accounting_mod2_nr(trans, gc, &replicas_sectors, 1, snapshot, k.k->p.snapshot);
if (ret)
return ret;
}
@@ -834,7 +868,7 @@ static int __trigger_extent(struct btree_trans *trans,
}
if (level) {
- ret = bch2_disk_accounting_mod2_nr(trans, gc, replicas_sectors, 1, btree, btree_id);
+ ret = bch2_disk_accounting_mod2_nr(trans, gc, &replicas_sectors, 1, btree, btree_id);
if (ret)
return ret;
} else {
@@ -843,7 +877,7 @@ static int __trigger_extent(struct btree_trans *trans,
s64 v[3] = {
insert ? 1 : -1,
insert ? k.k->size : -((s64) k.k->size),
- *replicas_sectors,
+ replicas_sectors,
};
ret = bch2_disk_accounting_mod2(trans, gc, v, inum, k.k->p.inode);
if (ret)
@@ -875,20 +909,16 @@ int bch2_trigger_extent(struct btree_trans *trans,
return 0;
if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
- s64 old_replicas_sectors = 0, new_replicas_sectors = 0;
-
if (old.k->type) {
int ret = __trigger_extent(trans, btree, level, old,
- flags & ~BTREE_TRIGGER_insert,
- &old_replicas_sectors);
+ flags & ~BTREE_TRIGGER_insert);
if (ret)
return ret;
}
if (new.k->type) {
int ret = __trigger_extent(trans, btree, level, new.s_c,
- flags & ~BTREE_TRIGGER_overwrite,
- &new_replicas_sectors);
+ flags & ~BTREE_TRIGGER_overwrite);
if (ret)
return ret;
}
@@ -966,15 +996,25 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
return PTR_ERR(a);
if (a->v.data_type && type && a->v.data_type != type) {
- bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations);
- log_fsck_err(trans, bucket_metadata_type_mismatch,
- "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
- "while marking %s",
- iter.pos.inode, iter.pos.offset, a->v.gen,
- bch2_data_type_str(a->v.data_type),
- bch2_data_type_str(type),
- bch2_data_type_str(type));
- ret = -BCH_ERR_metadata_bucket_inconsistency;
+ struct printbuf buf = PRINTBUF;
+ bch2_log_msg_start(c, &buf);
+ prt_printf(&buf, "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
+ "while marking %s\n",
+ iter.pos.inode, iter.pos.offset, a->v.gen,
+ bch2_data_type_str(a->v.data_type),
+ bch2_data_type_str(type),
+ bch2_data_type_str(type));
+
+ bch2_count_fsck_err(c, bucket_metadata_type_mismatch, &buf);
+
+ ret = bch2_run_explicit_recovery_pass(c, &buf,
+ BCH_RECOVERY_PASS_check_allocations, 0);
+
+ /* Always print, this is always fatal */
+ bch2_print_str(c, KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+ if (!ret)
+ ret = bch_err_throw(c, metadata_bucket_inconsistency);
goto err;
}
@@ -985,7 +1025,6 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
}
err:
-fsck_err:
bch2_trans_iter_exit(trans, &iter);
return ret;
}
@@ -1028,7 +1067,7 @@ static int bch2_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev *
err_unlock:
bucket_unlock(g);
err:
- return -BCH_ERR_metadata_bucket_inconsistency;
+ return bch_err_throw(c, metadata_bucket_inconsistency);
}
int bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
@@ -1143,10 +1182,10 @@ int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca,
int bch2_trans_mark_dev_sbs_flags(struct bch_fs *c,
enum btree_iter_update_trigger_flags flags)
{
- for_each_online_member(c, ca) {
+ for_each_online_member(c, ca, BCH_DEV_READ_REF_trans_mark_dev_sbs) {
int ret = bch2_trans_mark_dev_sb(c, ca, flags);
if (ret) {
- percpu_ref_put(&ca->io_ref[READ]);
+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_trans_mark_dev_sbs);
return ret;
}
}
@@ -1243,7 +1282,7 @@ recalculate:
ret = 0;
} else {
atomic64_set(&c->sectors_available, sectors_available);
- ret = -BCH_ERR_ENOSPC_disk_reservation;
+ ret = bch_err_throw(c, ENOSPC_disk_reservation);
}
mutex_unlock(&c->sectors_available_lock);
@@ -1272,7 +1311,7 @@ int bch2_buckets_nouse_alloc(struct bch_fs *c)
GFP_KERNEL|__GFP_ZERO);
if (!ca->buckets_nouse) {
bch2_dev_put(ca);
- return -BCH_ERR_ENOMEM_buckets_nouse;
+ return bch_err_throw(c, ENOMEM_buckets_nouse);
}
}
@@ -1297,12 +1336,12 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
lockdep_assert_held(&c->state_lock);
if (resize && ca->buckets_nouse)
- return -BCH_ERR_no_resize_with_buckets_nouse;
+ return bch_err_throw(c, no_resize_with_buckets_nouse);
bucket_gens = bch2_kvmalloc(struct_size(bucket_gens, b, nbuckets),
GFP_KERNEL|__GFP_ZERO);
if (!bucket_gens) {
- ret = -BCH_ERR_ENOMEM_bucket_gens;
+ ret = bch_err_throw(c, ENOMEM_bucket_gens);
goto err;
}
@@ -1321,6 +1360,11 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
sizeof(bucket_gens->b[0]) * copy);
}
+ ret = bch2_bucket_bitmap_resize(ca, &ca->bucket_backpointer_mismatch,
+ ca->mi.nbuckets, nbuckets) ?:
+ bch2_bucket_bitmap_resize(ca, &ca->bucket_backpointer_empty,
+ ca->mi.nbuckets, nbuckets);
+
rcu_assign_pointer(ca->bucket_gens, bucket_gens);
bucket_gens = old_bucket_gens;
@@ -1345,7 +1389,7 @@ int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
{
ca->usage = alloc_percpu(struct bch_dev_usage_full);
if (!ca->usage)
- return -BCH_ERR_ENOMEM_usage_init;
+ return bch_err_throw(c, ENOMEM_usage_init);
return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);
}
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index af1532de4a37..49a3807a5eab 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -84,10 +84,8 @@ static inline int bucket_gen_get_rcu(struct bch_dev *ca, size_t b)
static inline int bucket_gen_get(struct bch_dev *ca, size_t b)
{
- rcu_read_lock();
- int ret = bucket_gen_get_rcu(ca, b);
- rcu_read_unlock();
- return ret;
+ guard(rcu)();
+ return bucket_gen_get_rcu(ca, b);
}
static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
@@ -156,10 +154,8 @@ static inline int dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_
*/
static inline int dev_ptr_stale(struct bch_dev *ca, const struct bch_extent_ptr *ptr)
{
- rcu_read_lock();
- int ret = dev_ptr_stale_rcu(ca, ptr);
- rcu_read_unlock();
- return ret;
+ guard(rcu)();
+ return dev_ptr_stale_rcu(ca, ptr);
}
/* Device usage: */
diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c
index c8a488e6b7b8..832eff93acb6 100644
--- a/fs/bcachefs/buckets_waiting_for_journal.c
+++ b/fs/bcachefs/buckets_waiting_for_journal.c
@@ -108,7 +108,8 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
realloc:
n = kvmalloc(sizeof(*n) + (sizeof(n->d[0]) << new_bits), GFP_KERNEL);
if (!n) {
- ret = -BCH_ERR_ENOMEM_buckets_waiting_for_journal_set;
+ struct bch_fs *c = container_of(b, struct bch_fs, buckets_waiting_for_journal);
+ ret = bch_err_throw(c, ENOMEM_buckets_waiting_for_journal_set);
goto out;
}
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 5891b3a1e61c..2d38466eddfd 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -613,13 +613,12 @@ static long bch2_ioctl_disk_get_idx(struct bch_fs *c,
if (!dev)
return -EINVAL;
- for_each_online_member(c, ca)
- if (ca->dev == dev) {
- percpu_ref_put(&ca->io_ref[READ]);
+ guard(rcu)();
+ for_each_online_member_rcu(c, ca)
+ if (ca->dev == dev)
return ca->dev_idx;
- }
- return -BCH_ERR_ENOENT_dev_idx_not_found;
+ return bch_err_throw(c, ENOENT_dev_idx_not_found);
}
static long bch2_ioctl_disk_resize(struct bch_fs *c,
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index d0a34a097b80..a6795e73f0b9 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -91,7 +91,7 @@ static void bch2_checksum_update(struct bch2_checksum_state *state, const void *
}
}
-static void bch2_chacha20_init(u32 state[CHACHA_STATE_WORDS],
+static void bch2_chacha20_init(struct chacha_state *state,
const struct bch_key *key, struct nonce nonce)
{
u32 key_words[CHACHA_KEY_SIZE / sizeof(u32)];
@@ -106,14 +106,14 @@ static void bch2_chacha20_init(u32 state[CHACHA_STATE_WORDS],
memzero_explicit(key_words, sizeof(key_words));
}
-static void bch2_chacha20(const struct bch_key *key, struct nonce nonce,
- void *data, size_t len)
+void bch2_chacha20(const struct bch_key *key, struct nonce nonce,
+ void *data, size_t len)
{
- u32 state[CHACHA_STATE_WORDS];
+ struct chacha_state state;
- bch2_chacha20_init(state, key, nonce);
- chacha20_crypt(state, data, data, len);
- memzero_explicit(state, sizeof(state));
+ bch2_chacha20_init(&state, key, nonce);
+ chacha20_crypt(&state, data, data, len);
+ chacha_zeroize_state(&state);
}
static void bch2_poly1305_init(struct poly1305_desc_ctx *desc,
@@ -173,7 +173,7 @@ int bch2_encrypt(struct bch_fs *c, unsigned type,
if (bch2_fs_inconsistent_on(!c->chacha20_key_set,
c, "attempting to encrypt without encryption key"))
- return -BCH_ERR_no_encryption_key;
+ return bch_err_throw(c, no_encryption_key);
bch2_chacha20(&c->chacha20_key, nonce, data, len);
return 0;
@@ -257,14 +257,14 @@ int __bch2_encrypt_bio(struct bch_fs *c, unsigned type,
{
struct bio_vec bv;
struct bvec_iter iter;
- u32 chacha_state[CHACHA_STATE_WORDS];
+ struct chacha_state chacha_state;
int ret = 0;
if (bch2_fs_inconsistent_on(!c->chacha20_key_set,
c, "attempting to encrypt without encryption key"))
- return -BCH_ERR_no_encryption_key;
+ return bch_err_throw(c, no_encryption_key);
- bch2_chacha20_init(chacha_state, &c->chacha20_key, nonce);
+ bch2_chacha20_init(&chacha_state, &c->chacha20_key, nonce);
bio_for_each_segment(bv, bio, iter) {
void *p;
@@ -280,10 +280,10 @@ int __bch2_encrypt_bio(struct bch_fs *c, unsigned type,
}
p = bvec_kmap_local(&bv);
- chacha20_crypt(chacha_state, p, p, bv.bv_len);
+ chacha20_crypt(&chacha_state, p, p, bv.bv_len);
kunmap_local(p);
}
- memzero_explicit(chacha_state, sizeof(chacha_state));
+ chacha_zeroize_state(&chacha_state);
return ret;
}
@@ -375,7 +375,7 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
prt_str(&buf, ")");
WARN_RATELIMIT(1, "%s", buf.buf);
printbuf_exit(&buf);
- return -BCH_ERR_recompute_checksum;
+ return bch_err_throw(c, recompute_checksum);
}
for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
@@ -659,7 +659,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed)
crypt = bch2_sb_field_resize(&c->disk_sb, crypt,
sizeof(*crypt) / sizeof(u64));
if (!crypt) {
- ret = -BCH_ERR_ENOSPC_sb_crypt;
+ ret = bch_err_throw(c, ENOSPC_sb_crypt);
goto err;
}
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
index 1310782d3ae9..7bd9cf6104ca 100644
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@@ -69,6 +69,8 @@ static inline void bch2_csum_err_msg(struct printbuf *out,
bch2_csum_to_text(out, type, expected);
}
+void bch2_chacha20(const struct bch_key *, struct nonce, void *, size_t);
+
int bch2_request_key(struct bch_sb *, struct bch_key *);
#ifndef __KERNEL__
int bch2_revoke_key(struct bch_sb *);
diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c
index d6dd12d74d4f..8e9264b5a84e 100644
--- a/fs/bcachefs/clock.c
+++ b/fs/bcachefs/clock.c
@@ -53,7 +53,6 @@ void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer)
struct io_clock_wait {
struct io_timer io_timer;
- struct timer_list cpu_timer;
struct task_struct *task;
int expired;
};
@@ -67,15 +66,6 @@ static void io_clock_wait_fn(struct io_timer *timer)
wake_up_process(wait->task);
}
-static void io_clock_cpu_timeout(struct timer_list *timer)
-{
- struct io_clock_wait *wait = container_of(timer,
- struct io_clock_wait, cpu_timer);
-
- wait->expired = 1;
- wake_up_process(wait->task);
-}
-
void bch2_io_clock_schedule_timeout(struct io_clock *clock, u64 until)
{
struct io_clock_wait wait = {
@@ -90,8 +80,8 @@ void bch2_io_clock_schedule_timeout(struct io_clock *clock, u64 until)
bch2_io_timer_del(clock, &wait.io_timer);
}
-void bch2_kthread_io_clock_wait(struct io_clock *clock,
- u64 io_until, unsigned long cpu_timeout)
+unsigned long bch2_kthread_io_clock_wait_once(struct io_clock *clock,
+ u64 io_until, unsigned long cpu_timeout)
{
bool kthread = (current->flags & PF_KTHREAD) != 0;
struct io_clock_wait wait = {
@@ -103,27 +93,26 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock,
bch2_io_timer_add(clock, &wait.io_timer);
- timer_setup_on_stack(&wait.cpu_timer, io_clock_cpu_timeout, 0);
-
- if (cpu_timeout != MAX_SCHEDULE_TIMEOUT)
- mod_timer(&wait.cpu_timer, cpu_timeout + jiffies);
-
- do {
- set_current_state(TASK_INTERRUPTIBLE);
- if (kthread && kthread_should_stop())
- break;
-
- if (wait.expired)
- break;
-
- schedule();
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (!(kthread && kthread_should_stop())) {
+ cpu_timeout = schedule_timeout(cpu_timeout);
try_to_freeze();
- } while (0);
+ }
__set_current_state(TASK_RUNNING);
- timer_delete_sync(&wait.cpu_timer);
- destroy_timer_on_stack(&wait.cpu_timer);
bch2_io_timer_del(clock, &wait.io_timer);
+ return cpu_timeout;
+}
+
+void bch2_kthread_io_clock_wait(struct io_clock *clock,
+ u64 io_until, unsigned long cpu_timeout)
+{
+ bool kthread = (current->flags & PF_KTHREAD) != 0;
+
+ while (!(kthread && kthread_should_stop()) &&
+ cpu_timeout &&
+ atomic64_read(&clock->now) < io_until)
+ cpu_timeout = bch2_kthread_io_clock_wait_once(clock, io_until, cpu_timeout);
}
static struct io_timer *get_expired_timer(struct io_clock *clock, u64 now)
diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h
index 82c79c8baf92..8769be2aa21e 100644
--- a/fs/bcachefs/clock.h
+++ b/fs/bcachefs/clock.h
@@ -4,6 +4,7 @@
void bch2_io_timer_add(struct io_clock *, struct io_timer *);
void bch2_io_timer_del(struct io_clock *, struct io_timer *);
+unsigned long bch2_kthread_io_clock_wait_once(struct io_clock *, u64, unsigned long);
void bch2_kthread_io_clock_wait(struct io_clock *, u64, unsigned long);
void __bch2_increment_clock(struct io_clock *, u64);
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 28ed32449913..b37b1f325f0a 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -187,7 +187,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
__bch2_compression_types[crc.compression_type]))
ret = bch2_check_set_has_compressed_data(c, opt);
else
- ret = -BCH_ERR_compression_workspace_not_initialized;
+ ret = bch_err_throw(c, compression_workspace_not_initialized);
if (ret)
goto err;
}
@@ -200,7 +200,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
ret2 = LZ4_decompress_safe_partial(src_data.b, dst_data,
src_len, dst_len, dst_len);
if (ret2 != dst_len)
- ret = -BCH_ERR_decompress_lz4;
+ ret = bch_err_throw(c, decompress_lz4);
break;
case BCH_COMPRESSION_TYPE_gzip: {
z_stream strm = {
@@ -219,7 +219,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
mempool_free(workspace, workspace_pool);
if (ret2 != Z_STREAM_END)
- ret = -BCH_ERR_decompress_gzip;
+ ret = bch_err_throw(c, decompress_gzip);
break;
}
case BCH_COMPRESSION_TYPE_zstd: {
@@ -227,7 +227,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
size_t real_src_len = le32_to_cpup(src_data.b);
if (real_src_len > src_len - 4) {
- ret = -BCH_ERR_decompress_zstd_src_len_bad;
+ ret = bch_err_throw(c, decompress_zstd_src_len_bad);
goto err;
}
@@ -241,7 +241,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
mempool_free(workspace, workspace_pool);
if (ret2 != dst_len)
- ret = -BCH_ERR_decompress_zstd;
+ ret = bch_err_throw(c, decompress_zstd);
break;
}
default:
@@ -270,7 +270,7 @@ int bch2_bio_uncompress_inplace(struct bch_write_op *op,
bch2_write_op_error(op, op->pos.offset,
"extent too big to decompress (%u > %u)",
crc->uncompressed_size << 9, c->opts.encoded_extent_max);
- return -BCH_ERR_decompress_exceeded_max_encoded_extent;
+ return bch_err_throw(c, decompress_exceeded_max_encoded_extent);
}
data = __bounce_alloc(c, dst_len, WRITE);
@@ -314,7 +314,7 @@ int bch2_bio_uncompress(struct bch_fs *c, struct bio *src,
if (crc.uncompressed_size << 9 > c->opts.encoded_extent_max ||
crc.compressed_size << 9 > c->opts.encoded_extent_max)
- return -BCH_ERR_decompress_exceeded_max_encoded_extent;
+ return bch_err_throw(c, decompress_exceeded_max_encoded_extent);
dst_data = dst_len == dst_iter.bi_size
? __bio_map_or_bounce(c, dst, dst_iter, WRITE)
@@ -656,12 +656,12 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
if (!mempool_initialized(&c->compression_bounce[READ]) &&
mempool_init_kvmalloc_pool(&c->compression_bounce[READ],
1, c->opts.encoded_extent_max))
- return -BCH_ERR_ENOMEM_compression_bounce_read_init;
+ return bch_err_throw(c, ENOMEM_compression_bounce_read_init);
if (!mempool_initialized(&c->compression_bounce[WRITE]) &&
mempool_init_kvmalloc_pool(&c->compression_bounce[WRITE],
1, c->opts.encoded_extent_max))
- return -BCH_ERR_ENOMEM_compression_bounce_write_init;
+ return bch_err_throw(c, ENOMEM_compression_bounce_write_init);
for (i = compression_types;
i < compression_types + ARRAY_SIZE(compression_types);
@@ -675,7 +675,7 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
if (mempool_init_kvmalloc_pool(
&c->compress_workspace[i->type],
1, i->compress_workspace))
- return -BCH_ERR_ENOMEM_compression_workspace_init;
+ return bch_err_throw(c, ENOMEM_compression_workspace_init);
}
return 0;
@@ -714,7 +714,7 @@ int bch2_opt_compression_parse(struct bch_fs *c, const char *_val, u64 *res,
ret = match_string(bch2_compression_opts, -1, type_str);
if (ret < 0 && err)
- prt_str(err, "invalid compression type");
+ prt_printf(err, "invalid compression type\n");
if (ret < 0)
goto err;
@@ -729,7 +729,7 @@ int bch2_opt_compression_parse(struct bch_fs *c, const char *_val, u64 *res,
if (!ret && level > 15)
ret = -EINVAL;
if (ret < 0 && err)
- prt_str(err, "invalid compression level");
+ prt_printf(err, "invalid compression level\n");
if (ret < 0)
goto err;
diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h
index c6151495985f..4080ee99aadd 100644
--- a/fs/bcachefs/darray.h
+++ b/fs/bcachefs/darray.h
@@ -8,6 +8,7 @@
* Inspired by CCAN's darray
*/
+#include <linux/cleanup.h>
#include <linux/slab.h>
#define DARRAY_PREALLOCATED(_type, _nr) \
@@ -20,7 +21,18 @@ struct { \
#define DARRAY(_type) DARRAY_PREALLOCATED(_type, 0)
typedef DARRAY(char) darray_char;
-typedef DARRAY(char *) darray_str;
+typedef DARRAY(char *) darray_str;
+typedef DARRAY(const char *) darray_const_str;
+
+typedef DARRAY(u8) darray_u8;
+typedef DARRAY(u16) darray_u16;
+typedef DARRAY(u32) darray_u32;
+typedef DARRAY(u64) darray_u64;
+
+typedef DARRAY(s8) darray_s8;
+typedef DARRAY(s16) darray_s16;
+typedef DARRAY(s32) darray_s32;
+typedef DARRAY(s64) darray_s64;
int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t);
@@ -76,7 +88,23 @@ int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t);
#define darray_remove_item(_d, _pos) \
array_remove_item((_d)->data, (_d)->nr, (_pos) - (_d)->data)
-#define __darray_for_each(_d, _i) \
+#define darray_find_p(_d, _i, cond) \
+({ \
+ typeof((_d).data) _ret = NULL; \
+ \
+ darray_for_each(_d, _i) \
+ if (cond) { \
+ _ret = _i; \
+ break; \
+ } \
+ _ret; \
+})
+
+#define darray_find(_d, _item) darray_find_p(_d, _i, *_i == _item)
+
+/* Iteration: */
+
+#define __darray_for_each(_d, _i) \
for ((_i) = (_d).data; _i < (_d).data + (_d).nr; _i++)
#define darray_for_each(_d, _i) \
@@ -85,6 +113,8 @@ int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t);
#define darray_for_each_reverse(_d, _i) \
for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data && (_d).nr; --_i)
+/* Init/exit */
+
#define darray_init(_d) \
do { \
(_d)->nr = 0; \
@@ -100,4 +130,29 @@ do { \
darray_init(_d); \
} while (0)
+#define DEFINE_DARRAY_CLASS(_type) \
+DEFINE_CLASS(_type, _type, darray_exit(&(_T)), (_type) {}, void)
+
+#define DEFINE_DARRAY(_type) \
+typedef DARRAY(_type) darray_##_type; \
+DEFINE_DARRAY_CLASS(darray_##_type)
+
+#define DEFINE_DARRAY_NAMED(_name, _type) \
+typedef DARRAY(_type) _name; \
+DEFINE_DARRAY_CLASS(_name)
+
+DEFINE_DARRAY_CLASS(darray_char);
+DEFINE_DARRAY_CLASS(darray_str)
+DEFINE_DARRAY_CLASS(darray_const_str)
+
+DEFINE_DARRAY_CLASS(darray_u8)
+DEFINE_DARRAY_CLASS(darray_u16)
+DEFINE_DARRAY_CLASS(darray_u32)
+DEFINE_DARRAY_CLASS(darray_u64)
+
+DEFINE_DARRAY_CLASS(darray_s8)
+DEFINE_DARRAY_CLASS(darray_s16)
+DEFINE_DARRAY_CLASS(darray_s32)
+DEFINE_DARRAY_CLASS(darray_s64)
+
#endif /* _BCACHEFS_DARRAY_H */
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index b211c97238ab..5f1174348974 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -66,43 +66,53 @@ static void bkey_nocow_unlock(struct bch_fs *c, struct bkey_s_c k)
}
}
-static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_s_c k)
+static noinline_for_stack
+bool __bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_ptrs_c ptrs,
+ const struct bch_extent_ptr *start)
{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ if (!ctxt) {
+ bkey_for_each_ptr(ptrs, ptr) {
+ if (ptr == start)
+ break;
+
+ struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
+ struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
+ bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0);
+ }
+ return false;
+ }
- bkey_for_each_ptr(ptrs, ptr) {
+ __bkey_for_each_ptr(start, ptrs.end, ptr) {
struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
- if (ctxt) {
- bool locked;
-
- move_ctxt_wait_event(ctxt,
- (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) ||
- list_empty(&ctxt->ios));
+ bool locked;
+ move_ctxt_wait_event(ctxt,
+ (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) ||
+ list_empty(&ctxt->ios));
+ if (!locked)
+ bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0);
+ }
+ return true;
+}
- if (!locked)
- bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0);
- } else {
- if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) {
- bkey_for_each_ptr(ptrs, ptr2) {
- if (ptr2 == ptr)
- break;
+static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_ptrs_c ptrs)
+{
+ bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
+ struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
- ca = bch2_dev_have_ref(c, ptr2->dev);
- bucket = PTR_BUCKET_POS(ca, ptr2);
- bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0);
- }
- return false;
- }
- }
+ if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0))
+ return __bkey_nocow_lock(c, ctxt, ptrs, ptr);
}
+
return true;
}
-static noinline void trace_io_move_finish2(struct data_update *u,
- struct bkey_i *new,
- struct bkey_i *insert)
+noinline_for_stack
+static void trace_io_move_finish2(struct data_update *u,
+ struct bkey_i *new,
+ struct bkey_i *insert)
{
struct bch_fs *c = u->op.c;
struct printbuf buf = PRINTBUF;
@@ -124,6 +134,7 @@ static noinline void trace_io_move_finish2(struct data_update *u,
printbuf_exit(&buf);
}
+noinline_for_stack
static void trace_io_move_fail2(struct data_update *m,
struct bkey_s_c new,
struct bkey_s_c wrote,
@@ -179,24 +190,84 @@ static void trace_io_move_fail2(struct data_update *m,
printbuf_exit(&buf);
}
+noinline_for_stack
+static void trace_data_update2(struct data_update *m,
+ struct bkey_s_c old, struct bkey_s_c k,
+ struct bkey_i *insert)
+{
+ struct bch_fs *c = m->op.c;
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "\nold: ");
+ bch2_bkey_val_to_text(&buf, c, old);
+ prt_str(&buf, "\nk: ");
+ bch2_bkey_val_to_text(&buf, c, k);
+ prt_str(&buf, "\nnew: ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
+
+ trace_data_update(c, buf.buf);
+ printbuf_exit(&buf);
+}
+
+noinline_for_stack
+static void trace_io_move_created_rebalance2(struct data_update *m,
+ struct bkey_s_c old, struct bkey_s_c k,
+ struct bkey_i *insert)
+{
+ struct bch_fs *c = m->op.c;
+ struct printbuf buf = PRINTBUF;
+
+ bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts);
+
+ prt_str(&buf, "\nold: ");
+ bch2_bkey_val_to_text(&buf, c, old);
+ prt_str(&buf, "\nk: ");
+ bch2_bkey_val_to_text(&buf, c, k);
+ prt_str(&buf, "\nnew: ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
+
+ trace_io_move_created_rebalance(c, buf.buf);
+ printbuf_exit(&buf);
+
+ this_cpu_inc(c->counters[BCH_COUNTER_io_move_created_rebalance]);
+}
+
+noinline_for_stack
+static int data_update_invalid_bkey(struct data_update *m,
+ struct bkey_s_c old, struct bkey_s_c k,
+ struct bkey_i *insert)
+{
+ struct bch_fs *c = m->op.c;
+ struct printbuf buf = PRINTBUF;
+ bch2_log_msg_start(c, &buf);
+
+ prt_str(&buf, "about to insert invalid key in data update path");
+ prt_printf(&buf, "\nop.nonce: %u", m->op.nonce);
+ prt_str(&buf, "\nold: ");
+ bch2_bkey_val_to_text(&buf, c, old);
+ prt_str(&buf, "\nk: ");
+ bch2_bkey_val_to_text(&buf, c, k);
+ prt_str(&buf, "\nnew: ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
+
+ bch2_fs_emergency_read_only2(c, &buf);
+
+ bch2_print_str(c, KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+
+ return bch_err_throw(c, invalid_bkey);
+}
+
static int __bch2_data_update_index_update(struct btree_trans *trans,
struct bch_write_op *op)
{
struct bch_fs *c = op->c;
struct btree_iter iter;
- struct data_update *m =
- container_of(op, struct data_update, op);
- struct keylist *keys = &op->insert_keys;
- struct bkey_buf _new, _insert;
- struct printbuf journal_msg = PRINTBUF;
+ struct data_update *m = container_of(op, struct data_update, op);
int ret = 0;
- bch2_bkey_buf_init(&_new);
- bch2_bkey_buf_init(&_insert);
- bch2_bkey_buf_realloc(&_insert, c, U8_MAX);
-
bch2_trans_iter_init(trans, &iter, m->btree_id,
- bkey_start_pos(&bch2_keylist_front(keys)->k),
+ bkey_start_pos(&bch2_keylist_front(&op->insert_keys)->k),
BTREE_ITER_slots|BTREE_ITER_intent);
while (1) {
@@ -221,19 +292,30 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
if (ret)
goto err;
- new = bkey_i_to_extent(bch2_keylist_front(keys));
+ new = bkey_i_to_extent(bch2_keylist_front(&op->insert_keys));
if (!bch2_extents_match(k, old)) {
trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i),
- NULL, "no match:");
+ NULL, "no match:");
goto nowork;
}
- bkey_reassemble(_insert.k, k);
- insert = _insert.k;
+ insert = bch2_trans_kmalloc(trans,
+ bkey_bytes(k.k) +
+ bkey_val_bytes(&new->k) +
+ sizeof(struct bch_extent_rebalance));
+ ret = PTR_ERR_OR_ZERO(insert);
+ if (ret)
+ goto err;
- bch2_bkey_buf_copy(&_new, c, bch2_keylist_front(keys));
- new = bkey_i_to_extent(_new.k);
+ bkey_reassemble(insert, k);
+
+ new = bch2_trans_kmalloc(trans, bkey_bytes(&new->k));
+ ret = PTR_ERR_OR_ZERO(new);
+ if (ret)
+ goto err;
+
+ bkey_copy(&new->k_i, bch2_keylist_front(&op->insert_keys));
bch2_cut_front(iter.pos, &new->k_i);
bch2_cut_front(iter.pos, insert);
@@ -294,21 +376,21 @@ restart_drop_conflicting_replicas:
bch2_bkey_durability(c, bkey_i_to_s_c(&new->k_i));
/* Now, drop excess replicas: */
- rcu_read_lock();
+ scoped_guard(rcu) {
restart_drop_extra_replicas:
- bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) {
- unsigned ptr_durability = bch2_extent_ptr_durability(c, &p);
+ bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) {
+ unsigned ptr_durability = bch2_extent_ptr_durability(c, &p);
- if (!p.ptr.cached &&
- durability - ptr_durability >= m->op.opts.data_replicas) {
- durability -= ptr_durability;
+ if (!p.ptr.cached &&
+ durability - ptr_durability >= m->op.opts.data_replicas) {
+ durability -= ptr_durability;
- bch2_extent_ptr_set_cached(c, &m->op.opts,
- bkey_i_to_s(insert), &entry->ptr);
- goto restart_drop_extra_replicas;
+ bch2_extent_ptr_set_cached(c, &m->op.opts,
+ bkey_i_to_s(insert), &entry->ptr);
+ goto restart_drop_extra_replicas;
+ }
}
}
- rcu_read_unlock();
/* Finally, add the pointers we just wrote: */
extent_for_each_ptr_decode(extent_i_to_s(new), p, entry)
@@ -346,44 +428,12 @@ restart_drop_extra_replicas:
.btree = m->btree_id,
.flags = BCH_VALIDATE_commit,
});
- if (invalid) {
- struct printbuf buf = PRINTBUF;
-
- prt_str(&buf, "about to insert invalid key in data update path");
- prt_printf(&buf, "\nop.nonce: %u", m->op.nonce);
- prt_str(&buf, "\nold: ");
- bch2_bkey_val_to_text(&buf, c, old);
- prt_str(&buf, "\nk: ");
- bch2_bkey_val_to_text(&buf, c, k);
- prt_str(&buf, "\nnew: ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
-
- bch2_print_string_as_lines(KERN_ERR, buf.buf);
- printbuf_exit(&buf);
-
- bch2_fatal_error(c);
- ret = -BCH_ERR_invalid_bkey;
+ if (unlikely(invalid)) {
+ ret = data_update_invalid_bkey(m, old, k, insert);
goto out;
}
- if (trace_data_update_enabled()) {
- struct printbuf buf = PRINTBUF;
-
- prt_str(&buf, "\nold: ");
- bch2_bkey_val_to_text(&buf, c, old);
- prt_str(&buf, "\nk: ");
- bch2_bkey_val_to_text(&buf, c, k);
- prt_str(&buf, "\nnew: ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
-
- trace_data_update(c, buf.buf);
- printbuf_exit(&buf);
- }
-
- printbuf_reset(&journal_msg);
- prt_str(&journal_msg, bch2_data_update_type_strs[m->type]);
-
- ret = bch2_trans_log_msg(trans, &journal_msg) ?:
+ ret = bch2_trans_log_str(trans, bch2_data_update_type_strs[m->type]) ?:
bch2_trans_log_bkey(trans, m->btree_id, 0, m->k.k) ?:
bch2_insert_snapshot_whiteouts(trans, m->btree_id,
k.k->p, bkey_start_pos(&insert->k)) ?:
@@ -391,28 +441,39 @@ restart_drop_extra_replicas:
k.k->p, insert->k.p) ?:
bch2_bkey_set_needs_rebalance(c, &op->opts, insert) ?:
bch2_trans_update(trans, &iter, insert,
- BTREE_UPDATE_internal_snapshot_node) ?:
- bch2_trans_commit(trans, &op->res,
+ BTREE_UPDATE_internal_snapshot_node);
+ if (ret)
+ goto err;
+
+ if (trace_data_update_enabled())
+ trace_data_update2(m, old, k, insert);
+
+ if (bch2_bkey_sectors_need_rebalance(c, bkey_i_to_s_c(insert)) * k.k->size >
+ bch2_bkey_sectors_need_rebalance(c, k) * insert->k.size)
+ trace_io_move_created_rebalance2(m, old, k, insert);
+
+ ret = bch2_trans_commit(trans, &op->res,
NULL,
BCH_TRANS_COMMIT_no_check_rw|
BCH_TRANS_COMMIT_no_enospc|
m->data_opts.btree_insert_flags);
- if (!ret) {
- bch2_btree_iter_set_pos(trans, &iter, next_pos);
+ if (ret)
+ goto err;
- this_cpu_add(c->counters[BCH_COUNTER_io_move_finish], new->k.size);
- if (trace_io_move_finish_enabled())
- trace_io_move_finish2(m, &new->k_i, insert);
- }
+ bch2_btree_iter_set_pos(trans, &iter, next_pos);
+
+ this_cpu_add(c->counters[BCH_COUNTER_io_move_finish], new->k.size);
+ if (trace_io_move_finish_enabled())
+ trace_io_move_finish2(m, &new->k_i, insert);
err:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
ret = 0;
if (ret)
break;
next:
- while (bkey_ge(iter.pos, bch2_keylist_front(keys)->k.p)) {
- bch2_keylist_pop_front(keys);
- if (bch2_keylist_empty(keys))
+ while (bkey_ge(iter.pos, bch2_keylist_front(&op->insert_keys)->k.p)) {
+ bch2_keylist_pop_front(&op->insert_keys);
+ if (bch2_keylist_empty(&op->insert_keys))
goto out;
}
continue;
@@ -430,10 +491,7 @@ nowork:
goto next;
}
out:
- printbuf_exit(&journal_msg);
bch2_trans_iter_exit(trans, &iter);
- bch2_bkey_buf_exit(&_insert, c);
- bch2_bkey_buf_exit(&_new, c);
BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
return ret;
}
@@ -474,8 +532,9 @@ void bch2_data_update_exit(struct data_update *update)
bch2_bkey_buf_exit(&update->k, c);
}
-static int bch2_update_unwritten_extent(struct btree_trans *trans,
- struct data_update *update)
+static noinline_for_stack
+int bch2_update_unwritten_extent(struct btree_trans *trans,
+ struct data_update *update)
{
struct bch_fs *c = update->op.c;
struct bkey_i_extent *e;
@@ -587,6 +646,10 @@ void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c,
prt_str_indented(out, "extra replicas:\t");
prt_u64(out, data_opts->extra_replicas);
+ prt_newline(out);
+
+ prt_str_indented(out, "scrub:\t");
+ prt_u64(out, data_opts->scrub);
}
void bch2_data_update_to_text(struct printbuf *out, struct data_update *m)
@@ -607,9 +670,17 @@ void bch2_data_update_inflight_to_text(struct printbuf *out, struct data_update
prt_newline(out);
printbuf_indent_add(out, 2);
bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts);
- prt_printf(out, "read_done:\t%u\n", m->read_done);
- bch2_write_op_to_text(out, &m->op);
- printbuf_indent_sub(out, 2);
+
+ if (!m->read_done) {
+ prt_printf(out, "read:\n");
+ printbuf_indent_add(out, 2);
+ bch2_read_bio_to_text(out, &m->rbio);
+ } else {
+ prt_printf(out, "write:\n");
+ printbuf_indent_add(out, 2);
+ bch2_write_op_to_text(out, &m->op);
+ }
+ printbuf_indent_sub(out, 4);
}
int bch2_extent_drop_ptrs(struct btree_trans *trans,
@@ -655,18 +726,10 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans,
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
}
-int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
- struct bch_io_opts *io_opts)
+static int __bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
+ struct bch_io_opts *io_opts,
+ unsigned buf_bytes)
{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k));
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
-
- /* write path might have to decompress data: */
- unsigned buf_bytes = 0;
- bkey_for_each_ptr_decode(&m->k.k->k, ptrs, p, entry)
- buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9);
-
unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE);
m->bvecs = kmalloc_array(nr_vecs, sizeof*(m->bvecs), GFP_KERNEL);
@@ -690,11 +753,26 @@ int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
return 0;
}
+int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
+ struct bch_io_opts *io_opts)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k));
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+
+ /* write path might have to decompress data: */
+ unsigned buf_bytes = 0;
+ bkey_for_each_ptr_decode(&m->k.k->k, ptrs, p, entry)
+ buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9);
+
+ return __bch2_data_update_bios_init(m, c, io_opts, buf_bytes);
+}
+
static int can_write_extent(struct bch_fs *c, struct data_update *m)
{
if ((m->op.flags & BCH_WRITE_alloc_nowait) &&
unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(m->op.watermark)))
- return -BCH_ERR_data_update_done_would_block;
+ return bch_err_throw(c, data_update_done_would_block);
unsigned target = m->op.flags & BCH_WRITE_only_specified_devs
? m->op.target
@@ -704,10 +782,13 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m)
darray_for_each(m->op.devs_have, i)
__clear_bit(*i, devs.d);
- rcu_read_lock();
+ guard(rcu)();
+
unsigned nr_replicas = 0, i;
for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) {
- struct bch_dev *ca = bch2_dev_rcu(c, i);
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, i);
+ if (!ca)
+ continue;
struct bch_dev_usage usage;
bch2_dev_usage_read_fast(ca, &usage);
@@ -719,12 +800,11 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m)
if (nr_replicas >= m->op.nr_replicas)
break;
}
- rcu_read_unlock();
if (!nr_replicas)
- return -BCH_ERR_data_update_done_no_rw_devs;
+ return bch_err_throw(c, data_update_done_no_rw_devs);
if (nr_replicas < m->op.nr_replicas)
- return -BCH_ERR_insufficient_devices;
+ return bch_err_throw(c, insufficient_devices);
return 0;
}
@@ -739,19 +819,21 @@ int bch2_data_update_init(struct btree_trans *trans,
struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- unsigned reserve_sectors = k.k->size * data_opts.extra_replicas;
int ret = 0;
- /*
- * fs is corrupt we have a key for a snapshot node that doesn't exist,
- * and we have to check for this because we go rw before repairing the
- * snapshots table - just skip it, we can move it later.
- */
- if (unlikely(k.k->p.snapshot && !bch2_snapshot_exists(c, k.k->p.snapshot)))
- return -BCH_ERR_data_update_done_no_snapshot;
+ if (k.k->p.snapshot) {
+ ret = bch2_check_key_has_snapshot(trans, iter, k);
+ if (bch2_err_matches(ret, BCH_ERR_recovery_will_run)) {
+ /* Can't repair yet, waiting on other recovery passes */
+ return bch_err_throw(c, data_update_done_no_snapshot);
+ }
+ if (ret < 0)
+ return ret;
+ if (ret) /* key was deleted */
+ return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
+ bch_err_throw(c, data_update_done_no_snapshot);
+ ret = 0;
+ }
bch2_bkey_buf_init(&m->k);
bch2_bkey_buf_reassemble(&m->k, c, k);
@@ -779,10 +861,17 @@ int bch2_data_update_init(struct btree_trans *trans,
unsigned durability_have = 0, durability_removing = 0;
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k));
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ unsigned reserve_sectors = k.k->size * data_opts.extra_replicas;
+ unsigned buf_bytes = 0;
+ bool unwritten = false;
+
unsigned ptr_bit = 1;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
if (!p.ptr.cached) {
- rcu_read_lock();
+ guard(rcu)();
if (ptr_bit & m->data_opts.rewrite_ptrs) {
if (crc_is_compressed(p.crc))
reserve_sectors += k.k->size;
@@ -793,7 +882,6 @@ int bch2_data_update_init(struct btree_trans *trans,
bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
durability_have += bch2_extent_ptr_durability(c, &p);
}
- rcu_read_unlock();
}
/*
@@ -809,6 +897,9 @@ int bch2_data_update_init(struct btree_trans *trans,
if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
m->op.incompressible = true;
+ buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9);
+ unwritten |= p.ptr.unwritten;
+
ptr_bit <<= 1;
}
@@ -847,7 +938,7 @@ int bch2_data_update_init(struct btree_trans *trans,
if (iter)
ret = bch2_extent_drop_ptrs(trans, iter, k, io_opts, &m->data_opts);
if (!ret)
- ret = -BCH_ERR_data_update_done_no_writes_needed;
+ ret = bch_err_throw(c, data_update_done_no_writes_needed);
goto out_bkey_buf_exit;
}
@@ -878,23 +969,25 @@ int bch2_data_update_init(struct btree_trans *trans,
}
if (!bkey_get_dev_refs(c, k)) {
- ret = -BCH_ERR_data_update_done_no_dev_refs;
+ ret = bch_err_throw(c, data_update_done_no_dev_refs);
goto out_put_disk_res;
}
if (c->opts.nocow_enabled &&
- !bkey_nocow_lock(c, ctxt, k)) {
- ret = -BCH_ERR_nocow_lock_blocked;
+ !bkey_nocow_lock(c, ctxt, ptrs)) {
+ ret = bch_err_throw(c, nocow_lock_blocked);
goto out_put_dev_refs;
}
- if (bkey_extent_is_unwritten(k)) {
+ if (unwritten) {
ret = bch2_update_unwritten_extent(trans, m) ?:
- -BCH_ERR_data_update_done_unwritten;
+ bch_err_throw(c, data_update_done_unwritten);
goto out_nocow_unlock;
}
- ret = bch2_data_update_bios_init(m, c, io_opts);
+ bch2_trans_unlock(trans);
+
+ ret = __bch2_data_update_bios_init(m, c, io_opts, buf_bytes);
if (ret)
goto out_nocow_unlock;
diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h
index ed05125867da..5e14d13568de 100644
--- a/fs/bcachefs/data_update.h
+++ b/fs/bcachefs/data_update.h
@@ -50,6 +50,21 @@ struct data_update {
struct bio_vec *bvecs;
};
+struct promote_op {
+ struct rcu_head rcu;
+ u64 start_time;
+#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
+ unsigned list_idx;
+#endif
+
+ struct rhash_head hash;
+ struct bpos pos;
+
+ struct work_struct work;
+ struct data_update write;
+ struct bio_vec bi_inline_vecs[]; /* must be last */
+};
+
void bch2_data_update_to_text(struct printbuf *, struct data_update *);
void bch2_data_update_inflight_to_text(struct printbuf *, struct data_update *);
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 5a8bc7013512..901f643ead83 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -8,6 +8,7 @@
#include "bcachefs.h"
#include "alloc_foreground.h"
+#include "async_objs.h"
#include "bkey_methods.h"
#include "btree_cache.h"
#include "btree_io.h"
@@ -16,6 +17,7 @@
#include "btree_update.h"
#include "btree_update_interior.h"
#include "buckets.h"
+#include "data_update.h"
#include "debug.h"
#include "error.h"
#include "extents.h"
@@ -40,9 +42,10 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b,
struct btree_node *n_sorted = c->verify_data->data;
struct bset *sorted, *inmemory = &b->data->keys;
struct bio *bio;
- bool failed = false, saw_error = false;
+ bool failed = false;
- struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
+ struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ,
+ BCH_DEV_READ_REF_btree_verify_replicas);
if (!ca)
return false;
@@ -57,12 +60,13 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b,
submit_bio_wait(bio);
bio_put(bio);
- percpu_ref_put(&ca->io_ref[READ]);
+ enumerated_ref_put(&ca->io_ref[READ],
+ BCH_DEV_READ_REF_btree_verify_replicas);
memcpy(n_ondisk, n_sorted, btree_buf_bytes(b));
v->written = 0;
- if (bch2_btree_node_read_done(c, ca, v, false, &saw_error) || saw_error)
+ if (bch2_btree_node_read_done(c, ca, v, NULL, NULL))
return false;
n_sorted = c->verify_data->data;
@@ -196,7 +200,8 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
return;
}
- ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
+ ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ,
+ BCH_DEV_READ_REF_btree_node_ondisk_to_text);
if (!ca) {
prt_printf(out, "error getting device to read from: not online\n");
return;
@@ -297,28 +302,13 @@ out:
if (bio)
bio_put(bio);
kvfree(n_ondisk);
- percpu_ref_put(&ca->io_ref[READ]);
+ enumerated_ref_put(&ca->io_ref[READ],
+ BCH_DEV_READ_REF_btree_node_ondisk_to_text);
}
#ifdef CONFIG_DEBUG_FS
-/* XXX: bch_fs refcounting */
-
-struct dump_iter {
- struct bch_fs *c;
- enum btree_id id;
- struct bpos from;
- struct bpos prev_node;
- u64 iter;
-
- struct printbuf buf;
-
- char __user *ubuf; /* destination user buffer */
- size_t size; /* size of requested read */
- ssize_t ret; /* bytes read so far */
-};
-
-static ssize_t flush_buf(struct dump_iter *i)
+ssize_t bch2_debugfs_flush_buf(struct dump_iter *i)
{
if (i->buf.pos) {
size_t bytes = min_t(size_t, i->buf.pos, i->size);
@@ -330,6 +320,11 @@ static ssize_t flush_buf(struct dump_iter *i)
i->buf.pos -= copied;
memmove(i->buf.buf, i->buf.buf + copied, i->buf.pos);
+ if (i->buf.last_newline >= copied)
+ i->buf.last_newline -= copied;
+ if (i->buf.last_field >= copied)
+ i->buf.last_field -= copied;
+
if (copied != bytes)
return -EFAULT;
}
@@ -356,7 +351,7 @@ static int bch2_dump_open(struct inode *inode, struct file *file)
return 0;
}
-static int bch2_dump_release(struct inode *inode, struct file *file)
+int bch2_dump_release(struct inode *inode, struct file *file)
{
struct dump_iter *i = file->private_data;
@@ -374,7 +369,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
i->size = size;
i->ret = 0;
- return flush_buf(i) ?:
+ return bch2_debugfs_flush_buf(i) ?:
bch2_trans_run(i->c,
for_each_btree_key(trans, iter, i->id, i->from,
BTREE_ITER_prefetch|
@@ -383,7 +378,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
prt_newline(&i->buf);
bch2_trans_unlock(trans);
i->from = bpos_successor(iter.pos);
- flush_buf(i);
+ bch2_debugfs_flush_buf(i);
}))) ?:
i->ret;
}
@@ -404,7 +399,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
i->size = size;
i->ret = 0;
- ssize_t ret = flush_buf(i);
+ ssize_t ret = bch2_debugfs_flush_buf(i);
if (ret)
return ret;
@@ -418,7 +413,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
? bpos_successor(b->key.k.p)
: b->key.k.p;
- drop_locks_do(trans, flush_buf(i));
+ drop_locks_do(trans, bch2_debugfs_flush_buf(i));
}))) ?: i->ret;
}
@@ -438,7 +433,7 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
i->size = size;
i->ret = 0;
- return flush_buf(i) ?:
+ return bch2_debugfs_flush_buf(i) ?:
bch2_trans_run(i->c,
for_each_btree_key(trans, iter, i->id, i->from,
BTREE_ITER_prefetch|
@@ -456,7 +451,7 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
bch2_bfloat_to_text(&i->buf, l->b, _k);
bch2_trans_unlock(trans);
i->from = bpos_successor(iter.pos);
- flush_buf(i);
+ bch2_debugfs_flush_buf(i);
}))) ?:
i->ret;
}
@@ -497,6 +492,8 @@ static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs *
prt_printf(out, "journal pin %px:\t%llu\n",
&b->writes[1].journal, b->writes[1].journal.seq);
+ prt_printf(out, "ob:\t%u\n", b->ob.nr);
+
printbuf_indent_sub(out, 2);
}
@@ -513,34 +510,34 @@ static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf,
i->ret = 0;
do {
- struct bucket_table *tbl;
- struct rhash_head *pos;
- struct btree *b;
-
- ret = flush_buf(i);
+ ret = bch2_debugfs_flush_buf(i);
if (ret)
return ret;
- rcu_read_lock();
i->buf.atomic++;
- tbl = rht_dereference_rcu(c->btree_cache.table.tbl,
- &c->btree_cache.table);
- if (i->iter < tbl->size) {
- rht_for_each_entry_rcu(b, pos, tbl, i->iter, hash)
- bch2_cached_btree_node_to_text(&i->buf, c, b);
- i->iter++;
- } else {
- done = true;
+ scoped_guard(rcu) {
+ struct bucket_table *tbl =
+ rht_dereference_rcu(c->btree_cache.table.tbl,
+ &c->btree_cache.table);
+ if (i->iter < tbl->size) {
+ struct rhash_head *pos;
+ struct btree *b;
+
+ rht_for_each_entry_rcu(b, pos, tbl, i->iter, hash)
+ bch2_cached_btree_node_to_text(&i->buf, c, b);
+ i->iter++;
+ } else {
+ done = true;
+ }
}
--i->buf.atomic;
- rcu_read_unlock();
} while (!done);
if (i->buf.allocation_failure)
ret = -ENOMEM;
if (!ret)
- ret = flush_buf(i);
+ ret = bch2_debugfs_flush_buf(i);
return ret ?: i->ret;
}
@@ -614,7 +611,7 @@ restart:
closure_put(&trans->ref);
- ret = flush_buf(i);
+ ret = bch2_debugfs_flush_buf(i);
if (ret)
goto unlocked;
@@ -627,7 +624,7 @@ unlocked:
ret = -ENOMEM;
if (!ret)
- ret = flush_buf(i);
+ ret = bch2_debugfs_flush_buf(i);
return ret ?: i->ret;
}
@@ -652,7 +649,7 @@ static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf,
i->ret = 0;
while (1) {
- err = flush_buf(i);
+ err = bch2_debugfs_flush_buf(i);
if (err)
return err;
@@ -695,7 +692,7 @@ static ssize_t bch2_btree_updates_read(struct file *file, char __user *buf,
i->iter++;
}
- err = flush_buf(i);
+ err = bch2_debugfs_flush_buf(i);
if (err)
return err;
@@ -753,7 +750,7 @@ static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf,
while (1) {
struct btree_transaction_stats *s = &c->btree_transaction_stats[i->iter];
- err = flush_buf(i);
+ err = bch2_debugfs_flush_buf(i);
if (err)
return err;
@@ -770,6 +767,12 @@ static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf,
mutex_lock(&s->lock);
prt_printf(&i->buf, "Max mem used: %u\n", s->max_mem);
+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
+ printbuf_indent_add(&i->buf, 2);
+ bch2_trans_kmalloc_trace_to_text(&i->buf, &s->trans_kmalloc_trace);
+ printbuf_indent_sub(&i->buf, 2);
+#endif
+
prt_printf(&i->buf, "Transaction duration:\n");
printbuf_indent_add(&i->buf, 2);
@@ -868,7 +871,7 @@ static ssize_t bch2_simple_print(struct file *file, char __user *buf,
ret = -ENOMEM;
if (!ret)
- ret = flush_buf(i);
+ ret = bch2_debugfs_flush_buf(i);
return ret ?: i->ret;
}
@@ -927,7 +930,11 @@ void bch2_fs_debug_init(struct bch_fs *c)
if (IS_ERR_OR_NULL(bch_debug))
return;
- snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b);
+ if (c->sb.multi_device)
+ snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b);
+ else
+ strscpy(name, c->name, sizeof(name));
+
c->fs_debug_dir = debugfs_create_dir(name, bch_debug);
if (IS_ERR_OR_NULL(c->fs_debug_dir))
return;
@@ -953,6 +960,8 @@ void bch2_fs_debug_init(struct bch_fs *c)
debugfs_create_file("write_points", 0400, c->fs_debug_dir,
c->btree_debug, &write_points_ops);
+ bch2_fs_async_obj_debugfs_init(c);
+
c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir);
if (IS_ERR_OR_NULL(c->btree_debug_dir))
return;
diff --git a/fs/bcachefs/debug.h b/fs/bcachefs/debug.h
index 2c37143b5fd1..d88b1194b8ac 100644
--- a/fs/bcachefs/debug.h
+++ b/fs/bcachefs/debug.h
@@ -14,11 +14,29 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *, struct bch_fs *,
static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b)
{
- if (bch2_verify_btree_ondisk)
+ if (static_branch_unlikely(&bch2_verify_btree_ondisk))
__bch2_btree_verify(c, b);
}
#ifdef CONFIG_DEBUG_FS
+struct dump_iter {
+ struct bch_fs *c;
+ struct async_obj_list *list;
+ enum btree_id id;
+ struct bpos from;
+ struct bpos prev_node;
+ u64 iter;
+
+ struct printbuf buf;
+
+ char __user *ubuf; /* destination user buffer */
+ size_t size; /* size of requested read */
+ ssize_t ret; /* bytes read so far */
+};
+
+ssize_t bch2_debugfs_flush_buf(struct dump_iter *);
+int bch2_dump_release(struct inode *, struct file *);
+
void bch2_fs_debug_exit(struct bch_fs *);
void bch2_fs_debug_init(struct bch_fs *);
#else
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index a51195088227..300f7cc8abdf 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -212,82 +212,83 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
struct qstr d_name = bch2_dirent_get_name(d);
- prt_printf(out, "%.*s -> ", d_name.len, d_name.name);
+ prt_printf(out, "%.*s", d_name.len, d_name.name);
+
+ if (d.v->d_casefold) {
+ struct qstr d_name = bch2_dirent_get_lookup_name(d);
+ prt_printf(out, " (casefold %.*s)", d_name.len, d_name.name);
+ }
+
+ prt_str(out, " ->");
if (d.v->d_type != DT_SUBVOL)
- prt_printf(out, "%llu", le64_to_cpu(d.v->d_inum));
+ prt_printf(out, " %llu", le64_to_cpu(d.v->d_inum));
else
- prt_printf(out, "%u -> %u",
+ prt_printf(out, " %u -> %u",
le32_to_cpu(d.v->d_parent_subvol),
le32_to_cpu(d.v->d_child_subvol));
prt_printf(out, " type %s", bch2_d_type_str(d.v->d_type));
}
-static struct bkey_i_dirent *dirent_alloc_key(struct btree_trans *trans,
- subvol_inum dir,
- u8 type,
- int name_len, int cf_name_len,
- u64 dst)
+int bch2_dirent_init_name(struct bkey_i_dirent *dirent,
+ const struct bch_hash_info *hash_info,
+ const struct qstr *name,
+ const struct qstr *cf_name)
{
- struct bkey_i_dirent *dirent;
- unsigned u64s = BKEY_U64s + dirent_val_u64s(name_len, cf_name_len);
+ EBUG_ON(hash_info->cf_encoding == NULL && cf_name);
+ int cf_len = 0;
- BUG_ON(u64s > U8_MAX);
+ if (name->len > BCH_NAME_MAX)
+ return -ENAMETOOLONG;
- dirent = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
- if (IS_ERR(dirent))
- return dirent;
+ dirent->v.d_casefold = hash_info->cf_encoding != NULL;
- bkey_dirent_init(&dirent->k_i);
- dirent->k.u64s = u64s;
-
- if (type != DT_SUBVOL) {
- dirent->v.d_inum = cpu_to_le64(dst);
+ if (!dirent->v.d_casefold) {
+ memcpy(&dirent->v.d_name[0], name->name, name->len);
+ memset(&dirent->v.d_name[name->len], 0,
+ bkey_val_bytes(&dirent->k) -
+ offsetof(struct bch_dirent, d_name) -
+ name->len);
} else {
- dirent->v.d_parent_subvol = cpu_to_le32(dir.subvol);
- dirent->v.d_child_subvol = cpu_to_le32(dst);
- }
+#ifdef CONFIG_UNICODE
+ memcpy(&dirent->v.d_cf_name_block.d_names[0], name->name, name->len);
- dirent->v.d_type = type;
- dirent->v.d_unused = 0;
- dirent->v.d_casefold = cf_name_len ? 1 : 0;
+ char *cf_out = &dirent->v.d_cf_name_block.d_names[name->len];
- return dirent;
-}
+ if (cf_name) {
+ cf_len = cf_name->len;
-static void dirent_init_regular_name(struct bkey_i_dirent *dirent,
- const struct qstr *name)
-{
- EBUG_ON(dirent->v.d_casefold);
+ memcpy(cf_out, cf_name->name, cf_name->len);
+ } else {
+ cf_len = utf8_casefold(hash_info->cf_encoding, name,
+ cf_out,
+ bkey_val_end(bkey_i_to_s(&dirent->k_i)) - (void *) cf_out);
+ if (cf_len <= 0)
+ return cf_len;
+ }
- memcpy(&dirent->v.d_name[0], name->name, name->len);
- memset(&dirent->v.d_name[name->len], 0,
- bkey_val_bytes(&dirent->k) -
- offsetof(struct bch_dirent, d_name) -
- name->len);
-}
+ memset(&dirent->v.d_cf_name_block.d_names[name->len + cf_len], 0,
+ bkey_val_bytes(&dirent->k) -
+ offsetof(struct bch_dirent, d_cf_name_block.d_names) -
+ name->len + cf_len);
-static void dirent_init_casefolded_name(struct bkey_i_dirent *dirent,
- const struct qstr *name,
- const struct qstr *cf_name)
-{
- EBUG_ON(!dirent->v.d_casefold);
- EBUG_ON(!cf_name->len);
-
- dirent->v.d_cf_name_block.d_name_len = cpu_to_le16(name->len);
- dirent->v.d_cf_name_block.d_cf_name_len = cpu_to_le16(cf_name->len);
- memcpy(&dirent->v.d_cf_name_block.d_names[0], name->name, name->len);
- memcpy(&dirent->v.d_cf_name_block.d_names[name->len], cf_name->name, cf_name->len);
- memset(&dirent->v.d_cf_name_block.d_names[name->len + cf_name->len], 0,
- bkey_val_bytes(&dirent->k) -
- offsetof(struct bch_dirent, d_cf_name_block.d_names) -
- name->len + cf_name->len);
-
- EBUG_ON(bch2_dirent_get_casefold_name(dirent_i_to_s_c(dirent)).len != cf_name->len);
+ dirent->v.d_cf_name_block.d_name_len = cpu_to_le16(name->len);
+ dirent->v.d_cf_name_block.d_cf_name_len = cpu_to_le16(cf_len);
+
+ EBUG_ON(bch2_dirent_get_casefold_name(dirent_i_to_s_c(dirent)).len != cf_len);
+#else
+ return -EOPNOTSUPP;
+#endif
+ }
+
+ unsigned u64s = dirent_val_u64s(name->len, cf_len);
+ BUG_ON(u64s > bkey_val_u64s(&dirent->k));
+ set_bkey_val_u64s(&dirent->k, u64s);
+ return 0;
}
-static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
+struct bkey_i_dirent *bch2_dirent_create_key(struct btree_trans *trans,
const struct bch_hash_info *hash_info,
subvol_inum dir,
u8 type,
@@ -295,31 +296,28 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
const struct qstr *cf_name,
u64 dst)
{
- struct bkey_i_dirent *dirent;
- struct qstr _cf_name;
-
- if (name->len > BCH_NAME_MAX)
- return ERR_PTR(-ENAMETOOLONG);
+ struct bkey_i_dirent *dirent = bch2_trans_kmalloc(trans, BKEY_U64s_MAX * sizeof(u64));
+ if (IS_ERR(dirent))
+ return dirent;
- if (hash_info->cf_encoding && !cf_name) {
- int ret = bch2_casefold(trans, hash_info, name, &_cf_name);
- if (ret)
- return ERR_PTR(ret);
+ bkey_dirent_init(&dirent->k_i);
+ dirent->k.u64s = BKEY_U64s_MAX;
- cf_name = &_cf_name;
+ if (type != DT_SUBVOL) {
+ dirent->v.d_inum = cpu_to_le64(dst);
+ } else {
+ dirent->v.d_parent_subvol = cpu_to_le32(dir.subvol);
+ dirent->v.d_child_subvol = cpu_to_le32(dst);
}
- dirent = dirent_alloc_key(trans, dir, type, name->len, cf_name ? cf_name->len : 0, dst);
- if (IS_ERR(dirent))
- return dirent;
+ dirent->v.d_type = type;
+ dirent->v.d_unused = 0;
- if (cf_name)
- dirent_init_casefolded_name(dirent, name, cf_name);
- else
- dirent_init_regular_name(dirent, name);
+ int ret = bch2_dirent_init_name(dirent, hash_info, name, cf_name);
+ if (ret)
+ return ERR_PTR(ret);
EBUG_ON(bch2_dirent_get_name(dirent_i_to_s_c(dirent)).len != name->len);
-
return dirent;
}
@@ -334,7 +332,7 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans,
struct bkey_i_dirent *dirent;
int ret;
- dirent = dirent_create_key(trans, hash_info, dir_inum, type, name, NULL, dst_inum);
+ dirent = bch2_dirent_create_key(trans, hash_info, dir_inum, type, name, NULL, dst_inum);
ret = PTR_ERR_OR_ZERO(dirent);
if (ret)
return ret;
@@ -358,7 +356,7 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
struct bkey_i_dirent *dirent;
int ret;
- dirent = dirent_create_key(trans, hash_info, dir, type, name, NULL, dst_inum);
+ dirent = bch2_dirent_create_key(trans, hash_info, dir, type, name, NULL, dst_inum);
ret = PTR_ERR_OR_ZERO(dirent);
if (ret)
return ret;
@@ -395,8 +393,8 @@ int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir,
}
int bch2_dirent_rename(struct btree_trans *trans,
- subvol_inum src_dir, struct bch_hash_info *src_hash, u64 *src_dir_i_size,
- subvol_inum dst_dir, struct bch_hash_info *dst_hash, u64 *dst_dir_i_size,
+ subvol_inum src_dir, struct bch_hash_info *src_hash,
+ subvol_inum dst_dir, struct bch_hash_info *dst_hash,
const struct qstr *src_name, subvol_inum *src_inum, u64 *src_offset,
const struct qstr *dst_name, subvol_inum *dst_inum, u64 *dst_offset,
enum bch_rename_mode mode)
@@ -463,8 +461,8 @@ int bch2_dirent_rename(struct btree_trans *trans,
*src_offset = dst_iter.pos.offset;
/* Create new dst key: */
- new_dst = dirent_create_key(trans, dst_hash, dst_dir, 0, dst_name,
- dst_hash->cf_encoding ? &dst_name_lookup : NULL, 0);
+ new_dst = bch2_dirent_create_key(trans, dst_hash, dst_dir, 0, dst_name,
+ dst_hash->cf_encoding ? &dst_name_lookup : NULL, 0);
ret = PTR_ERR_OR_ZERO(new_dst);
if (ret)
goto out;
@@ -474,8 +472,8 @@ int bch2_dirent_rename(struct btree_trans *trans,
/* Create new src key: */
if (mode == BCH_RENAME_EXCHANGE) {
- new_src = dirent_create_key(trans, src_hash, src_dir, 0, src_name,
- src_hash->cf_encoding ? &src_name_lookup : NULL, 0);
+ new_src = bch2_dirent_create_key(trans, src_hash, src_dir, 0, src_name,
+ src_hash->cf_encoding ? &src_name_lookup : NULL, 0);
ret = PTR_ERR_OR_ZERO(new_src);
if (ret)
goto out;
@@ -535,14 +533,6 @@ int bch2_dirent_rename(struct btree_trans *trans,
new_src->v.d_type == DT_SUBVOL)
new_src->v.d_parent_subvol = cpu_to_le32(src_dir.subvol);
- if (old_dst.k)
- *dst_dir_i_size -= bkey_bytes(old_dst.k);
- *src_dir_i_size -= bkey_bytes(old_src.k);
-
- if (mode == BCH_RENAME_EXCHANGE)
- *src_dir_i_size += bkey_bytes(&new_src->k);
- *dst_dir_i_size += bkey_bytes(&new_dst->k);
-
ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0);
if (ret)
goto out;
@@ -649,7 +639,7 @@ int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 subvol, u32
struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
if (d.v->d_type == DT_SUBVOL && le32_to_cpu(d.v->d_parent_subvol) != subvol)
continue;
- ret = -BCH_ERR_ENOTEMPTY_dir_not_empty;
+ ret = bch_err_throw(trans->c, ENOTEMPTY_dir_not_empty);
break;
}
bch2_trans_iter_exit(trans, &iter);
@@ -685,7 +675,9 @@ static int bch2_dir_emit(struct dir_context *ctx, struct bkey_s_c_dirent d, subv
return !ret;
}
-int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
+int bch2_readdir(struct bch_fs *c, subvol_inum inum,
+ struct bch_hash_info *hash_info,
+ struct dir_context *ctx)
{
struct bkey_buf sk;
bch2_bkey_buf_init(&sk);
@@ -703,7 +695,11 @@ int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
struct bkey_s_c_dirent dirent = bkey_i_to_s_c_dirent(sk.k);
subvol_inum target;
- int ret2 = bch2_dirent_read_target(trans, inum, dirent, &target);
+
+ bool need_second_pass = false;
+ int ret2 = bch2_str_hash_check_key(trans, NULL, &bch2_dirent_hash_desc,
+ hash_info, &iter, k, &need_second_pass) ?:
+ bch2_dirent_read_target(trans, inum, dirent, &target);
if (ret2 > 0)
continue;
@@ -733,7 +729,7 @@ static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
ret = bch2_inode_unpack(k, inode);
goto found;
}
- ret = -BCH_ERR_ENOENT_inode;
+ ret = bch_err_throw(trans->c, ENOENT_inode);
found:
bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
bch2_trans_iter_exit(trans, &iter);
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index d3e7ae669575..70fb0b581221 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -38,7 +38,7 @@ static inline int bch2_maybe_casefold(struct btree_trans *trans,
}
}
-struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d);
+struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent);
static inline unsigned dirent_val_u64s(unsigned len, unsigned cf_len)
{
@@ -59,6 +59,14 @@ static inline void dirent_copy_target(struct bkey_i_dirent *dst,
dst->v.d_type = src.v->d_type;
}
+int bch2_dirent_init_name(struct bkey_i_dirent *,
+ const struct bch_hash_info *,
+ const struct qstr *,
+ const struct qstr *);
+struct bkey_i_dirent *bch2_dirent_create_key(struct btree_trans *,
+ const struct bch_hash_info *, subvol_inum, u8,
+ const struct qstr *, const struct qstr *, u64);
+
int bch2_dirent_create_snapshot(struct btree_trans *, u32, u64, u32,
const struct bch_hash_info *, u8,
const struct qstr *, u64, u64 *,
@@ -80,8 +88,8 @@ enum bch_rename_mode {
};
int bch2_dirent_rename(struct btree_trans *,
- subvol_inum, struct bch_hash_info *, u64 *,
- subvol_inum, struct bch_hash_info *, u64 *,
+ subvol_inum, struct bch_hash_info *,
+ subvol_inum, struct bch_hash_info *,
const struct qstr *, subvol_inum *, u64 *,
const struct qstr *, subvol_inum *, u64 *,
enum bch_rename_mode);
@@ -95,7 +103,7 @@ u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum,
int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32, u32);
int bch2_empty_dir_trans(struct btree_trans *, subvol_inum);
-int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *);
+int bch2_readdir(struct bch_fs *, subvol_inum, struct bch_hash_info *, struct dir_context *);
int bch2_fsck_remove_dirent(struct btree_trans *, struct bpos);
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index 1f0422bfae35..3d59a57a5256 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -68,23 +68,31 @@ static const char * const disk_accounting_type_strs[] = {
NULL
};
-static inline void accounting_key_init(struct bkey_i *k, struct disk_accounting_pos *pos,
- s64 *d, unsigned nr)
+static inline void __accounting_key_init(struct bkey_i *k, struct bpos pos,
+ s64 *d, unsigned nr)
{
struct bkey_i_accounting *acc = bkey_accounting_init(k);
- acc->k.p = disk_accounting_pos_to_bpos(pos);
+ acc->k.p = pos;
set_bkey_val_u64s(&acc->k, sizeof(struct bch_accounting) / sizeof(u64) + nr);
memcpy_u64s_small(acc->v.d, d, nr);
}
+static inline void accounting_key_init(struct bkey_i *k, struct disk_accounting_pos *pos,
+ s64 *d, unsigned nr)
+{
+ return __accounting_key_init(k, disk_accounting_pos_to_bpos(pos), d, nr);
+}
+
static int bch2_accounting_update_sb_one(struct bch_fs *, struct bpos);
int bch2_disk_accounting_mod(struct btree_trans *trans,
struct disk_accounting_pos *k,
s64 *d, unsigned nr, bool gc)
{
+ BUG_ON(nr > BCH_ACCOUNTING_MAX_COUNTERS);
+
/* Normalize: */
switch (k->type) {
case BCH_DISK_ACCOUNTING_replicas:
@@ -92,21 +100,49 @@ int bch2_disk_accounting_mod(struct btree_trans *trans,
break;
}
- BUG_ON(nr > BCH_ACCOUNTING_MAX_COUNTERS);
+ struct bpos pos = disk_accounting_pos_to_bpos(k);
+
+ if (likely(!gc)) {
+ struct bkey_i_accounting *a;
+#if 0
+ for (a = btree_trans_subbuf_base(trans, &trans->accounting);
+ a != btree_trans_subbuf_top(trans, &trans->accounting);
+ a = (void *) bkey_next(&a->k_i))
+ if (bpos_eq(a->k.p, pos)) {
+ BUG_ON(nr != bch2_accounting_counters(&a->k));
+ acc_u64s(a->v.d, d, nr);
+
+ if (bch2_accounting_key_is_zero(accounting_i_to_s_c(a))) {
+ unsigned offset = (u64 *) a -
+ (u64 *) btree_trans_subbuf_base(trans, &trans->accounting);
+
+ trans->accounting.u64s -= a->k.u64s;
+ memmove_u64s_down(a,
+ bkey_next(&a->k_i),
+ trans->accounting.u64s - offset);
+ }
+ return 0;
+ }
+#endif
+ unsigned u64s = sizeof(*a) / sizeof(u64) + nr;
+ a = bch2_trans_subbuf_alloc(trans, &trans->accounting, u64s);
+ int ret = PTR_ERR_OR_ZERO(a);
+ if (ret)
+ return ret;
- struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i;
+ __accounting_key_init(&a->k_i, pos, d, nr);
+ return 0;
+ } else {
+ struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i;
- accounting_key_init(&k_i.k, k, d, nr);
+ __accounting_key_init(&k_i.k, pos, d, nr);
- if (unlikely(gc)) {
int ret = bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true);
if (ret == -BCH_ERR_btree_insert_need_mark_replicas)
ret = drop_locks_do(trans,
bch2_accounting_update_sb_one(trans->c, disk_accounting_pos_to_bpos(k))) ?:
bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true);
return ret;
- } else {
- return bch2_trans_update_buffered(trans, BTREE_ID_accounting, &k_i.k);
}
}
@@ -287,7 +323,7 @@ static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struc
static int bch2_accounting_update_sb_one(struct bch_fs *c, struct bpos p)
{
- struct bch_replicas_padded r;
+ union bch_replicas_padded r;
return accounting_to_replicas(&r.e, p)
? bch2_mark_replicas(c, &r.e)
: 0;
@@ -299,14 +335,13 @@ static int bch2_accounting_update_sb_one(struct bch_fs *c, struct bpos p)
*/
int bch2_accounting_update_sb(struct btree_trans *trans)
{
- for (struct jset_entry *i = trans->journal_entries;
- i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
- i = vstruct_next(i))
- if (jset_entry_is_key(i) && i->start->k.type == KEY_TYPE_accounting) {
- int ret = bch2_accounting_update_sb_one(trans->c, i->start->k.p);
- if (ret)
- return ret;
- }
+ for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting);
+ i != btree_trans_subbuf_top(trans, &trans->accounting);
+ i = bkey_next(i)) {
+ int ret = bch2_accounting_update_sb_one(trans->c, i->k.p);
+ if (ret)
+ return ret;
+ }
return 0;
}
@@ -355,18 +390,18 @@ static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accoun
err:
free_percpu(n.v[1]);
free_percpu(n.v[0]);
- return -BCH_ERR_ENOMEM_disk_accounting;
+ return bch_err_throw(c, ENOMEM_disk_accounting);
}
int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a,
enum bch_accounting_mode mode)
{
- struct bch_replicas_padded r;
+ union bch_replicas_padded r;
if (mode != BCH_ACCOUNTING_read &&
accounting_to_replicas(&r.e, a.k->p) &&
!bch2_replicas_marked_locked(c, &r.e))
- return -BCH_ERR_btree_insert_need_mark_replicas;
+ return bch_err_throw(c, btree_insert_need_mark_replicas);
percpu_up_read(&c->mark_lock);
percpu_down_write(&c->mark_lock);
@@ -379,12 +414,12 @@ int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a,
int bch2_accounting_mem_insert_locked(struct bch_fs *c, struct bkey_s_c_accounting a,
enum bch_accounting_mode mode)
{
- struct bch_replicas_padded r;
+ union bch_replicas_padded r;
if (mode != BCH_ACCOUNTING_read &&
accounting_to_replicas(&r.e, a.k->p) &&
!bch2_replicas_marked_locked(c, &r.e))
- return -BCH_ERR_btree_insert_need_mark_replicas;
+ return bch_err_throw(c, btree_insert_need_mark_replicas);
return __bch2_accounting_mem_insert(c, a);
}
@@ -438,10 +473,12 @@ int bch2_fs_replicas_usage_read(struct bch_fs *c, darray_char *usage)
percpu_down_read(&c->mark_lock);
darray_for_each(acc->k, i) {
- struct {
+ union {
+ u8 bytes[struct_size_t(struct bch_replicas_usage, r.devs,
+ BCH_BKEY_PTRS_MAX)];
struct bch_replicas_usage r;
- u8 pad[BCH_BKEY_PTRS_MAX];
} u;
+ u.r.r.nr_devs = BCH_BKEY_PTRS_MAX;
if (!accounting_to_replicas(&u.r.r, i->pos))
continue;
@@ -522,7 +559,7 @@ int bch2_gc_accounting_start(struct bch_fs *c)
sizeof(u64), GFP_KERNEL);
if (!e->v[1]) {
bch2_accounting_free_counters(acc, true);
- ret = -BCH_ERR_ENOMEM_disk_accounting;
+ ret = bch_err_throw(c, ENOMEM_disk_accounting);
break;
}
}
@@ -570,11 +607,11 @@ int bch2_gc_accounting_done(struct bch_fs *c)
prt_str(&buf, "accounting mismatch for ");
bch2_accounting_key_to_text(&buf, &acc_k);
- prt_str(&buf, ": got");
+ prt_str(&buf, ":\n got");
for (unsigned j = 0; j < nr; j++)
prt_printf(&buf, " %llu", dst_v[j]);
- prt_str(&buf, " should be");
+ prt_str(&buf, "\nshould be");
for (unsigned j = 0; j < nr; j++)
prt_printf(&buf, " %llu", src_v[j]);
@@ -631,17 +668,17 @@ static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k)
}
static int bch2_disk_accounting_validate_late(struct btree_trans *trans,
- struct disk_accounting_pos acc,
+ struct disk_accounting_pos *acc,
u64 *v, unsigned nr)
{
struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF;
int ret = 0, invalid_dev = -1;
- switch (acc.type) {
+ switch (acc->type) {
case BCH_DISK_ACCOUNTING_replicas: {
- struct bch_replicas_padded r;
- __accounting_to_replicas(&r.e, &acc);
+ union bch_replicas_padded r;
+ __accounting_to_replicas(&r.e, acc);
for (unsigned i = 0; i < r.e.nr_devs; i++)
if (r.e.devs[i] != BCH_SB_MEMBER_INVALID &&
@@ -660,7 +697,7 @@ static int bch2_disk_accounting_validate_late(struct btree_trans *trans,
trans, accounting_replicas_not_marked,
"accounting not marked in superblock replicas\n%s",
(printbuf_reset(&buf),
- bch2_accounting_key_to_text(&buf, &acc),
+ bch2_accounting_key_to_text(&buf, acc),
buf.buf))) {
/*
* We're not RW yet and still single threaded, dropping
@@ -676,8 +713,8 @@ static int bch2_disk_accounting_validate_late(struct btree_trans *trans,
}
case BCH_DISK_ACCOUNTING_dev_data_type:
- if (!bch2_dev_exists(c, acc.dev_data_type.dev)) {
- invalid_dev = acc.dev_data_type.dev;
+ if (!bch2_dev_exists(c, acc->dev_data_type.dev)) {
+ invalid_dev = acc->dev_data_type.dev;
goto invalid_device;
}
break;
@@ -691,16 +728,16 @@ invalid_device:
"accounting entry points to invalid device %i\n%s",
invalid_dev,
(printbuf_reset(&buf),
- bch2_accounting_key_to_text(&buf, &acc),
+ bch2_accounting_key_to_text(&buf, acc),
buf.buf))) {
for (unsigned i = 0; i < nr; i++)
v[i] = -v[i];
ret = commit_do(trans, NULL, NULL, 0,
- bch2_disk_accounting_mod(trans, &acc, v, nr, false)) ?:
+ bch2_disk_accounting_mod(trans, acc, v, nr, false)) ?:
-BCH_ERR_remove_disk_accounting_entry;
} else {
- ret = -BCH_ERR_remove_disk_accounting_entry;
+ ret = bch_err_throw(c, remove_disk_accounting_entry);
}
goto fsck_err;
}
@@ -748,7 +785,7 @@ int bch2_accounting_read(struct bch_fs *c)
if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR)
break;
- if (!bch2_accounting_is_mem(acc_k)) {
+ if (!bch2_accounting_is_mem(&acc_k)) {
struct disk_accounting_pos next;
memset(&next, 0, sizeof(next));
next.type = acc_k.type + 1;
@@ -770,7 +807,7 @@ int bch2_accounting_read(struct bch_fs *c)
struct disk_accounting_pos acc_k;
bpos_to_disk_accounting_pos(&acc_k, i->k->k.p);
- if (!bch2_accounting_is_mem(acc_k))
+ if (!bch2_accounting_is_mem(&acc_k))
continue;
struct bkey_s_c k = bkey_i_to_s_c(i->k);
@@ -826,7 +863,7 @@ int bch2_accounting_read(struct bch_fs *c)
*/
ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters)
? -BCH_ERR_remove_disk_accounting_entry
- : bch2_disk_accounting_validate_late(trans, acc_k, v, i->nr_counters);
+ : bch2_disk_accounting_validate_late(trans, &acc_k, v, i->nr_counters);
if (ret == -BCH_ERR_remove_disk_accounting_entry) {
free_percpu(i->v[0]);
@@ -860,8 +897,8 @@ int bch2_accounting_read(struct bch_fs *c)
case BCH_DISK_ACCOUNTING_replicas:
fs_usage_data_type_to_base(usage, k.replicas.data_type, v[0]);
break;
- case BCH_DISK_ACCOUNTING_dev_data_type:
- rcu_read_lock();
+ case BCH_DISK_ACCOUNTING_dev_data_type: {
+ guard(rcu)();
struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.dev_data_type.dev);
if (ca) {
struct bch_dev_usage_type __percpu *d = &ca->usage->d[k.dev_data_type.data_type];
@@ -873,9 +910,9 @@ int bch2_accounting_read(struct bch_fs *c)
k.dev_data_type.data_type == BCH_DATA_journal)
usage->hidden += v[0] * ca->mi.bucket_size;
}
- rcu_read_unlock();
break;
}
+ }
}
preempt_enable();
fsck_err:
@@ -939,7 +976,7 @@ void bch2_verify_accounting_clean(struct bch_fs *c)
if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR)
break;
- if (!bch2_accounting_is_mem(acc_k)) {
+ if (!bch2_accounting_is_mem(&acc_k)) {
struct disk_accounting_pos next;
memset(&next, 0, sizeof(next));
next.type = acc_k.type + 1;
@@ -969,19 +1006,18 @@ void bch2_verify_accounting_clean(struct bch_fs *c)
case BCH_DISK_ACCOUNTING_replicas:
fs_usage_data_type_to_base(&base, acc_k.replicas.data_type, a.v->d[0]);
break;
- case BCH_DISK_ACCOUNTING_dev_data_type: {
- rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev);
- if (!ca) {
- rcu_read_unlock();
- continue;
+ case BCH_DISK_ACCOUNTING_dev_data_type:
+ {
+ guard(rcu)(); /* scoped guard is a loop, and doesn't play nicely with continue */
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev);
+ if (!ca)
+ continue;
+
+ v[0] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].buckets);
+ v[1] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].sectors);
+ v[2] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].fragmented);
}
- v[0] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].buckets);
- v[1] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].sectors);
- v[2] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].fragmented);
- rcu_read_unlock();
-
if (memcmp(a.v->d, v, 3 * sizeof(u64))) {
struct printbuf buf = PRINTBUF;
@@ -995,7 +1031,6 @@ void bch2_verify_accounting_clean(struct bch_fs *c)
mismatch = true;
}
}
- }
0;
})));
diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h
index d557b99b3c0a..d61abebf3e0b 100644
--- a/fs/bcachefs/disk_accounting.h
+++ b/fs/bcachefs/disk_accounting.h
@@ -139,10 +139,10 @@ int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, enum
int bch2_accounting_mem_insert_locked(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode);
void bch2_accounting_mem_gc(struct bch_fs *);
-static inline bool bch2_accounting_is_mem(struct disk_accounting_pos acc)
+static inline bool bch2_accounting_is_mem(struct disk_accounting_pos *acc)
{
- return acc.type < BCH_DISK_ACCOUNTING_TYPE_NR &&
- acc.type != BCH_DISK_ACCOUNTING_inum;
+ return acc->type < BCH_DISK_ACCOUNTING_TYPE_NR &&
+ acc->type != BCH_DISK_ACCOUNTING_inum;
}
/*
@@ -163,7 +163,7 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans,
if (gc && !acc->gc_running)
return 0;
- if (!bch2_accounting_is_mem(acc_k))
+ if (!bch2_accounting_is_mem(&acc_k))
return 0;
if (mode == BCH_ACCOUNTING_normal) {
@@ -174,17 +174,17 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans,
case BCH_DISK_ACCOUNTING_replicas:
fs_usage_data_type_to_base(&trans->fs_usage_delta, acc_k.replicas.data_type, a.v->d[0]);
break;
- case BCH_DISK_ACCOUNTING_dev_data_type:
- rcu_read_lock();
+ case BCH_DISK_ACCOUNTING_dev_data_type: {
+ guard(rcu)();
struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev);
if (ca) {
this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].buckets, a.v->d[0]);
this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].sectors, a.v->d[1]);
this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].fragmented, a.v->d[2]);
}
- rcu_read_unlock();
break;
}
+ }
}
unsigned idx;
@@ -259,8 +259,8 @@ static inline int bch2_accounting_trans_commit_hook(struct btree_trans *trans,
struct bkey_i_accounting *a,
unsigned commit_flags)
{
- a->k.bversion = journal_pos_to_bversion(&trans->journal_res,
- (u64 *) a - (u64 *) trans->journal_entries);
+ u64 *base = (u64 *) btree_trans_subbuf_base(trans, &trans->accounting);
+ a->k.bversion = journal_pos_to_bversion(&trans->journal_res, (u64 *) a - base);
EBUG_ON(bversion_zero(a->k.bversion));
diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
index 2ca3cbf12b71..cde842ac1886 100644
--- a/fs/bcachefs/disk_groups.c
+++ b/fs/bcachefs/disk_groups.c
@@ -86,35 +86,6 @@ err:
return ret;
}
-void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c)
-{
- out->atomic++;
- rcu_read_lock();
-
- struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
- if (!g)
- goto out;
-
- for (unsigned i = 0; i < g->nr; i++) {
- if (i)
- prt_printf(out, " ");
-
- if (g->entries[i].deleted) {
- prt_printf(out, "[deleted]");
- continue;
- }
-
- prt_printf(out, "[parent %d devs", g->entries[i].parent);
- for_each_member_device_rcu(c, ca, &g->entries[i].devs)
- prt_printf(out, " %s", ca->name);
- prt_printf(out, "]");
- }
-
-out:
- rcu_read_unlock();
- out->atomic--;
-}
-
static void bch2_sb_disk_groups_to_text(struct printbuf *out,
struct bch_sb *sb,
struct bch_sb_field *f)
@@ -159,7 +130,7 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
cpu_g = kzalloc(struct_size(cpu_g, entries, nr_groups), GFP_KERNEL);
if (!cpu_g)
- return -BCH_ERR_ENOMEM_disk_groups_to_cpu;
+ return bch_err_throw(c, ENOMEM_disk_groups_to_cpu);
cpu_g->nr = nr_groups;
@@ -199,36 +170,28 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target)
{
struct target t = target_decode(target);
- struct bch_devs_mask *devs;
- rcu_read_lock();
+ guard(rcu)();
switch (t.type) {
case TARGET_NULL:
- devs = NULL;
- break;
+ return NULL;
case TARGET_DEV: {
struct bch_dev *ca = t.dev < c->sb.nr_devices
? rcu_dereference(c->devs[t.dev])
: NULL;
- devs = ca ? &ca->self : NULL;
- break;
+ return ca ? &ca->self : NULL;
}
case TARGET_GROUP: {
struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
- devs = g && t.group < g->nr && !g->entries[t.group].deleted
+ return g && t.group < g->nr && !g->entries[t.group].deleted
? &g->entries[t.group].devs
: NULL;
- break;
}
default:
BUG();
}
-
- rcu_read_unlock();
-
- return devs;
}
bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target)
@@ -241,20 +204,13 @@ bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target)
case TARGET_DEV:
return dev == t.dev;
case TARGET_GROUP: {
- struct bch_disk_groups_cpu *g;
- const struct bch_devs_mask *m;
- bool ret;
-
- rcu_read_lock();
- g = rcu_dereference(c->disk_groups);
- m = g && t.group < g->nr && !g->entries[t.group].deleted
+ struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
+ const struct bch_devs_mask *m =
+ g && t.group < g->nr && !g->entries[t.group].deleted
? &g->entries[t.group].devs
: NULL;
- ret = m ? test_bit(dev, m->d) : false;
- rcu_read_unlock();
-
- return ret;
+ return m ? test_bit(dev, m->d) : false;
}
default:
BUG();
@@ -377,54 +333,79 @@ int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name)
return v;
}
-void bch2_disk_path_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
+static void __bch2_disk_path_to_text(struct printbuf *out, struct bch_disk_groups_cpu *g,
+ unsigned v)
{
- struct bch_disk_groups_cpu *groups;
- struct bch_disk_group_cpu *g;
- unsigned nr = 0;
u16 path[32];
-
- out->atomic++;
- rcu_read_lock();
- groups = rcu_dereference(c->disk_groups);
- if (!groups)
- goto invalid;
+ unsigned nr = 0;
while (1) {
if (nr == ARRAY_SIZE(path))
goto invalid;
- if (v >= groups->nr)
+ if (v >= (g ? g->nr : 0))
goto invalid;
- g = groups->entries + v;
+ struct bch_disk_group_cpu *e = g->entries + v;
- if (g->deleted)
+ if (e->deleted)
goto invalid;
path[nr++] = v;
- if (!g->parent)
+ if (!e->parent)
break;
- v = g->parent - 1;
+ v = e->parent - 1;
}
while (nr) {
- v = path[--nr];
- g = groups->entries + v;
+ struct bch_disk_group_cpu *e = g->entries + path[--nr];
- prt_printf(out, "%.*s", (int) sizeof(g->label), g->label);
+ prt_printf(out, "%.*s", (int) sizeof(e->label), e->label);
if (nr)
prt_printf(out, ".");
}
-out:
- rcu_read_unlock();
- out->atomic--;
return;
invalid:
prt_printf(out, "invalid label %u", v);
- goto out;
+}
+
+void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ bch2_printbuf_make_room(out, 4096);
+
+ out->atomic++;
+ guard(rcu)();
+ struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
+
+ for (unsigned i = 0; i < (g ? g->nr : 0); i++) {
+ prt_printf(out, "%2u: ", i);
+
+ if (g->entries[i].deleted) {
+ prt_printf(out, "[deleted]");
+ goto next;
+ }
+
+ __bch2_disk_path_to_text(out, g, i);
+
+ prt_printf(out, " devs");
+
+ for_each_member_device_rcu(c, ca, &g->entries[i].devs)
+ prt_printf(out, " %s", ca->name);
+next:
+ prt_newline(out);
+ }
+
+ out->atomic--;
+}
+
+void bch2_disk_path_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
+{
+ out->atomic++;
+ guard(rcu)();
+ __bch2_disk_path_to_text(out, rcu_dereference(c->disk_groups), v),
+ --out->atomic;
}
void bch2_disk_path_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v)
@@ -544,32 +525,27 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
switch (t.type) {
case TARGET_NULL:
prt_printf(out, "none");
- break;
+ return;
case TARGET_DEV: {
- struct bch_dev *ca;
-
out->atomic++;
- rcu_read_lock();
- ca = t.dev < c->sb.nr_devices
+ guard(rcu)();
+ struct bch_dev *ca = t.dev < c->sb.nr_devices
? rcu_dereference(c->devs[t.dev])
: NULL;
- if (ca && percpu_ref_tryget(&ca->io_ref[READ])) {
+ if (ca && ca->disk_sb.bdev)
prt_printf(out, "/dev/%s", ca->name);
- percpu_ref_put(&ca->io_ref[READ]);
- } else if (ca) {
+ else if (ca)
prt_printf(out, "offline device %u", t.dev);
- } else {
+ else
prt_printf(out, "invalid device %u", t.dev);
- }
- rcu_read_unlock();
out->atomic--;
- break;
+ return;
}
case TARGET_GROUP:
bch2_disk_path_to_text(out, c, t.group);
- break;
+ return;
default:
BUG();
}
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index c6cb26981923..543dbba9b14f 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -16,6 +16,7 @@
#include "disk_accounting.h"
#include "disk_groups.h"
#include "ec.h"
+#include "enumerated_ref.h"
#include "error.h"
#include "io_read.h"
#include "io_write.h"
@@ -212,7 +213,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans,
a->dirty_sectors,
a->stripe, s.k->p.offset,
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- ret = -BCH_ERR_mark_stripe;
+ ret = bch_err_throw(c, mark_stripe);
goto err;
}
@@ -223,7 +224,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans,
a->dirty_sectors,
a->cached_sectors,
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- ret = -BCH_ERR_mark_stripe;
+ ret = bch_err_throw(c, mark_stripe);
goto err;
}
} else {
@@ -233,7 +234,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans,
bucket.inode, bucket.offset, a->gen,
a->stripe,
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- ret = -BCH_ERR_mark_stripe;
+ ret = bch_err_throw(c, mark_stripe);
goto err;
}
@@ -243,7 +244,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans,
bch2_data_type_str(a->data_type),
bch2_data_type_str(data_type),
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- ret = -BCH_ERR_mark_stripe;
+ ret = bch_err_throw(c, mark_stripe);
goto err;
}
@@ -255,7 +256,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans,
a->dirty_sectors,
a->cached_sectors,
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- ret = -BCH_ERR_mark_stripe;
+ ret = bch_err_throw(c, mark_stripe);
goto err;
}
}
@@ -294,7 +295,7 @@ static int mark_stripe_bucket(struct btree_trans *trans,
struct bch_dev *ca = bch2_dev_tryget(c, ptr->dev);
if (unlikely(!ca)) {
if (ptr->dev != BCH_SB_MEMBER_INVALID && !(flags & BTREE_TRIGGER_overwrite))
- ret = -BCH_ERR_mark_stripe;
+ ret = bch_err_throw(c, mark_stripe);
goto err;
}
@@ -324,7 +325,7 @@ static int mark_stripe_bucket(struct btree_trans *trans,
if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n%s",
ptr->dev,
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- ret = -BCH_ERR_mark_stripe;
+ ret = bch_err_throw(c, mark_stripe);
goto err;
}
@@ -427,7 +428,7 @@ int bch2_trigger_stripe(struct btree_trans *trans,
gc = genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL);
if (!gc) {
bch_err(c, "error allocating memory for gc_stripes, idx %llu", idx);
- return -BCH_ERR_ENOMEM_mark_stripe;
+ return bch_err_throw(c, ENOMEM_mark_stripe);
}
/*
@@ -535,7 +536,8 @@ static void ec_stripe_buf_exit(struct ec_stripe_buf *buf)
}
/* XXX: this is a non-mempoolified memory allocation: */
-static int ec_stripe_buf_init(struct ec_stripe_buf *buf,
+static int ec_stripe_buf_init(struct bch_fs *c,
+ struct ec_stripe_buf *buf,
unsigned offset, unsigned size)
{
struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
@@ -563,7 +565,7 @@ static int ec_stripe_buf_init(struct ec_stripe_buf *buf,
return 0;
err:
ec_stripe_buf_exit(buf);
- return -BCH_ERR_ENOMEM_stripe_buf;
+ return bch_err_throw(c, ENOMEM_stripe_buf);
}
/* Checksumming: */
@@ -700,6 +702,9 @@ static void ec_block_endio(struct bio *bio)
struct bch_dev *ca = ec_bio->ca;
struct closure *cl = bio->bi_private;
int rw = ec_bio->rw;
+ unsigned ref = rw == READ
+ ? BCH_DEV_READ_REF_ec_block
+ : BCH_DEV_WRITE_REF_ec_block;
bch2_account_io_completion(ca, bio_data_dir(bio),
ec_bio->submit_time, !bio->bi_status);
@@ -721,7 +726,7 @@ static void ec_block_endio(struct bio *bio)
}
bio_put(&ec_bio->bio);
- percpu_ref_put(&ca->io_ref[rw]);
+ enumerated_ref_put(&ca->io_ref[rw], ref);
closure_put(cl);
}
@@ -735,8 +740,11 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
? BCH_DATA_user
: BCH_DATA_parity;
int rw = op_is_write(opf);
+ unsigned ref = rw == READ
+ ? BCH_DEV_READ_REF_ec_block
+ : BCH_DEV_WRITE_REF_ec_block;
- struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, rw);
+ struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, rw, ref);
if (!ca) {
clear_bit(idx, buf->valid);
return;
@@ -782,14 +790,14 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b);
closure_get(cl);
- percpu_ref_get(&ca->io_ref[rw]);
+ enumerated_ref_get(&ca->io_ref[rw], ref);
submit_bio(&ec_bio->bio);
offset += b;
}
- percpu_ref_put(&ca->io_ref[rw]);
+ enumerated_ref_put(&ca->io_ref[rw], ref);
}
static int get_stripe_key_trans(struct btree_trans *trans, u64 idx,
@@ -833,7 +841,7 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio,
buf = kzalloc(sizeof(*buf), GFP_NOFS);
if (!buf)
- return -BCH_ERR_ENOMEM_ec_read_extent;
+ return bch_err_throw(c, ENOMEM_ec_read_extent);
ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf));
if (ret) {
@@ -854,7 +862,7 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio,
goto err;
}
- ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio));
+ ret = ec_stripe_buf_init(c, buf, offset, bio_sectors(&rbio->bio));
if (ret) {
msg = "-ENOMEM";
goto err;
@@ -887,7 +895,7 @@ err:
bch_err_ratelimited(c,
"error doing reconstruct read: %s\n %s", msg, msgbuf.buf);
printbuf_exit(&msgbuf);
- ret = -BCH_ERR_stripe_reconstruct;
+ ret = bch_err_throw(c, stripe_reconstruct);
goto out;
}
@@ -897,7 +905,7 @@ static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
{
if (c->gc_pos.phase != GC_PHASE_not_running &&
!genradix_ptr_alloc(&c->gc_stripes, idx, gfp))
- return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
+ return bch_err_throw(c, ENOMEM_ec_stripe_mem_alloc);
return 0;
}
@@ -1011,14 +1019,14 @@ static void ec_stripe_delete_work(struct work_struct *work)
BCH_TRANS_COMMIT_no_enospc, ({
ec_stripe_delete(trans, lru_k.k->p.offset);
})));
- bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_delete);
}
void bch2_do_stripe_deletes(struct bch_fs *c)
{
- if (bch2_write_ref_tryget(c, BCH_WRITE_REF_stripe_delete) &&
+ if (enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_stripe_delete) &&
!queue_work(c->write_ref_wq, &c->ec_stripe_delete_work))
- bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_delete);
}
/* stripe creation: */
@@ -1122,7 +1130,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
bch2_fs_inconsistent(c, "%s", buf.buf);
printbuf_exit(&buf);
- return -BCH_ERR_erasure_coding_found_btree_node;
+ return bch_err_throw(c, erasure_coding_found_btree_node);
}
k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, last_flushed);
@@ -1188,7 +1196,7 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev);
if (!ca)
- return -BCH_ERR_ENOENT_dev_not_found;
+ return bch_err_throw(c, ENOENT_dev_not_found);
struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr);
@@ -1246,9 +1254,10 @@ static void zero_out_rest_of_ec_bucket(struct bch_fs *c,
unsigned block,
struct open_bucket *ob)
{
- struct bch_dev *ca = bch2_dev_get_ioref(c, ob->dev, WRITE);
+ struct bch_dev *ca = bch2_dev_get_ioref(c, ob->dev, WRITE,
+ BCH_DEV_WRITE_REF_ec_bucket_zero);
if (!ca) {
- s->err = -BCH_ERR_erofs_no_writes;
+ s->err = bch_err_throw(c, erofs_no_writes);
return;
}
@@ -1262,7 +1271,7 @@ static void zero_out_rest_of_ec_bucket(struct bch_fs *c,
ob->sectors_free,
GFP_KERNEL, 0);
- percpu_ref_put(&ca->io_ref[WRITE]);
+ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_ec_bucket_zero);
if (ret)
s->err = ret;
@@ -1312,7 +1321,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
if (ec_do_recov(c, &s->existing_stripe)) {
bch_err(c, "error creating stripe: error reading existing stripe");
- ret = -BCH_ERR_ec_block_read;
+ ret = bch_err_throw(c, ec_block_read);
goto err;
}
@@ -1338,7 +1347,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
if (ec_nr_failed(&s->new_stripe)) {
bch_err(c, "error creating stripe: error writing redundancy buckets");
- ret = -BCH_ERR_ec_block_write;
+ ret = bch_err_throw(c, ec_block_write);
goto err;
}
@@ -1412,15 +1421,15 @@ static void ec_stripe_create_work(struct work_struct *work)
while ((s = get_pending_stripe(c)))
ec_stripe_create(s);
- bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_create);
}
void bch2_ec_do_stripe_creates(struct bch_fs *c)
{
- bch2_write_ref_get(c, BCH_WRITE_REF_stripe_create);
+ enumerated_ref_get(&c->writes, BCH_WRITE_REF_stripe_create);
if (!queue_work(system_long_wq, &c->ec_stripe_create_work))
- bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_create);
}
static void ec_stripe_new_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
@@ -1570,26 +1579,26 @@ static struct ec_stripe_new *ec_new_stripe_alloc(struct bch_fs *c, struct ec_str
static void ec_stripe_head_devs_update(struct bch_fs *c, struct ec_stripe_head *h)
{
struct bch_devs_mask devs = h->devs;
+ unsigned nr_devs, nr_devs_with_durability;
- rcu_read_lock();
- h->devs = target_rw_devs(c, BCH_DATA_user, h->disk_label
- ? group_to_target(h->disk_label - 1)
- : 0);
- unsigned nr_devs = dev_mask_nr(&h->devs);
+ scoped_guard(rcu) {
+ h->devs = target_rw_devs(c, BCH_DATA_user, h->disk_label
+ ? group_to_target(h->disk_label - 1)
+ : 0);
+ nr_devs = dev_mask_nr(&h->devs);
- for_each_member_device_rcu(c, ca, &h->devs)
- if (!ca->mi.durability)
- __clear_bit(ca->dev_idx, h->devs.d);
- unsigned nr_devs_with_durability = dev_mask_nr(&h->devs);
+ for_each_member_device_rcu(c, ca, &h->devs)
+ if (!ca->mi.durability)
+ __clear_bit(ca->dev_idx, h->devs.d);
+ nr_devs_with_durability = dev_mask_nr(&h->devs);
- h->blocksize = pick_blocksize(c, &h->devs);
+ h->blocksize = pick_blocksize(c, &h->devs);
- h->nr_active_devs = 0;
- for_each_member_device_rcu(c, ca, &h->devs)
- if (ca->mi.bucket_size == h->blocksize)
- h->nr_active_devs++;
-
- rcu_read_unlock();
+ h->nr_active_devs = 0;
+ for_each_member_device_rcu(c, ca, &h->devs)
+ if (ca->mi.bucket_size == h->blocksize)
+ h->nr_active_devs++;
+ }
/*
* If we only have redundancy + 1 devices, we're better off with just
@@ -1710,23 +1719,32 @@ err:
}
static int new_stripe_alloc_buckets(struct btree_trans *trans,
+ struct alloc_request *req,
struct ec_stripe_head *h, struct ec_stripe_new *s,
- enum bch_watermark watermark, struct closure *cl)
+ struct closure *cl)
{
struct bch_fs *c = trans->c;
- struct bch_devs_mask devs = h->devs;
struct open_bucket *ob;
- struct open_buckets buckets;
struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
unsigned i, j, nr_have_parity = 0, nr_have_data = 0;
- bool have_cache = true;
int ret = 0;
+ req->scratch_data_type = req->data_type;
+ req->scratch_ptrs = req->ptrs;
+ req->scratch_nr_replicas = req->nr_replicas;
+ req->scratch_nr_effective = req->nr_effective;
+ req->scratch_have_cache = req->have_cache;
+ req->scratch_devs_may_alloc = req->devs_may_alloc;
+
+ req->devs_may_alloc = h->devs;
+ req->have_cache = true;
+
BUG_ON(v->nr_blocks != s->nr_data + s->nr_parity);
BUG_ON(v->nr_redundant != s->nr_parity);
/* * We bypass the sector allocator which normally does this: */
- bitmap_and(devs.d, devs.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX);
+ bitmap_and(req->devs_may_alloc.d, req->devs_may_alloc.d,
+ c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX);
for_each_set_bit(i, s->blocks_gotten, v->nr_blocks) {
/*
@@ -1736,7 +1754,7 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans,
* block when updating the stripe
*/
if (v->ptrs[i].dev != BCH_SB_MEMBER_INVALID)
- __clear_bit(v->ptrs[i].dev, devs.d);
+ __clear_bit(v->ptrs[i].dev, req->devs_may_alloc.d);
if (i < s->nr_data)
nr_have_data++;
@@ -1747,60 +1765,58 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans,
BUG_ON(nr_have_data > s->nr_data);
BUG_ON(nr_have_parity > s->nr_parity);
- buckets.nr = 0;
+ req->ptrs.nr = 0;
if (nr_have_parity < s->nr_parity) {
- ret = bch2_bucket_alloc_set_trans(trans, &buckets,
- &h->parity_stripe,
- &devs,
- s->nr_parity,
- &nr_have_parity,
- &have_cache, 0,
- BCH_DATA_parity,
- watermark,
- cl);
-
- open_bucket_for_each(c, &buckets, ob, i) {
+ req->nr_replicas = s->nr_parity;
+ req->nr_effective = nr_have_parity;
+ req->data_type = BCH_DATA_parity;
+
+ ret = bch2_bucket_alloc_set_trans(trans, req, &h->parity_stripe, cl);
+
+ open_bucket_for_each(c, &req->ptrs, ob, i) {
j = find_next_zero_bit(s->blocks_gotten,
s->nr_data + s->nr_parity,
s->nr_data);
BUG_ON(j >= s->nr_data + s->nr_parity);
- s->blocks[j] = buckets.v[i];
+ s->blocks[j] = req->ptrs.v[i];
v->ptrs[j] = bch2_ob_ptr(c, ob);
__set_bit(j, s->blocks_gotten);
}
if (ret)
- return ret;
+ goto err;
}
- buckets.nr = 0;
+ req->ptrs.nr = 0;
if (nr_have_data < s->nr_data) {
- ret = bch2_bucket_alloc_set_trans(trans, &buckets,
- &h->block_stripe,
- &devs,
- s->nr_data,
- &nr_have_data,
- &have_cache, 0,
- BCH_DATA_user,
- watermark,
- cl);
-
- open_bucket_for_each(c, &buckets, ob, i) {
+ req->nr_replicas = s->nr_data;
+ req->nr_effective = nr_have_data;
+ req->data_type = BCH_DATA_user;
+
+ ret = bch2_bucket_alloc_set_trans(trans, req, &h->block_stripe, cl);
+
+ open_bucket_for_each(c, &req->ptrs, ob, i) {
j = find_next_zero_bit(s->blocks_gotten,
s->nr_data, 0);
BUG_ON(j >= s->nr_data);
- s->blocks[j] = buckets.v[i];
+ s->blocks[j] = req->ptrs.v[i];
v->ptrs[j] = bch2_ob_ptr(c, ob);
__set_bit(j, s->blocks_gotten);
}
if (ret)
- return ret;
+ goto err;
}
-
- return 0;
+err:
+ req->data_type = req->scratch_data_type;
+ req->ptrs = req->scratch_ptrs;
+ req->nr_replicas = req->scratch_nr_replicas;
+ req->nr_effective = req->scratch_nr_effective;
+ req->have_cache = req->scratch_have_cache;
+ req->devs_may_alloc = req->scratch_devs_may_alloc;
+ return ret;
}
static int __get_existing_stripe(struct btree_trans *trans,
@@ -1850,7 +1866,7 @@ static int init_new_stripe_from_existing(struct bch_fs *c, struct ec_stripe_new
s->nr_data = existing_v->nr_blocks -
existing_v->nr_redundant;
- int ret = ec_stripe_buf_init(&s->existing_stripe, 0, le16_to_cpu(existing_v->sectors));
+ int ret = ec_stripe_buf_init(c, &s->existing_stripe, 0, le16_to_cpu(existing_v->sectors));
if (ret) {
bch2_stripe_close(c, s);
return ret;
@@ -1910,7 +1926,7 @@ static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stri
}
bch2_trans_iter_exit(trans, &lru_iter);
if (!ret)
- ret = -BCH_ERR_stripe_alloc_blocked;
+ ret = bch_err_throw(c, stripe_alloc_blocked);
if (ret == 1)
ret = 0;
if (ret)
@@ -1951,7 +1967,7 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st
continue;
}
- ret = -BCH_ERR_ENOSPC_stripe_create;
+ ret = bch_err_throw(c, ENOSPC_stripe_create);
break;
}
@@ -1981,17 +1997,15 @@ err:
}
struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
- unsigned target,
+ struct alloc_request *req,
unsigned algo,
- unsigned redundancy,
- enum bch_watermark watermark,
struct closure *cl)
{
struct bch_fs *c = trans->c;
- struct ec_stripe_head *h;
- bool waiting = false;
+ unsigned redundancy = req->nr_replicas - 1;
unsigned disk_label = 0;
- struct target t = target_decode(target);
+ struct target t = target_decode(req->target);
+ bool waiting = false;
int ret;
if (t.type == TARGET_GROUP) {
@@ -2002,14 +2016,16 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
disk_label = t.group + 1; /* 0 == no label */
}
- h = __bch2_ec_stripe_head_get(trans, disk_label, algo, redundancy, watermark);
+ struct ec_stripe_head *h =
+ __bch2_ec_stripe_head_get(trans, disk_label, algo,
+ redundancy, req->watermark);
if (IS_ERR_OR_NULL(h))
return h;
if (!h->s) {
h->s = ec_new_stripe_alloc(c, h);
if (!h->s) {
- ret = -BCH_ERR_ENOMEM_ec_new_stripe_alloc;
+ ret = bch_err_throw(c, ENOMEM_ec_new_stripe_alloc);
bch_err(c, "failed to allocate new stripe");
goto err;
}
@@ -2026,8 +2042,12 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
goto alloc_existing;
/* First, try to allocate a full stripe: */
- ret = new_stripe_alloc_buckets(trans, h, s, BCH_WATERMARK_stripe, NULL) ?:
+ enum bch_watermark saved_watermark = BCH_WATERMARK_stripe;
+ swap(req->watermark, saved_watermark);
+ ret = new_stripe_alloc_buckets(trans, req, h, s, NULL) ?:
__bch2_ec_stripe_head_reserve(trans, h, s);
+ swap(req->watermark, saved_watermark);
+
if (!ret)
goto allocate_buf;
if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
@@ -2045,8 +2065,8 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked)
goto err;
- if (watermark == BCH_WATERMARK_copygc) {
- ret = new_stripe_alloc_buckets(trans, h, s, watermark, NULL) ?:
+ if (req->watermark == BCH_WATERMARK_copygc) {
+ ret = new_stripe_alloc_buckets(trans, req, h, s, NULL) ?:
__bch2_ec_stripe_head_reserve(trans, h, s);
if (ret)
goto err;
@@ -2065,12 +2085,12 @@ alloc_existing:
* Retry allocating buckets, with the watermark for this
* particular write:
*/
- ret = new_stripe_alloc_buckets(trans, h, s, watermark, cl);
+ ret = new_stripe_alloc_buckets(trans, req, h, s, cl);
if (ret)
goto err;
allocate_buf:
- ret = ec_stripe_buf_init(&s->new_stripe, 0, h->blocksize);
+ ret = ec_stripe_buf_init(c, &s->new_stripe, 0, h->blocksize);
if (ret)
goto err;
@@ -2087,23 +2107,18 @@ err:
/* device removal */
-static int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, struct bkey_s_c k_a)
+int bch2_invalidate_stripe_to_dev(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k,
+ unsigned dev_idx,
+ unsigned flags)
{
- struct bch_alloc_v4 a_convert;
- const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k_a, &a_convert);
-
- if (!a->stripe)
+ if (k.k->type != KEY_TYPE_stripe)
return 0;
- if (a->stripe_sectors) {
- bch_err(trans->c, "trying to invalidate device in stripe when bucket has stripe data");
- return -BCH_ERR_invalidate_stripe_to_dev;
- }
-
- struct btree_iter iter;
+ struct bch_fs *c = trans->c;
struct bkey_i_stripe *s =
- bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_stripes, POS(0, a->stripe),
- BTREE_ITER_slots, stripe);
+ bch2_bkey_make_mut_typed(trans, iter, &k, 0, stripe);
int ret = PTR_ERR_OR_ZERO(s);
if (ret)
return ret;
@@ -2120,12 +2135,30 @@ static int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, struct bkey_
acc.replicas.data_type = BCH_DATA_user;
ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, false);
if (ret)
- goto err;
+ return ret;
struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(&s->k_i));
- bkey_for_each_ptr(ptrs, ptr)
- if (ptr->dev == k_a.k->p.inode)
- ptr->dev = BCH_SB_MEMBER_INVALID;
+
+ /* XXX: how much redundancy do we still have? check degraded flags */
+
+ unsigned nr_good = 0;
+
+ scoped_guard(rcu)
+ bkey_for_each_ptr(ptrs, ptr) {
+ if (ptr->dev == dev_idx)
+ ptr->dev = BCH_SB_MEMBER_INVALID;
+
+ struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
+ nr_good += ca && ca->mi.state != BCH_MEMBER_STATE_failed;
+ }
+
+ if (nr_good < s->v.nr_blocks && !(flags & BCH_FORCE_IF_DATA_DEGRADED))
+ return bch_err_throw(c, remove_would_lose_data);
+
+ unsigned nr_data = s->v.nr_blocks - s->v.nr_redundant;
+
+ if (nr_good < nr_data && !(flags & BCH_FORCE_IF_DATA_LOST))
+ return bch_err_throw(c, remove_would_lose_data);
sectors = -sectors;
@@ -2133,23 +2166,48 @@ static int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, struct bkey_
acc.type = BCH_DISK_ACCOUNTING_replicas;
bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i));
acc.replicas.data_type = BCH_DATA_user;
- ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, false);
+ return bch2_disk_accounting_mod(trans, &acc, &sectors, 1, false);
+}
+
+static int bch2_invalidate_stripe_to_dev_from_alloc(struct btree_trans *trans, struct bkey_s_c k_a,
+ unsigned flags)
+{
+ struct bch_alloc_v4 a_convert;
+ const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k_a, &a_convert);
+
+ if (!a->stripe)
+ return 0;
+
+ if (a->stripe_sectors) {
+ struct bch_fs *c = trans->c;
+ bch_err(c, "trying to invalidate device in stripe when bucket has stripe data");
+ return bch_err_throw(c, invalidate_stripe_to_dev);
+ }
+
+ struct btree_iter iter;
+ struct bkey_s_c_stripe s =
+ bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_stripes, POS(0, a->stripe),
+ BTREE_ITER_slots, stripe);
+ int ret = bkey_err(s);
if (ret)
- goto err;
-err:
+ return ret;
+
+ ret = bch2_invalidate_stripe_to_dev(trans, &iter, s.s_c, k_a.k->p.inode, flags);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
-int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx)
+int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx, unsigned flags)
{
- return bch2_trans_run(c,
+ int ret = bch2_trans_run(c,
for_each_btree_key_max_commit(trans, iter,
BTREE_ID_alloc, POS(dev_idx, 0), POS(dev_idx, U64_MAX),
BTREE_ITER_intent, k,
NULL, NULL, 0, ({
- bch2_invalidate_stripe_to_dev(trans, k);
+ bch2_invalidate_stripe_to_dev_from_alloc(trans, k, flags);
})));
+ bch_err_fn(c, ret);
+ return ret;
}
/* startup/shutdown */
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 51893e1ee874..548048adf0d5 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -255,9 +255,10 @@ void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *, int);
int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *);
void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *);
+
+struct alloc_request;
struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *,
- unsigned, unsigned, unsigned,
- enum bch_watermark, struct closure *);
+ struct alloc_request *, unsigned, struct closure *);
void bch2_do_stripe_deletes(struct bch_fs *);
void bch2_ec_do_stripe_creates(struct bch_fs *);
@@ -287,7 +288,9 @@ static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s,
}
}
-int bch2_dev_remove_stripes(struct bch_fs *, unsigned);
+int bch2_invalidate_stripe_to_dev(struct btree_trans *, struct btree_iter *,
+ struct bkey_s_c, unsigned, unsigned);
+int bch2_dev_remove_stripes(struct bch_fs *, unsigned, unsigned);
void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *);
void bch2_fs_ec_stop(struct bch_fs *);
diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h
index 06144bfd9c19..809446c78951 100644
--- a/fs/bcachefs/ec_types.h
+++ b/fs/bcachefs/ec_types.h
@@ -4,9 +4,10 @@
#include "bcachefs_format.h"
-struct bch_replicas_padded {
+union bch_replicas_padded {
+ u8 bytes[struct_size_t(struct bch_replicas_entry_v1,
+ devs, BCH_BKEY_PTRS_MAX)];
struct bch_replicas_entry_v1 e;
- u8 pad[BCH_BKEY_PTRS_MAX];
};
struct stripe {
@@ -28,7 +29,7 @@ struct gc_stripe {
u16 block_sectors[BCH_BKEY_PTRS_MAX];
struct bch_extent_ptr ptrs[BCH_BKEY_PTRS_MAX];
- struct bch_replicas_padded r;
+ union bch_replicas_padded r;
};
#endif /* _BCACHEFS_EC_TYPES_H */
diff --git a/fs/bcachefs/enumerated_ref.c b/fs/bcachefs/enumerated_ref.c
new file mode 100644
index 000000000000..56ab430f209f
--- /dev/null
+++ b/fs/bcachefs/enumerated_ref.c
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "enumerated_ref.h"
+#include "util.h"
+
+#include <linux/completion.h>
+
+#ifdef ENUMERATED_REF_DEBUG
+void enumerated_ref_get(struct enumerated_ref *ref, unsigned idx)
+{
+ BUG_ON(idx >= ref->nr);
+ atomic_long_inc(&ref->refs[idx]);
+}
+
+bool __enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx)
+{
+ BUG_ON(idx >= ref->nr);
+ return atomic_long_inc_not_zero(&ref->refs[idx]);
+}
+
+bool enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx)
+{
+ BUG_ON(idx >= ref->nr);
+ return !ref->dying &&
+ atomic_long_inc_not_zero(&ref->refs[idx]);
+}
+
+void enumerated_ref_put(struct enumerated_ref *ref, unsigned idx)
+{
+ BUG_ON(idx >= ref->nr);
+ long v = atomic_long_dec_return(&ref->refs[idx]);
+
+ BUG_ON(v < 0);
+ if (v)
+ return;
+
+ for (unsigned i = 0; i < ref->nr; i++)
+ if (atomic_long_read(&ref->refs[i]))
+ return;
+
+ if (ref->stop_fn)
+ ref->stop_fn(ref);
+ complete(&ref->stop_complete);
+}
+#endif
+
+#ifndef ENUMERATED_REF_DEBUG
+static void enumerated_ref_kill_cb(struct percpu_ref *percpu_ref)
+{
+ struct enumerated_ref *ref =
+ container_of(percpu_ref, struct enumerated_ref, ref);
+
+ if (ref->stop_fn)
+ ref->stop_fn(ref);
+ complete(&ref->stop_complete);
+}
+#endif
+
+void enumerated_ref_stop_async(struct enumerated_ref *ref)
+{
+ reinit_completion(&ref->stop_complete);
+
+#ifndef ENUMERATED_REF_DEBUG
+ percpu_ref_kill(&ref->ref);
+#else
+ ref->dying = true;
+ for (unsigned i = 0; i < ref->nr; i++)
+ enumerated_ref_put(ref, i);
+#endif
+}
+
+void enumerated_ref_stop(struct enumerated_ref *ref,
+ const char * const names[])
+{
+ enumerated_ref_stop_async(ref);
+ while (!wait_for_completion_timeout(&ref->stop_complete, HZ * 10)) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "Waited for 10 seconds to shutdown enumerated ref\n");
+ prt_str(&buf, "Outstanding refs:\n");
+ enumerated_ref_to_text(&buf, ref, names);
+ printk(KERN_ERR "%s", buf.buf);
+ printbuf_exit(&buf);
+ }
+}
+
+void enumerated_ref_start(struct enumerated_ref *ref)
+{
+#ifndef ENUMERATED_REF_DEBUG
+ percpu_ref_reinit(&ref->ref);
+#else
+ ref->dying = false;
+ for (unsigned i = 0; i < ref->nr; i++) {
+ BUG_ON(atomic_long_read(&ref->refs[i]));
+ atomic_long_inc(&ref->refs[i]);
+ }
+#endif
+}
+
+void enumerated_ref_exit(struct enumerated_ref *ref)
+{
+#ifndef ENUMERATED_REF_DEBUG
+ percpu_ref_exit(&ref->ref);
+#else
+ kfree(ref->refs);
+ ref->refs = NULL;
+ ref->nr = 0;
+#endif
+}
+
+int enumerated_ref_init(struct enumerated_ref *ref, unsigned nr,
+ void (*stop_fn)(struct enumerated_ref *))
+{
+ init_completion(&ref->stop_complete);
+ ref->stop_fn = stop_fn;
+
+#ifndef ENUMERATED_REF_DEBUG
+ return percpu_ref_init(&ref->ref, enumerated_ref_kill_cb,
+ PERCPU_REF_INIT_DEAD, GFP_KERNEL);
+#else
+ ref->refs = kzalloc(sizeof(ref->refs[0]) * nr, GFP_KERNEL);
+ if (!ref->refs)
+ return -ENOMEM;
+
+ ref->nr = nr;
+ return 0;
+#endif
+}
+
+void enumerated_ref_to_text(struct printbuf *out,
+ struct enumerated_ref *ref,
+ const char * const names[])
+{
+#ifdef ENUMERATED_REF_DEBUG
+ bch2_printbuf_tabstop_push(out, 32);
+
+ for (unsigned i = 0; i < ref->nr; i++)
+ prt_printf(out, "%s\t%li\n", names[i],
+ atomic_long_read(&ref->refs[i]));
+#else
+ prt_str(out, "(not in debug mode)\n");
+#endif
+}
diff --git a/fs/bcachefs/enumerated_ref.h b/fs/bcachefs/enumerated_ref.h
new file mode 100644
index 000000000000..ec01cf59ef80
--- /dev/null
+++ b/fs/bcachefs/enumerated_ref.h
@@ -0,0 +1,66 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ENUMERATED_REF_H
+#define _BCACHEFS_ENUMERATED_REF_H
+
+#include "enumerated_ref_types.h"
+
+/*
+ * A refcount where the users are enumerated: in debug mode, we create sepate
+ * refcounts for each user, to make leaks and refcount errors easy to track
+ * down:
+ */
+
+#ifdef ENUMERATED_REF_DEBUG
+void enumerated_ref_get(struct enumerated_ref *, unsigned);
+bool __enumerated_ref_tryget(struct enumerated_ref *, unsigned);
+bool enumerated_ref_tryget(struct enumerated_ref *, unsigned);
+void enumerated_ref_put(struct enumerated_ref *, unsigned);
+#else
+
+static inline void enumerated_ref_get(struct enumerated_ref *ref, unsigned idx)
+{
+ percpu_ref_get(&ref->ref);
+}
+
+static inline bool __enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx)
+{
+ return percpu_ref_tryget(&ref->ref);
+}
+
+static inline bool enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx)
+{
+ return percpu_ref_tryget_live(&ref->ref);
+}
+
+static inline void enumerated_ref_put(struct enumerated_ref *ref, unsigned idx)
+{
+ percpu_ref_put(&ref->ref);
+}
+#endif
+
+static inline bool enumerated_ref_is_zero(struct enumerated_ref *ref)
+{
+#ifndef ENUMERATED_REF_DEBUG
+ return percpu_ref_is_zero(&ref->ref);
+#else
+ for (unsigned i = 0; i < ref->nr; i++)
+ if (atomic_long_read(&ref->refs[i]))
+ return false;
+ return true;
+#endif
+}
+
+void enumerated_ref_stop_async(struct enumerated_ref *);
+void enumerated_ref_stop(struct enumerated_ref *, const char * const[]);
+void enumerated_ref_start(struct enumerated_ref *);
+
+void enumerated_ref_exit(struct enumerated_ref *);
+int enumerated_ref_init(struct enumerated_ref *, unsigned,
+ void (*stop_fn)(struct enumerated_ref *));
+
+struct printbuf;
+void enumerated_ref_to_text(struct printbuf *,
+ struct enumerated_ref *,
+ const char * const[]);
+
+#endif /* _BCACHEFS_ENUMERATED_REF_H */
diff --git a/fs/bcachefs/enumerated_ref_types.h b/fs/bcachefs/enumerated_ref_types.h
new file mode 100644
index 000000000000..0e6076f466d3
--- /dev/null
+++ b/fs/bcachefs/enumerated_ref_types.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ENUMERATED_REF_TYPES_H
+#define _BCACHEFS_ENUMERATED_REF_TYPES_H
+
+#include <linux/percpu-refcount.h>
+
+struct enumerated_ref {
+#ifdef ENUMERATED_REF_DEBUG
+ unsigned nr;
+ bool dying;
+ atomic_long_t *refs;
+#else
+ struct percpu_ref ref;
+#endif
+ void (*stop_fn)(struct enumerated_ref *);
+ struct completion stop_complete;
+};
+
+#endif /* _BCACHEFS_ENUMERATED_REF_TYPES_H */
diff --git a/fs/bcachefs/errcode.c b/fs/bcachefs/errcode.c
index 43557bebd0f8..c39cf304c681 100644
--- a/fs/bcachefs/errcode.c
+++ b/fs/bcachefs/errcode.c
@@ -13,12 +13,13 @@ static const char * const bch2_errcode_strs[] = {
NULL
};
-static unsigned bch2_errcode_parents[] = {
+static const unsigned bch2_errcode_parents[] = {
#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = class,
BCH_ERRCODES()
#undef x
};
+__attribute__((const))
const char *bch2_err_str(int err)
{
const char *errstr;
@@ -36,6 +37,7 @@ const char *bch2_err_str(int err)
return errstr ?: "(Invalid error)";
}
+__attribute__((const))
bool __bch2_err_matches(int err, int class)
{
err = abs(err);
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index d9ebffa5b3a2..ac3264134a15 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -53,6 +53,7 @@
x(ENOMEM, ENOMEM_dio_write_bioset_init) \
x(ENOMEM, ENOMEM_nocow_flush_bioset_init) \
x(ENOMEM, ENOMEM_promote_table_init) \
+ x(ENOMEM, ENOMEM_async_obj_init) \
x(ENOMEM, ENOMEM_compression_bounce_read_init) \
x(ENOMEM, ENOMEM_compression_bounce_write_init) \
x(ENOMEM, ENOMEM_compression_workspace_init) \
@@ -174,16 +175,19 @@
x(0, backpointer_to_overwritten_btree_node) \
x(0, journal_reclaim_would_deadlock) \
x(EINVAL, fsck) \
+ x(BCH_ERR_fsck, fsck_ask) \
x(BCH_ERR_fsck, fsck_fix) \
x(BCH_ERR_fsck, fsck_delete_bkey) \
x(BCH_ERR_fsck, fsck_ignore) \
x(BCH_ERR_fsck, fsck_errors_not_fixed) \
x(BCH_ERR_fsck, fsck_repair_unimplemented) \
x(BCH_ERR_fsck, fsck_repair_impossible) \
- x(EINVAL, restart_recovery) \
- x(EINVAL, not_in_recovery) \
- x(EINVAL, cannot_rewind_recovery) \
+ x(EINVAL, recovery_will_run) \
+ x(BCH_ERR_recovery_will_run, restart_recovery) \
+ x(BCH_ERR_recovery_will_run, cannot_rewind_recovery) \
+ x(BCH_ERR_recovery_will_run, recovery_pass_will_run) \
x(0, data_update_done) \
+ x(0, bkey_was_deleted) \
x(BCH_ERR_data_update_done, data_update_done_would_block) \
x(BCH_ERR_data_update_done, data_update_done_unwritten) \
x(BCH_ERR_data_update_done, data_update_done_no_writes_needed) \
@@ -201,6 +205,7 @@
x(EINVAL, device_has_been_removed) \
x(EINVAL, device_splitbrain) \
x(EINVAL, device_already_online) \
+ x(EINVAL, filesystem_uuid_already_open) \
x(EINVAL, insufficient_devices_to_start) \
x(EINVAL, invalid) \
x(EINVAL, internal_fsck_err) \
@@ -209,8 +214,11 @@
x(EINVAL, remove_would_lose_data) \
x(EINVAL, no_resize_with_buckets_nouse) \
x(EINVAL, inode_unpack_error) \
+ x(EINVAL, inode_not_unlinked) \
+ x(EINVAL, inode_has_child_snapshot) \
x(EINVAL, varint_decode_error) \
x(EINVAL, erasure_coding_found_btree_node) \
+ x(EINVAL, option_negative) \
x(EOPNOTSUPP, may_not_use_incompat_feature) \
x(EROFS, erofs_trans_commit) \
x(EROFS, erofs_no_writes) \
@@ -219,6 +227,8 @@
x(EROFS, erofs_unfixed_errors) \
x(EROFS, erofs_norecovery) \
x(EROFS, erofs_nochanges) \
+ x(EROFS, erofs_no_alloc_info) \
+ x(EROFS, erofs_filesystem_full) \
x(EROFS, insufficient_devices) \
x(0, operation_blocked) \
x(BCH_ERR_operation_blocked, btree_cache_cannibalize_lock_blocked) \
@@ -352,9 +362,11 @@ enum bch_errcode {
BCH_ERR_MAX
};
-const char *bch2_err_str(int);
-bool __bch2_err_matches(int, int);
+__attribute__((const)) const char *bch2_err_str(int);
+__attribute__((const)) bool __bch2_err_matches(int, int);
+
+__attribute__((const))
static inline bool _bch2_err_matches(int err, int class)
{
return err < 0 && __bch2_err_matches(err, class);
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 6b8695b1349c..63951e293c47 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -11,12 +11,12 @@
#define FSCK_ERR_RATELIMIT_NR 10
-void bch2_log_msg_start(struct bch_fs *c, struct printbuf *out)
+void __bch2_log_msg_start(const char *fs_or_dev_name, struct printbuf *out)
{
printbuf_indent_add_nextline(out, 2);
#ifdef BCACHEFS_LOG_PREFIX
- prt_printf(out, bch2_log_msg(c, ""));
+ prt_printf(out, "bcachefs (%s): ", fs_or_dev_name);
#endif
}
@@ -29,12 +29,10 @@ bool __bch2_inconsistent_error(struct bch_fs *c, struct printbuf *out)
return false;
case BCH_ON_ERROR_fix_safe:
case BCH_ON_ERROR_ro:
- if (bch2_fs_emergency_read_only(c))
- prt_printf(out, "inconsistency detected - emergency read only at journal seq %llu\n",
- journal_cur_seq(&c->journal));
+ bch2_fs_emergency_read_only2(c, out);
return true;
case BCH_ON_ERROR_panic:
- bch2_print_string_as_lines_nonblocking(KERN_ERR, out->buf);
+ bch2_print_str(c, KERN_ERR, out->buf);
panic(bch2_fmt(c, "panic after error"));
return true;
default:
@@ -71,7 +69,7 @@ static bool bch2_fs_trans_inconsistent(struct bch_fs *c, struct btree_trans *tra
if (trans)
bch2_trans_updates_to_text(&buf, trans);
bool ret = __bch2_inconsistent_error(c, &buf);
- bch2_print_string_as_lines_nonblocking(KERN_ERR, buf.buf);
+ bch2_print_str_nonblocking(c, KERN_ERR, buf.buf);
printbuf_exit(&buf);
return ret;
@@ -100,12 +98,12 @@ int __bch2_topology_error(struct bch_fs *c, struct printbuf *out)
prt_printf(out, "btree topology error: ");
set_bit(BCH_FS_topology_error, &c->flags);
- if (!test_bit(BCH_FS_recovery_running, &c->flags)) {
+ if (!test_bit(BCH_FS_in_recovery, &c->flags)) {
__bch2_inconsistent_error(c, out);
- return -BCH_ERR_btree_need_topology_repair;
+ return bch_err_throw(c, btree_need_topology_repair);
} else {
- return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?:
- -BCH_ERR_btree_node_read_validate_error;
+ return bch2_run_explicit_recovery_pass(c, out, BCH_RECOVERY_PASS_check_topology, 0) ?:
+ bch_err_throw(c, btree_node_read_validate_error);
}
}
@@ -121,7 +119,7 @@ int bch2_fs_topology_error(struct bch_fs *c, const char *fmt, ...)
va_end(args);
int ret = __bch2_topology_error(c, &buf);
- bch2_print_string_as_lines(KERN_ERR, buf.buf);
+ bch2_print_str(c, KERN_ERR, buf.buf);
printbuf_exit(&buf);
return ret;
@@ -151,14 +149,17 @@ void bch2_io_error_work(struct work_struct *work)
bool dev = !__bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro,
BCH_FORCE_IF_DEGRADED);
+ struct printbuf buf = PRINTBUF;
+ __bch2_log_msg_start(ca->name, &buf);
- bch_err(ca,
- "writes erroring for %u seconds, setting %s ro",
+ prt_printf(&buf, "writes erroring for %u seconds, setting %s ro",
c->opts.write_error_timeout,
dev ? "device" : "filesystem");
if (!dev)
- bch2_fs_emergency_read_only(c);
+ bch2_fs_emergency_read_only2(c, &buf);
+ bch2_print_str(c, KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
}
out:
up_write(&c->state_lock);
@@ -328,7 +329,7 @@ static int do_fsck_ask_yn(struct bch_fs *c,
if (bch2_fs_stdio_redirect(c))
bch2_print(c, "%s", question->buf);
else
- bch2_print_string_as_lines(KERN_ERR, question->buf);
+ bch2_print_str(c, KERN_ERR, question->buf);
int ask = bch2_fsck_ask_yn(c, trans);
@@ -376,15 +377,63 @@ static struct fsck_err_state *count_fsck_err_locked(struct bch_fs *c,
return s;
}
-void __bch2_count_fsck_err(struct bch_fs *c,
- enum bch_sb_error_id id, const char *msg,
- bool *repeat, bool *print, bool *suppress)
+bool __bch2_count_fsck_err(struct bch_fs *c,
+ enum bch_sb_error_id id, struct printbuf *msg)
{
bch2_sb_error_count(c, id);
mutex_lock(&c->fsck_error_msgs_lock);
- count_fsck_err_locked(c, id, msg, repeat, print, suppress);
+ bool print = true, repeat = false, suppress = false;
+
+ count_fsck_err_locked(c, id, msg->buf, &repeat, &print, &suppress);
mutex_unlock(&c->fsck_error_msgs_lock);
+
+ if (suppress)
+ prt_printf(msg, "Ratelimiting new instances of previous error\n");
+
+ return print && !repeat;
+}
+
+int bch2_fsck_err_opt(struct bch_fs *c,
+ enum bch_fsck_flags flags,
+ enum bch_sb_error_id err)
+{
+ if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra)))
+ flags |= fsck_flags_extra[err];
+
+ if (test_bit(BCH_FS_in_fsck, &c->flags)) {
+ if (!(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE)))
+ return bch_err_throw(c, fsck_repair_unimplemented);
+
+ switch (c->opts.fix_errors) {
+ case FSCK_FIX_exit:
+ return bch_err_throw(c, fsck_errors_not_fixed);
+ case FSCK_FIX_yes:
+ if (flags & FSCK_CAN_FIX)
+ return bch_err_throw(c, fsck_fix);
+ fallthrough;
+ case FSCK_FIX_no:
+ if (flags & FSCK_CAN_IGNORE)
+ return bch_err_throw(c, fsck_ignore);
+ return bch_err_throw(c, fsck_errors_not_fixed);
+ case FSCK_FIX_ask:
+ if (flags & FSCK_AUTOFIX)
+ return bch_err_throw(c, fsck_fix);
+ return bch_err_throw(c, fsck_ask);
+ default:
+ BUG();
+ }
+ } else {
+ if ((flags & FSCK_AUTOFIX) &&
+ (c->opts.errors == BCH_ON_ERROR_continue ||
+ c->opts.errors == BCH_ON_ERROR_fix_safe))
+ return bch_err_throw(c, fsck_fix);
+
+ if (c->opts.errors == BCH_ON_ERROR_continue &&
+ (flags & FSCK_CAN_IGNORE))
+ return bch_err_throw(c, fsck_ignore);
+ return bch_err_throw(c, fsck_errors_not_fixed);
+ }
}
int __bch2_fsck_err(struct bch_fs *c,
@@ -395,7 +444,7 @@ int __bch2_fsck_err(struct bch_fs *c,
{
va_list args;
struct printbuf buf = PRINTBUF, *out = &buf;
- int ret = -BCH_ERR_fsck_ignore;
+ int ret = 0;
const char *action_orig = "fix?", *action = action_orig;
might_sleep();
@@ -425,8 +474,8 @@ int __bch2_fsck_err(struct bch_fs *c,
if (test_bit(err, c->sb.errors_silent))
return flags & FSCK_CAN_FIX
- ? -BCH_ERR_fsck_fix
- : -BCH_ERR_fsck_ignore;
+ ? bch_err_throw(c, fsck_fix)
+ : bch_err_throw(c, fsck_ignore);
printbuf_indent_add_nextline(out, 2);
@@ -468,14 +517,14 @@ int __bch2_fsck_err(struct bch_fs *c,
prt_str(out, ", ");
if (flags & FSCK_CAN_FIX) {
prt_actioning(out, action);
- ret = -BCH_ERR_fsck_fix;
+ ret = bch_err_throw(c, fsck_fix);
} else {
prt_str(out, ", continuing");
- ret = -BCH_ERR_fsck_ignore;
+ ret = bch_err_throw(c, fsck_ignore);
}
goto print;
- } else if (!test_bit(BCH_FS_fsck_running, &c->flags)) {
+ } else if (!test_bit(BCH_FS_in_fsck, &c->flags)) {
if (c->opts.errors != BCH_ON_ERROR_continue ||
!(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) {
prt_str_indented(out, ", shutting down\n"
@@ -483,18 +532,18 @@ int __bch2_fsck_err(struct bch_fs *c,
"run fsck, and forward to devs so error can be marked for self-healing");
inconsistent = true;
print = true;
- ret = -BCH_ERR_fsck_errors_not_fixed;
+ ret = bch_err_throw(c, fsck_errors_not_fixed);
} else if (flags & FSCK_CAN_FIX) {
prt_str(out, ", ");
prt_actioning(out, action);
- ret = -BCH_ERR_fsck_fix;
+ ret = bch_err_throw(c, fsck_fix);
} else {
prt_str(out, ", continuing");
- ret = -BCH_ERR_fsck_ignore;
+ ret = bch_err_throw(c, fsck_ignore);
}
} else if (c->opts.fix_errors == FSCK_FIX_exit) {
prt_str(out, ", exiting");
- ret = -BCH_ERR_fsck_errors_not_fixed;
+ ret = bch_err_throw(c, fsck_errors_not_fixed);
} else if (flags & FSCK_CAN_FIX) {
int fix = s && s->fix
? s->fix
@@ -513,30 +562,37 @@ int __bch2_fsck_err(struct bch_fs *c,
: FSCK_FIX_yes;
ret = ret & 1
- ? -BCH_ERR_fsck_fix
- : -BCH_ERR_fsck_ignore;
+ ? bch_err_throw(c, fsck_fix)
+ : bch_err_throw(c, fsck_ignore);
} else if (fix == FSCK_FIX_yes ||
(c->opts.nochanges &&
!(flags & FSCK_CAN_IGNORE))) {
prt_str(out, ", ");
prt_actioning(out, action);
- ret = -BCH_ERR_fsck_fix;
+ ret = bch_err_throw(c, fsck_fix);
} else {
prt_str(out, ", not ");
prt_actioning(out, action);
+ ret = bch_err_throw(c, fsck_ignore);
+ }
+ } else {
+ if (flags & FSCK_CAN_IGNORE) {
+ prt_str(out, ", continuing");
+ ret = bch_err_throw(c, fsck_ignore);
+ } else {
+ prt_str(out, " (repair unimplemented)");
+ ret = bch_err_throw(c, fsck_repair_unimplemented);
}
- } else if (!(flags & FSCK_CAN_IGNORE)) {
- prt_str(out, " (repair unimplemented)");
}
- if (ret == -BCH_ERR_fsck_ignore &&
+ if (bch2_err_matches(ret, BCH_ERR_fsck_ignore) &&
(c->opts.fix_errors == FSCK_FIX_exit ||
!(flags & FSCK_CAN_IGNORE)))
- ret = -BCH_ERR_fsck_errors_not_fixed;
+ ret = bch_err_throw(c, fsck_errors_not_fixed);
- if (test_bit(BCH_FS_fsck_running, &c->flags) &&
- (ret != -BCH_ERR_fsck_fix &&
- ret != -BCH_ERR_fsck_ignore)) {
+ if (test_bit(BCH_FS_in_fsck, &c->flags) &&
+ (!bch2_err_matches(ret, BCH_ERR_fsck_fix) &&
+ !bch2_err_matches(ret, BCH_ERR_fsck_ignore))) {
exiting = true;
print = true;
}
@@ -559,31 +615,31 @@ print:
if (bch2_fs_stdio_redirect(c))
bch2_print(c, "%s", out->buf);
else
- bch2_print_string_as_lines(KERN_ERR, out->buf);
+ bch2_print_str(c, KERN_ERR, out->buf);
}
if (s)
s->ret = ret;
-
+err_unlock:
+ mutex_unlock(&c->fsck_error_msgs_lock);
+err:
/*
* We don't yet track whether the filesystem currently has errors, for
* log_fsck_err()s: that would require us to track for every error type
* which recovery pass corrects it, to get the fsck exit status correct:
*/
- if (flags & FSCK_CAN_FIX) {
- if (ret == -BCH_ERR_fsck_fix) {
- set_bit(BCH_FS_errors_fixed, &c->flags);
- } else {
- set_bit(BCH_FS_errors_not_fixed, &c->flags);
- set_bit(BCH_FS_error, &c->flags);
- }
+ if (bch2_err_matches(ret, BCH_ERR_fsck_fix)) {
+ set_bit(BCH_FS_errors_fixed, &c->flags);
+ } else {
+ set_bit(BCH_FS_errors_not_fixed, &c->flags);
+ set_bit(BCH_FS_error, &c->flags);
}
-err_unlock:
- mutex_unlock(&c->fsck_error_msgs_lock);
-err:
+
if (action != action_orig)
kfree(action);
printbuf_exit(&buf);
+
+ BUG_ON(!ret);
return ret;
}
@@ -601,12 +657,12 @@ int __bch2_bkey_fsck_err(struct bch_fs *c,
const char *fmt, ...)
{
if (from.flags & BCH_VALIDATE_silent)
- return -BCH_ERR_fsck_delete_bkey;
+ return bch_err_throw(c, fsck_delete_bkey);
unsigned fsck_flags = 0;
if (!(from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit))) {
if (test_bit(err, c->sb.errors_silent))
- return -BCH_ERR_fsck_delete_bkey;
+ return bch_err_throw(c, fsck_delete_bkey);
fsck_flags |= FSCK_AUTOFIX|FSCK_CAN_FIX;
}
@@ -693,25 +749,9 @@ void bch2_inum_offset_err_msg(struct bch_fs *c, struct printbuf *out,
int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
struct bpos pos)
{
- struct bch_fs *c = trans->c;
- int ret = 0;
-
- if (!bch2_snapshot_is_leaf(c, pos.snapshot))
- prt_str(out, "(multiple snapshots) ");
-
- subvol_inum inum = {
- .subvol = bch2_snapshot_tree_oldest_subvol(c, pos.snapshot),
- .inum = pos.inode,
- };
-
- if (inum.subvol) {
- ret = bch2_inum_to_path(trans, inum, out);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- return ret;
- }
-
- if (!inum.subvol || ret)
- prt_printf(out, "inum %llu:%u", pos.inode, pos.snapshot);
+ int ret = bch2_inum_snapshot_to_path(trans, pos.inode, pos.snapshot, NULL, out);
+ if (ret)
+ return ret;
prt_printf(out, " offset %llu: ", pos.offset << 8);
return 0;
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index 4a364fd44abe..0c3c3a24fc6f 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -18,7 +18,12 @@ struct work_struct;
/* Error messages: */
-void bch2_log_msg_start(struct bch_fs *, struct printbuf *);
+void __bch2_log_msg_start(const char *, struct printbuf *);
+
+static inline void bch2_log_msg_start(struct bch_fs *c, struct printbuf *out)
+{
+ __bch2_log_msg_start(c->name, out);
+}
/*
* Inconsistency errors: The on disk data is inconsistent. If these occur during
@@ -76,12 +81,14 @@ struct fsck_err_state {
#define fsck_err_count(_c, _err) bch2_sb_err_count(_c, BCH_FSCK_ERR_##_err)
-void __bch2_count_fsck_err(struct bch_fs *,
- enum bch_sb_error_id, const char *,
- bool *, bool *, bool *);
+bool __bch2_count_fsck_err(struct bch_fs *, enum bch_sb_error_id, struct printbuf *);
#define bch2_count_fsck_err(_c, _err, ...) \
__bch2_count_fsck_err(_c, BCH_FSCK_ERR_##_err, __VA_ARGS__)
+int bch2_fsck_err_opt(struct bch_fs *,
+ enum bch_fsck_flags,
+ enum bch_sb_error_id);
+
__printf(5, 6) __cold
int __bch2_fsck_err(struct bch_fs *, struct btree_trans *,
enum bch_fsck_flags,
@@ -98,13 +105,13 @@ void bch2_free_fsck_errs(struct bch_fs *);
#define fsck_err_wrap(_do) \
({ \
int _ret = _do; \
- if (_ret != -BCH_ERR_fsck_fix && \
- _ret != -BCH_ERR_fsck_ignore) { \
+ if (!bch2_err_matches(_ret, BCH_ERR_fsck_fix) && \
+ !bch2_err_matches(_ret, BCH_ERR_fsck_ignore)) { \
ret = _ret; \
goto fsck_err; \
} \
\
- _ret == -BCH_ERR_fsck_fix; \
+ bch2_err_matches(_ret, BCH_ERR_fsck_fix); \
})
#define __fsck_err(...) fsck_err_wrap(bch2_fsck_err(__VA_ARGS__))
@@ -163,10 +170,10 @@ do { \
int _ret = __bch2_bkey_fsck_err(c, k, from, \
BCH_FSCK_ERR_##_err_type, \
_err_msg, ##__VA_ARGS__); \
- if (_ret != -BCH_ERR_fsck_fix && \
- _ret != -BCH_ERR_fsck_ignore) \
+ if (!bch2_err_matches(_ret, BCH_ERR_fsck_fix) && \
+ !bch2_err_matches(_ret, BCH_ERR_fsck_ignore)) \
ret = _ret; \
- ret = -BCH_ERR_fsck_delete_bkey; \
+ ret = bch_err_throw(c, fsck_delete_bkey); \
goto fsck_err; \
} while (0)
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
index 6bb42985306e..b899ee75f5b9 100644
--- a/fs/bcachefs/extent_update.c
+++ b/fs/bcachefs/extent_update.c
@@ -37,16 +37,17 @@ static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k)
return lru + ret * 2;
}
+#define EXTENT_ITERS_MAX 64
+
static int count_iters_for_insert(struct btree_trans *trans,
struct bkey_s_c k,
unsigned offset,
struct bpos *end,
- unsigned *nr_iters,
- unsigned max_iters)
+ unsigned *nr_iters)
{
int ret = 0, ret2 = 0;
- if (*nr_iters >= max_iters) {
+ if (*nr_iters >= EXTENT_ITERS_MAX) {
*end = bpos_min(*end, k.k->p);
ret = 1;
}
@@ -56,7 +57,7 @@ static int count_iters_for_insert(struct btree_trans *trans,
case KEY_TYPE_reflink_v:
*nr_iters += bch2_bkey_nr_alloc_ptrs(k);
- if (*nr_iters >= max_iters) {
+ if (*nr_iters >= EXTENT_ITERS_MAX) {
*end = bpos_min(*end, k.k->p);
ret = 1;
}
@@ -81,7 +82,7 @@ static int count_iters_for_insert(struct btree_trans *trans,
*nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k);
- if (*nr_iters >= max_iters) {
+ if (*nr_iters >= EXTENT_ITERS_MAX) {
struct bpos pos = bkey_start_pos(k.k);
pos.offset += min_t(u64, k.k->size,
r_k.k->p.offset - idx);
@@ -100,59 +101,31 @@ static int count_iters_for_insert(struct btree_trans *trans,
return ret2 ?: ret;
}
-#define EXTENT_ITERS_MAX (BTREE_ITER_INITIAL / 3)
-
int bch2_extent_atomic_end(struct btree_trans *trans,
struct btree_iter *iter,
- struct bkey_i *insert,
struct bpos *end)
{
- struct btree_iter copy;
- struct bkey_s_c k;
unsigned nr_iters = 0;
- int ret;
-
- ret = bch2_btree_iter_traverse(trans, iter);
- if (ret)
- return ret;
-
- *end = insert->k.p;
-
- /* extent_update_to_keys(): */
- nr_iters += 1;
-
- ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert), 0, end,
- &nr_iters, EXTENT_ITERS_MAX / 2);
- if (ret < 0)
- return ret;
+ struct btree_iter copy;
bch2_trans_copy_iter(trans, &copy, iter);
- for_each_btree_key_max_continue_norestart(trans, copy, insert->k.p, 0, k, ret) {
- unsigned offset = 0;
+ int ret = bch2_btree_iter_traverse(trans, &copy);
+ if (ret)
+ goto err;
- if (bkey_gt(bkey_start_pos(&insert->k), bkey_start_pos(k.k)))
- offset = bkey_start_offset(&insert->k) -
- bkey_start_offset(k.k);
+ struct bkey_s_c k;
+ for_each_btree_key_max_continue_norestart(trans, copy, *end, 0, k, ret) {
+ unsigned offset = 0;
- /* extent_handle_overwrites(): */
- switch (bch2_extent_overlap(&insert->k, k.k)) {
- case BCH_EXTENT_OVERLAP_ALL:
- case BCH_EXTENT_OVERLAP_FRONT:
- nr_iters += 1;
- break;
- case BCH_EXTENT_OVERLAP_BACK:
- case BCH_EXTENT_OVERLAP_MIDDLE:
- nr_iters += 2;
- break;
- }
+ if (bkey_gt(iter->pos, bkey_start_pos(k.k)))
+ offset = iter->pos.offset - bkey_start_offset(k.k);
- ret = count_iters_for_insert(trans, k, offset, end,
- &nr_iters, EXTENT_ITERS_MAX);
+ ret = count_iters_for_insert(trans, k, offset, end, &nr_iters);
if (ret)
break;
}
-
+err:
bch2_trans_iter_exit(trans, &copy);
return ret < 0 ? ret : 0;
}
@@ -161,10 +134,8 @@ int bch2_extent_trim_atomic(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_i *k)
{
- struct bpos end;
- int ret;
-
- ret = bch2_extent_atomic_end(trans, iter, k, &end);
+ struct bpos end = k->k.p;
+ int ret = bch2_extent_atomic_end(trans, iter, &end);
if (ret)
return ret;
diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h
index 6f5cf449361a..34467db53f45 100644
--- a/fs/bcachefs/extent_update.h
+++ b/fs/bcachefs/extent_update.h
@@ -5,7 +5,7 @@
#include "bcachefs.h"
int bch2_extent_atomic_end(struct btree_trans *, struct btree_iter *,
- struct bkey_i *, struct bpos *);
+ struct bpos *);
int bch2_extent_trim_atomic(struct btree_trans *, struct btree_iter *,
struct bkey_i *);
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index e597fb9c9823..036e4ad95987 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -45,6 +45,49 @@ static void bch2_extent_crc_pack(union bch_extent_crc *,
struct bch_extent_crc_unpacked,
enum bch_extent_entry_type);
+void bch2_io_failures_to_text(struct printbuf *out,
+ struct bch_fs *c,
+ struct bch_io_failures *failed)
+{
+ static const char * const error_types[] = {
+ "io", "checksum", "ec reconstruct", NULL
+ };
+
+ for (struct bch_dev_io_failures *f = failed->devs;
+ f < failed->devs + failed->nr;
+ f++) {
+ unsigned errflags =
+ ((!!f->failed_io) << 0) |
+ ((!!f->failed_csum_nr) << 1) |
+ ((!!f->failed_ec) << 2);
+
+ if (!errflags)
+ continue;
+
+ bch2_printbuf_make_room(out, 1024);
+ out->atomic++;
+ scoped_guard(rcu) {
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, f->dev);
+ if (ca)
+ prt_str(out, ca->name);
+ else
+ prt_printf(out, "(invalid device %u)", f->dev);
+ }
+ --out->atomic;
+
+ prt_char(out, ' ');
+
+ if (is_power_of_2(errflags)) {
+ prt_bitflags(out, error_types, errflags);
+ prt_str(out, " error");
+ } else {
+ prt_str(out, "errors: ");
+ prt_bitflags(out, error_types, errflags);
+ }
+ prt_newline(out);
+ }
+}
+
struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *f,
unsigned dev)
{
@@ -79,6 +122,22 @@ void bch2_mark_io_failure(struct bch_io_failures *failed,
f->failed_csum_nr++;
}
+void bch2_mark_btree_validate_failure(struct bch_io_failures *failed,
+ unsigned dev)
+{
+ struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, dev);
+
+ if (!f) {
+ BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs));
+
+ f = &failed->devs[failed->nr++];
+ memset(f, 0, sizeof(*f));
+ f->dev = dev;
+ }
+
+ f->failed_btree_validate = true;
+}
+
static inline u64 dev_latency(struct bch_dev *ca)
{
return ca ? atomic64_read(&ca->cur_latency[READ]) : S64_MAX;
@@ -105,7 +164,7 @@ static inline bool ptr_better(struct bch_fs *c,
if (unlikely(failed_delta))
return failed_delta < 0;
- if (unlikely(bch2_force_reconstruct_read))
+ if (static_branch_unlikely(&bch2_force_reconstruct_read))
return p1.do_ec_reconstruct > p2.do_ec_reconstruct;
if (unlikely(p1.do_ec_reconstruct || p2.do_ec_reconstruct))
@@ -134,14 +193,10 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
bool have_dirty_ptrs = false, have_pick = false;
if (k.k->type == KEY_TYPE_error)
- return -BCH_ERR_key_type_error;
-
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-
- if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
- return -BCH_ERR_extent_poisoned;
+ return bch_err_throw(c, key_type_error);
rcu_read_lock();
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
u64 pick_latency;
@@ -162,7 +217,15 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
if (dev >= 0 && p.ptr.dev != dev)
continue;
- struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev);
+
+ if (unlikely(!ca && p.ptr.dev != BCH_SB_MEMBER_INVALID)) {
+ rcu_read_unlock();
+ int ret = bch2_dev_missing_bkey(c, k, p.ptr.dev);
+ if (ret)
+ return ret;
+ rcu_read_lock();
+ }
if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr)))
continue;
@@ -175,6 +238,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
if (ca && ca->mi.state != BCH_MEMBER_STATE_failed) {
have_io_errors |= f->failed_io;
+ have_io_errors |= f->failed_btree_validate;
have_io_errors |= f->failed_ec;
}
have_csum_errors |= !!f->failed_csum_nr;
@@ -182,6 +246,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
if (p.has_ec && (f->failed_io || f->failed_csum_nr))
p.do_ec_reconstruct = true;
else if (f->failed_io ||
+ f->failed_btree_validate ||
f->failed_csum_nr > c->opts.checksum_err_retry_nr)
continue;
}
@@ -194,7 +259,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
p.do_ec_reconstruct = true;
}
- if (bch2_force_reconstruct_read && p.has_ec)
+ if (static_branch_unlikely(&bch2_force_reconstruct_read) && p.has_ec)
p.do_ec_reconstruct = true;
u64 p_latency = dev_latency(ca);
@@ -221,17 +286,17 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
if (!have_dirty_ptrs)
return 0;
if (have_missing_devs)
- return -BCH_ERR_no_device_to_read_from;
+ return bch_err_throw(c, no_device_to_read_from);
if (have_csum_errors)
- return -BCH_ERR_data_read_csum_err;
+ return bch_err_throw(c, data_read_csum_err);
if (have_io_errors)
- return -BCH_ERR_data_read_io_err;
+ return bch_err_throw(c, data_read_io_err);
/*
* If we get here, we have pointers (bkey_ptrs_validate() ensures that),
* but they don't point to valid devices:
*/
- return -BCH_ERR_no_devices_valid;
+ return bch_err_throw(c, no_devices_valid);
}
/* KEY_TYPE_btree_ptr: */
@@ -342,6 +407,8 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
lp.crc = bch2_extent_crc_unpack(l.k, NULL);
rp.crc = bch2_extent_crc_unpack(r.k, NULL);
+ guard(rcu)();
+
while (__bkey_ptr_next_decode(l.k, l_ptrs.end, lp, en_l) &&
__bkey_ptr_next_decode(r.k, r_ptrs.end, rp, en_r)) {
if (lp.ptr.offset + lp.crc.offset + lp.crc.live_size !=
@@ -353,10 +420,8 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
return false;
/* Extents may not straddle buckets: */
- rcu_read_lock();
struct bch_dev *ca = bch2_dev_rcu(c, lp.ptr.dev);
bool same_bucket = ca && PTR_BUCKET_NR(ca, &lp.ptr) == PTR_BUCKET_NR(ca, &rp.ptr);
- rcu_read_unlock();
if (!same_bucket)
return false;
@@ -773,11 +838,9 @@ unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k)
struct extent_ptr_decoded p;
unsigned durability = 0;
- rcu_read_lock();
+ guard(rcu)();
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
durability += bch2_extent_ptr_durability(c, &p);
- rcu_read_unlock();
-
return durability;
}
@@ -788,12 +851,10 @@ static unsigned bch2_bkey_durability_safe(struct bch_fs *c, struct bkey_s_c k)
struct extent_ptr_decoded p;
unsigned durability = 0;
- rcu_read_lock();
+ guard(rcu)();
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
if (p.ptr.dev < c->sb.nr_devices && c->devs[p.ptr.dev])
durability += bch2_extent_ptr_durability(c, &p);
- rcu_read_unlock();
-
return durability;
}
@@ -950,20 +1011,16 @@ bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
struct bch_dev *ca;
- bool ret = false;
- rcu_read_lock();
+ guard(rcu)();
bkey_for_each_ptr(ptrs, ptr)
if (bch2_dev_in_target(c, ptr->dev, target) &&
(ca = bch2_dev_rcu(c, ptr->dev)) &&
(!ptr->cached ||
- !dev_ptr_stale_rcu(ca, ptr))) {
- ret = true;
- break;
- }
- rcu_read_unlock();
+ !dev_ptr_stale_rcu(ca, ptr)))
+ return true;
- return ret;
+ return false;
}
bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k,
@@ -1071,33 +1128,48 @@ void bch2_extent_ptr_set_cached(struct bch_fs *c,
struct bkey_s k,
struct bch_extent_ptr *ptr)
{
- struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
+ struct bkey_ptrs ptrs;
union bch_extent_entry *entry;
struct extent_ptr_decoded p;
+ bool have_cached_ptr;
+ unsigned drop_dev = ptr->dev;
- rcu_read_lock();
- if (!want_cached_ptr(c, opts, ptr)) {
- bch2_bkey_drop_ptr_noerror(k, ptr);
- goto out;
- }
+ guard(rcu)();
+restart_drop_ptrs:
+ ptrs = bch2_bkey_ptrs(k);
+ have_cached_ptr = false;
- /*
- * Stripes can't contain cached data, for - reasons.
- *
- * Possibly something we can fix in the future?
- */
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
- if (&entry->ptr == ptr) {
- if (p.has_ec)
- bch2_bkey_drop_ptr_noerror(k, ptr);
- else
- ptr->cached = true;
- goto out;
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ /*
+ * Check if it's erasure coded - stripes can't contain cached
+ * data. Possibly something we can fix in the future?
+ */
+ if (&entry->ptr == ptr && p.has_ec)
+ goto drop;
+
+ if (p.ptr.cached) {
+ if (have_cached_ptr || !want_cached_ptr(c, opts, &p.ptr)) {
+ bch2_bkey_drop_ptr_noerror(k, &entry->ptr);
+ ptr = NULL;
+ goto restart_drop_ptrs;
+ }
+
+ have_cached_ptr = true;
}
+ }
- BUG();
-out:
- rcu_read_unlock();
+ if (!ptr)
+ bkey_for_each_ptr(ptrs, ptr2)
+ if (ptr2->dev == drop_dev)
+ ptr = ptr2;
+
+ if (have_cached_ptr || !want_cached_ptr(c, opts, ptr))
+ goto drop;
+
+ ptr->cached = true;
+ return;
+drop:
+ bch2_bkey_drop_ptr_noerror(k, ptr);
}
/*
@@ -1112,12 +1184,11 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
{
struct bch_dev *ca;
- rcu_read_lock();
+ guard(rcu)();
bch2_bkey_drop_ptrs(k, ptr,
ptr->cached &&
(!(ca = bch2_dev_rcu(c, ptr->dev)) ||
dev_ptr_stale_rcu(ca, ptr) > 0));
- rcu_read_unlock();
return bkey_deleted(k.k);
}
@@ -1135,7 +1206,7 @@ bool bch2_extent_normalize_by_opts(struct bch_fs *c,
struct bkey_ptrs ptrs;
bool have_cached_ptr;
- rcu_read_lock();
+ guard(rcu)();
restart_drop_ptrs:
ptrs = bch2_bkey_ptrs(k);
have_cached_ptr = false;
@@ -1148,7 +1219,6 @@ restart_drop_ptrs:
}
have_cached_ptr = true;
}
- rcu_read_unlock();
return bkey_deleted(k.k);
}
@@ -1156,7 +1226,7 @@ restart_drop_ptrs:
void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_extent_ptr *ptr)
{
out->atomic++;
- rcu_read_lock();
+ guard(rcu)();
struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
if (!ca) {
prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev,
@@ -1180,7 +1250,6 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struc
else if (stale)
prt_printf(out, " invalid");
}
- rcu_read_unlock();
--out->atomic;
}
@@ -1446,7 +1515,7 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
struct bch_compression_opt opt = __bch2_compression_decode(r->compression);
prt_printf(err, "invalid compression opt %u:%u",
opt.type, opt.level);
- return -BCH_ERR_invalid_bkey;
+ return bch_err_throw(c, invalid_bkey);
}
#endif
break;
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 9fe153183b36..b8590e51b76e 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -392,10 +392,13 @@ out: \
/* utility code common to all keys with pointers: */
+void bch2_io_failures_to_text(struct printbuf *, struct bch_fs *,
+ struct bch_io_failures *);
struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *,
unsigned);
void bch2_mark_io_failure(struct bch_io_failures *,
struct extent_ptr_decoded *, bool);
+void bch2_mark_btree_validate_failure(struct bch_io_failures *, unsigned);
int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
struct bch_io_failures *,
struct extent_ptr_decoded *, int);
diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h
index e51529dca4c2..b23ce4a373c0 100644
--- a/fs/bcachefs/extents_types.h
+++ b/fs/bcachefs/extents_types.h
@@ -34,6 +34,7 @@ struct bch_io_failures {
u8 dev;
unsigned failed_csum_nr:6,
failed_io:1,
+ failed_btree_validate:1,
failed_ec:1;
} devs[BCH_REPLICAS_MAX + 1];
};
diff --git a/fs/bcachefs/fast_list.c b/fs/bcachefs/fast_list.c
new file mode 100644
index 000000000000..2faec143eb31
--- /dev/null
+++ b/fs/bcachefs/fast_list.c
@@ -0,0 +1,156 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Fast, unordered lists
+ *
+ * Supports add, remove, and iterate
+ *
+ * Underneath, they're a radix tree and an IDA, with a percpu buffer for slot
+ * allocation and freeing.
+ *
+ * This means that adding, removing, and iterating over items is lockless,
+ * except when refilling/emptying the percpu slot buffers.
+ */
+
+#include "fast_list.h"
+
+struct fast_list_pcpu {
+ u32 nr;
+ u32 entries[31];
+};
+
+static int fast_list_alloc_idx(struct fast_list *l, gfp_t gfp)
+{
+ int idx = ida_alloc_range(&l->slots_allocated, 1, INT_MAX, gfp);
+ if (unlikely(idx < 0))
+ return 0;
+
+ if (unlikely(!genradix_ptr_alloc_inlined(&l->items, idx, gfp))) {
+ ida_free(&l->slots_allocated, idx);
+ return 0;
+ }
+
+ return idx;
+}
+
+/**
+ * fast_list_get_idx - get a slot in a fast_list
+ * @l: list to get slot in
+ *
+ * This allocates a slot in the radix tree without storing to it, so that we can
+ * take the potential memory allocation failure early and do the list add later
+ * when we can't take an allocation failure.
+ *
+ * Returns: positive integer on success, -ENOMEM on failure
+ */
+int fast_list_get_idx(struct fast_list *l)
+{
+ unsigned long flags;
+ int idx;
+retry:
+ local_irq_save(flags);
+ struct fast_list_pcpu *lp = this_cpu_ptr(l->buffer);
+
+ if (unlikely(!lp->nr)) {
+ u32 entries[16], nr = 0;
+
+ local_irq_restore(flags);
+ while (nr < ARRAY_SIZE(entries) &&
+ (idx = fast_list_alloc_idx(l, GFP_KERNEL)))
+ entries[nr++] = idx;
+ local_irq_save(flags);
+
+ lp = this_cpu_ptr(l->buffer);
+
+ while (nr && lp->nr < ARRAY_SIZE(lp->entries))
+ lp->entries[lp->nr++] = entries[--nr];
+
+ if (unlikely(nr)) {
+ local_irq_restore(flags);
+ while (nr)
+ ida_free(&l->slots_allocated, entries[--nr]);
+ goto retry;
+ }
+
+ if (unlikely(!lp->nr)) {
+ local_irq_restore(flags);
+ return -ENOMEM;
+ }
+ }
+
+ idx = lp->entries[--lp->nr];
+ local_irq_restore(flags);
+
+ return idx;
+}
+
+/**
+ * fast_list_add - add an item to a fast_list
+ * @l: list
+ * @item: item to add
+ *
+ * Allocates a slot in the radix tree and stores to it and then returns the
+ * slot index, which must be passed to fast_list_remove().
+ *
+ * Returns: positive integer on success, -ENOMEM on failure
+ */
+int fast_list_add(struct fast_list *l, void *item)
+{
+ int idx = fast_list_get_idx(l);
+ if (idx < 0)
+ return idx;
+
+ *genradix_ptr_inlined(&l->items, idx) = item;
+ return idx;
+}
+
+/**
+ * fast_list_remove - remove an item from a fast_list
+ * @l: list
+ * @idx: item's slot index
+ *
+ * Zeroes out the slot in the radix tree and frees the slot for future
+ * fast_list_add() operations.
+ */
+void fast_list_remove(struct fast_list *l, unsigned idx)
+{
+ u32 entries[16], nr = 0;
+ unsigned long flags;
+
+ if (!idx)
+ return;
+
+ *genradix_ptr_inlined(&l->items, idx) = NULL;
+
+ local_irq_save(flags);
+ struct fast_list_pcpu *lp = this_cpu_ptr(l->buffer);
+
+ if (unlikely(lp->nr == ARRAY_SIZE(lp->entries)))
+ while (nr < ARRAY_SIZE(entries))
+ entries[nr++] = lp->entries[--lp->nr];
+
+ lp->entries[lp->nr++] = idx;
+ local_irq_restore(flags);
+
+ if (unlikely(nr))
+ while (nr)
+ ida_free(&l->slots_allocated, entries[--nr]);
+}
+
+void fast_list_exit(struct fast_list *l)
+{
+ /* XXX: warn if list isn't empty */
+ free_percpu(l->buffer);
+ ida_destroy(&l->slots_allocated);
+ genradix_free(&l->items);
+}
+
+int fast_list_init(struct fast_list *l)
+{
+ genradix_init(&l->items);
+ ida_init(&l->slots_allocated);
+ l->buffer = alloc_percpu(*l->buffer);
+ if (!l->buffer)
+ return -ENOMEM;
+ return 0;
+}
diff --git a/fs/bcachefs/fast_list.h b/fs/bcachefs/fast_list.h
new file mode 100644
index 000000000000..73c9bf591fd6
--- /dev/null
+++ b/fs/bcachefs/fast_list.h
@@ -0,0 +1,41 @@
+#ifndef _LINUX_FAST_LIST_H
+#define _LINUX_FAST_LIST_H
+
+#include <linux/generic-radix-tree.h>
+#include <linux/idr.h>
+#include <linux/percpu.h>
+
+struct fast_list_pcpu;
+
+struct fast_list {
+ GENRADIX(void *) items;
+ struct ida slots_allocated;;
+ struct fast_list_pcpu __percpu
+ *buffer;
+};
+
+static inline void *fast_list_iter_peek(struct genradix_iter *iter,
+ struct fast_list *list)
+{
+ void **p;
+ while ((p = genradix_iter_peek(iter, &list->items)) && !*p)
+ genradix_iter_advance(iter, &list->items);
+
+ return p ? *p : NULL;
+}
+
+#define fast_list_for_each_from(_list, _iter, _i, _start) \
+ for (_iter = genradix_iter_init(&(_list)->items, _start); \
+ (_i = fast_list_iter_peek(&(_iter), _list)) != NULL; \
+ genradix_iter_advance(&(_iter), &(_list)->items))
+
+#define fast_list_for_each(_list, _iter, _i) \
+ fast_list_for_each_from(_list, _iter, _i, 0)
+
+int fast_list_get_idx(struct fast_list *l);
+int fast_list_add(struct fast_list *l, void *item);
+void fast_list_remove(struct fast_list *l, unsigned idx);
+void fast_list_exit(struct fast_list *l);
+int fast_list_init(struct fast_list *l);
+
+#endif /* _LINUX_FAST_LIST_H */
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index e3a75dcca60c..66bacdd49f78 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -394,17 +394,9 @@ struct bch_writepage_state {
struct bch_io_opts opts;
struct bch_folio_sector *tmp;
unsigned tmp_sectors;
+ struct blk_plug plug;
};
-static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c,
- struct bch_inode_info *inode)
-{
- struct bch_writepage_state ret = { 0 };
-
- bch2_inode_opts_get(&ret.opts, c, &inode->ei_inode);
- return ret;
-}
-
/*
* Determine when a writepage io is full. We have to limit writepage bios to a
* single page per bvec (i.e. 1MB with 4k pages) because that is the limit to
@@ -666,17 +658,17 @@ do_io:
int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
struct bch_fs *c = mapping->host->i_sb->s_fs_info;
- struct bch_writepage_state w =
- bch_writepage_state_init(c, to_bch_ei(mapping->host));
- struct blk_plug plug;
- int ret;
+ struct bch_writepage_state *w = kzalloc(sizeof(*w), GFP_NOFS|__GFP_NOFAIL);
- blk_start_plug(&plug);
- ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w);
- if (w.io)
- bch2_writepage_do_io(&w);
- blk_finish_plug(&plug);
- kfree(w.tmp);
+ bch2_inode_opts_get(&w->opts, c, &to_bch_ei(mapping->host)->ei_inode);
+
+ blk_start_plug(&w->plug);
+ int ret = write_cache_pages(mapping, wbc, __bch2_writepage, w);
+ if (w->io)
+ bch2_writepage_do_io(w);
+ blk_finish_plug(&w->plug);
+ kfree(w->tmp);
+ kfree(w);
return bch2_err_class(ret);
}
diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
index 535bc5fcbcc0..1f5154d9676b 100644
--- a/fs/bcachefs/fs-io-direct.c
+++ b/fs/bcachefs/fs-io-direct.c
@@ -3,6 +3,7 @@
#include "bcachefs.h"
#include "alloc_foreground.h"
+#include "enumerated_ref.h"
#include "fs.h"
#include "fs-io.h"
#include "fs-io-direct.h"
@@ -401,7 +402,7 @@ static __always_inline long bch2_dio_write_done(struct dio_write *dio)
ret = dio->op.error ?: ((long) dio->written << 9);
bio_put(&dio->op.wbio.bio);
- bch2_write_ref_put(c, BCH_WRITE_REF_dio_write);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_dio_write);
/* inode->i_dio_count is our ref on inode and thus bch_fs */
inode_dio_end(&inode->v);
@@ -606,7 +607,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
prefetch(&inode->ei_inode);
prefetch((void *) &inode->ei_inode + 64);
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_dio_write))
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_dio_write))
return -EROFS;
inode_lock(&inode->v);
@@ -675,7 +676,7 @@ err_put_bio:
bio_put(bio);
inode_dio_end(&inode->v);
err_put_write_ref:
- bch2_write_ref_put(c, BCH_WRITE_REF_dio_write);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_dio_write);
goto out;
}
diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c
index fbae9c1de746..c2cc405822f2 100644
--- a/fs/bcachefs/fs-io-pagecache.c
+++ b/fs/bcachefs/fs-io-pagecache.c
@@ -447,7 +447,7 @@ static int __bch2_folio_reservation_get(struct bch_fs *c,
if (!reserved) {
bch2_disk_reservation_put(c, &disk_res);
- return -BCH_ERR_ENOSPC_disk_reservation;
+ return bch_err_throw(c, ENOSPC_disk_reservation);
}
break;
}
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 9657144666b8..a233f45875e9 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -7,6 +7,7 @@
#include "btree_update.h"
#include "buckets.h"
#include "clock.h"
+#include "enumerated_ref.h"
#include "error.h"
#include "extents.h"
#include "extent_update.h"
@@ -48,7 +49,8 @@ static void nocow_flush_endio(struct bio *_bio)
struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio);
closure_put(bio->cl);
- percpu_ref_put(&bio->ca->io_ref[WRITE]);
+ enumerated_ref_put(&bio->ca->io_ref[WRITE],
+ BCH_DEV_WRITE_REF_nocow_flush);
bio_put(&bio->bio);
}
@@ -69,11 +71,12 @@ void bch2_inode_flush_nocow_writes_async(struct bch_fs *c,
memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush));
for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) {
- rcu_read_lock();
- ca = rcu_dereference(c->devs[dev]);
- if (ca && !percpu_ref_tryget(&ca->io_ref[WRITE]))
- ca = NULL;
- rcu_read_unlock();
+ scoped_guard(rcu) {
+ ca = rcu_dereference(c->devs[dev]);
+ if (ca && !enumerated_ref_tryget(&ca->io_ref[WRITE],
+ BCH_DEV_WRITE_REF_nocow_flush))
+ ca = NULL;
+ }
if (!ca)
continue;
@@ -151,10 +154,9 @@ void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
inode->v.i_ino, (u64) inode->v.i_blocks, sectors,
inode->ei_inode.bi_sectors);
- bool repeat = false, print = false, suppress = false;
- bch2_count_fsck_err(c, vfs_inode_i_blocks_underflow, buf.buf, &repeat, &print, &suppress);
+ bool print = bch2_count_fsck_err(c, vfs_inode_i_blocks_underflow, &buf);
if (print)
- bch2_print_str(c, buf.buf);
+ bch2_print_str(c, KERN_ERR, buf.buf);
printbuf_exit(&buf);
if (sectors < 0)
@@ -220,7 +222,7 @@ static int bch2_flush_inode(struct bch_fs *c,
if (c->opts.journal_flush_disabled)
return 0;
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync))
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_fsync))
return -EROFS;
u64 seq;
@@ -228,7 +230,7 @@ static int bch2_flush_inode(struct bch_fs *c,
bch2_get_inode_journal_seq_trans(trans, inode_inum(inode), &seq)) ?:
bch2_journal_flush_seq(&c->journal, seq, TASK_INTERRUPTIBLE) ?:
bch2_inode_flush_nocow_writes(c, inode);
- bch2_write_ref_put(c, BCH_WRITE_REF_fsync);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_fsync);
return ret;
}
@@ -526,11 +528,9 @@ int bchfs_truncate(struct mnt_idmap *idmap,
inode->v.i_ino, (u64) inode->v.i_blocks,
inode->ei_inode.bi_sectors);
- bool repeat = false, print = false, suppress = false;
- bch2_count_fsck_err(c, vfs_inode_i_blocks_not_zero_at_truncate, buf.buf,
- &repeat, &print, &suppress);
+ bool print = bch2_count_fsck_err(c, vfs_inode_i_blocks_not_zero_at_truncate, &buf);
if (print)
- bch2_print_str(c, buf.buf);
+ bch2_print_str(c, KERN_ERR, buf.buf);
printbuf_exit(&buf);
}
@@ -821,7 +821,7 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
struct bch_fs *c = inode->v.i_sb->s_fs_info;
long ret;
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fallocate))
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_fallocate))
return -EROFS;
inode_lock(&inode->v);
@@ -845,7 +845,7 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
err:
bch2_pagecache_block_put(inode);
inode_unlock(&inode->v);
- bch2_write_ref_put(c, BCH_WRITE_REF_fallocate);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_fallocate);
return bch2_err_class(ret);
}
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index a82dfce9e4ad..4e72e654da96 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -172,7 +172,10 @@ static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg)
if (get_user(flags, arg))
return -EFAULT;
- bch_notice(c, "shutdown by ioctl type %u", flags);
+ struct printbuf buf = PRINTBUF;
+ bch2_log_msg_start(c, &buf);
+
+ prt_printf(&buf, "shutdown by ioctl type %u", flags);
switch (flags) {
case FSOP_GOING_FLAGS_DEFAULT:
@@ -180,20 +183,23 @@ static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg)
if (ret)
break;
bch2_journal_flush(&c->journal);
- bch2_fs_emergency_read_only(c);
+ bch2_fs_emergency_read_only2(c, &buf);
bdev_thaw(c->vfs_sb->s_bdev);
break;
case FSOP_GOING_FLAGS_LOGFLUSH:
bch2_journal_flush(&c->journal);
fallthrough;
case FSOP_GOING_FLAGS_NOLOGFLUSH:
- bch2_fs_emergency_read_only(c);
+ bch2_fs_emergency_read_only2(c, &buf);
break;
default:
ret = -EINVAL;
- break;
+ goto noprint;
}
+ bch2_print_str(c, KERN_ERR, buf.buf);
+noprint:
+ printbuf_exit(&buf);
return ret;
}
@@ -262,13 +268,13 @@ static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
}
if (dst_dentry->d_inode) {
- error = -BCH_ERR_EEXIST_subvolume_create;
+ error = bch_err_throw(c, EEXIST_subvolume_create);
goto err3;
}
dir = dst_path.dentry->d_inode;
if (IS_DEADDIR(dir)) {
- error = -BCH_ERR_ENOENT_directory_dead;
+ error = bch_err_throw(c, ENOENT_directory_dead);
goto err3;
}
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 47f1a64c5c8d..85d13f800165 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -124,8 +124,9 @@ retry:
goto err;
struct bch_extent_rebalance new_r = bch2_inode_rebalance_opts_get(c, &inode_u);
+ bool rebalance_changed = memcmp(&old_r, &new_r, sizeof(new_r));
- if (memcmp(&old_r, &new_r, sizeof(new_r))) {
+ if (rebalance_changed) {
ret = bch2_set_rebalance_needs_scan_trans(trans, inode_u.bi_inum);
if (ret)
goto err;
@@ -146,6 +147,9 @@ err:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
+ if (rebalance_changed)
+ bch2_rebalance_wakeup(c);
+
bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c,
"%s: inode %llu:%llu not found when updating",
bch2_err_str(ret),
@@ -191,11 +195,6 @@ int bch2_fs_quota_transfer(struct bch_fs *c,
return ret;
}
-static bool subvol_inum_eq(subvol_inum a, subvol_inum b)
-{
- return a.subvol == b.subvol && a.inum == b.inum;
-}
-
static u32 bch2_vfs_inode_hash_fn(const void *data, u32 len, u32 seed)
{
const subvol_inum *inum = data;
@@ -352,9 +351,8 @@ repeat:
if (!trans) {
__wait_on_freeing_inode(c, inode, inum);
} else {
- bch2_trans_unlock(trans);
- __wait_on_freeing_inode(c, inode, inum);
- int ret = bch2_trans_relock(trans);
+ int ret = drop_locks_do(trans,
+ (__wait_on_freeing_inode(c, inode, inum), 0));
if (ret)
return ERR_PTR(ret);
}
@@ -1575,11 +1573,12 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
{
struct bch_inode_info *inode = file_bch_inode(file);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
if (!dir_emit_dots(file, ctx))
return 0;
- int ret = bch2_readdir(c, inode_inum(inode), ctx);
+ int ret = bch2_readdir(c, inode_inum(inode), &hash, ctx);
bch_err_fn(c, ret);
return bch2_err_class(ret);
@@ -2008,14 +2007,14 @@ retry:
goto err;
if (k.k->type != KEY_TYPE_dirent) {
- ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
+ ret = bch_err_throw(c, ENOENT_dirent_doesnt_match_inode);
goto err;
}
d = bkey_s_c_to_dirent(k);
ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
if (ret > 0)
- ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
+ ret = bch_err_throw(c, ENOENT_dirent_doesnt_match_inode);
if (ret)
goto err;
@@ -2181,7 +2180,13 @@ static void bch2_evict_inode(struct inode *vinode)
KEY_TYPE_QUOTA_WARN);
bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
KEY_TYPE_QUOTA_WARN);
- bch2_inode_rm(c, inode_inum(inode));
+ int ret = bch2_inode_rm(c, inode_inum(inode));
+ if (ret && !bch2_err_matches(ret, EROFS)) {
+ bch_err_msg(c, ret, "VFS incorrectly tried to delete inode %llu:%llu",
+ inode->ei_inum.subvol,
+ inode->ei_inum.inum);
+ bch2_sb_error_count(c, BCH_FSCK_ERR_vfs_bad_inode_rm);
+ }
/*
* If we are deleting, we need it present in the vfs hash table
@@ -2328,7 +2333,8 @@ static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
struct bch_fs *c = root->d_sb->s_fs_info;
bool first = true;
- for_each_online_member(c, ca) {
+ guard(rcu)();
+ for_each_online_member_rcu(c, ca) {
if (!first)
seq_putc(seq, ':');
first = false;
@@ -2440,7 +2446,7 @@ static int bch2_fs_get_tree(struct fs_context *fc)
struct inode *vinode;
struct bch2_opts_parse *opts_parse = fc->fs_private;
struct bch_opts opts = opts_parse->opts;
- darray_str devs;
+ darray_const_str devs;
darray_fs devs_to_fs = {};
int ret;
@@ -2464,7 +2470,7 @@ static int bch2_fs_get_tree(struct fs_context *fc)
if (!IS_ERR(sb))
goto got_sb;
- c = bch2_fs_open(devs.data, devs.nr, opts);
+ c = bch2_fs_open(&devs, &opts);
ret = PTR_ERR_OR_ZERO(c);
if (ret)
goto err;
@@ -2514,7 +2520,12 @@ got_sb:
sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec);
super_set_uuid(sb, c->sb.user_uuid.b, sizeof(c->sb.user_uuid));
- super_set_sysfs_name_uuid(sb);
+
+ if (c->sb.multi_device)
+ super_set_sysfs_name_uuid(sb);
+ else
+ strscpy(sb->s_sysfs_name, c->name, sizeof(sb->s_sysfs_name));
+
sb->s_shrink->seeks = 0;
c->vfs_sb = sb;
strscpy(sb->s_id, c->name, sizeof(sb->s_id));
@@ -2525,14 +2536,15 @@ got_sb:
sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
- for_each_online_member(c, ca) {
- struct block_device *bdev = ca->disk_sb.bdev;
+ scoped_guard(rcu) {
+ for_each_online_member_rcu(c, ca) {
+ struct block_device *bdev = ca->disk_sb.bdev;
- /* XXX: create an anonymous device for multi device filesystems */
- sb->s_bdev = bdev;
- sb->s_dev = bdev->bd_dev;
- percpu_ref_put(&ca->io_ref[READ]);
- break;
+ /* XXX: create an anonymous device for multi device filesystems */
+ sb->s_bdev = bdev;
+ sb->s_dev = bdev->bd_dev;
+ break;
+ }
}
c->dev = sb->s_dev;
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index aaf187085276..68ed69a255e1 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -23,14 +23,15 @@
#include <linux/bsearch.h>
#include <linux/dcache.h> /* struct qstr */
-static int dirent_points_to_inode_nowarn(struct bkey_s_c_dirent d,
+static int dirent_points_to_inode_nowarn(struct bch_fs *c,
+ struct bkey_s_c_dirent d,
struct bch_inode_unpacked *inode)
{
if (d.v->d_type == DT_SUBVOL
? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol
: le64_to_cpu(d.v->d_inum) == inode->bi_inum)
return 0;
- return -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
+ return bch_err_throw(c, ENOENT_dirent_doesnt_match_inode);
}
static void dirent_inode_mismatch_msg(struct printbuf *out,
@@ -49,7 +50,7 @@ static int dirent_points_to_inode(struct bch_fs *c,
struct bkey_s_c_dirent dirent,
struct bch_inode_unpacked *inode)
{
- int ret = dirent_points_to_inode_nowarn(dirent, inode);
+ int ret = dirent_points_to_inode_nowarn(c, dirent, inode);
if (ret) {
struct printbuf buf = PRINTBUF;
dirent_inode_mismatch_msg(&buf, c, dirent, inode);
@@ -109,27 +110,6 @@ static int subvol_lookup(struct btree_trans *trans, u32 subvol,
return ret;
}
-static int lookup_inode(struct btree_trans *trans, u64 inode_nr, u32 snapshot,
- struct bch_inode_unpacked *inode)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
- SPOS(0, inode_nr, snapshot), 0);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- ret = bkey_is_inode(k.k)
- ? bch2_inode_unpack(k, inode)
- : -BCH_ERR_ENOENT_inode;
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
static int lookup_dirent_in_snapshot(struct btree_trans *trans,
struct bch_hash_info hash_info,
subvol_inum dir, struct qstr *name,
@@ -173,7 +153,7 @@ static int find_snapshot_tree_subvol(struct btree_trans *trans,
goto found;
}
}
- ret = -BCH_ERR_ENOENT_no_snapshot_tree_subvol;
+ ret = bch_err_throw(trans->c, ENOENT_no_snapshot_tree_subvol);
found:
bch2_trans_iter_exit(trans, &iter);
return ret;
@@ -231,7 +211,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
struct bch_inode_unpacked root_inode;
struct bch_hash_info root_hash_info;
- ret = lookup_inode(trans, root_inum.inum, snapshot, &root_inode);
+ ret = bch2_inode_find_by_inum_snapshot(trans, root_inum.inum, snapshot, &root_inode, 0);
bch_err_msg(c, ret, "looking up root inode %llu for subvol %u",
root_inum.inum, subvolid);
if (ret)
@@ -250,14 +230,14 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
if (d_type != DT_DIR) {
bch_err(c, "error looking up lost+found: not a directory");
- return -BCH_ERR_ENOENT_not_directory;
+ return bch_err_throw(c, ENOENT_not_directory);
}
/*
* The bch2_check_dirents pass has already run, dangling dirents
* shouldn't exist here:
*/
- ret = lookup_inode(trans, inum, snapshot, lostfound);
+ ret = bch2_inode_find_by_inum_snapshot(trans, inum, snapshot, lostfound, 0);
bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)",
inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot));
return ret;
@@ -285,7 +265,7 @@ create_lostfound:
u64 cpu = raw_smp_processor_id();
bch2_inode_init_early(c, lostfound);
- bch2_inode_init_late(lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode);
+ bch2_inode_init_late(c, lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode);
lostfound->bi_dir = root_inode.bi_inum;
lostfound->bi_snapshot = le32_to_cpu(st.root_snapshot);
@@ -552,7 +532,7 @@ static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 sub
if (!bch2_snapshot_is_leaf(c, snapshotid)) {
bch_err(c, "need to reconstruct subvol, but have interior node snapshot");
- return -BCH_ERR_fsck_repair_unimplemented;
+ return bch_err_throw(c, fsck_repair_unimplemented);
}
/*
@@ -566,7 +546,7 @@ static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 sub
u64 cpu = raw_smp_processor_id();
bch2_inode_init_early(c, &new_inode);
- bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL);
+ bch2_inode_init_late(c, &new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL);
new_inode.bi_subvol = subvolid;
@@ -656,7 +636,7 @@ static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32
struct bch_inode_unpacked new_inode;
bch2_inode_init_early(c, &new_inode);
- bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL);
+ bch2_inode_init_late(c, &new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL);
new_inode.bi_size = i_size;
new_inode.bi_inum = inum;
new_inode.bi_snapshot = snapshot;
@@ -664,11 +644,6 @@ static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32
return __bch2_fsck_write_inode(trans, &new_inode);
}
-struct snapshots_seen {
- struct bpos pos;
- snapshot_id_list ids;
-};
-
static inline void snapshots_seen_exit(struct snapshots_seen *s)
{
darray_exit(&s->ids);
@@ -787,12 +762,12 @@ static int ref_visible2(struct bch_fs *c,
#define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \
for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr && \
- (_i)->snapshot <= (_snapshot); _i++) \
- if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
+ (_i)->inode.bi_snapshot <= (_snapshot); _i++) \
+ if (key_visible_in_snapshot(_c, _s, _i->inode.bi_snapshot, _snapshot))
struct inode_walker_entry {
struct bch_inode_unpacked inode;
- u32 snapshot;
+ bool whiteout;
u64 count;
u64 i_size;
};
@@ -821,13 +796,20 @@ static struct inode_walker inode_walker_init(void)
static int add_inode(struct bch_fs *c, struct inode_walker *w,
struct bkey_s_c inode)
{
- struct bch_inode_unpacked u;
-
- return bch2_inode_unpack(inode, &u) ?:
- darray_push(&w->inodes, ((struct inode_walker_entry) {
- .inode = u,
- .snapshot = inode.k->p.snapshot,
+ int ret = darray_push(&w->inodes, ((struct inode_walker_entry) {
+ .whiteout = !bkey_is_inode(inode.k),
}));
+ if (ret)
+ return ret;
+
+ struct inode_walker_entry *n = &darray_last(w->inodes);
+ if (!n->whiteout) {
+ return bch2_inode_unpack(inode, &n->inode);
+ } else {
+ n->inode.bi_inum = inode.k->p.inode;
+ n->inode.bi_snapshot = inode.k->p.snapshot;
+ return 0;
+ }
}
static int get_inodes_all_snapshots(struct btree_trans *trans,
@@ -847,13 +829,12 @@ static int get_inodes_all_snapshots(struct btree_trans *trans,
w->recalculate_sums = false;
w->inodes.nr = 0;
- for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
- BTREE_ITER_all_snapshots, k, ret) {
- if (k.k->p.offset != inum)
+ for_each_btree_key_max_norestart(trans, iter,
+ BTREE_ID_inodes, POS(0, inum), SPOS(0, inum, U32_MAX),
+ BTREE_ITER_all_snapshots, k, ret) {
+ ret = add_inode(c, w, k);
+ if (ret)
break;
-
- if (bkey_is_inode(k.k))
- add_inode(c, w, k);
}
bch2_trans_iter_exit(trans, &iter);
@@ -865,65 +846,6 @@ static int get_inodes_all_snapshots(struct btree_trans *trans,
return 0;
}
-static struct inode_walker_entry *
-lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k)
-{
- bool is_whiteout = k.k->type == KEY_TYPE_whiteout;
-
- struct inode_walker_entry *i;
- __darray_for_each(w->inodes, i)
- if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->snapshot))
- goto found;
-
- return NULL;
-found:
- BUG_ON(k.k->p.snapshot > i->snapshot);
-
- if (k.k->p.snapshot != i->snapshot && !is_whiteout) {
- struct inode_walker_entry new = *i;
-
- new.snapshot = k.k->p.snapshot;
- new.count = 0;
- new.i_size = 0;
-
- struct printbuf buf = PRINTBUF;
- bch2_bkey_val_to_text(&buf, c, k);
-
- bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n"
- "unexpected because we should always update the inode when we update a key in that inode\n"
- "%s",
- w->last_pos.inode, k.k->p.snapshot, i->snapshot, buf.buf);
- printbuf_exit(&buf);
-
- while (i > w->inodes.data && i[-1].snapshot > k.k->p.snapshot)
- --i;
-
- size_t pos = i - w->inodes.data;
- int ret = darray_insert_item(&w->inodes, pos, new);
- if (ret)
- return ERR_PTR(ret);
-
- i = w->inodes.data + pos;
- }
-
- return i;
-}
-
-static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
- struct inode_walker *w,
- struct bkey_s_c k)
-{
- if (w->last_pos.inode != k.k->p.inode) {
- int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode);
- if (ret)
- return ERR_PTR(ret);
- }
-
- w->last_pos = k.k->p;
-
- return lookup_inode_for_snapshot(trans->c, w, k);
-}
-
static int get_visible_inodes(struct btree_trans *trans,
struct inode_walker *w,
struct snapshots_seen *s,
@@ -959,6 +881,91 @@ static int get_visible_inodes(struct btree_trans *trans,
return ret;
}
+static struct inode_walker_entry *
+lookup_inode_for_snapshot(struct btree_trans *trans, struct inode_walker *w, struct bkey_s_c k)
+{
+ struct bch_fs *c = trans->c;
+
+ struct inode_walker_entry *i = darray_find_p(w->inodes, i,
+ bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->inode.bi_snapshot));
+
+ if (!i)
+ return NULL;
+
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ if (fsck_err_on(k.k->p.snapshot != i->inode.bi_snapshot,
+ trans, snapshot_key_missing_inode_snapshot,
+ "have key for inode %llu:%u but have inode in ancestor snapshot %u\n"
+ "unexpected because we should always update the inode when we update a key in that inode\n"
+ "%s",
+ w->last_pos.inode, k.k->p.snapshot, i->inode.bi_snapshot,
+ (bch2_bkey_val_to_text(&buf, c, k),
+ buf.buf))) {
+ struct bch_inode_unpacked new = i->inode;
+ struct bkey_i whiteout;
+
+ new.bi_snapshot = k.k->p.snapshot;
+
+ if (!i->whiteout) {
+ ret = __bch2_fsck_write_inode(trans, &new);
+ } else {
+ bkey_init(&whiteout.k);
+ whiteout.k.type = KEY_TYPE_whiteout;
+ whiteout.k.p = SPOS(0, i->inode.bi_inum, i->inode.bi_snapshot);
+ ret = bch2_btree_insert_nonextent(trans, BTREE_ID_inodes,
+ &whiteout,
+ BTREE_UPDATE_internal_snapshot_node);
+ }
+
+ if (ret)
+ goto fsck_err;
+
+ ret = bch2_trans_commit(trans, NULL, NULL, 0);
+ if (ret)
+ goto fsck_err;
+
+ struct inode_walker_entry new_entry = *i;
+
+ new_entry.inode.bi_snapshot = k.k->p.snapshot;
+ new_entry.count = 0;
+ new_entry.i_size = 0;
+
+ while (i > w->inodes.data && i[-1].inode.bi_snapshot > k.k->p.snapshot)
+ --i;
+
+ size_t pos = i - w->inodes.data;
+ ret = darray_insert_item(&w->inodes, pos, new_entry);
+ if (ret)
+ goto fsck_err;
+
+ ret = bch_err_throw(c, transaction_restart_nested);
+ goto fsck_err;
+ }
+
+ printbuf_exit(&buf);
+ return i;
+fsck_err:
+ printbuf_exit(&buf);
+ return ERR_PTR(ret);
+}
+
+static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
+ struct inode_walker *w,
+ struct bkey_s_c k)
+{
+ if (w->last_pos.inode != k.k->p.inode) {
+ int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode);
+ if (ret)
+ return ERR_PTR(ret);
+ }
+
+ w->last_pos = k.k->p;
+
+ return lookup_inode_for_snapshot(trans, w, k);
+}
+
/*
* Prefer to delete the first one, since that will be the one at the wrong
* offset:
@@ -978,7 +985,8 @@ int bch2_fsck_update_backpointers(struct btree_trans *trans,
int ret = 0;
if (d->v.d_type == DT_SUBVOL) {
- BUG();
+ bch_err(trans->c, "%s does not support DT_SUBVOL", __func__);
+ ret = -BCH_ERR_fsck_repair_unimplemented;
} else {
ret = get_visible_inodes(trans, &target, s, le64_to_cpu(d->v.d_inum));
if (ret)
@@ -1034,7 +1042,7 @@ static int check_inode_dirent_inode(struct btree_trans *trans,
if (ret && !bch2_err_matches(ret, ENOENT))
return ret;
- if ((ret || dirent_points_to_inode_nowarn(d, inode)) &&
+ if ((ret || dirent_points_to_inode_nowarn(c, d, inode)) &&
inode->bi_subvol &&
(inode->bi_flags & BCH_INODE_has_child_snapshot)) {
/* Older version of a renamed subvolume root: we won't have a
@@ -1055,7 +1063,7 @@ static int check_inode_dirent_inode(struct btree_trans *trans,
trans, inode_points_to_missing_dirent,
"inode points to missing dirent\n%s",
(bch2_inode_unpacked_to_text(&buf, inode), buf.buf)) ||
- fsck_err_on(!ret && dirent_points_to_inode_nowarn(d, inode),
+ fsck_err_on(!ret && dirent_points_to_inode_nowarn(c, d, inode),
trans, inode_points_to_wrong_dirent,
"%s",
(printbuf_reset(&buf),
@@ -1080,32 +1088,6 @@ fsck_err:
return ret;
}
-static int get_snapshot_root_inode(struct btree_trans *trans,
- struct bch_inode_unpacked *root,
- u64 inum)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret = 0;
-
- for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes,
- SPOS(0, inum, U32_MAX),
- BTREE_ITER_all_snapshots, k, ret) {
- if (k.k->p.offset != inum)
- break;
- if (bkey_is_inode(k.k))
- goto found_root;
- }
- if (ret)
- goto err;
- BUG();
-found_root:
- ret = bch2_inode_unpack(k, root);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
static int check_inode(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k,
@@ -1136,20 +1118,23 @@ static int check_inode(struct btree_trans *trans,
goto err;
if (snapshot_root->bi_inum != u.bi_inum) {
- ret = get_snapshot_root_inode(trans, snapshot_root, u.bi_inum);
+ ret = bch2_inode_find_snapshot_root(trans, u.bi_inum, snapshot_root);
if (ret)
goto err;
}
- if (fsck_err_on(u.bi_hash_seed != snapshot_root->bi_hash_seed ||
- INODE_STR_HASH(&u) != INODE_STR_HASH(snapshot_root),
- trans, inode_snapshot_mismatch,
- "inode hash info in different snapshots don't match")) {
- u.bi_hash_seed = snapshot_root->bi_hash_seed;
- SET_INODE_STR_HASH(&u, INODE_STR_HASH(snapshot_root));
- do_update = true;
+ if (u.bi_hash_seed != snapshot_root->bi_hash_seed ||
+ INODE_STR_HASH(&u) != INODE_STR_HASH(snapshot_root)) {
+ ret = bch2_repair_inode_hash_info(trans, snapshot_root);
+ BUG_ON(ret == -BCH_ERR_fsck_repair_unimplemented);
+ if (ret)
+ goto err;
}
+ ret = bch2_check_inode_has_case_insensitive(trans, &u, &s->ids, &do_update);
+ if (ret)
+ goto err;
+
if (u.bi_dir || u.bi_dir_offset) {
ret = check_inode_dirent_inode(trans, &u, &do_update);
if (ret)
@@ -1183,6 +1168,14 @@ static int check_inode(struct btree_trans *trans,
ret = 0;
}
+ if (fsck_err_on(S_ISDIR(u.bi_mode) && u.bi_size,
+ trans, inode_dir_has_nonzero_i_size,
+ "directory %llu:%u with nonzero i_size %lli",
+ u.bi_inum, u.bi_snapshot, u.bi_size)) {
+ u.bi_size = 0;
+ do_update = true;
+ }
+
ret = bch2_inode_has_child_snapshots(trans, k.k->p);
if (ret < 0)
goto err;
@@ -1452,25 +1445,27 @@ static int check_key_has_inode(struct btree_trans *trans,
if (k.k->type == KEY_TYPE_whiteout)
goto out;
- if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
+ bool have_inode = i && !i->whiteout;
+
+ if (!have_inode && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
ret = reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?:
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
if (ret)
goto err;
inode->last_pos.inode--;
- ret = -BCH_ERR_transaction_restart_nested;
+ ret = bch_err_throw(c, transaction_restart_nested);
goto err;
}
- if (fsck_err_on(!i,
+ if (fsck_err_on(!have_inode,
trans, key_in_missing_inode,
"key in missing inode:\n%s",
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
goto delete;
- if (fsck_err_on(i && !btree_matches_i_mode(iter->btree_id, i->inode.bi_mode),
+ if (fsck_err_on(have_inode && !btree_matches_i_mode(iter->btree_id, i->inode.bi_mode),
trans, key_in_wrong_inode_type,
"key for wrong inode mode %o:\n%s",
i->inode.bi_mode,
@@ -1498,21 +1493,21 @@ static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_wal
if (i->inode.bi_sectors == i->count)
continue;
- count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot);
+ count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->inode.bi_snapshot);
if (w->recalculate_sums)
i->count = count2;
if (i->count != count2) {
bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
- w->last_pos.inode, i->snapshot, i->count, count2);
+ w->last_pos.inode, i->inode.bi_snapshot, i->count, count2);
i->count = count2;
}
if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty),
trans, inode_i_sectors_wrong,
"inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
- w->last_pos.inode, i->snapshot,
+ w->last_pos.inode, i->inode.bi_snapshot,
i->inode.bi_sectors, i->count)) {
i->inode.bi_sectors = i->count;
ret = bch2_fsck_write_inode(trans, &i->inode);
@@ -1576,7 +1571,7 @@ static int extent_ends_at(struct bch_fs *c,
sizeof(seen->ids.data[0]) * seen->ids.size,
GFP_KERNEL);
if (!n.seen.ids.data)
- return -BCH_ERR_ENOMEM_fsck_extent_ends_at;
+ return bch_err_throw(c, ENOMEM_fsck_extent_ends_at);
__darray_for_each(extent_ends->e, i) {
if (i->snapshot == k.k->p.snapshot) {
@@ -1626,7 +1621,7 @@ static int overlapping_extents_found(struct btree_trans *trans,
bch_err(c, "%s: error finding first overlapping extent when repairing, got%s",
__func__, buf.buf);
- ret = -BCH_ERR_internal_fsck_err;
+ ret = bch_err_throw(c, internal_fsck_err);
goto err;
}
@@ -1651,7 +1646,7 @@ static int overlapping_extents_found(struct btree_trans *trans,
pos2.size != k2.k->size) {
bch_err(c, "%s: error finding seconding overlapping extent when repairing%s",
__func__, buf.buf);
- ret = -BCH_ERR_internal_fsck_err;
+ ret = bch_err_throw(c, internal_fsck_err);
goto err;
}
@@ -1699,7 +1694,7 @@ static int overlapping_extents_found(struct btree_trans *trans,
* We overwrote the second extent - restart
* check_extent() from the top:
*/
- ret = -BCH_ERR_transaction_restart_nested;
+ ret = bch_err_throw(c, transaction_restart_nested);
}
}
fsck_err:
@@ -1823,20 +1818,20 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes);
inode->inodes.data && i >= inode->inodes.data;
--i) {
- if (i->snapshot > k.k->p.snapshot ||
- !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
+ if (i->inode.bi_snapshot > k.k->p.snapshot ||
+ !key_visible_in_snapshot(c, s, i->inode.bi_snapshot, k.k->p.snapshot))
continue;
if (fsck_err_on(k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
!bkey_extent_is_reservation(k),
trans, extent_past_end_of_inode,
"extent type past end of inode %llu:%u, i_size %llu\n%s",
- i->inode.bi_inum, i->snapshot, i->inode.bi_size,
+ i->inode.bi_inum, i->inode.bi_snapshot, i->inode.bi_size,
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
struct btree_iter iter2;
bch2_trans_copy_iter(trans, &iter2, iter);
- bch2_btree_iter_set_snapshot(trans, &iter2, i->snapshot);
+ bch2_btree_iter_set_snapshot(trans, &iter2, i->inode.bi_snapshot);
ret = bch2_btree_iter_traverse(trans, &iter2) ?:
bch2_btree_delete_at(trans, &iter2,
BTREE_UPDATE_internal_snapshot_node);
@@ -1858,8 +1853,9 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes);
inode->inodes.data && i >= inode->inodes.data;
--i) {
- if (i->snapshot > k.k->p.snapshot ||
- !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
+ if (i->whiteout ||
+ i->inode.bi_snapshot > k.k->p.snapshot ||
+ !key_visible_in_snapshot(c, s, i->inode.bi_snapshot, k.k->p.snapshot))
continue;
i->count += k.k->size;
@@ -1941,13 +1937,13 @@ static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_
if (i->inode.bi_nlink == i->count)
continue;
- count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->snapshot);
+ count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->inode.bi_snapshot);
if (count2 < 0)
return count2;
if (i->count != count2) {
bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu",
- w->last_pos.inode, i->snapshot, i->count, count2);
+ w->last_pos.inode, i->inode.bi_snapshot, i->count, count2);
i->count = count2;
if (i->inode.bi_nlink == i->count)
continue;
@@ -1956,7 +1952,7 @@ static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_
if (fsck_err_on(i->inode.bi_nlink != i->count,
trans, inode_dir_wrong_nlink,
"directory %llu:%u with wrong i_nlink: got %u, should be %llu",
- w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
+ w->last_pos.inode, i->inode.bi_snapshot, i->inode.bi_nlink, i->count)) {
i->inode.bi_nlink = i->count;
ret = bch2_fsck_write_inode(trans, &i->inode);
if (ret)
@@ -2051,7 +2047,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
if (!new_parent_subvol) {
bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot);
- return -BCH_ERR_fsck_repair_unimplemented;
+ return bch_err_throw(c, fsck_repair_unimplemented);
}
struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent);
@@ -2068,7 +2064,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
0, subvolume);
ret = bkey_err(s.s_c);
if (ret && !bch2_err_matches(ret, ENOENT))
- return ret;
+ goto err;
if (ret) {
if (fsck_err(trans, dirent_to_missing_subvol,
@@ -2079,30 +2075,41 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
goto out;
}
- if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol,
- trans, subvol_fs_path_parent_wrong,
- "subvol with wrong fs_path_parent, should be be %u\n%s",
- parent_subvol,
- (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- struct bkey_i_subvolume *n =
- bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume);
- ret = PTR_ERR_OR_ZERO(n);
+ if (le32_to_cpu(s.v->fs_path_parent) != parent_subvol) {
+ printbuf_reset(&buf);
+
+ prt_printf(&buf, "subvol with wrong fs_path_parent, should be be %u\n",
+ parent_subvol);
+
+ ret = bch2_inum_to_path(trans, (subvol_inum) { s.k->p.offset,
+ le64_to_cpu(s.v->inode) }, &buf);
if (ret)
goto err;
+ prt_newline(&buf);
+ bch2_bkey_val_to_text(&buf, c, s.s_c);
+
+ if (fsck_err(trans, subvol_fs_path_parent_wrong, "%s", buf.buf)) {
+ struct bkey_i_subvolume *n =
+ bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume);
+ ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ goto err;
- n->v.fs_path_parent = cpu_to_le32(parent_subvol);
+ n->v.fs_path_parent = cpu_to_le32(parent_subvol);
+ }
}
u64 target_inum = le64_to_cpu(s.v->inode);
u32 target_snapshot = le32_to_cpu(s.v->snapshot);
- ret = lookup_inode(trans, target_inum, target_snapshot, &subvol_root);
+ ret = bch2_inode_find_by_inum_snapshot(trans, target_inum, target_snapshot,
+ &subvol_root, 0);
if (ret && !bch2_err_matches(ret, ENOENT))
goto err;
if (ret) {
bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum);
- ret = -BCH_ERR_fsck_repair_unimplemented;
+ ret = bch_err_throw(c, fsck_repair_unimplemented);
goto err;
}
@@ -2134,7 +2141,8 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
struct bch_hash_info *hash_info,
struct inode_walker *dir,
struct inode_walker *target,
- struct snapshots_seen *s)
+ struct snapshots_seen *s,
+ bool *need_second_pass)
{
struct bch_fs *c = trans->c;
struct inode_walker_entry *i;
@@ -2169,14 +2177,19 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
if (ret)
goto err;
- if (!i)
+ if (!i || i->whiteout)
goto out;
if (dir->first_this_inode)
*hash_info = bch2_hash_info_init(c, &i->inode);
dir->first_this_inode = false;
- ret = bch2_str_hash_check_key(trans, s, &bch2_dirent_hash_desc, hash_info, iter, k);
+#ifdef CONFIG_UNICODE
+ hash_info->cf_encoding = bch2_inode_casefold(c, &i->inode) ? c->cf_encoding : NULL;
+#endif
+
+ ret = bch2_str_hash_check_key(trans, s, &bch2_dirent_hash_desc, hash_info,
+ iter, k, need_second_pass);
if (ret < 0)
goto err;
if (ret) {
@@ -2197,31 +2210,34 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k),
buf.buf))) {
- struct qstr name = bch2_dirent_get_name(d);
- u32 subvol = d.v->d_type == DT_SUBVOL
- ? d.v->d_parent_subvol
- : 0;
+ subvol_inum dir_inum = { .subvol = d.v->d_type == DT_SUBVOL
+ ? le32_to_cpu(d.v->d_parent_subvol)
+ : 0,
+ };
u64 target = d.v->d_type == DT_SUBVOL
- ? d.v->d_child_subvol
- : d.v->d_inum;
- u64 dir_offset;
+ ? le32_to_cpu(d.v->d_child_subvol)
+ : le64_to_cpu(d.v->d_inum);
+ struct qstr name = bch2_dirent_get_name(d);
- ret = bch2_hash_delete_at(trans,
+ struct bkey_i_dirent *new_d =
+ bch2_dirent_create_key(trans, hash_info, dir_inum,
+ d.v->d_type, &name, NULL, target);
+ ret = PTR_ERR_OR_ZERO(new_d);
+ if (ret)
+ goto out;
+
+ new_d->k.p.inode = d.k->p.inode;
+ new_d->k.p.snapshot = d.k->p.snapshot;
+
+ struct btree_iter dup_iter = {};
+ ret = bch2_hash_delete_at(trans,
bch2_dirent_hash_desc, hash_info, iter,
BTREE_UPDATE_internal_snapshot_node) ?:
- bch2_dirent_create_snapshot(trans, subvol,
- d.k->p.inode, d.k->p.snapshot,
- hash_info,
- d.v->d_type,
- &name,
- target,
- &dir_offset,
- BTREE_ITER_with_updates|
- BTREE_UPDATE_internal_snapshot_node|
- STR_HASH_must_create) ?:
- bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
-
- /* might need another check_dirents pass */
+ bch2_str_hash_repair_key(trans, s,
+ &bch2_dirent_hash_desc, hash_info,
+ iter, bkey_i_to_s_c(&new_d->k_i),
+ &dup_iter, bkey_s_c_null,
+ need_second_pass);
goto out;
}
@@ -2289,7 +2305,6 @@ out:
err:
fsck_err:
printbuf_exit(&buf);
- bch_err_fn(c, ret);
return ret;
}
@@ -2303,16 +2318,31 @@ int bch2_check_dirents(struct bch_fs *c)
struct inode_walker target = inode_walker_init();
struct snapshots_seen s;
struct bch_hash_info hash_info;
+ bool need_second_pass = false, did_second_pass = false;
+ int ret;
snapshots_seen_init(&s);
-
- int ret = bch2_trans_run(c,
- for_each_btree_key(trans, iter, BTREE_ID_dirents,
+again:
+ ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter, BTREE_ID_dirents,
POS(BCACHEFS_ROOT_INO, 0),
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
- check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?:
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s,
+ &need_second_pass)) ?:
check_subdir_count_notnested(trans, &dir));
+ if (!ret && need_second_pass && !did_second_pass) {
+ bch_info(c, "check_dirents requires second pass");
+ swap(did_second_pass, need_second_pass);
+ goto again;
+ }
+
+ if (!ret && need_second_pass) {
+ bch_err(c, "dirents not repairing");
+ ret = -EINVAL;
+ }
+
snapshots_seen_exit(&s);
inode_walker_exit(&dir);
inode_walker_exit(&target);
@@ -2326,16 +2356,14 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
struct inode_walker *inode)
{
struct bch_fs *c = trans->c;
- struct inode_walker_entry *i;
- int ret;
- ret = bch2_check_key_has_snapshot(trans, iter, k);
+ int ret = bch2_check_key_has_snapshot(trans, iter, k);
if (ret < 0)
return ret;
if (ret)
return 0;
- i = walk_inode(trans, inode, k);
+ struct inode_walker_entry *i = walk_inode(trans, inode, k);
ret = PTR_ERR_OR_ZERO(i);
if (ret)
return ret;
@@ -2344,16 +2372,16 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
if (ret)
return ret;
- if (!i)
+ if (!i || i->whiteout)
return 0;
if (inode->first_this_inode)
*hash_info = bch2_hash_info_init(c, &i->inode);
inode->first_this_inode = false;
- ret = bch2_str_hash_check_key(trans, NULL, &bch2_xattr_hash_desc, hash_info, iter, k);
- bch_err_fn(c, ret);
- return ret;
+ bool need_second_pass = false;
+ return bch2_str_hash_check_key(trans, NULL, &bch2_xattr_hash_desc, hash_info,
+ iter, k, &need_second_pass);
}
/*
@@ -2413,7 +2441,8 @@ static int check_root_trans(struct btree_trans *trans)
goto err;
}
- ret = lookup_inode(trans, BCACHEFS_ROOT_INO, snapshot, &root_inode);
+ ret = bch2_inode_find_by_inum_snapshot(trans, BCACHEFS_ROOT_INO, snapshot,
+ &root_inode, 0);
if (ret && !bch2_err_matches(ret, ENOENT))
return ret;
@@ -2445,8 +2474,6 @@ int bch2_check_root(struct bch_fs *c)
return ret;
}
-typedef DARRAY(u32) darray_u32;
-
static bool darray_u32_has(darray_u32 *d, u32 v)
{
darray_for_each(*d, i)
@@ -2483,7 +2510,14 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter,
u32 parent = le32_to_cpu(s.v->fs_path_parent);
if (darray_u32_has(&subvol_path, parent)) {
- if (fsck_err(trans, subvol_loop, "subvolume loop"))
+ printbuf_reset(&buf);
+ prt_printf(&buf, "subvolume loop:\n");
+
+ darray_for_each_reverse(subvol_path, i)
+ prt_printf(&buf, "%u ", *i);
+ prt_printf(&buf, "%u", parent);
+
+ if (fsck_err(trans, subvol_loop, "%s", buf.buf))
ret = reattach_subvol(trans, s);
break;
}
@@ -2499,7 +2533,8 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter,
if (fsck_err_on(k.k->type != KEY_TYPE_subvolume,
trans, subvol_unreachable,
"unreachable subvolume %s",
- (bch2_bkey_val_to_text(&buf, c, s.s_c),
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, s.s_c),
buf.buf))) {
ret = reattach_subvol(trans, s);
break;
@@ -2655,14 +2690,13 @@ static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k)
redo_bi_depth = true;
if (path_is_dup(&path, inode.bi_inum, snapshot)) {
- /* XXX print path */
- bch_err(c, "directory structure loop");
-
- darray_for_each(path, i)
- pr_err("%llu:%u", i->inum, i->snapshot);
- pr_err("%llu:%u", inode.bi_inum, snapshot);
+ printbuf_reset(&buf);
+ prt_printf(&buf, "directory structure loop:\n");
+ darray_for_each_reverse(path, i)
+ prt_printf(&buf, "%llu:%u ", i->inum, i->snapshot);
+ prt_printf(&buf, "%llu:%u", inode.bi_inum, snapshot);
- if (fsck_err(trans, dir_loop, "directory structure loop")) {
+ if (fsck_err(trans, dir_loop, "%s", buf.buf)) {
ret = remove_backpointer(trans, &inode);
bch_err_msg(c, ret, "removing dirent");
if (ret)
@@ -2736,7 +2770,7 @@ static int add_nlink(struct bch_fs *c, struct nlink_table *t,
if (!d) {
bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
new_size);
- return -BCH_ERR_ENOMEM_fsck_add_nlink;
+ return bch_err_throw(c, ENOMEM_fsck_add_nlink);
}
if (t->d)
@@ -3061,7 +3095,7 @@ long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg)
{
struct bch_ioctl_fsck_offline arg;
struct fsck_thread *thr = NULL;
- darray_str(devs) = {};
+ darray_const_str devs = {};
long ret = 0;
if (copy_from_user(&arg, user_arg, sizeof(arg)))
@@ -3119,7 +3153,7 @@ long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg)
bch2_thread_with_stdio_init(&thr->thr, &bch2_offline_fsck_ops);
- thr->c = bch2_fs_open(devs.data, arg.nr_devs, thr->opts);
+ thr->c = bch2_fs_open(&devs, &thr->opts);
if (!IS_ERR(thr->c) &&
thr->c->opts.errors == BCH_ON_ERROR_panic)
@@ -3156,19 +3190,18 @@ static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio)
c->opts.fix_errors = FSCK_FIX_ask;
c->opts.fsck = true;
- set_bit(BCH_FS_fsck_running, &c->flags);
+ set_bit(BCH_FS_in_fsck, &c->flags);
- c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info;
- int ret = bch2_run_online_recovery_passes(c);
+ int ret = bch2_run_online_recovery_passes(c, ~0ULL);
- clear_bit(BCH_FS_fsck_running, &c->flags);
+ clear_bit(BCH_FS_in_fsck, &c->flags);
bch_err_fn(c, ret);
c->stdio = NULL;
c->stdio_filter = NULL;
c->opts.fix_errors = old_fix_errors;
- up(&c->online_fsck_mutex);
+ up(&c->recovery.run_lock);
bch2_ro_ref_put(c);
return ret;
}
@@ -3192,7 +3225,7 @@ long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg)
if (!bch2_ro_ref_tryget(c))
return -EROFS;
- if (down_trylock(&c->online_fsck_mutex)) {
+ if (down_trylock(&c->recovery.run_lock)) {
bch2_ro_ref_put(c);
return -EAGAIN;
}
@@ -3224,7 +3257,7 @@ err:
bch_err_fn(c, ret);
if (thr)
bch2_fsck_thread_exit(&thr->thr);
- up(&c->online_fsck_mutex);
+ up(&c->recovery.run_lock);
bch2_ro_ref_put(c);
}
return ret;
diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h
index 574948278cd4..e5fe7cf7b251 100644
--- a/fs/bcachefs/fsck.h
+++ b/fs/bcachefs/fsck.h
@@ -4,6 +4,12 @@
#include "str_hash.h"
+/* recoverds snapshot IDs of overwrites at @pos */
+struct snapshots_seen {
+ struct bpos pos;
+ snapshot_id_list ids;
+};
+
int bch2_fsck_update_backpointers(struct btree_trans *,
struct snapshots_seen *,
const struct bch_hash_desc,
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 490b85841de9..53e5dc1f6ac1 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -38,6 +38,7 @@ static const char * const bch2_inode_flag_strs[] = {
#undef x
static int delete_ancestor_snapshot_inodes(struct btree_trans *, struct bpos);
+static int may_delete_deleted_inum(struct btree_trans *, subvol_inum);
static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 };
@@ -241,6 +242,7 @@ static int bch2_inode_unpack_v3(struct bkey_s_c k,
u64 v[2];
unpacked->bi_inum = inode.k->p.offset;
+ unpacked->bi_snapshot = inode.k->p.snapshot;
unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq);
unpacked->bi_hash_seed = inode.v->bi_hash_seed;
unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags);
@@ -285,13 +287,12 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k,
{
memset(unpacked, 0, sizeof(*unpacked));
- unpacked->bi_snapshot = k.k->p.snapshot;
-
switch (k.k->type) {
case KEY_TYPE_inode: {
struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
unpacked->bi_inum = inode.k->p.offset;
+ unpacked->bi_snapshot = inode.k->p.snapshot;
unpacked->bi_journal_seq= 0;
unpacked->bi_hash_seed = inode.v->bi_hash_seed;
unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags);
@@ -310,6 +311,7 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k,
struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
unpacked->bi_inum = inode.k->p.offset;
+ unpacked->bi_snapshot = inode.k->p.snapshot;
unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq);
unpacked->bi_hash_seed = inode.v->bi_hash_seed;
unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags);
@@ -327,8 +329,6 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k,
int bch2_inode_unpack(struct bkey_s_c k,
struct bch_inode_unpacked *unpacked)
{
- unpacked->bi_snapshot = k.k->p.snapshot;
-
return likely(k.k->type == KEY_TYPE_inode_v3)
? bch2_inode_unpack_v3(k, unpacked)
: bch2_inode_unpack_slowpath(k, unpacked);
@@ -368,6 +368,82 @@ err:
return ret;
}
+int bch2_inode_find_by_inum_snapshot(struct btree_trans *trans,
+ u64 inode_nr, u32 snapshot,
+ struct bch_inode_unpacked *inode,
+ unsigned flags)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
+ SPOS(0, inode_nr, snapshot), flags);
+ int ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ ret = bkey_is_inode(k.k)
+ ? bch2_inode_unpack(k, inode)
+ : -BCH_ERR_ENOENT_inode;
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *trans,
+ subvol_inum inum,
+ struct bch_inode_unpacked *inode)
+{
+ struct btree_iter iter;
+ int ret;
+
+ ret = bch2_inode_peek_nowarn(trans, &iter, inode, inum, 0);
+ if (!ret)
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int bch2_inode_find_by_inum_trans(struct btree_trans *trans,
+ subvol_inum inum,
+ struct bch_inode_unpacked *inode)
+{
+ struct btree_iter iter;
+ int ret;
+
+ ret = bch2_inode_peek(trans, &iter, inode, inum, 0);
+ if (!ret)
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum,
+ struct bch_inode_unpacked *inode)
+{
+ return bch2_trans_do(c, bch2_inode_find_by_inum_trans(trans, inum, inode));
+}
+
+int bch2_inode_find_snapshot_root(struct btree_trans *trans, u64 inum,
+ struct bch_inode_unpacked *root)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes,
+ SPOS(0, inum, U32_MAX),
+ BTREE_ITER_all_snapshots, k, ret) {
+ if (k.k->p.offset != inum)
+ break;
+ if (bkey_is_inode(k.k)) {
+ ret = bch2_inode_unpack(k, root);
+ goto out;
+ }
+ }
+ /* We're only called when we know we have an inode for @inum */
+ BUG_ON(!ret);
+out:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
int bch2_inode_write_flags(struct btree_trans *trans,
struct btree_iter *iter,
struct bch_inode_unpacked *inode,
@@ -833,7 +909,8 @@ void bch2_inode_init_early(struct bch_fs *c,
get_random_bytes(&inode_u->bi_hash_seed, sizeof(inode_u->bi_hash_seed));
}
-void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now,
+void bch2_inode_init_late(struct bch_fs *c,
+ struct bch_inode_unpacked *inode_u, u64 now,
uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
struct bch_inode_unpacked *parent)
{
@@ -857,6 +934,12 @@ void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now,
BCH_INODE_OPTS()
#undef x
}
+
+ if (!S_ISDIR(mode))
+ inode_u->bi_casefold = 0;
+
+ if (bch2_inode_casefold(c, inode_u))
+ inode_u->bi_flags |= BCH_INODE_has_case_insensitive;
}
void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
@@ -864,7 +947,7 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
struct bch_inode_unpacked *parent)
{
bch2_inode_init_early(c, inode_u);
- bch2_inode_init_late(inode_u, bch2_current_time(c),
+ bch2_inode_init_late(c, inode_u, bch2_current_time(c),
uid, gid, mode, rdev, parent);
}
@@ -959,7 +1042,7 @@ again:
goto found_slot;
if (!ret && start == min)
- ret = -BCH_ERR_ENOSPC_inode_create;
+ ret = bch_err_throw(trans->c, ENOSPC_inode_create);
if (ret) {
bch2_trans_iter_exit(trans, iter);
@@ -1048,19 +1131,23 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
u32 snapshot;
int ret;
+ ret = lockrestart_do(trans, may_delete_deleted_inum(trans, inum));
+ if (ret)
+ goto err2;
+
/*
* If this was a directory, there shouldn't be any real dirents left -
* but there could be whiteouts (from hash collisions) that we should
* delete:
*
- * XXX: the dirent could ideally would delete whiteouts when they're no
+ * XXX: the dirent code ideally would delete whiteouts when they're no
* longer needed
*/
ret = bch2_inode_delete_keys(trans, inum, BTREE_ID_extents) ?:
bch2_inode_delete_keys(trans, inum, BTREE_ID_xattrs) ?:
bch2_inode_delete_keys(trans, inum, BTREE_ID_dirents);
if (ret)
- goto err;
+ goto err2;
retry:
bch2_trans_begin(trans);
@@ -1079,7 +1166,7 @@ retry:
bch2_fs_inconsistent(c,
"inode %llu:%u not found when deleting",
inum.inum, snapshot);
- ret = -BCH_ERR_ENOENT_inode;
+ ret = bch_err_throw(c, ENOENT_inode);
goto err;
}
@@ -1100,38 +1187,6 @@ err2:
return ret;
}
-int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *trans,
- subvol_inum inum,
- struct bch_inode_unpacked *inode)
-{
- struct btree_iter iter;
- int ret;
-
- ret = bch2_inode_peek_nowarn(trans, &iter, inode, inum, 0);
- if (!ret)
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-int bch2_inode_find_by_inum_trans(struct btree_trans *trans,
- subvol_inum inum,
- struct bch_inode_unpacked *inode)
-{
- struct btree_iter iter;
- int ret;
-
- ret = bch2_inode_peek(trans, &iter, inode, inum, 0);
- if (!ret)
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum,
- struct bch_inode_unpacked *inode)
-{
- return bch2_trans_do(c, bch2_inode_find_by_inum_trans(trans, inum, inode));
-}
-
int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi)
{
if (bi->bi_flags & BCH_INODE_unlinked)
@@ -1233,7 +1288,7 @@ int bch2_inode_set_casefold(struct btree_trans *trans, subvol_inum inum,
bi->bi_casefold = v + 1;
bi->bi_fields_set |= BIT(Inode_opt_casefold);
- return 0;
+ return bch2_maybe_propagate_has_case_insensitive(trans, inum, bi);
#else
bch_err(c, "Cannot use casefolding on a kernel without CONFIG_UNICODE");
return -EOPNOTSUPP;
@@ -1278,7 +1333,7 @@ retry:
bch2_fs_inconsistent(c,
"inode %llu:%u not found when deleting",
inum, snapshot);
- ret = -BCH_ERR_ENOENT_inode;
+ ret = bch_err_throw(c, ENOENT_inode);
goto err;
}
@@ -1342,10 +1397,8 @@ int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
delete_ancestor_snapshot_inodes(trans, SPOS(0, inum, snapshot));
}
-static int may_delete_deleted_inode(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bpos pos,
- bool *need_another_pass)
+static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos,
+ bool from_deleted_inodes)
{
struct bch_fs *c = trans->c;
struct btree_iter inode_iter;
@@ -1359,12 +1412,14 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
if (ret)
return ret;
- ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode;
- if (fsck_err_on(!bkey_is_inode(k.k),
+ ret = bkey_is_inode(k.k) ? 0 : bch_err_throw(c, ENOENT_inode);
+ if (fsck_err_on(from_deleted_inodes && ret,
trans, deleted_inode_missing,
"nonexistent inode %llu:%u in deleted_inodes btree",
pos.offset, pos.snapshot))
goto delete;
+ if (ret)
+ goto out;
ret = bch2_inode_unpack(k, &inode);
if (ret)
@@ -1372,7 +1427,8 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
if (S_ISDIR(inode.bi_mode)) {
ret = bch2_empty_dir_snapshot(trans, pos.offset, 0, pos.snapshot);
- if (fsck_err_on(bch2_err_matches(ret, ENOTEMPTY),
+ if (fsck_err_on(from_deleted_inodes &&
+ bch2_err_matches(ret, ENOTEMPTY),
trans, deleted_inode_is_dir,
"non empty directory %llu:%u in deleted_inodes btree",
pos.offset, pos.snapshot))
@@ -1381,17 +1437,25 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
goto out;
}
- if (fsck_err_on(!(inode.bi_flags & BCH_INODE_unlinked),
+ ret = inode.bi_flags & BCH_INODE_unlinked ? 0 : bch_err_throw(c, inode_not_unlinked);
+ if (fsck_err_on(from_deleted_inodes && ret,
trans, deleted_inode_not_unlinked,
"non-deleted inode %llu:%u in deleted_inodes btree",
pos.offset, pos.snapshot))
goto delete;
+ if (ret)
+ goto out;
- if (fsck_err_on(inode.bi_flags & BCH_INODE_has_child_snapshot,
+ ret = !(inode.bi_flags & BCH_INODE_has_child_snapshot)
+ ? 0 : bch_err_throw(c, inode_has_child_snapshot);
+
+ if (fsck_err_on(from_deleted_inodes && ret,
trans, deleted_inode_has_child_snapshots,
"inode with child snapshots %llu:%u in deleted_inodes btree",
pos.offset, pos.snapshot))
goto delete;
+ if (ret)
+ goto out;
ret = bch2_inode_has_child_snapshots(trans, k.k->p);
if (ret < 0)
@@ -1408,19 +1472,28 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
if (ret)
goto out;
}
+
+ if (!from_deleted_inodes) {
+ ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
+ bch_err_throw(c, inode_has_child_snapshot);
+ goto out;
+ }
+
goto delete;
}
- if (test_bit(BCH_FS_clean_recovery, &c->flags) &&
- !fsck_err(trans, deleted_inode_but_clean,
- "filesystem marked as clean but have deleted inode %llu:%u",
- pos.offset, pos.snapshot)) {
- ret = 0;
- goto out;
- }
+ if (from_deleted_inodes) {
+ if (test_bit(BCH_FS_clean_recovery, &c->flags) &&
+ !fsck_err(trans, deleted_inode_but_clean,
+ "filesystem marked as clean but have deleted inode %llu:%u",
+ pos.offset, pos.snapshot)) {
+ ret = 0;
+ goto out;
+ }
- ret = 1;
+ ret = 1;
+ }
out:
fsck_err:
bch2_trans_iter_exit(trans, &inode_iter);
@@ -1431,12 +1504,19 @@ delete:
goto out;
}
+static int may_delete_deleted_inum(struct btree_trans *trans, subvol_inum inum)
+{
+ u32 snapshot;
+
+ return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?:
+ may_delete_deleted_inode(trans, SPOS(0, inum.inum, snapshot), false);
+}
+
int bch2_delete_dead_inodes(struct bch_fs *c)
{
struct btree_trans *trans = bch2_trans_get(c);
- bool need_another_pass;
int ret;
-again:
+
/*
* if we ran check_inodes() unlinked inodes will have already been
* cleaned up but the write buffer will be out of sync; therefore we
@@ -1446,8 +1526,6 @@ again:
if (ret)
goto err;
- need_another_pass = false;
-
/*
* Weird transaction restart handling here because on successful delete,
* bch2_inode_rm_snapshot() will return a nested transaction restart,
@@ -1457,7 +1535,7 @@ again:
ret = for_each_btree_key_commit(trans, iter, BTREE_ID_deleted_inodes, POS_MIN,
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
- ret = may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass);
+ ret = may_delete_deleted_inode(trans, k.k->p, true);
if (ret > 0) {
bch_verbose_ratelimited(c, "deleting unlinked inode %llu:%u",
k.k->p.offset, k.k->p.snapshot);
@@ -1478,10 +1556,8 @@ again:
ret;
}));
-
- if (!ret && need_another_pass)
- goto again;
err:
bch2_trans_put(trans);
+ bch_err_fn(c, ret);
return ret;
}
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 5cfba9e98966..82cec2836cbd 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -134,10 +134,21 @@ static inline int bch2_inode_peek(struct btree_trans *trans,
subvol_inum inum, unsigned flags)
{
return __bch2_inode_peek(trans, iter, inode, inum, flags, true);
- int ret = bch2_inode_peek_nowarn(trans, iter, inode, inum, flags);
- return ret;
}
+int bch2_inode_find_by_inum_snapshot(struct btree_trans *, u64, u32,
+ struct bch_inode_unpacked *, unsigned);
+int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *,
+ subvol_inum,
+ struct bch_inode_unpacked *);
+int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum,
+ struct bch_inode_unpacked *);
+int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum,
+ struct bch_inode_unpacked *);
+
+int bch2_inode_find_snapshot_root(struct btree_trans *trans, u64 inum,
+ struct bch_inode_unpacked *root);
+
int bch2_inode_write_flags(struct btree_trans *, struct btree_iter *,
struct bch_inode_unpacked *, enum btree_iter_update_trigger_flags);
@@ -153,7 +164,7 @@ int bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *);
void bch2_inode_init_early(struct bch_fs *,
struct bch_inode_unpacked *);
-void bch2_inode_init_late(struct bch_inode_unpacked *, u64,
+void bch2_inode_init_late(struct bch_fs *, struct bch_inode_unpacked *, u64,
uid_t, gid_t, umode_t, dev_t,
struct bch_inode_unpacked *);
void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
@@ -165,14 +176,6 @@ int bch2_inode_create(struct btree_trans *, struct btree_iter *,
int bch2_inode_rm(struct bch_fs *, subvol_inum);
-int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *,
- subvol_inum,
- struct bch_inode_unpacked *);
-int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum,
- struct bch_inode_unpacked *);
-int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum,
- struct bch_inode_unpacked *);
-
#define inode_opt_get(_c, _inode, _name) \
((_inode)->bi_##_name ? (_inode)->bi_##_name - 1 : (_c)->opts._name)
@@ -245,7 +248,7 @@ static inline unsigned bkey_inode_mode(struct bkey_s_c k)
static inline bool bch2_inode_casefold(struct bch_fs *c, const struct bch_inode_unpacked *bi)
{
- /* inode apts are stored with a +1 bias: 0 means "unset, use fs opt" */
+ /* inode opts are stored with a +1 bias: 0 means "unset, use fs opt" */
return bi->bi_casefold
? bi->bi_casefold - 1
: c->opts.casefold;
@@ -280,15 +283,6 @@ static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi,
int bch2_inode_nlink_inc(struct bch_inode_unpacked *);
void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *);
-static inline bool bch2_inode_should_have_single_bp(struct bch_inode_unpacked *inode)
-{
- bool inode_has_bp = inode->bi_dir || inode->bi_dir_offset;
-
- return S_ISDIR(inode->bi_mode) ||
- inode->bi_subvol ||
- (!inode->bi_nlink && inode_has_bp);
-}
-
struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *);
void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *,
struct bch_inode_unpacked *);
@@ -306,6 +300,14 @@ bch2_inode_rebalance_opts_get(struct bch_fs *c, struct bch_inode_unpacked *inode
return io_opts_to_rebalance_opts(c, &io_opts);
}
+#define BCACHEFS_ROOT_SUBVOL_INUM \
+ ((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO })
+
+static inline bool subvol_inum_eq(subvol_inum a, subvol_inum b)
+{
+ return a.subvol == b.subvol && a.inum == b.inum;
+}
+
int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32);
int bch2_delete_dead_inodes(struct bch_fs *);
diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h
index 87e193e8ed25..1f00938b1bdc 100644
--- a/fs/bcachefs/inode_format.h
+++ b/fs/bcachefs/inode_format.h
@@ -129,6 +129,10 @@ enum inode_opt_id {
Inode_opt_nr,
};
+/*
+ * BCH_INODE_has_case_insensitive is set if any descendent is case insensitive -
+ * for overlayfs
+ */
#define BCH_INODE_FLAGS() \
x(sync, 0) \
x(immutable, 1) \
@@ -139,7 +143,8 @@ enum inode_opt_id {
x(i_sectors_dirty, 6) \
x(unlinked, 7) \
x(backptr_untrusted, 8) \
- x(has_child_snapshot, 9)
+ x(has_child_snapshot, 9) \
+ x(has_case_insensitive, 10)
/* bits 20+ reserved for packed fields below: */
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
index cc07729a4b62..bf72b1d2e2cb 100644
--- a/fs/bcachefs/io_misc.c
+++ b/fs/bcachefs/io_misc.c
@@ -91,7 +91,7 @@ int bch2_extent_fallocate(struct btree_trans *trans,
opts.data_replicas,
BCH_WATERMARK_normal, 0, &cl, &wp);
if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
- ret = -BCH_ERR_transaction_restart_nested;
+ ret = bch_err_throw(c, transaction_restart_nested);
if (ret)
goto err;
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index def4a26a3b45..a77779afad01 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -9,6 +9,7 @@
#include "bcachefs.h"
#include "alloc_background.h"
#include "alloc_foreground.h"
+#include "async_objs.h"
#include "btree_update.h"
#include "buckets.h"
#include "checksum.h"
@@ -17,6 +18,7 @@
#include "data_update.h"
#include "disk_groups.h"
#include "ec.h"
+#include "enumerated_ref.h"
#include "error.h"
#include "io_read.h"
#include "io_misc.h"
@@ -25,6 +27,7 @@
#include "subvolume.h"
#include "trace.h"
+#include <linux/moduleparam.h>
#include <linux/random.h>
#include <linux/sched/mm.h>
@@ -34,6 +37,12 @@ module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644);
MODULE_PARM_DESC(read_corrupt_ratio, "");
#endif
+static bool bch2_poison_extents_on_checksum_error;
+module_param_named(poison_extents_on_checksum_error,
+ bch2_poison_extents_on_checksum_error, bool, 0644);
+MODULE_PARM_DESC(poison_extents_on_checksum_error,
+ "Extents with checksum errors are marked as poisoned - unsafe without read fua support");
+
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
static bool bch2_target_congested(struct bch_fs *c, u16 target)
@@ -47,7 +56,7 @@ static bool bch2_target_congested(struct bch_fs *c, u16 target)
if (!target)
return false;
- rcu_read_lock();
+ guard(rcu)();
devs = bch2_target_to_mask(c, target) ?:
&c->rw_devs[BCH_DATA_user];
@@ -64,7 +73,6 @@ static bool bch2_target_congested(struct bch_fs *c, u16 target)
total += max(congested, 0LL);
nr++;
}
- rcu_read_unlock();
return get_random_u32_below(nr * CONGESTED_MAX) < total;
}
@@ -80,18 +88,6 @@ static bool bch2_target_congested(struct bch_fs *c, u16 target)
/* Cache promotion on read */
-struct promote_op {
- struct rcu_head rcu;
- u64 start_time;
-
- struct rhash_head hash;
- struct bpos pos;
-
- struct work_struct work;
- struct data_update write;
- struct bio_vec bi_inline_vecs[]; /* must be last */
-};
-
static const struct rhashtable_params bch_promote_params = {
.head_offset = offsetof(struct promote_op, hash),
.key_offset = offsetof(struct promote_op, pos),
@@ -141,21 +137,21 @@ static inline int should_promote(struct bch_fs *c, struct bkey_s_c k,
BUG_ON(!opts.promote_target);
if (!(flags & BCH_READ_may_promote))
- return -BCH_ERR_nopromote_may_not;
+ return bch_err_throw(c, nopromote_may_not);
if (bch2_bkey_has_target(c, k, opts.promote_target))
- return -BCH_ERR_nopromote_already_promoted;
+ return bch_err_throw(c, nopromote_already_promoted);
if (bkey_extent_is_unwritten(k))
- return -BCH_ERR_nopromote_unwritten;
+ return bch_err_throw(c, nopromote_unwritten);
if (bch2_target_congested(c, opts.promote_target))
- return -BCH_ERR_nopromote_congested;
+ return bch_err_throw(c, nopromote_congested);
}
if (rhashtable_lookup_fast(&c->promote_table, &pos,
bch_promote_params))
- return -BCH_ERR_nopromote_in_flight;
+ return bch_err_throw(c, nopromote_in_flight);
return 0;
}
@@ -169,9 +165,11 @@ static noinline void promote_free(struct bch_read_bio *rbio)
bch_promote_params);
BUG_ON(ret);
+ async_object_list_del(c, promote, op->list_idx);
+
bch2_data_update_exit(&op->write);
- bch2_write_ref_put(c, BCH_WRITE_REF_promote);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_promote);
kfree_rcu(op, rcu);
}
@@ -236,12 +234,12 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans,
return NULL;
}
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote))
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_promote))
return ERR_PTR(-BCH_ERR_nopromote_no_writes);
struct promote_op *op = kzalloc(sizeof(*op), GFP_KERNEL);
if (!op) {
- ret = -BCH_ERR_nopromote_enomem;
+ ret = bch_err_throw(c, nopromote_enomem);
goto err_put;
}
@@ -250,10 +248,14 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans,
if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
bch_promote_params)) {
- ret = -BCH_ERR_nopromote_in_flight;
+ ret = bch_err_throw(c, nopromote_in_flight);
goto err;
}
+ ret = async_object_list_add(c, promote, op, &op->list_idx);
+ if (ret < 0)
+ goto err_remove_hash;
+
ret = bch2_data_update_init(trans, NULL, NULL, &op->write,
writepoint_hashed((unsigned long) current),
&orig->opts,
@@ -265,7 +267,7 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans,
* -BCH_ERR_ENOSPC_disk_reservation:
*/
if (ret)
- goto err_remove_hash;
+ goto err_remove_list;
rbio_init_fragment(&op->write.rbio.bio, orig);
op->write.rbio.bounce = true;
@@ -273,6 +275,8 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans,
op->write.op.end_io = promote_done;
return &op->write.rbio;
+err_remove_list:
+ async_object_list_del(c, promote, op->list_idx);
err_remove_hash:
BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash,
bch_promote_params));
@@ -281,7 +285,7 @@ err:
/* We may have added to the rhashtable and thus need rcu freeing: */
kfree_rcu(op, rcu);
err_put:
- bch2_write_ref_put(c, BCH_WRITE_REF_promote);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_promote);
return ERR_PTR(ret);
}
@@ -296,6 +300,13 @@ static struct bch_read_bio *promote_alloc(struct btree_trans *trans,
bool *read_full,
struct bch_io_failures *failed)
{
+ /*
+ * We're in the retry path, but we don't know what to repair yet, and we
+ * don't want to do a promote here:
+ */
+ if (failed && !failed->nr)
+ return NULL;
+
struct bch_fs *c = trans->c;
/*
* if failed != NULL we're not actually doing a promote, we're
@@ -338,6 +349,18 @@ nopromote:
return NULL;
}
+void bch2_promote_op_to_text(struct printbuf *out, struct promote_op *op)
+{
+ if (!op->write.read_done) {
+ prt_printf(out, "parent read: %px\n", op->write.rbio.parent);
+ printbuf_indent_add(out, 2);
+ bch2_read_bio_to_text(out, op->write.rbio.parent);
+ printbuf_indent_sub(out, 2);
+ }
+
+ bch2_data_update_to_text(out, &op->write);
+}
+
/* Read */
static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
@@ -394,7 +417,7 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
if (rbio->have_ioref) {
struct bch_dev *ca = bch2_dev_have_ref(rbio->c, rbio->pick.ptr.dev);
- percpu_ref_put(&ca->io_ref[READ]);
+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_io_read);
}
if (rbio->split) {
@@ -406,6 +429,8 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
else
promote_free(rbio);
} else {
+ async_object_list_del(rbio->c, rbio, rbio->list_idx);
+
if (rbio->bounce)
bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
@@ -430,6 +455,74 @@ static void bch2_rbio_done(struct bch_read_bio *rbio)
bio_endio(&rbio->bio);
}
+static void get_rbio_extent(struct btree_trans *trans,
+ struct bch_read_bio *rbio,
+ struct bkey_buf *sk)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret = lockrestart_do(trans,
+ bkey_err(k = bch2_bkey_get_iter(trans, &iter,
+ rbio->data_btree, rbio->data_pos, 0)));
+ if (ret)
+ return;
+
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ bkey_for_each_ptr(ptrs, ptr)
+ if (bch2_extent_ptr_eq(*ptr, rbio->pick.ptr)) {
+ bch2_bkey_buf_reassemble(sk, trans->c, k);
+ break;
+ }
+
+ bch2_trans_iter_exit(trans, &iter);
+}
+
+static noinline int maybe_poison_extent(struct btree_trans *trans, struct bch_read_bio *rbio,
+ enum btree_id btree, struct bkey_s_c read_k)
+{
+ if (!bch2_poison_extents_on_checksum_error)
+ return 0;
+
+ struct bch_fs *c = trans->c;
+
+ struct data_update *u = rbio_data_update(rbio);
+ if (u)
+ read_k = bkey_i_to_s_c(u->k.k);
+
+ u64 flags = bch2_bkey_extent_flags(read_k);
+ if (flags & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
+ return 0;
+
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, btree, bkey_start_pos(read_k.k),
+ BTREE_ITER_intent);
+ int ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ if (!bkey_and_val_eq(k, read_k))
+ goto out;
+
+ struct bkey_i *new = bch2_trans_kmalloc(trans,
+ bkey_bytes(k.k) + sizeof(struct bch_extent_flags));
+ ret = PTR_ERR_OR_ZERO(new) ?:
+ (bkey_reassemble(new, k), 0) ?:
+ bch2_bkey_extent_flags_set(c, new, flags|BIT_ULL(BCH_EXTENT_FLAG_poisoned)) ?:
+ bch2_trans_update(trans, &iter, new, BTREE_UPDATE_internal_snapshot_node) ?:
+ bch2_trans_commit(trans, NULL, NULL, 0);
+
+ /*
+ * Propagate key change back to data update path, in particular so it
+ * knows the extent has been poisoned and it's safe to change the
+ * checksum
+ */
+ if (u && !ret)
+ bch2_bkey_buf_copy(&u->k, c, new);
+out:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
static noinline int bch2_read_retry_nodecode(struct btree_trans *trans,
struct bch_read_bio *rbio,
struct bvec_iter bvec_iter,
@@ -451,7 +544,7 @@ retry:
if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) {
/* extent we wanted to read no longer exists: */
- rbio->ret = -BCH_ERR_data_read_key_overwritten;
+ rbio->ret = bch_err_throw(trans->c, data_read_key_overwritten);
goto err;
}
@@ -463,7 +556,8 @@ retry:
err:
bch2_trans_iter_exit(trans, &iter);
- if (bch2_err_matches(ret, BCH_ERR_data_read_retry))
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
+ bch2_err_matches(ret, BCH_ERR_data_read_retry))
goto retry;
if (ret) {
@@ -487,15 +581,21 @@ static void bch2_rbio_retry(struct work_struct *work)
.inum = rbio->read_pos.inode,
};
struct bch_io_failures failed = { .nr = 0 };
- int orig_error = rbio->ret;
struct btree_trans *trans = bch2_trans_get(c);
+ struct bkey_buf sk;
+ bch2_bkey_buf_init(&sk);
+ bkey_init(&sk.k->k);
+
trace_io_read_retry(&rbio->bio);
this_cpu_add(c->counters[BCH_COUNTER_io_read_retry],
bvec_iter_sectors(rbio->bvec_iter));
- if (bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid))
+ get_rbio_extent(trans, rbio, &sk);
+
+ if (!bkey_deleted(&sk.k->k) &&
+ bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid))
bch2_mark_io_failure(&failed, &rbio->pick,
rbio->ret == -BCH_ERR_data_read_retry_csum_err);
@@ -516,15 +616,16 @@ static void bch2_rbio_retry(struct work_struct *work)
int ret = rbio->data_update
? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags)
- : __bch2_read(trans, rbio, iter, inum, &failed, flags);
+ : __bch2_read(trans, rbio, iter, inum, &failed, &sk, flags);
if (ret) {
rbio->ret = ret;
rbio->bio.bi_status = BLK_STS_IOERR;
- } else if (orig_error != -BCH_ERR_data_read_retry_csum_err_maybe_userspace &&
- orig_error != -BCH_ERR_data_read_ptr_stale_race &&
- !failed.nr) {
+ }
+
+ if (failed.nr || ret) {
struct printbuf buf = PRINTBUF;
+ bch2_log_msg_start(c, &buf);
lockrestart_do(trans,
bch2_inum_offset_err_msg_trans(trans, &buf,
@@ -532,13 +633,27 @@ static void bch2_rbio_retry(struct work_struct *work)
read_pos.offset << 9));
if (rbio->data_update)
prt_str(&buf, "(internal move) ");
- prt_str(&buf, "successful retry");
- bch_err_ratelimited(c, "%s", buf.buf);
+ prt_str(&buf, "data read error, ");
+ if (!ret)
+ prt_str(&buf, "successful retry");
+ else
+ prt_str(&buf, bch2_err_str(ret));
+ prt_newline(&buf);
+
+ if (!bkey_deleted(&sk.k->k)) {
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(sk.k));
+ prt_newline(&buf);
+ }
+
+ bch2_io_failures_to_text(&buf, c, &failed);
+
+ bch2_print_str_ratelimited(c, KERN_ERR, buf.buf);
printbuf_exit(&buf);
}
bch2_rbio_done(rbio);
+ bch2_bkey_buf_exit(&sk, c);
bch2_trans_put(trans);
}
@@ -568,27 +683,6 @@ static void bch2_rbio_error(struct bch_read_bio *rbio,
}
}
-static void bch2_read_io_err(struct work_struct *work)
-{
- struct bch_read_bio *rbio =
- container_of(work, struct bch_read_bio, work);
- struct bio *bio = &rbio->bio;
- struct bch_fs *c = rbio->c;
- struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
- struct printbuf buf = PRINTBUF;
-
- bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
- prt_printf(&buf, "data read error: %s", bch2_blk_status_to_str(bio->bi_status));
-
- if (ca)
- bch_err_ratelimited(ca, "%s", buf.buf);
- else
- bch_err_ratelimited(c, "%s", buf.buf);
-
- printbuf_exit(&buf);
- bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_io_err, bio->bi_status);
-}
-
static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
struct bch_read_bio *rbio)
{
@@ -652,31 +746,6 @@ static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
__bch2_rbio_narrow_crcs(trans, rbio));
}
-static void bch2_read_csum_err(struct work_struct *work)
-{
- struct bch_read_bio *rbio =
- container_of(work, struct bch_read_bio, work);
- struct bch_fs *c = rbio->c;
- struct bio *src = &rbio->bio;
- struct bch_extent_crc_unpacked crc = rbio->pick.crc;
- struct nonce nonce = extent_nonce(rbio->version, crc);
- struct bch_csum csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
- struct printbuf buf = PRINTBUF;
-
- bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
- prt_str(&buf, "data ");
- bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum);
-
- struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
- if (ca)
- bch_err_ratelimited(ca, "%s", buf.buf);
- else
- bch_err_ratelimited(c, "%s", buf.buf);
-
- bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR);
- printbuf_exit(&buf);
-}
-
static void bch2_read_decompress_err(struct work_struct *work)
{
struct bch_read_bio *rbio =
@@ -837,7 +906,7 @@ out:
memalloc_nofs_restore(nofs_flags);
return;
csum_err:
- bch2_rbio_punt(rbio, bch2_read_csum_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
+ bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR);
goto out;
decompression_err:
bch2_rbio_punt(rbio, bch2_read_decompress_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
@@ -863,7 +932,7 @@ static void bch2_read_endio(struct bio *bio)
rbio->bio.bi_end_io = rbio->end_io;
if (unlikely(bio->bi_status)) {
- bch2_rbio_punt(rbio, bch2_read_io_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
+ bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_io_err, bio->bi_status);
return;
}
@@ -963,6 +1032,10 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
bvec_iter_sectors(iter));
goto out_read_done;
}
+
+ if ((bch2_bkey_extent_flags(k) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) &&
+ !orig->data_update)
+ return bch_err_throw(c, extent_poisoned);
retry_pick:
ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev);
@@ -971,6 +1044,16 @@ retry_pick:
goto hole;
if (unlikely(ret < 0)) {
+ if (ret == -BCH_ERR_data_read_csum_err) {
+ int ret2 = maybe_poison_extent(trans, orig, data_btree, k);
+ if (ret2) {
+ ret = ret2;
+ goto err;
+ }
+
+ trace_and_count(c, io_read_fail_and_poison, &orig->bio);
+ }
+
struct printbuf buf = PRINTBUF;
bch2_read_err_msg_trans(trans, &buf, orig, read_pos);
prt_printf(&buf, "%s\n ", bch2_err_str(ret));
@@ -990,11 +1073,12 @@ retry_pick:
bch_err_ratelimited(c, "%s", buf.buf);
printbuf_exit(&buf);
- ret = -BCH_ERR_data_read_no_encryption_key;
+ ret = bch_err_throw(c, data_read_no_encryption_key);
goto err;
}
- struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
+ struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ,
+ BCH_DEV_READ_REF_io_read);
/*
* Stale dirty pointers are treated as IO errors, but @failed isn't
@@ -1008,7 +1092,7 @@ retry_pick:
unlikely(dev_ptr_stale(ca, &pick.ptr))) {
read_from_stale_dirty_pointer(trans, ca, k, pick.ptr);
bch2_mark_io_failure(failed, &pick, false);
- percpu_ref_put(&ca->io_ref[READ]);
+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_io_read);
goto retry_pick;
}
@@ -1041,8 +1125,9 @@ retry_pick:
*/
if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) {
if (ca)
- percpu_ref_put(&ca->io_ref[READ]);
- rbio->ret = -BCH_ERR_data_read_buffer_too_small;
+ enumerated_ref_put(&ca->io_ref[READ],
+ BCH_DEV_READ_REF_io_read);
+ rbio->ret = bch_err_throw(c, data_read_buffer_too_small);
goto out_read_done;
}
@@ -1138,6 +1223,8 @@ retry_pick:
rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
rbio->bio.bi_end_io = bch2_read_endio;
+ async_object_list_add(c, rbio, rbio, &rbio->list_idx);
+
if (rbio->bounce)
trace_and_count(c, io_read_bounce, &rbio->bio);
@@ -1171,14 +1258,6 @@ retry_pick:
if (likely(!rbio->pick.do_ec_reconstruct)) {
if (unlikely(!rbio->have_ioref)) {
- struct printbuf buf = PRINTBUF;
- bch2_read_err_msg_trans(trans, &buf, rbio, read_pos);
- prt_printf(&buf, "no device to read from:\n ");
- bch2_bkey_val_to_text(&buf, c, k);
-
- bch_err_ratelimited(c, "%s", buf.buf);
- printbuf_exit(&buf);
-
bch2_rbio_error(rbio,
-BCH_ERR_data_read_retry_device_offline,
BLK_STS_IOERR);
@@ -1253,7 +1332,7 @@ hole:
* have to signal that:
*/
if (u)
- orig->ret = -BCH_ERR_data_read_key_overwritten;
+ orig->ret = bch_err_throw(c, data_read_key_overwritten);
zero_fill_bio_iter(&orig->bio, iter);
out_read_done:
@@ -1265,12 +1344,15 @@ out_read_done:
int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio,
struct bvec_iter bvec_iter, subvol_inum inum,
- struct bch_io_failures *failed, unsigned flags)
+ struct bch_io_failures *failed,
+ struct bkey_buf *prev_read,
+ unsigned flags)
{
struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_buf sk;
struct bkey_s_c k;
+ enum btree_id data_btree;
int ret;
EBUG_ON(rbio->data_update);
@@ -1281,7 +1363,7 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio,
BTREE_ITER_slots);
while (1) {
- enum btree_id data_btree = BTREE_ID_extents;
+ data_btree = BTREE_ID_extents;
bch2_trans_begin(trans);
@@ -1313,6 +1395,12 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio,
k = bkey_i_to_s_c(sk.k);
+ if (unlikely(flags & BCH_READ_in_retry)) {
+ if (!bkey_and_val_eq(k, bkey_i_to_s_c(prev_read->k)))
+ failed->nr = 0;
+ bch2_bkey_buf_copy(prev_read, c, sk.k);
+ }
+
/*
* With indirect extents, the amount of data to read is the min
* of the original extent and the indirect extent:
@@ -1347,8 +1435,6 @@ err:
break;
}
- bch2_trans_iter_exit(trans, &iter);
-
if (unlikely(ret)) {
if (ret != -BCH_ERR_extent_poisoned) {
struct printbuf buf = PRINTBUF;
@@ -1367,30 +1453,74 @@ err:
bch2_rbio_done(rbio);
}
+ bch2_trans_iter_exit(trans, &iter);
bch2_bkey_buf_exit(&sk, c);
return ret;
}
+static const char * const bch2_read_bio_flags[] = {
+#define x(n) #n,
+ BCH_READ_FLAGS()
+#undef x
+ NULL
+};
+
+void bch2_read_bio_to_text(struct printbuf *out, struct bch_read_bio *rbio)
+{
+ u64 now = local_clock();
+ prt_printf(out, "start_time:\t%llu\n", rbio->start_time ? now - rbio->start_time : 0);
+ prt_printf(out, "submit_time:\t%llu\n", rbio->submit_time ? now - rbio->submit_time : 0);
+
+ if (!rbio->split)
+ prt_printf(out, "end_io:\t%ps\n", rbio->end_io);
+ else
+ prt_printf(out, "parent:\t%px\n", rbio->parent);
+
+ prt_printf(out, "bi_end_io:\t%ps\n", rbio->bio.bi_end_io);
+
+ prt_printf(out, "promote:\t%u\n", rbio->promote);
+ prt_printf(out, "bounce:\t%u\n", rbio->bounce);
+ prt_printf(out, "split:\t%u\n", rbio->split);
+ prt_printf(out, "have_ioref:\t%u\n", rbio->have_ioref);
+ prt_printf(out, "narrow_crcs:\t%u\n", rbio->narrow_crcs);
+ prt_printf(out, "context:\t%u\n", rbio->context);
+ prt_printf(out, "ret:\t%s\n", bch2_err_str(rbio->ret));
+
+ prt_printf(out, "flags:\t");
+ bch2_prt_bitflags(out, bch2_read_bio_flags, rbio->flags);
+ prt_newline(out);
+
+ bch2_bio_to_text(out, &rbio->bio);
+}
+
void bch2_fs_io_read_exit(struct bch_fs *c)
{
if (c->promote_table.tbl)
rhashtable_destroy(&c->promote_table);
bioset_exit(&c->bio_read_split);
bioset_exit(&c->bio_read);
+ mempool_exit(&c->bio_bounce_pages);
}
int bch2_fs_io_read_init(struct bch_fs *c)
{
+ if (mempool_init_page_pool(&c->bio_bounce_pages,
+ max_t(unsigned,
+ c->opts.btree_node_size,
+ c->opts.encoded_extent_max) /
+ PAGE_SIZE, 0))
+ return bch_err_throw(c, ENOMEM_bio_bounce_pages_init);
+
if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
BIOSET_NEED_BVECS))
- return -BCH_ERR_ENOMEM_bio_read_init;
+ return bch_err_throw(c, ENOMEM_bio_read_init);
if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
BIOSET_NEED_BVECS))
- return -BCH_ERR_ENOMEM_bio_read_split_init;
+ return bch_err_throw(c, ENOMEM_bio_read_split_init);
if (rhashtable_init(&c->promote_table, &bch_promote_params))
- return -BCH_ERR_ENOMEM_promote_table_init;
+ return bch_err_throw(c, ENOMEM_promote_table_init);
return 0;
}
diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h
index c78025d863e0..45c959018919 100644
--- a/fs/bcachefs/io_read.h
+++ b/fs/bcachefs/io_read.h
@@ -4,6 +4,7 @@
#include "bkey_buf.h"
#include "btree_iter.h"
+#include "extents_types.h"
#include "reflink.h"
struct bch_read_bio {
@@ -48,6 +49,9 @@ struct bch_read_bio {
u16 _state;
};
s16 ret;
+#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
+ unsigned list_idx;
+#endif
struct extent_ptr_decoded pick;
@@ -87,6 +91,8 @@ static inline int bch2_read_indirect_extent(struct btree_trans *trans,
return 0;
*data_btree = BTREE_ID_reflink;
+
+ struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_s_c k = bch2_lookup_indirect_extent(trans, &iter,
offset_into_extent,
@@ -98,10 +104,10 @@ static inline int bch2_read_indirect_extent(struct btree_trans *trans,
if (bkey_deleted(k.k)) {
bch2_trans_iter_exit(trans, &iter);
- return -BCH_ERR_missing_indirect_extent;
+ return bch_err_throw(c, missing_indirect_extent);
}
- bch2_bkey_buf_reassemble(extent, trans->c, k);
+ bch2_bkey_buf_reassemble(extent, c, k);
bch2_trans_iter_exit(trans, &iter);
return 0;
}
@@ -144,7 +150,8 @@ static inline void bch2_read_extent(struct btree_trans *trans,
}
int __bch2_read(struct btree_trans *, struct bch_read_bio *, struct bvec_iter,
- subvol_inum, struct bch_io_failures *, unsigned flags);
+ subvol_inum,
+ struct bch_io_failures *, struct bkey_buf *, unsigned flags);
static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
subvol_inum inum)
@@ -154,7 +161,7 @@ static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
rbio->subvol = inum.subvol;
bch2_trans_run(c,
- __bch2_read(trans, rbio, rbio->bio.bi_iter, inum, NULL,
+ __bch2_read(trans, rbio, rbio->bio.bi_iter, inum, NULL, NULL,
BCH_READ_retry_if_stale|
BCH_READ_may_promote|
BCH_READ_user_mapped));
@@ -172,6 +179,9 @@ static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio,
rbio->split = true;
rbio->parent = orig;
rbio->opts = orig->opts;
+#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
+ rbio->list_idx = 0;
+#endif
return rbio;
}
@@ -189,9 +199,16 @@ static inline struct bch_read_bio *rbio_init(struct bio *bio,
rbio->ret = 0;
rbio->opts = opts;
rbio->bio.bi_end_io = end_io;
+#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
+ rbio->list_idx = 0;
+#endif
return rbio;
}
+struct promote_op;
+void bch2_promote_op_to_text(struct printbuf *, struct promote_op *);
+void bch2_read_bio_to_text(struct printbuf *, struct bch_read_bio *);
+
void bch2_fs_io_read_exit(struct bch_fs *);
int bch2_fs_io_read_init(struct bch_fs *);
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index c1237da079ed..88b1eec8eff3 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -6,6 +6,7 @@
#include "bcachefs.h"
#include "alloc_foreground.h"
+#include "async_objs.h"
#include "bkey_buf.h"
#include "bset.h"
#include "btree_update.h"
@@ -15,6 +16,7 @@
#include "compress.h"
#include "debug.h"
#include "ec.h"
+#include "enumerated_ref.h"
#include "error.h"
#include "extent_update.h"
#include "inode.h"
@@ -263,11 +265,9 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
prt_printf(&buf, "inode %llu i_sectors underflow: %lli + %lli < 0",
extent_iter->pos.inode, bi_sectors, i_sectors_delta);
- bool repeat = false, print = false, suppress = false;
- bch2_count_fsck_err(c, inode_i_sectors_underflow, buf.buf,
- &repeat, &print, &suppress);
+ bool print = bch2_count_fsck_err(c, inode_i_sectors_underflow, &buf);
if (print)
- bch2_print_str(c, buf.buf);
+ bch2_print_str(c, KERN_ERR, buf.buf);
printbuf_exit(&buf);
if (i_sectors_delta < 0)
@@ -280,6 +280,12 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
inode_update_flags = 0;
}
+ /*
+ * extents, dirents and xattrs updates require that an inode update also
+ * happens - to ensure that if a key exists in one of those btrees with
+ * a given snapshot ID an inode is also present - so we may have to skip
+ * the nojournal optimization:
+ */
if (inode->k.p.snapshot != iter.snapshot) {
inode->k.p.snapshot = iter.snapshot;
inode_update_flags = 0;
@@ -397,8 +403,7 @@ static int bch2_write_index_default(struct bch_write_op *op)
bkey_start_pos(&sk.k->k),
BTREE_ITER_slots|BTREE_ITER_intent);
- ret = bch2_bkey_set_needs_rebalance(c, &op->opts, sk.k) ?:
- bch2_extent_update(trans, inum, &iter, sk.k,
+ ret = bch2_extent_update(trans, inum, &iter, sk.k,
&op->res,
op->new_i_size, &op->i_sectors_delta,
op->flags & BCH_WRITE_check_enospc);
@@ -462,9 +467,17 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
struct bch_write_bio *n;
+ unsigned ref_rw = type == BCH_DATA_btree ? READ : WRITE;
+ unsigned ref_idx = type == BCH_DATA_btree
+ ? BCH_DEV_READ_REF_btree_node_write
+ : BCH_DEV_WRITE_REF_io_write;
BUG_ON(c->opts.nochanges);
+ const struct bch_extent_ptr *last = NULL;
+ bkey_for_each_ptr(ptrs, ptr)
+ last = ptr;
+
bkey_for_each_ptr(ptrs, ptr) {
/*
* XXX: btree writes should be using io_ref[WRITE], but we
@@ -473,9 +486,9 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
*/
struct bch_dev *ca = nocow
? bch2_dev_have_ref(c, ptr->dev)
- : bch2_dev_get_ioref(c, ptr->dev, type == BCH_DATA_btree ? READ : WRITE);
+ : bch2_dev_get_ioref(c, ptr->dev, ref_rw, ref_idx);
- if (to_entry(ptr + 1) < ptrs.end) {
+ if (ptr != last) {
n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, GFP_NOFS, &c->replica_set));
n->bio.bi_end_io = wbio->bio.bi_end_io;
@@ -533,17 +546,19 @@ static void bch2_write_done(struct closure *cl)
bch2_disk_reservation_put(c, &op->res);
if (!(op->flags & BCH_WRITE_move))
- bch2_write_ref_put(c, BCH_WRITE_REF_write);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_write);
bch2_keylist_free(&op->insert_keys, op->inline_keys);
EBUG_ON(cl->parent);
closure_debug_destroy(cl);
+ async_object_list_del(c, write_op, op->list_idx);
if (op->end_io)
op->end_io(op);
}
static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op)
{
+ struct bch_fs *c = op->c;
struct keylist *keys = &op->insert_keys;
struct bkey_i *src, *dst = keys->keys, *n;
@@ -555,7 +570,7 @@ static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op)
test_bit(ptr->dev, op->failed.d));
if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src)))
- return -BCH_ERR_data_write_io;
+ return bch_err_throw(c, data_write_io);
}
if (dst != src)
@@ -748,7 +763,8 @@ static void bch2_write_endio(struct bio *bio)
}
if (wbio->have_ioref)
- percpu_ref_put(&ca->io_ref[WRITE]);
+ enumerated_ref_put(&ca->io_ref[WRITE],
+ BCH_DEV_WRITE_REF_io_write);
if (wbio->bounce)
bch2_bio_free_pages_pool(c, bio);
@@ -784,6 +800,9 @@ static void init_append_extent(struct bch_write_op *op,
bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size,
op->flags & BCH_WRITE_cached);
+ if (!(op->flags & BCH_WRITE_move))
+ bch2_bkey_set_needs_rebalance(op->c, &op->opts, &e->k_i);
+
bch2_keylist_push(&op->insert_keys);
}
@@ -958,7 +977,7 @@ csum_err:
op->crc.csum_type < BCH_CSUM_NR
? __bch2_csum_types[op->crc.csum_type]
: "(unknown)");
- return -BCH_ERR_data_write_csum;
+ return bch_err_throw(c, data_write_csum);
}
static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
@@ -1190,16 +1209,13 @@ static bool bch2_extent_is_writeable(struct bch_write_op *op,
e = bkey_s_c_to_extent(k);
- rcu_read_lock();
+ guard(rcu)();
extent_for_each_ptr_decode(e, p, entry) {
- if (crc_is_encoded(p.crc) || p.has_ec) {
- rcu_read_unlock();
+ if (crc_is_encoded(p.crc) || p.has_ec)
return false;
- }
replicas += bch2_extent_ptr_durability(c, &p);
}
- rcu_read_unlock();
return replicas >= op->opts.data_replicas;
}
@@ -1272,7 +1288,7 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
static void __bch2_nocow_write_done(struct bch_write_op *op)
{
if (unlikely(op->flags & BCH_WRITE_io_error)) {
- op->error = -BCH_ERR_data_write_io;
+ op->error = bch_err_throw(op->c, data_write_io);
} else if (unlikely(op->flags & BCH_WRITE_convert_unwritten))
bch2_nocow_write_convert_unwritten(op);
}
@@ -1345,7 +1361,8 @@ retry:
/* Get iorefs before dropping btree locks: */
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE);
+ struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE,
+ BCH_DEV_WRITE_REF_io_write);
if (unlikely(!ca))
goto err_get_ioref;
@@ -1447,7 +1464,8 @@ err:
return;
err_get_ioref:
darray_for_each(buckets, i)
- percpu_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref[WRITE]);
+ enumerated_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref[WRITE],
+ BCH_DEV_WRITE_REF_io_write);
/* Fall back to COW path: */
goto out;
@@ -1463,10 +1481,10 @@ err_bucket_stale:
"pointer to invalid bucket in nocow path on device %llu\n %s",
stale_at->b.inode,
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- ret = -BCH_ERR_data_write_invalid_ptr;
+ ret = bch_err_throw(c, data_write_invalid_ptr);
} else {
/* We can retry this: */
- ret = -BCH_ERR_transaction_restart;
+ ret = bch_err_throw(c, transaction_restart);
}
printbuf_exit(&buf);
@@ -1661,6 +1679,8 @@ CLOSURE_CALLBACK(bch2_write)
BUG_ON(!op->write_point.v);
BUG_ON(bkey_eq(op->pos, POS_MAX));
+ async_object_list_add(c, write_op, op, &op->list_idx);
+
if (op->flags & BCH_WRITE_only_specified_devs)
op->flags |= BCH_WRITE_alloc_nowait;
@@ -1671,18 +1691,18 @@ CLOSURE_CALLBACK(bch2_write)
if (unlikely(bio->bi_iter.bi_size & (c->opts.block_size - 1))) {
bch2_write_op_error(op, op->pos.offset, "misaligned write");
- op->error = -BCH_ERR_data_write_misaligned;
+ op->error = bch_err_throw(c, data_write_misaligned);
goto err;
}
if (c->opts.nochanges) {
- op->error = -BCH_ERR_erofs_no_writes;
+ op->error = bch_err_throw(c, erofs_no_writes);
goto err;
}
if (!(op->flags & BCH_WRITE_move) &&
- !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) {
- op->error = -BCH_ERR_erofs_no_writes;
+ !enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_write)) {
+ op->error = bch_err_throw(c, erofs_no_writes);
goto err;
}
@@ -1705,6 +1725,7 @@ err:
bch2_disk_reservation_put(c, &op->res);
closure_debug_destroy(&op->cl);
+ async_object_list_del(c, write_op, op->list_idx);
if (op->end_io)
op->end_io(op);
}
@@ -1738,13 +1759,13 @@ void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op)
prt_printf(out, "nr_replicas_required:\t%u\n", op->nr_replicas_required);
prt_printf(out, "ref:\t%u\n", closure_nr_remaining(&op->cl));
+ prt_printf(out, "ret\t%s\n", bch2_err_str(op->error));
printbuf_indent_sub(out, 2);
}
void bch2_fs_io_write_exit(struct bch_fs *c)
{
- mempool_exit(&c->bio_bounce_pages);
bioset_exit(&c->replica_set);
bioset_exit(&c->bio_write);
}
@@ -1753,14 +1774,7 @@ int bch2_fs_io_write_init(struct bch_fs *c)
{
if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), BIOSET_NEED_BVECS) ||
bioset_init(&c->replica_set, 4, offsetof(struct bch_write_bio, bio), 0))
- return -BCH_ERR_ENOMEM_bio_write_init;
-
- if (mempool_init_page_pool(&c->bio_bounce_pages,
- max_t(unsigned,
- c->opts.btree_node_size,
- c->opts.encoded_extent_max) /
- PAGE_SIZE, 0))
- return -BCH_ERR_ENOMEM_bio_bounce_pages_init;
+ return bch_err_throw(c, ENOMEM_bio_write_init);
return 0;
}
diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h
index b8ab19a1e1da..2c0a8f35ee1f 100644
--- a/fs/bcachefs/io_write.h
+++ b/fs/bcachefs/io_write.h
@@ -17,34 +17,6 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
__printf(3, 4)
void bch2_write_op_error(struct bch_write_op *op, u64, const char *, ...);
-#define BCH_WRITE_FLAGS() \
- x(alloc_nowait) \
- x(cached) \
- x(data_encoded) \
- x(pages_stable) \
- x(pages_owned) \
- x(only_specified_devs) \
- x(wrote_data_inline) \
- x(check_enospc) \
- x(sync) \
- x(move) \
- x(in_worker) \
- x(submitted) \
- x(io_error) \
- x(convert_unwritten)
-
-enum __bch_write_flags {
-#define x(f) __BCH_WRITE_##f,
- BCH_WRITE_FLAGS()
-#undef x
-};
-
-enum bch_write_flags {
-#define x(f) BCH_WRITE_##f = BIT(__BCH_WRITE_##f),
- BCH_WRITE_FLAGS()
-#undef x
-};
-
static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
{
return op->watermark == BCH_WATERMARK_copygc
diff --git a/fs/bcachefs/io_write_types.h b/fs/bcachefs/io_write_types.h
index 3ef6df9145ef..5da4eb8bb6f6 100644
--- a/fs/bcachefs/io_write_types.h
+++ b/fs/bcachefs/io_write_types.h
@@ -13,6 +13,34 @@
#include <linux/llist.h>
#include <linux/workqueue.h>
+#define BCH_WRITE_FLAGS() \
+ x(alloc_nowait) \
+ x(cached) \
+ x(data_encoded) \
+ x(pages_stable) \
+ x(pages_owned) \
+ x(only_specified_devs) \
+ x(wrote_data_inline) \
+ x(check_enospc) \
+ x(sync) \
+ x(move) \
+ x(in_worker) \
+ x(submitted) \
+ x(io_error) \
+ x(convert_unwritten)
+
+enum __bch_write_flags {
+#define x(f) __BCH_WRITE_##f,
+ BCH_WRITE_FLAGS()
+#undef x
+};
+
+enum bch_write_flags {
+#define x(f) BCH_WRITE_##f = BIT(__BCH_WRITE_##f),
+ BCH_WRITE_FLAGS()
+#undef x
+};
+
struct bch_write_bio {
struct_group(wbio,
struct bch_fs *c;
@@ -43,6 +71,10 @@ struct bch_write_op {
void (*end_io)(struct bch_write_op *);
u64 start_time;
+#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
+ unsigned list_idx;
+#endif
+
unsigned written; /* sectors */
u16 flags;
s16 error; /* dio write path expects it to hold -ERESTARTSYS... */
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index bb45d3634194..dda802a656cf 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -12,6 +12,7 @@
#include "btree_update.h"
#include "btree_write_buffer.h"
#include "buckets.h"
+#include "enumerated_ref.h"
#include "error.h"
#include "journal.h"
#include "journal_io.h"
@@ -173,7 +174,7 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags)
spin_unlock(&j->lock);
prt_printf(&buf, bch2_fmt(c, "Journal stuck! Hava a pre-reservation but journal full (error %s)"),
bch2_err_str(error));
- bch2_print_string_as_lines(KERN_ERR, buf.buf);
+ bch2_print_str(c, KERN_ERR, buf.buf);
printbuf_reset(&buf);
bch2_journal_pins_to_text(&buf, j);
@@ -331,16 +332,6 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t
__bch2_journal_buf_put(j, le64_to_cpu(buf->data->seq));
}
-void bch2_journal_halt(struct journal *j)
-{
- spin_lock(&j->lock);
- __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL, true);
- if (!j->err_seq)
- j->err_seq = journal_cur_seq(j);
- journal_wake(j);
- spin_unlock(&j->lock);
-}
-
void bch2_journal_halt_locked(struct journal *j)
{
lockdep_assert_held(&j->lock);
@@ -351,6 +342,13 @@ void bch2_journal_halt_locked(struct journal *j)
journal_wake(j);
}
+void bch2_journal_halt(struct journal *j)
+{
+ spin_lock(&j->lock);
+ bch2_journal_halt_locked(j);
+ spin_unlock(&j->lock);
+}
+
static bool journal_entry_want_write(struct journal *j)
{
bool ret = !journal_entry_is_open(j) ||
@@ -399,7 +397,7 @@ static int journal_entry_open(struct journal *j)
BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
if (j->blocked)
- return -BCH_ERR_journal_blocked;
+ return bch_err_throw(c, journal_blocked);
if (j->cur_entry_error)
return j->cur_entry_error;
@@ -409,23 +407,23 @@ static int journal_entry_open(struct journal *j)
return ret;
if (!fifo_free(&j->pin))
- return -BCH_ERR_journal_pin_full;
+ return bch_err_throw(c, journal_pin_full);
if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf))
- return -BCH_ERR_journal_max_in_flight;
+ return bch_err_throw(c, journal_max_in_flight);
if (atomic64_read(&j->seq) - j->seq_write_started == JOURNAL_STATE_BUF_NR)
- return -BCH_ERR_journal_max_open;
+ return bch_err_throw(c, journal_max_open);
- if (journal_cur_seq(j) >= JOURNAL_SEQ_MAX) {
+ if (unlikely(journal_cur_seq(j) >= JOURNAL_SEQ_MAX)) {
bch_err(c, "cannot start: journal seq overflow");
if (bch2_fs_emergency_read_only_locked(c))
bch_err(c, "fatal error - emergency read only");
- return -BCH_ERR_journal_shutdown;
+ return bch_err_throw(c, journal_shutdown);
}
if (!j->free_buf && !buf->data)
- return -BCH_ERR_journal_buf_enomem; /* will retry after write completion frees up a buf */
+ return bch_err_throw(c, journal_buf_enomem); /* will retry after write completion frees up a buf */
BUG_ON(!j->cur_entry_sectors);
@@ -449,7 +447,7 @@ static int journal_entry_open(struct journal *j)
u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1);
if (u64s <= (ssize_t) j->early_journal_entries.nr)
- return -BCH_ERR_journal_full;
+ return bch_err_throw(c, journal_full);
if (fifo_empty(&j->pin) && j->reclaim_thread)
wake_up_process(j->reclaim_thread);
@@ -461,6 +459,14 @@ static int journal_entry_open(struct journal *j)
atomic64_inc(&j->seq);
journal_pin_list_init(fifo_push_ref(&j->pin), 1);
+ if (unlikely(bch2_journal_seq_is_blacklisted(c, journal_cur_seq(j), false))) {
+ bch_err(c, "attempting to open blacklisted journal seq %llu",
+ journal_cur_seq(j));
+ if (bch2_fs_emergency_read_only_locked(c))
+ bch_err(c, "fatal error - emergency read only");
+ return bch_err_throw(c, journal_shutdown);
+ }
+
BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf);
@@ -591,16 +597,16 @@ retry:
return ret;
if (j->blocked)
- return -BCH_ERR_journal_blocked;
+ return bch_err_throw(c, journal_blocked);
if ((flags & BCH_WATERMARK_MASK) < j->watermark) {
- ret = -BCH_ERR_journal_full;
+ ret = bch_err_throw(c, journal_full);
can_discard = j->can_discard;
goto out;
}
if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) && !journal_entry_is_open(j)) {
- ret = -BCH_ERR_journal_max_in_flight;
+ ret = bch_err_throw(c, journal_max_in_flight);
goto out;
}
@@ -641,7 +647,7 @@ out:
goto retry;
if (journal_error_check_stuck(j, ret, flags))
- ret = -BCH_ERR_journal_stuck;
+ ret = bch_err_throw(c, journal_stuck);
if (ret == -BCH_ERR_journal_max_in_flight &&
track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true) &&
@@ -702,7 +708,8 @@ static unsigned max_dev_latency(struct bch_fs *c)
{
u64 nsecs = 0;
- for_each_rw_member(c, ca)
+ guard(rcu)();
+ for_each_rw_member_rcu(c, ca)
nsecs = max(nsecs, ca->io_latency[WRITE].stats.max_duration);
return nsecs_to_jiffies(nsecs);
@@ -746,7 +753,7 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
struct printbuf buf = PRINTBUF;
bch2_journal_debug_to_text(&buf, j);
- bch2_print_string_as_lines(KERN_ERR, buf.buf);
+ bch2_print_str(c, KERN_ERR, buf.buf);
prt_printf(&buf, bch2_fmt(c, "Journal stuck? Waited for 10 seconds, err %s"), bch2_err_str(ret));
printbuf_exit(&buf);
@@ -805,6 +812,7 @@ out:
int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
struct closure *parent)
{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_buf *buf;
int ret = 0;
@@ -820,7 +828,7 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
/* Recheck under lock: */
if (j->err_seq && seq >= j->err_seq) {
- ret = -BCH_ERR_journal_flush_err;
+ ret = bch_err_throw(c, journal_flush_err);
goto out;
}
@@ -990,11 +998,11 @@ int bch2_journal_meta(struct journal *j)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_journal))
- return -BCH_ERR_erofs_no_writes;
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_journal))
+ return bch_err_throw(c, erofs_no_writes);
int ret = __bch2_journal_meta(j);
- bch2_write_ref_put(c, BCH_WRITE_REF_journal);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_journal);
return ret;
}
@@ -1124,7 +1132,7 @@ static int bch2_set_nr_journal_buckets_iter(struct bch_dev *ca, unsigned nr,
new_buckets = kcalloc(nr, sizeof(u64), GFP_KERNEL);
new_bucket_seq = kcalloc(nr, sizeof(u64), GFP_KERNEL);
if (!bu || !ob || !new_buckets || !new_bucket_seq) {
- ret = -BCH_ERR_ENOMEM_set_nr_journal_buckets;
+ ret = bch_err_throw(c, ENOMEM_set_nr_journal_buckets);
goto err_free;
}
@@ -1296,13 +1304,83 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
return ret;
}
+int bch2_dev_journal_bucket_delete(struct bch_dev *ca, u64 b)
+{
+ struct bch_fs *c = ca->fs;
+ struct journal *j = &c->journal;
+ struct journal_device *ja = &ca->journal;
+
+ guard(mutex)(&c->sb_lock);
+ unsigned pos;
+ for (pos = 0; pos < ja->nr; pos++)
+ if (ja->buckets[pos] == b)
+ break;
+
+ if (pos == ja->nr) {
+ bch_err(ca, "journal bucket %llu not found when deleting", b);
+ return -EINVAL;
+ }
+
+ u64 *new_buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);;
+ if (!new_buckets)
+ return bch_err_throw(c, ENOMEM_set_nr_journal_buckets);
+
+ memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64));
+ memmove(&new_buckets[pos],
+ &new_buckets[pos + 1],
+ (ja->nr - 1 - pos) * sizeof(new_buckets[0]));
+
+ int ret = bch2_journal_buckets_to_sb(c, ca, ja->buckets, ja->nr - 1) ?:
+ bch2_write_super(c);
+ if (ret) {
+ kfree(new_buckets);
+ return ret;
+ }
+
+ scoped_guard(spinlock, &j->lock) {
+ if (pos < ja->discard_idx)
+ --ja->discard_idx;
+ if (pos < ja->dirty_idx_ondisk)
+ --ja->dirty_idx_ondisk;
+ if (pos < ja->dirty_idx)
+ --ja->dirty_idx;
+ if (pos < ja->cur_idx)
+ --ja->cur_idx;
+
+ ja->nr--;
+
+ memmove(&ja->buckets[pos],
+ &ja->buckets[pos + 1],
+ (ja->nr - pos) * sizeof(ja->buckets[0]));
+
+ memmove(&ja->bucket_seq[pos],
+ &ja->bucket_seq[pos + 1],
+ (ja->nr - pos) * sizeof(ja->bucket_seq[0]));
+
+ bch2_journal_space_available(j);
+ }
+
+ kfree(new_buckets);
+ return 0;
+}
+
int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs)
{
+ struct bch_fs *c = ca->fs;
+
+ if (!(ca->mi.data_allowed & BIT(BCH_DATA_journal)))
+ return 0;
+
+ if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) {
+ bch_err(c, "cannot allocate journal, filesystem is an unresized image file");
+ return bch_err_throw(c, erofs_filesystem_full);
+ }
+
unsigned nr;
int ret;
if (dynamic_fault("bcachefs:add:journal_alloc")) {
- ret = -BCH_ERR_ENOMEM_set_nr_journal_buckets;
+ ret = bch_err_throw(c, ENOMEM_set_nr_journal_buckets);
goto err;
}
@@ -1318,7 +1396,7 @@ int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs)
min(1 << 13,
(1 << 24) / ca->mi.bucket_size));
- ret = bch2_set_nr_journal_buckets_loop(ca->fs, ca, nr, new_fs);
+ ret = bch2_set_nr_journal_buckets_loop(c, ca, nr, new_fs);
err:
bch_err_fn(ca, ret);
return ret;
@@ -1326,13 +1404,14 @@ err:
int bch2_fs_journal_alloc(struct bch_fs *c)
{
- for_each_online_member(c, ca) {
+ for_each_online_member(c, ca, BCH_DEV_READ_REF_fs_journal_alloc) {
if (ca->journal.nr)
continue;
int ret = bch2_dev_journal_alloc(ca, true);
if (ret) {
- percpu_ref_put(&ca->io_ref[READ]);
+ enumerated_ref_put(&ca->io_ref[READ],
+ BCH_DEV_READ_REF_fs_journal_alloc);
return ret;
}
}
@@ -1404,6 +1483,13 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
bool had_entries = false;
u64 last_seq = cur_seq, nr, seq;
+ /*
+ *
+ * XXX pick most recent non blacklisted sequence number
+ */
+
+ cur_seq = max(cur_seq, bch2_journal_last_blacklisted_seq(c));
+
if (cur_seq >= JOURNAL_SEQ_MAX) {
bch_err(c, "cannot start: journal seq overflow");
return -EINVAL;
@@ -1429,13 +1515,11 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
*/
nr += nr / 4;
- if (nr + 1 > j->pin.size) {
- free_fifo(&j->pin);
- init_fifo(&j->pin, roundup_pow_of_two(nr + 1), GFP_KERNEL);
- if (!j->pin.data) {
- bch_err(c, "error reallocating journal fifo (%llu open entries)", nr);
- return -BCH_ERR_ENOMEM_journal_pin_fifo;
- }
+ nr = max(nr, JOURNAL_PIN);
+ init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL);
+ if (!j->pin.data) {
+ bch_err(c, "error reallocating journal fifo (%llu open entries)", nr);
+ return bch_err_throw(c, ENOMEM_journal_pin_fifo);
}
j->replay_journal_seq = last_seq;
@@ -1523,6 +1607,7 @@ void bch2_dev_journal_exit(struct bch_dev *ca)
int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
{
+ struct bch_fs *c = ca->fs;
struct journal_device *ja = &ca->journal;
struct bch_sb_field_journal *journal_buckets =
bch2_sb_field_get(sb, journal);
@@ -1542,7 +1627,7 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
if (!ja->bucket_seq)
- return -BCH_ERR_ENOMEM_dev_journal_init;
+ return bch_err_throw(c, ENOMEM_dev_journal_init);
unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE);
@@ -1550,7 +1635,7 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
ja->bio[i] = kzalloc(struct_size(ja->bio[i], bio.bi_inline_vecs,
nr_bvecs), GFP_KERNEL);
if (!ja->bio[i])
- return -BCH_ERR_ENOMEM_dev_journal_init;
+ return bch_err_throw(c, ENOMEM_dev_journal_init);
ja->bio[i]->ca = ca;
ja->bio[i]->buf_idx = i;
@@ -1559,7 +1644,7 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
if (!ja->buckets)
- return -BCH_ERR_ENOMEM_dev_journal_init;
+ return bch_err_throw(c, ENOMEM_dev_journal_init);
if (journal_buckets_v2) {
unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
@@ -1590,7 +1675,7 @@ void bch2_fs_journal_exit(struct journal *j)
free_fifo(&j->pin);
}
-int bch2_fs_journal_init(struct journal *j)
+void bch2_fs_journal_init_early(struct journal *j)
{
static struct lock_class_key res_key;
@@ -1609,24 +1694,24 @@ int bch2_fs_journal_init(struct journal *j)
atomic64_set(&j->reservations.counter,
((union journal_res_state)
{ .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
+}
- if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)))
- return -BCH_ERR_ENOMEM_journal_pin_fifo;
+int bch2_fs_journal_init(struct journal *j)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
j->free_buf_size = j->buf_size_want = JOURNAL_ENTRY_SIZE_MIN;
j->free_buf = kvmalloc(j->free_buf_size, GFP_KERNEL);
if (!j->free_buf)
- return -BCH_ERR_ENOMEM_journal_buf;
+ return bch_err_throw(c, ENOMEM_journal_buf);
for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++)
j->buf[i].idx = i;
- j->pin.front = j->pin.back = 1;
-
j->wq = alloc_workqueue("bcachefs_journal",
WQ_HIGHPRI|WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512);
if (!j->wq)
- return -BCH_ERR_ENOMEM_fs_other_alloc;
+ return bch_err_throw(c, ENOMEM_fs_other_alloc);
return 0;
}
@@ -1650,7 +1735,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
printbuf_tabstop_push(out, 28);
out->atomic++;
- rcu_read_lock();
+ guard(rcu)();
s = READ_ONCE(j->reservations);
prt_printf(out, "flags:\t");
@@ -1741,8 +1826,6 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
prt_printf(out, "replicas want %u need %u\n", c->opts.metadata_replicas, c->opts.metadata_replicas_required);
- rcu_read_unlock();
-
--out->atomic;
}
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 641e20c05a14..83734fe4331f 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -426,8 +426,8 @@ int bch2_journal_flush(struct journal *);
bool bch2_journal_noflush_seq(struct journal *, u64, u64);
int bch2_journal_meta(struct journal *);
-void bch2_journal_halt(struct journal *);
void bch2_journal_halt_locked(struct journal *);
+void bch2_journal_halt(struct journal *);
static inline int bch2_journal_error(struct journal *j)
{
@@ -444,8 +444,9 @@ struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *, u
void __bch2_journal_debug_to_text(struct printbuf *, struct journal *);
void bch2_journal_debug_to_text(struct printbuf *, struct journal *);
-int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
- unsigned nr);
+int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, unsigned);
+int bch2_dev_journal_bucket_delete(struct bch_dev *, u64);
+
int bch2_dev_journal_alloc(struct bch_dev *, bool);
int bch2_fs_journal_alloc(struct bch_fs *);
@@ -458,6 +459,7 @@ void bch2_journal_set_replay_done(struct journal *);
void bch2_dev_journal_exit(struct bch_dev *);
int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *);
void bch2_fs_journal_exit(struct journal *);
+void bch2_fs_journal_init_early(struct journal *);
int bch2_fs_journal_init(struct journal *);
#endif /* _BCACHEFS_JOURNAL_H */
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index ded18a94ed02..0b15d71a8d2d 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -49,25 +49,27 @@ void bch2_journal_pos_from_member_info_resume(struct bch_fs *c)
mutex_unlock(&c->sb_lock);
}
-void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
- struct journal_replay *j)
+static void bch2_journal_ptr_to_text(struct printbuf *out, struct bch_fs *c, struct journal_ptr *p)
+{
+ struct bch_dev *ca = bch2_dev_tryget_noerror(c, p->dev);
+ prt_printf(out, "%s %u:%u:%u (sector %llu)",
+ ca ? ca->name : "(invalid dev)",
+ p->dev, p->bucket, p->bucket_offset, p->sector);
+ bch2_dev_put(ca);
+}
+
+void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, struct journal_replay *j)
{
darray_for_each(j->ptrs, i) {
if (i != j->ptrs.data)
prt_printf(out, " ");
- prt_printf(out, "%u:%u:%u (sector %llu)",
- i->dev, i->bucket, i->bucket_offset, i->sector);
+ bch2_journal_ptr_to_text(out, c, i);
}
}
-static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c,
- struct journal_replay *j)
+static void bch2_journal_datetime_to_text(struct printbuf *out, struct jset *j)
{
- prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq));
-
- bch2_journal_ptrs_to_text(out, c, j);
-
- for_each_jset_entry_type(entry, &j->j, BCH_JSET_ENTRY_datetime) {
+ for_each_jset_entry_type(entry, j, BCH_JSET_ENTRY_datetime) {
struct jset_entry_datetime *datetime =
container_of(entry, struct jset_entry_datetime, entry);
bch2_prt_datetime(out, le64_to_cpu(datetime->seconds));
@@ -75,6 +77,15 @@ static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c,
}
}
+static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c,
+ struct journal_replay *j)
+{
+ prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq));
+ bch2_journal_datetime_to_text(out, &j->j);
+ prt_char(out, ' ');
+ bch2_journal_ptrs_to_text(out, c, j);
+}
+
static struct nonce journal_nonce(const struct jset *jset)
{
return (struct nonce) {{
@@ -188,7 +199,7 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
journal_entry_radix_idx(c, le64_to_cpu(j->seq)),
GFP_KERNEL);
if (!_i)
- return -BCH_ERR_ENOMEM_journal_entry_add;
+ return bch_err_throw(c, ENOMEM_journal_entry_add);
/*
* Duplicate journal entries? If so we want the one that didn't have a
@@ -231,7 +242,7 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
replace:
i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
if (!i)
- return -BCH_ERR_ENOMEM_journal_entry_add;
+ return bch_err_throw(c, ENOMEM_journal_entry_add);
darray_init(&i->ptrs);
i->csum_good = entry_ptr.csum_good;
@@ -311,7 +322,7 @@ static void journal_entry_err_msg(struct printbuf *out,
bch2_sb_error_count(c, BCH_FSCK_ERR_##_err); \
if (bch2_fs_inconsistent(c, \
"corrupt metadata before write: %s\n", _buf.buf)) {\
- ret = -BCH_ERR_fsck_errors_not_fixed; \
+ ret = bch_err_throw(c, fsck_errors_not_fixed); \
goto fsck_err; \
} \
break; \
@@ -418,6 +429,10 @@ static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs
bool first = true;
jset_entry_for_each_key(entry, k) {
+ /* We may be called on entries that haven't been validated: */
+ if (!k->k.u64s)
+ break;
+
if (!first) {
prt_newline(out);
bch2_prt_jset_entry_type(out, entry->type);
@@ -1005,19 +1020,19 @@ struct journal_read_buf {
size_t size;
};
-static int journal_read_buf_realloc(struct journal_read_buf *b,
+static int journal_read_buf_realloc(struct bch_fs *c, struct journal_read_buf *b,
size_t new_size)
{
void *n;
/* the bios are sized for this many pages, max: */
if (new_size > JOURNAL_ENTRY_SIZE_MAX)
- return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
+ return bch_err_throw(c, ENOMEM_journal_read_buf_realloc);
new_size = roundup_pow_of_two(new_size);
n = kvmalloc(new_size, GFP_KERNEL);
if (!n)
- return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
+ return bch_err_throw(c, ENOMEM_journal_read_buf_realloc);
kvfree(b->data);
b->data = n;
@@ -1037,7 +1052,6 @@ static int journal_read_bucket(struct bch_dev *ca,
u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
end = offset + ca->mi.bucket_size;
bool saw_bad = false, csum_good;
- struct printbuf err = PRINTBUF;
int ret = 0;
pr_debug("reading %u", bucket);
@@ -1053,7 +1067,7 @@ reread:
bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
if (!bio)
- return -BCH_ERR_ENOMEM_journal_read_bucket;
+ return bch_err_throw(c, ENOMEM_journal_read_bucket);
bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ);
bio->bi_iter.bi_sector = offset;
@@ -1064,7 +1078,7 @@ reread:
kfree(bio);
if (!ret && bch2_meta_read_fault("journal"))
- ret = -BCH_ERR_EIO_fault_injected;
+ ret = bch_err_throw(c, EIO_fault_injected);
bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
submit_time, !ret);
@@ -1078,7 +1092,7 @@ reread:
* found on a different device, and missing or
* no journal entries will be handled later
*/
- goto out;
+ return 0;
}
j = buf->data;
@@ -1092,15 +1106,15 @@ reread:
break;
case JOURNAL_ENTRY_REREAD:
if (vstruct_bytes(j) > buf->size) {
- ret = journal_read_buf_realloc(buf,
+ ret = journal_read_buf_realloc(c, buf,
vstruct_bytes(j));
if (ret)
- goto err;
+ return ret;
}
goto reread;
case JOURNAL_ENTRY_NONE:
if (!saw_bad)
- goto out;
+ return 0;
/*
* On checksum error we don't really trust the size
* field of the journal entry we read, so try reading
@@ -1109,7 +1123,7 @@ reread:
sectors = block_sectors(c);
goto next_block;
default:
- goto err;
+ return ret;
}
if (le64_to_cpu(j->seq) > ja->highest_seq_found) {
@@ -1126,22 +1140,20 @@ reread:
* bucket:
*/
if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
- goto out;
+ return 0;
ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
- enum bch_csum_type csum_type = JSET_CSUM_TYPE(j);
struct bch_csum csum;
csum_good = jset_csum_good(c, j, &csum);
bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good);
if (!csum_good) {
- bch_err_dev_ratelimited(ca, "%s",
- (printbuf_reset(&err),
- prt_str(&err, "journal "),
- bch2_csum_err_msg(&err, csum_type, j->csum, csum),
- err.buf));
+ /*
+ * Don't print an error here, we'll print the error
+ * later if we need this journal entry
+ */
saw_bad = true;
}
@@ -1153,6 +1165,7 @@ reread:
mutex_lock(&jlist->lock);
ret = journal_entry_add(c, ca, (struct journal_ptr) {
.csum_good = csum_good,
+ .csum = csum,
.dev = ca->dev_idx,
.bucket = bucket,
.bucket_offset = offset -
@@ -1167,7 +1180,7 @@ reread:
case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
break;
default:
- goto err;
+ return ret;
}
next_block:
pr_debug("next");
@@ -1176,11 +1189,7 @@ next_block:
j = ((void *) j) + (sectors << 9);
}
-out:
- ret = 0;
-err:
- printbuf_exit(&err);
- return ret;
+ return 0;
}
static CLOSURE_CALLBACK(bch2_journal_read_device)
@@ -1197,7 +1206,7 @@ static CLOSURE_CALLBACK(bch2_journal_read_device)
if (!ja->nr)
goto out;
- ret = journal_read_buf_realloc(&buf, PAGE_SIZE);
+ ret = journal_read_buf_realloc(c, &buf, PAGE_SIZE);
if (ret)
goto err;
@@ -1219,7 +1228,7 @@ static CLOSURE_CALLBACK(bch2_journal_read_device)
out:
bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret);
kvfree(buf.data);
- percpu_ref_put(&ca->io_ref[READ]);
+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_journal_read);
closure_return(cl);
return;
err:
@@ -1229,13 +1238,105 @@ err:
goto out;
}
+noinline_for_stack
+static void bch2_journal_print_checksum_error(struct bch_fs *c, struct journal_replay *j)
+{
+ struct printbuf buf = PRINTBUF;
+ enum bch_csum_type csum_type = JSET_CSUM_TYPE(&j->j);
+ bool have_good = false;
+
+ prt_printf(&buf, "invalid journal checksum(s) at seq %llu ", le64_to_cpu(j->j.seq));
+ bch2_journal_datetime_to_text(&buf, &j->j);
+ prt_newline(&buf);
+
+ darray_for_each(j->ptrs, ptr)
+ if (!ptr->csum_good) {
+ bch2_journal_ptr_to_text(&buf, c, ptr);
+ prt_char(&buf, ' ');
+ bch2_csum_to_text(&buf, csum_type, ptr->csum);
+ prt_newline(&buf);
+ } else {
+ have_good = true;
+ }
+
+ prt_printf(&buf, "should be ");
+ bch2_csum_to_text(&buf, csum_type, j->j.csum);
+
+ if (have_good)
+ prt_printf(&buf, "\n(had good copy on another device)");
+
+ bch2_print_str(c, KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+}
+
+noinline_for_stack
+static int bch2_journal_check_for_missing(struct bch_fs *c, u64 start_seq, u64 end_seq)
+{
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ struct genradix_iter radix_iter;
+ struct journal_replay *i, **_i, *prev = NULL;
+ u64 seq = start_seq;
+
+ genradix_for_each(&c->journal_entries, radix_iter, _i) {
+ i = *_i;
+
+ if (journal_replay_ignore(i))
+ continue;
+
+ BUG_ON(seq > le64_to_cpu(i->j.seq));
+
+ while (seq < le64_to_cpu(i->j.seq)) {
+ while (seq < le64_to_cpu(i->j.seq) &&
+ bch2_journal_seq_is_blacklisted(c, seq, false))
+ seq++;
+
+ if (seq == le64_to_cpu(i->j.seq))
+ break;
+
+ u64 missing_start = seq;
+
+ while (seq < le64_to_cpu(i->j.seq) &&
+ !bch2_journal_seq_is_blacklisted(c, seq, false))
+ seq++;
+
+ u64 missing_end = seq - 1;
+
+ printbuf_reset(&buf);
+ prt_printf(&buf, "journal entries %llu-%llu missing! (replaying %llu-%llu)",
+ missing_start, missing_end,
+ start_seq, end_seq);
+
+ prt_printf(&buf, "\nprev at ");
+ if (prev) {
+ bch2_journal_ptrs_to_text(&buf, c, prev);
+ prt_printf(&buf, " size %zu", vstruct_sectors(&prev->j, c->block_bits));
+ } else
+ prt_printf(&buf, "(none)");
+
+ prt_printf(&buf, "\nnext at ");
+ bch2_journal_ptrs_to_text(&buf, c, i);
+ prt_printf(&buf, ", continue?");
+
+ fsck_err(c, journal_entries_missing, "%s", buf.buf);
+ }
+
+ prev = i;
+ seq++;
+ }
+fsck_err:
+ printbuf_exit(&buf);
+ return ret;
+}
+
int bch2_journal_read(struct bch_fs *c,
u64 *last_seq,
u64 *blacklist_seq,
u64 *start_seq)
{
struct journal_list jlist;
- struct journal_replay *i, **_i, *prev = NULL;
+ struct journal_replay *i, **_i;
struct genradix_iter radix_iter;
struct printbuf buf = PRINTBUF;
bool degraded = false, last_write_torn = false;
@@ -1254,7 +1355,8 @@ int bch2_journal_read(struct bch_fs *c,
if ((ca->mi.state == BCH_MEMBER_STATE_rw ||
ca->mi.state == BCH_MEMBER_STATE_ro) &&
- percpu_ref_tryget(&ca->io_ref[READ]))
+ enumerated_ref_tryget(&ca->io_ref[READ],
+ BCH_DEV_READ_REF_journal_read))
closure_call(&ca->journal.read,
bch2_journal_read_device,
system_unbound_wq,
@@ -1325,12 +1427,12 @@ int bch2_journal_read(struct bch_fs *c,
return 0;
}
- bch_info(c, "journal read done, replaying entries %llu-%llu",
- *last_seq, *blacklist_seq - 1);
-
+ printbuf_reset(&buf);
+ prt_printf(&buf, "journal read done, replaying entries %llu-%llu",
+ *last_seq, *blacklist_seq - 1);
if (*start_seq != *blacklist_seq)
- bch_info(c, "dropped unflushed entries %llu-%llu",
- *blacklist_seq, *start_seq - 1);
+ prt_printf(&buf, " (unflushed %llu-%llu)", *blacklist_seq, *start_seq - 1);
+ bch_info(c, "%s", buf.buf);
/* Drop blacklisted entries and entries older than last_seq: */
genradix_for_each(&c->journal_entries, radix_iter, _i) {
@@ -1353,59 +1455,12 @@ int bch2_journal_read(struct bch_fs *c,
}
}
- /* Check for missing entries: */
- seq = *last_seq;
- genradix_for_each(&c->journal_entries, radix_iter, _i) {
- i = *_i;
-
- if (journal_replay_ignore(i))
- continue;
-
- BUG_ON(seq > le64_to_cpu(i->j.seq));
-
- while (seq < le64_to_cpu(i->j.seq)) {
- u64 missing_start, missing_end;
- struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
-
- while (seq < le64_to_cpu(i->j.seq) &&
- bch2_journal_seq_is_blacklisted(c, seq, false))
- seq++;
-
- if (seq == le64_to_cpu(i->j.seq))
- break;
-
- missing_start = seq;
-
- while (seq < le64_to_cpu(i->j.seq) &&
- !bch2_journal_seq_is_blacklisted(c, seq, false))
- seq++;
-
- if (prev) {
- bch2_journal_ptrs_to_text(&buf1, c, prev);
- prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits));
- } else
- prt_printf(&buf1, "(none)");
- bch2_journal_ptrs_to_text(&buf2, c, i);
-
- missing_end = seq - 1;
- fsck_err(c, journal_entries_missing,
- "journal entries %llu-%llu missing! (replaying %llu-%llu)\n"
- "prev at %s\n"
- "next at %s, continue?",
- missing_start, missing_end,
- *last_seq, *blacklist_seq - 1,
- buf1.buf, buf2.buf);
-
- printbuf_exit(&buf1);
- printbuf_exit(&buf2);
- }
-
- prev = i;
- seq++;
- }
+ ret = bch2_journal_check_for_missing(c, *last_seq, *blacklist_seq - 1);
+ if (ret)
+ goto err;
genradix_for_each(&c->journal_entries, radix_iter, _i) {
- struct bch_replicas_padded replicas = {
+ union bch_replicas_padded replicas = {
.e.data_type = BCH_DATA_journal,
.e.nr_devs = 0,
.e.nr_required = 1,
@@ -1415,15 +1470,15 @@ int bch2_journal_read(struct bch_fs *c,
if (journal_replay_ignore(i))
continue;
- darray_for_each(i->ptrs, ptr) {
- struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
-
- if (!ptr->csum_good)
- bch_err_dev_offset(ca, ptr->sector,
- "invalid journal checksum, seq %llu%s",
- le64_to_cpu(i->j.seq),
- i->csum_good ? " (had good copy on another device)" : "");
- }
+ /*
+ * Don't print checksum errors until we know we're going to use
+ * a given journal entry:
+ */
+ darray_for_each(i->ptrs, ptr)
+ if (!ptr->csum_good) {
+ bch2_journal_print_checksum_error(c, i);
+ break;
+ }
ret = jset_validate(c,
bch2_dev_have_ref(c, i->ptrs.data[0].dev),
@@ -1466,6 +1521,7 @@ static void journal_advance_devs_to_next_bucket(struct journal *j,
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ guard(rcu)();
darray_for_each(*devs, i) {
struct bch_dev *ca = rcu_dereference(c->devs[*i]);
if (!ca)
@@ -1499,7 +1555,8 @@ static void __journal_write_alloc(struct journal *j,
struct bch_fs *c = container_of(j, struct bch_fs, journal);
darray_for_each(*devs, i) {
- struct bch_dev *ca = rcu_dereference(c->devs[*i]);
+ struct bch_dev *ca = bch2_dev_get_ioref(c, *i, WRITE,
+ BCH_DEV_WRITE_REF_journal_write);
if (!ca)
continue;
@@ -1513,8 +1570,10 @@ static void __journal_write_alloc(struct journal *j,
ca->mi.state != BCH_MEMBER_STATE_rw ||
!ja->nr ||
bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) ||
- sectors > ja->sectors_free)
+ sectors > ja->sectors_free) {
+ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write);
continue;
+ }
bch2_dev_stripe_increment(ca, &j->wp.stripe);
@@ -1537,15 +1596,8 @@ static void __journal_write_alloc(struct journal *j,
}
}
-/**
- * journal_write_alloc - decide where to write next journal entry
- *
- * @j: journal object
- * @w: journal buf (entry to be written)
- *
- * Returns: 0 on success, or -BCH_ERR_insufficient_devices on failure
- */
-static int journal_write_alloc(struct journal *j, struct journal_buf *w)
+static int journal_write_alloc(struct journal *j, struct journal_buf *w,
+ unsigned *replicas)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_devs_mask devs;
@@ -1553,29 +1605,18 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w)
unsigned sectors = vstruct_sectors(w->data, c->block_bits);
unsigned target = c->opts.metadata_target ?:
c->opts.foreground_target;
- unsigned replicas = 0, replicas_want =
- READ_ONCE(c->opts.metadata_replicas);
+ unsigned replicas_want = READ_ONCE(c->opts.metadata_replicas);
unsigned replicas_need = min_t(unsigned, replicas_want,
READ_ONCE(c->opts.metadata_replicas_required));
bool advance_done = false;
- rcu_read_lock();
-
- /* We might run more than once if we have to stop and do discards: */
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&w->key));
- bkey_for_each_ptr(ptrs, p) {
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, p->dev);
- if (ca)
- replicas += ca->mi.durability;
- }
-
retry_target:
devs = target_rw_devs(c, BCH_DATA_journal, target);
- devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs);
+ bch2_dev_alloc_list(c, &j->wp.stripe, &devs, &devs_sorted);
retry_alloc:
- __journal_write_alloc(j, w, &devs_sorted, sectors, &replicas, replicas_want);
+ __journal_write_alloc(j, w, &devs_sorted, sectors, replicas, replicas_want);
- if (likely(replicas >= replicas_want))
+ if (likely(*replicas >= replicas_want))
goto done;
if (!advance_done) {
@@ -1584,18 +1625,26 @@ retry_alloc:
goto retry_alloc;
}
- if (replicas < replicas_want && target) {
+ if (*replicas < replicas_want && target) {
/* Retry from all devices: */
target = 0;
advance_done = false;
goto retry_target;
}
done:
- rcu_read_unlock();
-
BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX);
- return replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices;
+#if 0
+ /*
+ * XXX: we need a way to alert the user when we go degraded for any
+ * reason
+ */
+ if (*replicas < min(replicas_want,
+ dev_mask_nr(&c->rw_devs[BCH_DATA_free]))) {
+ }
+#endif
+
+ return *replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices;
}
static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
@@ -1633,7 +1682,7 @@ static CLOSURE_CALLBACK(journal_write_done)
closure_type(w, struct journal_buf, io);
struct journal *j = container_of(w, struct journal, buf[w->idx]);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct bch_replicas_padded replicas;
+ union bch_replicas_padded replicas;
u64 seq = le64_to_cpu(w->data->seq);
int err = 0;
@@ -1642,17 +1691,27 @@ static CLOSURE_CALLBACK(journal_write_done)
: j->noflush_write_time, j->write_start_time);
if (!w->devs_written.nr) {
- if (!bch2_journal_error(j))
- bch_err(c, "unable to write journal to sufficient devices");
- err = -BCH_ERR_journal_write_err;
+ err = bch_err_throw(c, journal_write_err);
} else {
bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
w->devs_written);
err = bch2_mark_replicas(c, &replicas.e);
}
- if (err)
- bch2_fatal_error(c);
+ if (err && !bch2_journal_error(j)) {
+ struct printbuf buf = PRINTBUF;
+ bch2_log_msg_start(c, &buf);
+
+ if (err == -BCH_ERR_journal_write_err)
+ prt_printf(&buf, "unable to write journal to sufficient devices");
+ else
+ prt_printf(&buf, "journal write error marking replicas: %s", bch2_err_str(err));
+
+ bch2_fs_emergency_read_only2(c, &buf);
+
+ bch2_print_str(c, KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+ }
closure_debug_destroy(cl);
@@ -1770,7 +1829,7 @@ static void journal_write_endio(struct bio *bio)
}
closure_put(&w->io);
- percpu_ref_put(&ca->io_ref[WRITE]);
+ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write);
}
static CLOSURE_CALLBACK(journal_write_submit)
@@ -1781,12 +1840,7 @@ static CLOSURE_CALLBACK(journal_write_submit)
unsigned sectors = vstruct_sectors(w->data, c->block_bits);
extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
- struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE);
- if (!ca) {
- /* XXX: fix this */
- bch_err(c, "missing device %u for journal write", ptr->dev);
- continue;
- }
+ struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
sectors);
@@ -1844,8 +1898,9 @@ static CLOSURE_CALLBACK(journal_write_preflush)
}
if (w->separate_flush) {
- for_each_rw_member(c, ca) {
- percpu_ref_get(&ca->io_ref[WRITE]);
+ for_each_rw_member(c, ca, BCH_DEV_WRITE_REF_journal_write) {
+ enumerated_ref_get(&ca->io_ref[WRITE],
+ BCH_DEV_WRITE_REF_journal_write);
struct journal_device *ja = &ca->journal;
struct bio *bio = &ja->bio[w->idx]->bio;
@@ -1872,9 +1927,8 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
struct jset_entry *start, *end;
struct jset *jset = w->data;
struct journal_keys_to_wb wb = { NULL };
- unsigned sectors, bytes, u64s;
+ unsigned u64s;
unsigned long btree_roots_have = 0;
- bool validate_before_checksum = false;
u64 seq = le64_to_cpu(jset->seq);
int ret;
@@ -1957,8 +2011,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
le32_add_cpu(&jset->u64s, u64s);
- sectors = vstruct_sectors(jset, c->block_bits);
- bytes = vstruct_bytes(jset);
+ unsigned sectors = vstruct_sectors(jset, c->block_bits);
if (sectors > w->sectors) {
bch2_fs_fatal_error(c, ": journal write overran available space, %zu > %u (extra %u reserved %u/%u)",
@@ -1967,6 +2020,17 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
return -EINVAL;
}
+ return 0;
+}
+
+static int bch2_journal_write_checksum(struct journal *j, struct journal_buf *w)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct jset *jset = w->data;
+ u64 seq = le64_to_cpu(jset->seq);
+ bool validate_before_checksum = false;
+ int ret = 0;
+
jset->magic = cpu_to_le64(jset_magic(c));
jset->version = cpu_to_le32(c->sb.version);
@@ -1989,7 +2053,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
jset->encrypted_start,
vstruct_end(jset) - (void *) jset->encrypted_start);
- if (bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret)))
+ if (bch2_fs_fatal_err_on(ret, c, "encrypting journal entry: %s", bch2_err_str(ret)))
return ret;
jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
@@ -1999,6 +2063,8 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
(ret = jset_validate(c, NULL, jset, 0, WRITE)))
return ret;
+ unsigned sectors = vstruct_sectors(jset, c->block_bits);
+ unsigned bytes = vstruct_bytes(jset);
memset((void *) jset + bytes, 0, (sectors << 9) - bytes);
return 0;
}
@@ -2054,13 +2120,10 @@ CLOSURE_CALLBACK(bch2_journal_write)
closure_type(w, struct journal_buf, io);
struct journal *j = container_of(w, struct journal, buf[w->idx]);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct bch_replicas_padded replicas;
- unsigned nr_rw_members = 0;
+ union bch_replicas_padded replicas;
+ unsigned nr_rw_members = dev_mask_nr(&c->rw_devs[BCH_DATA_free]);
int ret;
- for_each_rw_member(c, ca)
- nr_rw_members++;
-
BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
BUG_ON(!w->write_started);
BUG_ON(w->write_allocated);
@@ -2074,7 +2137,8 @@ CLOSURE_CALLBACK(bch2_journal_write)
ret = bch2_journal_write_pick_flush(j, w);
spin_unlock(&j->lock);
- if (ret)
+
+ if (unlikely(ret))
goto err;
mutex_lock(&j->buf_lock);
@@ -2082,43 +2146,34 @@ CLOSURE_CALLBACK(bch2_journal_write)
ret = bch2_journal_write_prep(j, w);
mutex_unlock(&j->buf_lock);
- if (ret)
- goto err;
- j->entry_bytes_written += vstruct_bytes(w->data);
+ if (unlikely(ret))
+ goto err;
+ unsigned replicas_allocated = 0;
while (1) {
- spin_lock(&j->lock);
- ret = journal_write_alloc(j, w);
+ ret = journal_write_alloc(j, w, &replicas_allocated);
if (!ret || !j->can_discard)
break;
- spin_unlock(&j->lock);
bch2_journal_do_discards(j);
}
- if (ret && !bch2_journal_error(j)) {
- struct printbuf buf = PRINTBUF;
- buf.atomic++;
+ if (unlikely(ret))
+ goto err_allocate_write;
- __bch2_journal_debug_to_text(&buf, j);
- spin_unlock(&j->lock);
- prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu for %zu sectors: %s"),
- le64_to_cpu(w->data->seq),
- vstruct_sectors(w->data, c->block_bits),
- bch2_err_str(ret));
- bch2_print_string_as_lines(KERN_ERR, buf.buf);
- printbuf_exit(&buf);
- }
- if (ret)
+ ret = bch2_journal_write_checksum(j, w);
+ if (unlikely(ret))
goto err;
+ spin_lock(&j->lock);
/*
* write is allocated, no longer need to account for it in
* bch2_journal_space_available():
*/
w->sectors = 0;
w->write_allocated = true;
+ j->entry_bytes_written += vstruct_bytes(w->data);
/*
* journal entry has been compacted and allocated, recalculate space
@@ -2130,9 +2185,6 @@ CLOSURE_CALLBACK(bch2_journal_write)
w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
- if (c->opts.nochanges)
- goto no_io;
-
/*
* Mark journal replicas before we submit the write to guarantee
* recovery will find the journal entries after a crash.
@@ -2143,15 +2195,33 @@ CLOSURE_CALLBACK(bch2_journal_write)
if (ret)
goto err;
+ if (c->opts.nochanges)
+ goto no_io;
+
if (!JSET_NO_FLUSH(w->data))
continue_at(cl, journal_write_preflush, j->wq);
else
continue_at(cl, journal_write_submit, j->wq);
return;
-no_io:
- continue_at(cl, journal_write_done, j->wq);
- return;
+err_allocate_write:
+ if (!bch2_journal_error(j)) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_journal_debug_to_text(&buf, j);
+ prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu for %zu sectors: %s"),
+ le64_to_cpu(w->data->seq),
+ vstruct_sectors(w->data, c->block_bits),
+ bch2_err_str(ret));
+ bch2_print_str(c, KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+ }
err:
bch2_fatal_error(c);
+no_io:
+ extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
+ struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
+ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write);
+ }
+
continue_at(cl, journal_write_done, j->wq);
}
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index 12b39fcb4424..6fa82c4050fe 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -9,6 +9,7 @@ void bch2_journal_pos_from_member_info_resume(struct bch_fs *);
struct journal_ptr {
bool csum_good;
+ struct bch_csum csum;
u8 dev;
u32 bucket;
u32 bucket_offset;
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index cc00b0fc40d8..cd6201741c59 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -83,18 +83,20 @@ static struct journal_space
journal_dev_space_available(struct journal *j, struct bch_dev *ca,
enum journal_space_from from)
{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_device *ja = &ca->journal;
unsigned sectors, buckets, unwritten;
+ unsigned bucket_size_aligned = round_down(ca->mi.bucket_size, block_sectors(c));
u64 seq;
if (from == journal_space_total)
return (struct journal_space) {
- .next_entry = ca->mi.bucket_size,
- .total = ca->mi.bucket_size * ja->nr,
+ .next_entry = bucket_size_aligned,
+ .total = bucket_size_aligned * ja->nr,
};
buckets = bch2_journal_dev_buckets_available(j, ja, from);
- sectors = ja->sectors_free;
+ sectors = round_down(ja->sectors_free, block_sectors(c));
/*
* We that we don't allocate the space for a journal entry
@@ -109,7 +111,7 @@ journal_dev_space_available(struct journal *j, struct bch_dev *ca,
continue;
/* entry won't fit on this device, skip: */
- if (unwritten > ca->mi.bucket_size)
+ if (unwritten > bucket_size_aligned)
continue;
if (unwritten >= sectors) {
@@ -119,7 +121,7 @@ journal_dev_space_available(struct journal *j, struct bch_dev *ca,
}
buckets--;
- sectors = ca->mi.bucket_size;
+ sectors = bucket_size_aligned;
}
sectors -= unwritten;
@@ -127,12 +129,12 @@ journal_dev_space_available(struct journal *j, struct bch_dev *ca,
if (sectors < ca->mi.bucket_size && buckets) {
buckets--;
- sectors = ca->mi.bucket_size;
+ sectors = bucket_size_aligned;
}
return (struct journal_space) {
.next_entry = sectors,
- .total = sectors + buckets * ca->mi.bucket_size,
+ .total = sectors + buckets * bucket_size_aligned,
};
}
@@ -146,7 +148,6 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne
BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space));
- rcu_read_lock();
for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
if (!ca->journal.nr ||
!ca->mi.durability)
@@ -164,7 +165,6 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne
array_insert_item(dev_space, nr_devs, pos, space);
}
- rcu_read_unlock();
if (nr_devs < nr_devs_want)
return (struct journal_space) { 0, 0 };
@@ -189,8 +189,8 @@ void bch2_journal_space_available(struct journal *j)
int ret = 0;
lockdep_assert_held(&j->lock);
+ guard(rcu)();
- rcu_read_lock();
for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
struct journal_device *ja = &ca->journal;
@@ -210,24 +210,23 @@ void bch2_journal_space_available(struct journal *j)
max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size);
nr_online++;
}
- rcu_read_unlock();
j->can_discard = can_discard;
if (nr_online < metadata_replicas_required(c)) {
- struct printbuf buf = PRINTBUF;
- buf.atomic++;
- prt_printf(&buf, "insufficient writeable journal devices available: have %u, need %u\n"
- "rw journal devs:", nr_online, metadata_replicas_required(c));
-
- rcu_read_lock();
- for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal])
- prt_printf(&buf, " %s", ca->name);
- rcu_read_unlock();
-
- bch_err(c, "%s", buf.buf);
- printbuf_exit(&buf);
- ret = -BCH_ERR_insufficient_journal_devices;
+ if (!(c->sb.features & BIT_ULL(BCH_FEATURE_small_image))) {
+ struct printbuf buf = PRINTBUF;
+ buf.atomic++;
+ prt_printf(&buf, "insufficient writeable journal devices available: have %u, need %u\n"
+ "rw journal devs:", nr_online, metadata_replicas_required(c));
+
+ for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal])
+ prt_printf(&buf, " %s", ca->name);
+
+ bch_err(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ }
+ ret = bch_err_throw(c, insufficient_journal_devices);
goto out;
}
@@ -241,7 +240,7 @@ void bch2_journal_space_available(struct journal *j)
total = j->space[journal_space_total].total;
if (!j->space[journal_space_discarded].next_entry)
- ret = -BCH_ERR_journal_full;
+ ret = bch_err_throw(c, journal_full);
if ((j->space[journal_space_clean_ondisk].next_entry <
j->space[journal_space_clean_ondisk].total) &&
@@ -254,8 +253,7 @@ void bch2_journal_space_available(struct journal *j)
bch2_journal_set_watermark(j);
out:
j->cur_entry_sectors = !ret
- ? round_down(j->space[journal_space_discarded].next_entry,
- block_sectors(c))
+ ? j->space[journal_space_discarded].next_entry
: 0;
j->cur_entry_error = ret;
@@ -293,12 +291,12 @@ void bch2_journal_do_discards(struct journal *j)
mutex_lock(&j->discard_lock);
- for_each_rw_member(c, ca) {
+ for_each_rw_member(c, ca, BCH_DEV_WRITE_REF_journal_do_discards) {
struct journal_device *ja = &ca->journal;
while (should_discard_bucket(j, ja)) {
if (!c->opts.nochanges &&
- ca->mi.discard &&
+ bch2_discard_opt_enabled(c, ca) &&
bdev_max_discard_sectors(ca->disk_sb.bdev))
blkdev_issue_discard(ca->disk_sb.bdev,
bucket_to_sector(ca,
@@ -623,9 +621,10 @@ static u64 journal_seq_to_flush(struct journal *j)
struct bch_fs *c = container_of(j, struct bch_fs, journal);
u64 seq_to_flush = 0;
- spin_lock(&j->lock);
+ guard(spinlock)(&j->lock);
+ guard(rcu)();
- for_each_rw_member(c, ca) {
+ for_each_rw_member_rcu(c, ca) {
struct journal_device *ja = &ca->journal;
unsigned nr_buckets, bucket_to_flush;
@@ -635,20 +634,15 @@ static u64 journal_seq_to_flush(struct journal *j)
/* Try to keep the journal at most half full: */
nr_buckets = ja->nr / 2;
- nr_buckets = min(nr_buckets, ja->nr);
-
bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr;
seq_to_flush = max(seq_to_flush,
ja->bucket_seq[bucket_to_flush]);
}
/* Also flush if the pin fifo is more than half full */
- seq_to_flush = max_t(s64, seq_to_flush,
- (s64) journal_cur_seq(j) -
- (j->pin.size >> 1));
- spin_unlock(&j->lock);
-
- return seq_to_flush;
+ return max_t(s64, seq_to_flush,
+ (s64) journal_cur_seq(j) -
+ (j->pin.size >> 1));
}
/**
@@ -699,6 +693,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
if (ret)
break;
+ /* XXX shove journal discards off to another thread */
bch2_journal_do_discards(j);
seq_to_flush = journal_seq_to_flush(j);
@@ -961,7 +956,7 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
seq = 0;
spin_lock(&j->lock);
while (!ret) {
- struct bch_replicas_padded replicas;
+ union bch_replicas_padded replicas;
seq = max(seq, journal_last_seq(j));
if (seq >= j->pin.back)
diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c
index 62b910f2fb27..0cb9b93f13e7 100644
--- a/fs/bcachefs/journal_sb.c
+++ b/fs/bcachefs/journal_sb.c
@@ -210,7 +210,7 @@ int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca,
j = bch2_sb_field_resize(&ca->disk_sb, journal_v2,
(sizeof(*j) + sizeof(j->d[0]) * nr_compacted) / sizeof(u64));
if (!j)
- return -BCH_ERR_ENOSPC_sb_journal;
+ return bch_err_throw(c, ENOSPC_sb_journal);
bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal);
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index e463d2d95359..af4fe416d9ec 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -78,7 +78,7 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
sb_blacklist_u64s(nr + 1));
if (!bl) {
- ret = -BCH_ERR_ENOSPC_sb_journal_seq_blacklist;
+ ret = bch_err_throw(c, ENOSPC_sb_journal_seq_blacklist);
goto out;
}
@@ -130,6 +130,16 @@ bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq,
return true;
}
+u64 bch2_journal_last_blacklisted_seq(struct bch_fs *c)
+{
+ struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table;
+
+ if (!t || !t->nr)
+ return 0;
+
+ return t->entries[eytzinger0_last(t->nr)].end - 1;
+}
+
int bch2_blacklist_table_initialize(struct bch_fs *c)
{
struct bch_sb_field_journal_seq_blacklist *bl =
@@ -142,7 +152,7 @@ int bch2_blacklist_table_initialize(struct bch_fs *c)
t = kzalloc(struct_size(t, entries, nr), GFP_KERNEL);
if (!t)
- return -BCH_ERR_ENOMEM_blacklist_table_init;
+ return bch_err_throw(c, ENOMEM_blacklist_table_init);
t->nr = nr;
diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h
index d47636f96fdc..f06942ccfcdd 100644
--- a/fs/bcachefs/journal_seq_blacklist.h
+++ b/fs/bcachefs/journal_seq_blacklist.h
@@ -12,6 +12,7 @@ blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl)
}
bool bch2_journal_seq_is_blacklisted(struct bch_fs *, u64, bool);
+u64 bch2_journal_last_blacklisted_seq(struct bch_fs *);
int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64, u64);
int bch2_blacklist_table_initialize(struct bch_fs *);
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 8e0eba776b9d..51104bbb99da 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -151,8 +151,6 @@ enum journal_flags {
#undef x
};
-typedef DARRAY(u64) darray_u64;
-
struct journal_bio {
struct bch_dev *ca;
unsigned buf_idx;
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index 2f63fc6d456f..57b5b3263b08 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -145,13 +145,11 @@ static u64 bkey_lru_type_idx(struct bch_fs *c,
case BCH_LRU_fragmentation: {
a = bch2_alloc_to_v4(k, &a_convert);
- rcu_read_lock();
+ guard(rcu)();
struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.k->p.inode);
- u64 idx = ca
+ return ca
? alloc_lru_idx_fragmentation(*a, ca)
: 0;
- rcu_read_unlock();
- return idx;
}
case BCH_LRU_stripes:
return k.k->type == KEY_TYPE_stripe
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 90dcf80bd64a..f296cce95338 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -4,10 +4,13 @@
*/
#include "bcachefs.h"
+#include "backpointers.h"
#include "bkey_buf.h"
#include "btree_update.h"
#include "btree_update_interior.h"
+#include "btree_write_buffer.h"
#include "buckets.h"
+#include "ec.h"
#include "errcode.h"
#include "extents.h"
#include "io_write.h"
@@ -20,7 +23,7 @@
#include "super-io.h"
static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k,
- unsigned dev_idx, int flags, bool metadata)
+ unsigned dev_idx, unsigned flags, bool metadata)
{
unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas;
unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST;
@@ -32,16 +35,33 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k,
nr_good = bch2_bkey_durability(c, k.s_c);
if ((!nr_good && !(flags & lost)) ||
(nr_good < replicas && !(flags & degraded)))
- return -BCH_ERR_remove_would_lose_data;
+ return bch_err_throw(c, remove_would_lose_data);
return 0;
}
+static int drop_btree_ptrs(struct btree_trans *trans, struct btree_iter *iter,
+ struct btree *b, unsigned dev_idx, unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_buf k;
+
+ bch2_bkey_buf_init(&k);
+ bch2_bkey_buf_copy(&k, c, &b->key);
+
+ int ret = drop_dev_ptrs(c, bkey_i_to_s(k.k), dev_idx, flags, true) ?:
+ bch2_btree_node_update_key(trans, iter, b, k.k, 0, false);
+
+ bch_err_fn(c, ret);
+ bch2_bkey_buf_exit(&k, c);
+ return ret;
+}
+
static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k,
unsigned dev_idx,
- int flags)
+ unsigned flags)
{
struct bch_fs *c = trans->c;
struct bkey_i *n;
@@ -77,9 +97,27 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
return 0;
}
+static int bch2_dev_btree_drop_key(struct btree_trans *trans,
+ struct bkey_s_c_backpointer bp,
+ unsigned dev_idx,
+ struct bkey_buf *last_flushed,
+ unsigned flags)
+{
+ struct btree_iter iter;
+ struct btree *b = bch2_backpointer_get_node(trans, bp, &iter, last_flushed);
+ int ret = PTR_ERR_OR_ZERO(b);
+ if (ret)
+ return ret == -BCH_ERR_backpointer_to_overwritten_btree_node ? 0 : ret;
+
+ ret = drop_btree_ptrs(trans, &iter, b, dev_idx, flags);
+
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
static int bch2_dev_usrdata_drop(struct bch_fs *c,
struct progress_indicator_state *progress,
- unsigned dev_idx, int flags)
+ unsigned dev_idx, unsigned flags)
{
struct btree_trans *trans = bch2_trans_get(c);
enum btree_id id;
@@ -106,7 +144,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c,
static int bch2_dev_metadata_drop(struct bch_fs *c,
struct progress_indicator_state *progress,
- unsigned dev_idx, int flags)
+ unsigned dev_idx, unsigned flags)
{
struct btree_trans *trans;
struct btree_iter iter;
@@ -118,7 +156,7 @@ static int bch2_dev_metadata_drop(struct bch_fs *c,
/* don't handle this yet: */
if (flags & BCH_FORCE_IF_METADATA_LOST)
- return -BCH_ERR_remove_with_metadata_missing_unimplemented;
+ return bch_err_throw(c, remove_with_metadata_missing_unimplemented);
trans = bch2_trans_get(c);
bch2_bkey_buf_init(&k);
@@ -137,20 +175,12 @@ retry:
if (!bch2_bkey_has_device_c(bkey_i_to_s_c(&b->key), dev_idx))
goto next;
- bch2_bkey_buf_copy(&k, c, &b->key);
-
- ret = drop_dev_ptrs(c, bkey_i_to_s(k.k),
- dev_idx, flags, true);
- if (ret)
- break;
-
- ret = bch2_btree_node_update_key(trans, &iter, b, k.k, 0, false);
+ ret = drop_btree_ptrs(trans, &iter, b, dev_idx, flags);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
ret = 0;
continue;
}
- bch_err_msg(c, ret, "updating btree node key");
if (ret)
break;
next:
@@ -176,7 +206,66 @@ err:
return ret;
}
-int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+static int data_drop_bp(struct btree_trans *trans, unsigned dev_idx,
+ struct bkey_s_c_backpointer bp, struct bkey_buf *last_flushed,
+ unsigned flags)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent,
+ last_flushed);
+ int ret = bkey_err(k);
+ if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
+ return 0;
+ if (ret)
+ return ret;
+
+ if (!k.k || !bch2_bkey_has_device_c(k, dev_idx))
+ goto out;
+
+ /*
+ * XXX: pass flags arg to invalidate_stripe_to_dev and handle it
+ * properly
+ */
+
+ if (bkey_is_btree_ptr(k.k))
+ ret = bch2_dev_btree_drop_key(trans, bp, dev_idx, last_flushed, flags);
+ else if (k.k->type == KEY_TYPE_stripe)
+ ret = bch2_invalidate_stripe_to_dev(trans, &iter, k, dev_idx, flags);
+ else
+ ret = bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags);
+out:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int bch2_dev_data_drop_by_backpointers(struct bch_fs *c, unsigned dev_idx, unsigned flags)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+
+ struct bkey_buf last_flushed;
+ bch2_bkey_buf_init(&last_flushed);
+ bkey_init(&last_flushed.k->k);
+
+ int ret = bch2_btree_write_buffer_flush_sync(trans) ?:
+ for_each_btree_key_max_commit(trans, iter, BTREE_ID_backpointers,
+ POS(dev_idx, 0),
+ POS(dev_idx, U64_MAX), 0, k,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
+ if (k.k->type != KEY_TYPE_backpointer)
+ continue;
+
+ data_drop_bp(trans, dev_idx, bkey_s_c_to_backpointer(k),
+ &last_flushed, flags);
+
+ }));
+
+ bch2_bkey_buf_exit(&last_flushed, trans->c);
+ bch2_trans_put(trans);
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, unsigned flags)
{
struct progress_indicator_state progress;
bch2_progress_init(&progress, c,
diff --git a/fs/bcachefs/migrate.h b/fs/bcachefs/migrate.h
index 027efaa0d575..30018140711b 100644
--- a/fs/bcachefs/migrate.h
+++ b/fs/bcachefs/migrate.h
@@ -2,6 +2,7 @@
#ifndef _BCACHEFS_MIGRATE_H
#define _BCACHEFS_MIGRATE_H
-int bch2_dev_data_drop(struct bch_fs *, unsigned, int);
+int bch2_dev_data_drop_by_backpointers(struct bch_fs *, unsigned, unsigned);
+int bch2_dev_data_drop(struct bch_fs *, unsigned, unsigned);
#endif /* _BCACHEFS_MIGRATE_H */
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index dfdbb9259985..eec591e947bd 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -38,36 +38,80 @@ const char * const bch2_data_ops_strs[] = {
NULL
};
-static void trace_io_move2(struct bch_fs *c, struct bkey_s_c k,
- struct bch_io_opts *io_opts,
- struct data_update_opts *data_opts)
+struct evacuate_bucket_arg {
+ struct bpos bucket;
+ int gen;
+ struct data_update_opts data_opts;
+};
+
+static bool evacuate_bucket_pred(struct bch_fs *, void *,
+ enum btree_id, struct bkey_s_c,
+ struct bch_io_opts *,
+ struct data_update_opts *);
+
+static noinline void
+trace_io_move2(struct bch_fs *c, struct bkey_s_c k,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
{
- if (trace_io_move_enabled()) {
- struct printbuf buf = PRINTBUF;
+ struct printbuf buf = PRINTBUF;
- bch2_bkey_val_to_text(&buf, c, k);
- prt_newline(&buf);
- bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts);
- trace_io_move(c, buf.buf);
- printbuf_exit(&buf);
- }
+ bch2_bkey_val_to_text(&buf, c, k);
+ prt_newline(&buf);
+ bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts);
+ trace_io_move(c, buf.buf);
+ printbuf_exit(&buf);
}
-static void trace_io_move_read2(struct bch_fs *c, struct bkey_s_c k)
+static noinline void trace_io_move_read2(struct bch_fs *c, struct bkey_s_c k)
{
- if (trace_io_move_read_enabled()) {
- struct printbuf buf = PRINTBUF;
+ struct printbuf buf = PRINTBUF;
- bch2_bkey_val_to_text(&buf, c, k);
- trace_io_move_read(c, buf.buf);
- printbuf_exit(&buf);
+ bch2_bkey_val_to_text(&buf, c, k);
+ trace_io_move_read(c, buf.buf);
+ printbuf_exit(&buf);
+}
+
+static noinline void
+trace_io_move_pred2(struct bch_fs *c, struct bkey_s_c k,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts,
+ move_pred_fn pred, void *_arg, bool p)
+{
+ struct printbuf buf = PRINTBUF;
+
+ prt_printf(&buf, "%ps: %u", pred, p);
+
+ if (pred == evacuate_bucket_pred) {
+ struct evacuate_bucket_arg *arg = _arg;
+ prt_printf(&buf, " gen=%u", arg->gen);
}
+
+ prt_newline(&buf);
+ bch2_bkey_val_to_text(&buf, c, k);
+ prt_newline(&buf);
+ bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts);
+ trace_io_move_pred(c, buf.buf);
+ printbuf_exit(&buf);
+}
+
+static noinline void
+trace_io_move_evacuate_bucket2(struct bch_fs *c, struct bpos bucket, int gen)
+{
+ struct printbuf buf = PRINTBUF;
+
+ prt_printf(&buf, "bucket: ");
+ bch2_bpos_to_text(&buf, bucket);
+ prt_printf(&buf, " gen: %i\n", gen);
+
+ trace_io_move_evacuate_bucket(c, buf.buf);
+ printbuf_exit(&buf);
}
struct moving_io {
struct list_head read_list;
struct list_head io_list;
- struct move_bucket_in_flight *b;
+ struct move_bucket *b;
struct closure cl;
bool read_completed;
@@ -109,7 +153,6 @@ static void move_write_done(struct bch_write_op *op)
struct printbuf buf = PRINTBUF;
bch2_write_op_to_text(&buf, op);
- prt_printf(&buf, "ret\t%s\n", bch2_err_str(op->error));
trace_io_move_write_fail(c, buf.buf);
printbuf_exit(&buf);
}
@@ -126,26 +169,40 @@ static void move_write_done(struct bch_write_op *op)
static void move_write(struct moving_io *io)
{
+ struct bch_fs *c = io->write.op.c;
struct moving_context *ctxt = io->write.ctxt;
+ struct bch_read_bio *rbio = &io->write.rbio;
if (ctxt->stats) {
- if (io->write.rbio.bio.bi_status)
+ if (rbio->bio.bi_status)
atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9,
&ctxt->stats->sectors_error_uncorrected);
- else if (io->write.rbio.saw_error)
+ else if (rbio->saw_error)
atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9,
&ctxt->stats->sectors_error_corrected);
}
- if (unlikely(io->write.rbio.ret ||
- io->write.rbio.bio.bi_status ||
- io->write.data_opts.scrub)) {
+ /*
+ * If the extent has been bitrotted, we're going to have to give it a
+ * new checksum in order to move it - but the poison bit will ensure
+ * that userspace still gets the appropriate error.
+ */
+ if (unlikely(rbio->ret == -BCH_ERR_data_read_csum_err &&
+ (bch2_bkey_extent_flags(bkey_i_to_s_c(io->write.k.k)) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)))) {
+ struct bch_extent_crc_unpacked crc = rbio->pick.crc;
+ struct nonce nonce = extent_nonce(rbio->version, crc);
+
+ rbio->pick.crc.csum = bch2_checksum_bio(c, rbio->pick.crc.csum_type,
+ nonce, &rbio->bio);
+ rbio->ret = 0;
+ }
+
+ if (unlikely(rbio->ret || io->write.data_opts.scrub)) {
move_free(io);
return;
}
if (trace_io_move_write_enabled()) {
- struct bch_fs *c = io->write.op.c;
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k));
@@ -275,7 +332,7 @@ void bch2_move_stats_init(struct bch_move_stats *stats, const char *name)
}
int bch2_move_extent(struct moving_context *ctxt,
- struct move_bucket_in_flight *bucket_in_flight,
+ struct move_bucket *bucket_in_flight,
struct btree_iter *iter,
struct bkey_s_c k,
struct bch_io_opts io_opts,
@@ -285,7 +342,8 @@ int bch2_move_extent(struct moving_context *ctxt,
struct bch_fs *c = trans->c;
int ret = -ENOMEM;
- trace_io_move2(c, k, &io_opts, &data_opts);
+ if (trace_io_move_enabled())
+ trace_io_move2(c, k, &io_opts, &data_opts);
this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size);
if (ctxt->stats)
@@ -301,16 +359,14 @@ int bch2_move_extent(struct moving_context *ctxt,
return 0;
}
- /*
- * Before memory allocations & taking nocow locks in
- * bch2_data_update_init():
- */
- bch2_trans_unlock(trans);
-
- struct moving_io *io = kzalloc(sizeof(struct moving_io), GFP_KERNEL);
+ struct moving_io *io = allocate_dropping_locks(trans, ret,
+ kzalloc(sizeof(struct moving_io), _gfp));
if (!io)
goto err;
+ if (ret)
+ goto err_free;
+
INIT_LIST_HEAD(&io->io_list);
io->write.ctxt = ctxt;
io->read_sectors = k.k->size;
@@ -330,6 +386,8 @@ int bch2_move_extent(struct moving_context *ctxt,
io->write.op.c = c;
io->write.data_opts = data_opts;
+ bch2_trans_unlock(trans);
+
ret = bch2_data_update_bios_init(&io->write, c, &io_opts);
if (ret)
goto err_free;
@@ -351,7 +409,8 @@ int bch2_move_extent(struct moving_context *ctxt,
atomic_inc(&io->b->count);
}
- trace_io_move_read2(c, k);
+ if (trace_io_move_read_enabled())
+ trace_io_move_read2(c, k);
mutex_lock(&ctxt->lock);
atomic_add(io->read_sectors, &ctxt->read_sectors);
@@ -377,9 +436,6 @@ int bch2_move_extent(struct moving_context *ctxt,
err_free:
kfree(io);
err:
- if (bch2_err_matches(ret, BCH_ERR_data_update_done))
- return 0;
-
if (bch2_err_matches(ret, EROFS) ||
bch2_err_matches(ret, BCH_ERR_transaction_restart))
return ret;
@@ -395,10 +451,13 @@ err:
trace_io_move_start_fail(c, buf.buf);
printbuf_exit(&buf);
}
+
+ if (bch2_err_matches(ret, BCH_ERR_data_update_done))
+ return 0;
return ret;
}
-static struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
+struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
struct per_snapshot_io_opts *io_opts,
struct bpos extent_pos, /* extent_iter, extent_k may be in reflink btree */
struct btree_iter *extent_iter,
@@ -409,6 +468,9 @@ static struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
struct bch_io_opts *opts_ret = &io_opts->fs_io_opts;
int ret = 0;
+ if (extent_iter->min_depth)
+ return opts_ret;
+
if (extent_k.k->type == KEY_TYPE_reflink_v)
goto out;
@@ -480,6 +542,7 @@ int bch2_move_get_io_opts_one(struct btree_trans *trans,
bch2_inode_opts_get(io_opts, c, &inode);
}
bch2_trans_iter_exit(trans, &inode_iter);
+ /* seem to be spinning here? */
out:
return bch2_get_update_rebalance_opts(trans, io_opts, extent_iter, extent_k);
}
@@ -559,11 +622,11 @@ static struct bkey_s_c bch2_lookup_indirect_extent_for_move(struct btree_trans *
return k;
}
-static int bch2_move_data_btree(struct moving_context *ctxt,
- struct bpos start,
- struct bpos end,
- move_pred_fn pred, void *arg,
- enum btree_id btree_id)
+int bch2_move_data_btree(struct moving_context *ctxt,
+ struct bpos start,
+ struct bpos end,
+ move_pred_fn pred, void *arg,
+ enum btree_id btree_id, unsigned level)
{
struct btree_trans *trans = ctxt->trans;
struct bch_fs *c = trans->c;
@@ -589,11 +652,56 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
ctxt->stats->pos = BBPOS(btree_id, start);
}
+retry_root:
bch2_trans_begin(trans);
- bch2_trans_iter_init(trans, &iter, btree_id, start,
- BTREE_ITER_prefetch|
- BTREE_ITER_not_extents|
- BTREE_ITER_all_snapshots);
+
+ if (level == bch2_btree_id_root(c, btree_id)->level + 1) {
+ bch2_trans_node_iter_init(trans, &iter, btree_id, start, 0, level - 1,
+ BTREE_ITER_prefetch|
+ BTREE_ITER_not_extents|
+ BTREE_ITER_all_snapshots);
+ struct btree *b = bch2_btree_iter_peek_node(trans, &iter);
+ ret = PTR_ERR_OR_ZERO(b);
+ if (ret)
+ goto root_err;
+
+ if (b != btree_node_root(c, b)) {
+ bch2_trans_iter_exit(trans, &iter);
+ goto retry_root;
+ }
+
+ k = bkey_i_to_s_c(&b->key);
+
+ io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts,
+ iter.pos, &iter, k);
+ ret = PTR_ERR_OR_ZERO(io_opts);
+ if (ret)
+ goto root_err;
+
+ memset(&data_opts, 0, sizeof(data_opts));
+ if (!pred(c, arg, iter.btree_id, k, io_opts, &data_opts))
+ goto out;
+
+
+ if (!data_opts.scrub)
+ ret = bch2_btree_node_rewrite_pos(trans, btree_id, level,
+ k.k->p, data_opts.target, 0);
+ else
+ ret = bch2_btree_node_scrub(trans, btree_id, level, k, data_opts.read_dev);
+
+root_err:
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+ bch2_trans_iter_exit(trans, &iter);
+ goto retry_root;
+ }
+
+ goto out;
+ }
+
+ bch2_trans_node_iter_init(trans, &iter, btree_id, start, 0, level,
+ BTREE_ITER_prefetch|
+ BTREE_ITER_not_extents|
+ BTREE_ITER_all_snapshots);
if (ctxt->rate)
bch2_ratelimit_reset(ctxt->rate);
@@ -613,7 +721,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
if (ret)
break;
- if (bkey_ge(bkey_start_pos(k.k), end))
+ if (bkey_gt(bkey_start_pos(k.k), end))
break;
if (ctxt->stats)
@@ -653,7 +761,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
continue;
memset(&data_opts, 0, sizeof(data_opts));
- if (!pred(c, arg, k, io_opts, &data_opts))
+ if (!pred(c, arg, extent_iter->btree_id, k, io_opts, &data_opts))
goto next;
/*
@@ -663,7 +771,14 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
bch2_bkey_buf_reassemble(&sk, c, k);
k = bkey_i_to_s_c(sk.k);
- ret2 = bch2_move_extent(ctxt, NULL, extent_iter, k, *io_opts, data_opts);
+ if (!level)
+ ret2 = bch2_move_extent(ctxt, NULL, extent_iter, k, *io_opts, data_opts);
+ else if (!data_opts.scrub)
+ ret2 = bch2_btree_node_rewrite_pos(trans, btree_id, level,
+ k.k->p, data_opts.target, 0);
+ else
+ ret2 = bch2_btree_node_scrub(trans, btree_id, level, k, data_opts.read_dev);
+
if (ret2) {
if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
continue;
@@ -681,9 +796,10 @@ next:
if (ctxt->stats)
atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
next_nondata:
- bch2_btree_iter_advance(trans, &iter);
+ if (!bch2_btree_iter_advance(trans, &iter))
+ break;
}
-
+out:
bch2_trans_iter_exit(trans, &reflink_iter);
bch2_trans_iter_exit(trans, &iter);
bch2_bkey_buf_exit(&sk, c);
@@ -713,7 +829,7 @@ int __bch2_move_data(struct moving_context *ctxt,
ret = bch2_move_data_btree(ctxt,
id == start.btree ? start.pos : POS_MIN,
id == end.btree ? end.pos : POS_MAX,
- pred, arg, id);
+ pred, arg, id, 0);
if (ret)
break;
}
@@ -740,11 +856,12 @@ int bch2_move_data(struct bch_fs *c,
}
static int __bch2_move_data_phys(struct moving_context *ctxt,
- struct move_bucket_in_flight *bucket_in_flight,
+ struct move_bucket *bucket_in_flight,
unsigned dev,
u64 bucket_start,
u64 bucket_end,
unsigned data_types,
+ bool copygc,
move_pred_fn pred, void *arg)
{
struct btree_trans *trans = ctxt->trans;
@@ -755,6 +872,7 @@ static int __bch2_move_data_phys(struct moving_context *ctxt,
struct bkey_buf sk;
struct bkey_s_c k;
struct bkey_buf last_flushed;
+ u64 check_mismatch_done = bucket_start;
int ret = 0;
struct bch_dev *ca = bch2_dev_tryget(c, dev);
@@ -765,8 +883,6 @@ static int __bch2_move_data_phys(struct moving_context *ctxt,
struct bpos bp_start = bucket_pos_to_bp_start(ca, POS(dev, bucket_start));
struct bpos bp_end = bucket_pos_to_bp_end(ca, POS(dev, bucket_end));
- bch2_dev_put(ca);
- ca = NULL;
bch2_bkey_buf_init(&last_flushed);
bkey_init(&last_flushed.k->k);
@@ -779,10 +895,6 @@ static int __bch2_move_data_phys(struct moving_context *ctxt,
bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, bp_start, 0);
- bch_err_msg(c, ret, "looking up alloc key");
- if (ret)
- goto err;
-
ret = bch2_btree_write_buffer_tryflush(trans);
if (!bch2_err_matches(ret, EROFS))
bch_err_msg(c, ret, "flushing btree write buffer");
@@ -805,6 +917,14 @@ static int __bch2_move_data_phys(struct moving_context *ctxt,
if (!k.k || bkey_gt(k.k->p, bp_end))
break;
+ if (check_mismatch_done < bp_pos_to_bucket(ca, k.k->p).offset) {
+ while (check_mismatch_done < bp_pos_to_bucket(ca, k.k->p).offset) {
+ bch2_check_bucket_backpointer_mismatch(trans, ca, check_mismatch_done++,
+ copygc, &last_flushed);
+ }
+ continue;
+ }
+
if (k.k->type != KEY_TYPE_backpointer)
goto next;
@@ -837,7 +957,13 @@ static int __bch2_move_data_phys(struct moving_context *ctxt,
}
struct data_update_opts data_opts = {};
- if (!pred(c, arg, k, &io_opts, &data_opts)) {
+ bool p = pred(c, arg, bp.v->btree_id, k, &io_opts, &data_opts);
+
+ if (trace_io_move_pred_enabled())
+ trace_io_move_pred2(c, k, &io_opts, &data_opts,
+ pred, arg, p);
+
+ if (!p) {
bch2_trans_iter_exit(trans, &iter);
goto next;
}
@@ -845,7 +971,7 @@ static int __bch2_move_data_phys(struct moving_context *ctxt,
if (data_opts.scrub &&
!bch2_dev_idx_is_online(c, data_opts.read_dev)) {
bch2_trans_iter_exit(trans, &iter);
- ret = -BCH_ERR_device_offline;
+ ret = bch_err_throw(c, device_offline);
break;
}
@@ -858,7 +984,8 @@ static int __bch2_move_data_phys(struct moving_context *ctxt,
if (!bp.v->level)
ret = bch2_move_extent(ctxt, bucket_in_flight, &iter, k, io_opts, data_opts);
else if (!data_opts.scrub)
- ret = bch2_btree_node_rewrite_pos(trans, bp.v->btree_id, bp.v->level, k.k->p, 0);
+ ret = bch2_btree_node_rewrite_pos(trans, bp.v->btree_id, bp.v->level,
+ k.k->p, data_opts.target, 0);
else
ret = bch2_btree_node_scrub(trans, bp.v->btree_id, bp.v->level, k, data_opts.read_dev);
@@ -879,45 +1006,48 @@ static int __bch2_move_data_phys(struct moving_context *ctxt,
next:
bch2_btree_iter_advance(trans, &bp_iter);
}
+
+ while (check_mismatch_done < bucket_end)
+ bch2_check_bucket_backpointer_mismatch(trans, ca, check_mismatch_done++,
+ copygc, &last_flushed);
err:
bch2_trans_iter_exit(trans, &bp_iter);
bch2_bkey_buf_exit(&sk, c);
bch2_bkey_buf_exit(&last_flushed, c);
+ bch2_dev_put(ca);
return ret;
}
-static int bch2_move_data_phys(struct bch_fs *c,
- unsigned dev,
- u64 start,
- u64 end,
- unsigned data_types,
- struct bch_ratelimit *rate,
- struct bch_move_stats *stats,
- struct write_point_specifier wp,
- bool wait_on_copygc,
- move_pred_fn pred, void *arg)
+int bch2_move_data_phys(struct bch_fs *c,
+ unsigned dev,
+ u64 start,
+ u64 end,
+ unsigned data_types,
+ struct bch_ratelimit *rate,
+ struct bch_move_stats *stats,
+ struct write_point_specifier wp,
+ bool wait_on_copygc,
+ move_pred_fn pred, void *arg)
{
struct moving_context ctxt;
bch2_trans_run(c, bch2_btree_write_buffer_flush_sync(trans));
bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
- ctxt.stats->phys = true;
- ctxt.stats->data_type = (int) DATA_PROGRESS_DATA_TYPE_phys;
+ if (ctxt.stats) {
+ ctxt.stats->phys = true;
+ ctxt.stats->data_type = (int) DATA_PROGRESS_DATA_TYPE_phys;
+ }
- int ret = __bch2_move_data_phys(&ctxt, NULL, dev, start, end, data_types, pred, arg);
+ int ret = __bch2_move_data_phys(&ctxt, NULL, dev, start, end,
+ data_types, false, pred, arg);
bch2_moving_ctxt_exit(&ctxt);
return ret;
}
-struct evacuate_bucket_arg {
- struct bpos bucket;
- int gen;
- struct data_update_opts data_opts;
-};
-
-static bool evacuate_bucket_pred(struct bch_fs *c, void *_arg, struct bkey_s_c k,
+static bool evacuate_bucket_pred(struct bch_fs *c, void *_arg,
+ enum btree_id btree, struct bkey_s_c k,
struct bch_io_opts *io_opts,
struct data_update_opts *data_opts)
{
@@ -938,17 +1068,23 @@ static bool evacuate_bucket_pred(struct bch_fs *c, void *_arg, struct bkey_s_c k
}
int bch2_evacuate_bucket(struct moving_context *ctxt,
- struct move_bucket_in_flight *bucket_in_flight,
- struct bpos bucket, int gen,
- struct data_update_opts data_opts)
+ struct move_bucket *bucket_in_flight,
+ struct bpos bucket, int gen,
+ struct data_update_opts data_opts)
{
+ struct bch_fs *c = ctxt->trans->c;
struct evacuate_bucket_arg arg = { bucket, gen, data_opts, };
+ count_event(c, io_move_evacuate_bucket);
+ if (trace_io_move_evacuate_bucket_enabled())
+ trace_io_move_evacuate_bucket2(c, bucket, gen);
+
return __bch2_move_data_phys(ctxt, bucket_in_flight,
bucket.inode,
bucket.offset,
bucket.offset + 1,
~0,
+ true,
evacuate_bucket_pred, &arg);
}
@@ -1006,7 +1142,7 @@ retry:
if (!pred(c, arg, b, &io_opts, &data_opts))
goto next;
- ret = bch2_btree_node_rewrite(trans, &iter, b, 0) ?: ret;
+ ret = bch2_btree_node_rewrite(trans, &iter, b, 0, 0) ?: ret;
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue;
if (ret)
@@ -1031,7 +1167,7 @@ next:
}
static bool rereplicate_pred(struct bch_fs *c, void *arg,
- struct bkey_s_c k,
+ enum btree_id btree, struct bkey_s_c k,
struct bch_io_opts *io_opts,
struct data_update_opts *data_opts)
{
@@ -1040,7 +1176,7 @@ static bool rereplicate_pred(struct bch_fs *c, void *arg,
? c->opts.metadata_replicas
: io_opts->data_replicas;
- rcu_read_lock();
+ guard(rcu)();
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
unsigned i = 0;
bkey_for_each_ptr(ptrs, ptr) {
@@ -1050,7 +1186,6 @@ static bool rereplicate_pred(struct bch_fs *c, void *arg,
data_opts->kill_ptrs |= BIT(i);
i++;
}
- rcu_read_unlock();
if (!data_opts->kill_ptrs &&
(!nr_good || nr_good >= replicas))
@@ -1063,7 +1198,7 @@ static bool rereplicate_pred(struct bch_fs *c, void *arg,
}
static bool migrate_pred(struct bch_fs *c, void *arg,
- struct bkey_s_c k,
+ enum btree_id btree, struct bkey_s_c k,
struct bch_io_opts *io_opts,
struct data_update_opts *data_opts)
{
@@ -1090,7 +1225,7 @@ static bool rereplicate_btree_pred(struct bch_fs *c, void *arg,
struct bch_io_opts *io_opts,
struct data_update_opts *data_opts)
{
- return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
+ return rereplicate_pred(c, arg, b->c.btree_id, bkey_i_to_s_c(&b->key), io_opts, data_opts);
}
/*
@@ -1146,7 +1281,7 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
}
static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg,
- struct bkey_s_c k,
+ enum btree_id btree, struct bkey_s_c k,
struct bch_io_opts *io_opts,
struct data_update_opts *data_opts)
{
@@ -1158,7 +1293,7 @@ static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg,
struct extent_ptr_decoded p;
unsigned i = 0;
- rcu_read_lock();
+ guard(rcu)();
bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) {
unsigned d = bch2_extent_ptr_durability(c, &p);
@@ -1169,7 +1304,6 @@ static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg,
i++;
}
- rcu_read_unlock();
return data_opts->kill_ptrs != 0;
}
@@ -1179,11 +1313,12 @@ static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg,
struct bch_io_opts *io_opts,
struct data_update_opts *data_opts)
{
- return drop_extra_replicas_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
+ return drop_extra_replicas_pred(c, arg, b->c.btree_id, bkey_i_to_s_c(&b->key),
+ io_opts, data_opts);
}
static bool scrub_pred(struct bch_fs *c, void *_arg,
- struct bkey_s_c k,
+ enum btree_id btree, struct bkey_s_c k,
struct bch_io_opts *io_opts,
struct data_update_opts *data_opts)
{
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
index 51e0505a8156..86b80499ac55 100644
--- a/fs/bcachefs/move.h
+++ b/fs/bcachefs/move.h
@@ -72,7 +72,7 @@ do { \
break; \
} while (1)
-typedef bool (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c,
+typedef bool (*move_pred_fn)(struct bch_fs *, void *, enum btree_id, struct bkey_s_c,
struct bch_io_opts *, struct data_update_opts *);
extern const char * const bch2_data_ops_strs[];
@@ -116,12 +116,18 @@ int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *,
int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *);
int bch2_move_extent(struct moving_context *,
- struct move_bucket_in_flight *,
+ struct move_bucket *,
struct btree_iter *,
struct bkey_s_c,
struct bch_io_opts,
struct data_update_opts);
+struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *,
+ struct per_snapshot_io_opts *, struct bpos,
+ struct btree_iter *, struct bkey_s_c);
+
+int bch2_move_data_btree(struct moving_context *, struct bpos, struct bpos,
+ move_pred_fn, void *, enum btree_id, unsigned);
int __bch2_move_data(struct moving_context *,
struct bbpos,
struct bbpos,
@@ -135,8 +141,13 @@ int bch2_move_data(struct bch_fs *,
bool,
move_pred_fn, void *);
+int bch2_move_data_phys(struct bch_fs *, unsigned, u64, u64, unsigned,
+ struct bch_ratelimit *, struct bch_move_stats *,
+ struct write_point_specifier, bool,
+ move_pred_fn, void *);
+
int bch2_evacuate_bucket(struct moving_context *,
- struct move_bucket_in_flight *,
+ struct move_bucket *,
struct bpos, int,
struct data_update_opts);
int bch2_data_job(struct bch_fs *,
diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h
index 807f779f6f76..c5c62cd600de 100644
--- a/fs/bcachefs/move_types.h
+++ b/fs/bcachefs/move_types.h
@@ -36,14 +36,10 @@ struct move_bucket_key {
};
struct move_bucket {
+ struct move_bucket *next;
+ struct rhash_head hash;
struct move_bucket_key k;
unsigned sectors;
-};
-
-struct move_bucket_in_flight {
- struct move_bucket_in_flight *next;
- struct rhash_head hash;
- struct move_bucket bucket;
atomic_t count;
};
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 96873372b516..6d7b1d5f7697 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -8,6 +8,7 @@
#include "bcachefs.h"
#include "alloc_background.h"
#include "alloc_foreground.h"
+#include "backpointers.h"
#include "btree_iter.h"
#include "btree_update.h"
#include "btree_write_buffer.h"
@@ -27,47 +28,32 @@
#include <linux/wait.h>
struct buckets_in_flight {
- struct rhashtable table;
- struct move_bucket_in_flight *first;
- struct move_bucket_in_flight *last;
- size_t nr;
- size_t sectors;
+ struct rhashtable table;
+ struct move_bucket *first;
+ struct move_bucket *last;
+ size_t nr;
+ size_t sectors;
+
+ DARRAY(struct move_bucket *) to_evacuate;
};
static const struct rhashtable_params bch_move_bucket_params = {
- .head_offset = offsetof(struct move_bucket_in_flight, hash),
- .key_offset = offsetof(struct move_bucket_in_flight, bucket.k),
+ .head_offset = offsetof(struct move_bucket, hash),
+ .key_offset = offsetof(struct move_bucket, k),
.key_len = sizeof(struct move_bucket_key),
.automatic_shrinking = true,
};
-static struct move_bucket_in_flight *
-move_bucket_in_flight_add(struct buckets_in_flight *list, struct move_bucket b)
+static void move_bucket_in_flight_add(struct buckets_in_flight *list, struct move_bucket *b)
{
- struct move_bucket_in_flight *new = kzalloc(sizeof(*new), GFP_KERNEL);
- int ret;
-
- if (!new)
- return ERR_PTR(-ENOMEM);
-
- new->bucket = b;
-
- ret = rhashtable_lookup_insert_fast(&list->table, &new->hash,
- bch_move_bucket_params);
- if (ret) {
- kfree(new);
- return ERR_PTR(ret);
- }
-
if (!list->first)
- list->first = new;
+ list->first = b;
else
- list->last->next = new;
+ list->last->next = b;
- list->last = new;
+ list->last = b;
list->nr++;
- list->sectors += b.sectors;
- return new;
+ list->sectors += b->sectors;
}
static int bch2_bucket_is_movable(struct btree_trans *trans,
@@ -89,9 +75,12 @@ static int bch2_bucket_is_movable(struct btree_trans *trans,
if (!ca)
goto out;
+ if (bch2_bucket_bitmap_test(&ca->bucket_backpointer_mismatch, b->k.bucket.offset))
+ goto out;
+
if (ca->mi.state != BCH_MEMBER_STATE_rw ||
!bch2_dev_is_online(ca))
- goto out_put;
+ goto out;
struct bch_alloc_v4 _a;
const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
@@ -100,19 +89,26 @@ static int bch2_bucket_is_movable(struct btree_trans *trans,
u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca);
ret = lru_idx && lru_idx <= time;
-out_put:
- bch2_dev_put(ca);
out:
+ bch2_dev_put(ca);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
+static void move_bucket_free(struct buckets_in_flight *list,
+ struct move_bucket *b)
+{
+ int ret = rhashtable_remove_fast(&list->table, &b->hash,
+ bch_move_bucket_params);
+ BUG_ON(ret);
+ kfree(b);
+}
+
static void move_buckets_wait(struct moving_context *ctxt,
struct buckets_in_flight *list,
bool flush)
{
- struct move_bucket_in_flight *i;
- int ret;
+ struct move_bucket *i;
while ((i = list->first)) {
if (flush)
@@ -126,12 +122,9 @@ static void move_buckets_wait(struct moving_context *ctxt,
list->last = NULL;
list->nr--;
- list->sectors -= i->bucket.sectors;
+ list->sectors -= i->sectors;
- ret = rhashtable_remove_fast(&list->table, &i->hash,
- bch_move_bucket_params);
- BUG_ON(ret);
- kfree(i);
+ move_bucket_free(list, i);
}
bch2_trans_unlock_long(ctxt->trans);
@@ -143,11 +136,8 @@ static bool bucket_in_flight(struct buckets_in_flight *list,
return rhashtable_lookup_fast(&list->table, &k, bch_move_bucket_params);
}
-typedef DARRAY(struct move_bucket) move_buckets;
-
static int bch2_copygc_get_buckets(struct moving_context *ctxt,
- struct buckets_in_flight *buckets_in_flight,
- move_buckets *buckets)
+ struct buckets_in_flight *buckets_in_flight)
{
struct btree_trans *trans = ctxt->trans;
struct bch_fs *c = trans->c;
@@ -164,8 +154,6 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt,
if (bch2_fs_fatal_err_on(ret, c, "%s: from bch2_btree_write_buffer_tryflush()", bch2_err_str(ret)))
return ret;
- bch2_trans_begin(trans);
-
ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru,
lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, 0, 0),
lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, U64_MAX, LRU_TIME_MAX),
@@ -184,20 +172,34 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt,
else if (bucket_in_flight(buckets_in_flight, b.k))
in_flight++;
else {
- ret2 = darray_push(buckets, b);
+ struct move_bucket *b_i = kmalloc(sizeof(*b_i), GFP_KERNEL);
+ ret2 = b_i ? 0 : -ENOMEM;
if (ret2)
goto err;
+
+ *b_i = b;
+
+ ret2 = darray_push(&buckets_in_flight->to_evacuate, b_i);
+ if (ret2) {
+ kfree(b_i);
+ goto err;
+ }
+
+ ret2 = rhashtable_lookup_insert_fast(&buckets_in_flight->table, &b_i->hash,
+ bch_move_bucket_params);
+ BUG_ON(ret2);
+
sectors += b.sectors;
}
- ret2 = buckets->nr >= nr_to_get;
+ ret2 = buckets_in_flight->to_evacuate.nr >= nr_to_get;
err:
ret2;
}));
pr_debug("have: %zu (%zu) saw %zu in flight %zu not movable %zu got %zu (%zu)/%zu buckets ret %i",
buckets_in_flight->nr, buckets_in_flight->sectors,
- saw, in_flight, not_movable, buckets->nr, sectors, nr_to_get, ret);
+ saw, in_flight, not_movable, buckets_in_flight->to_evacuate.nr, sectors, nr_to_get, ret);
return ret < 0 ? ret : 0;
}
@@ -212,40 +214,30 @@ static int bch2_copygc(struct moving_context *ctxt,
struct data_update_opts data_opts = {
.btree_insert_flags = BCH_WATERMARK_copygc,
};
- move_buckets buckets = { 0 };
- struct move_bucket_in_flight *f;
u64 sectors_seen = atomic64_read(&ctxt->stats->sectors_seen);
u64 sectors_moved = atomic64_read(&ctxt->stats->sectors_moved);
int ret = 0;
- ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight, &buckets);
+ ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight);
if (ret)
goto err;
- darray_for_each(buckets, i) {
+ darray_for_each(buckets_in_flight->to_evacuate, i) {
if (kthread_should_stop() || freezing(current))
break;
- f = move_bucket_in_flight_add(buckets_in_flight, *i);
- ret = PTR_ERR_OR_ZERO(f);
- if (ret == -EEXIST) { /* rare race: copygc_get_buckets returned same bucket more than once */
- ret = 0;
- continue;
- }
- if (ret == -ENOMEM) { /* flush IO, continue later */
- ret = 0;
- break;
- }
+ struct move_bucket *b = *i;
+ *i = NULL;
+
+ move_bucket_in_flight_add(buckets_in_flight, b);
- ret = bch2_evacuate_bucket(ctxt, f, f->bucket.k.bucket,
- f->bucket.k.gen, data_opts);
+ ret = bch2_evacuate_bucket(ctxt, b, b->k.bucket, b->k.gen, data_opts);
if (ret)
goto err;
*did_work = true;
}
err:
-
/* no entries in LRU btree found, or got to end: */
if (bch2_err_matches(ret, ENOENT))
ret = 0;
@@ -255,12 +247,34 @@ err:
sectors_seen = atomic64_read(&ctxt->stats->sectors_seen) - sectors_seen;
sectors_moved = atomic64_read(&ctxt->stats->sectors_moved) - sectors_moved;
- trace_and_count(c, copygc, c, buckets.nr, sectors_seen, sectors_moved);
+ trace_and_count(c, copygc, c, buckets_in_flight->to_evacuate.nr, sectors_seen, sectors_moved);
- darray_exit(&buckets);
+ darray_for_each(buckets_in_flight->to_evacuate, i)
+ if (*i)
+ move_bucket_free(buckets_in_flight, *i);
+ darray_exit(&buckets_in_flight->to_evacuate);
return ret;
}
+static u64 bch2_copygc_dev_wait_amount(struct bch_dev *ca)
+{
+ struct bch_dev_usage_full usage_full = bch2_dev_usage_full_read(ca);
+ struct bch_dev_usage usage;
+
+ for (unsigned i = 0; i < BCH_DATA_NR; i++)
+ usage.buckets[i] = usage_full.d[i].buckets;
+
+ s64 fragmented_allowed = ((__dev_buckets_available(ca, usage, BCH_WATERMARK_stripe) *
+ ca->mi.bucket_size) >> 1);
+ s64 fragmented = 0;
+
+ for (unsigned i = 0; i < BCH_DATA_NR; i++)
+ if (data_type_movable(i))
+ fragmented += usage_full.d[i].fragmented;
+
+ return max(0LL, fragmented_allowed - fragmented);
+}
+
/*
* Copygc runs when the amount of fragmented data is above some arbitrary
* threshold:
@@ -275,28 +289,13 @@ err:
* often and continually reduce the amount of fragmented space as the device
* fills up. So, we increase the threshold by half the current free space.
*/
-unsigned long bch2_copygc_wait_amount(struct bch_fs *c)
+u64 bch2_copygc_wait_amount(struct bch_fs *c)
{
- s64 wait = S64_MAX, fragmented_allowed, fragmented;
-
- for_each_rw_member(c, ca) {
- struct bch_dev_usage_full usage_full = bch2_dev_usage_full_read(ca);
- struct bch_dev_usage usage;
-
- for (unsigned i = 0; i < BCH_DATA_NR; i++)
- usage.buckets[i] = usage_full.d[i].buckets;
-
- fragmented_allowed = ((__dev_buckets_available(ca, usage, BCH_WATERMARK_stripe) *
- ca->mi.bucket_size) >> 1);
- fragmented = 0;
-
- for (unsigned i = 0; i < BCH_DATA_NR; i++)
- if (data_type_movable(i))
- fragmented += usage_full.d[i].fragmented;
-
- wait = min(wait, max(0LL, fragmented_allowed - fragmented));
- }
+ u64 wait = U64_MAX;
+ guard(rcu)();
+ for_each_rw_member_rcu(c, ca)
+ wait = min(wait, bch2_copygc_dev_wait_amount(ca));
return wait;
}
@@ -318,15 +317,23 @@ void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c)
c->copygc_wait_at) << 9);
prt_newline(out);
- prt_printf(out, "Currently calculated wait:\t");
- prt_human_readable_u64(out, bch2_copygc_wait_amount(c));
- prt_newline(out);
+ bch2_printbuf_make_room(out, 4096);
- rcu_read_lock();
- struct task_struct *t = rcu_dereference(c->copygc_thread);
- if (t)
- get_task_struct(t);
- rcu_read_unlock();
+ struct task_struct *t;
+ out->atomic++;
+ scoped_guard(rcu) {
+ prt_printf(out, "Currently calculated wait:\n");
+ for_each_rw_member_rcu(c, ca) {
+ prt_printf(out, " %s:\t", ca->name);
+ prt_human_readable_u64(out, bch2_copygc_dev_wait_amount(ca));
+ prt_newline(out);
+ }
+
+ t = rcu_dereference(c->copygc_thread);
+ if (t)
+ get_task_struct(t);
+ }
+ --out->atomic;
if (t) {
bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL);
@@ -340,19 +347,13 @@ static int bch2_copygc_thread(void *arg)
struct moving_context ctxt;
struct bch_move_stats move_stats;
struct io_clock *clock = &c->io_clock[WRITE];
- struct buckets_in_flight *buckets;
+ struct buckets_in_flight buckets = {};
u64 last, wait;
- int ret = 0;
- buckets = kzalloc(sizeof(struct buckets_in_flight), GFP_KERNEL);
- if (!buckets)
- return -ENOMEM;
- ret = rhashtable_init(&buckets->table, &bch_move_bucket_params);
+ int ret = rhashtable_init(&buckets.table, &bch_move_bucket_params);
bch_err_msg(c, ret, "allocating copygc buckets in flight");
- if (ret) {
- kfree(buckets);
+ if (ret)
return ret;
- }
set_freezable();
@@ -360,7 +361,7 @@ static int bch2_copygc_thread(void *arg)
* Data move operations can't run until after check_snapshots has
* completed, and bch2_snapshot_is_ancestor() is available.
*/
- kthread_wait_freezable(c->recovery_pass_done > BCH_RECOVERY_PASS_check_snapshots ||
+ kthread_wait_freezable(c->recovery.pass_done > BCH_RECOVERY_PASS_check_snapshots ||
kthread_should_stop());
bch2_move_stats_init(&move_stats, "copygc");
@@ -375,13 +376,13 @@ static int bch2_copygc_thread(void *arg)
cond_resched();
if (!c->opts.copygc_enabled) {
- move_buckets_wait(&ctxt, buckets, true);
+ move_buckets_wait(&ctxt, &buckets, true);
kthread_wait_freezable(c->opts.copygc_enabled ||
kthread_should_stop());
}
if (unlikely(freezing(current))) {
- move_buckets_wait(&ctxt, buckets, true);
+ move_buckets_wait(&ctxt, &buckets, true);
__refrigerator(false);
continue;
}
@@ -392,7 +393,7 @@ static int bch2_copygc_thread(void *arg)
if (wait > clock->max_slop) {
c->copygc_wait_at = last;
c->copygc_wait = last + wait;
- move_buckets_wait(&ctxt, buckets, true);
+ move_buckets_wait(&ctxt, &buckets, true);
trace_and_count(c, copygc_wait, c, wait, last + wait);
bch2_kthread_io_clock_wait(clock, last + wait,
MAX_SCHEDULE_TIMEOUT);
@@ -402,7 +403,7 @@ static int bch2_copygc_thread(void *arg)
c->copygc_wait = 0;
c->copygc_running = true;
- ret = bch2_copygc(&ctxt, buckets, &did_work);
+ ret = bch2_copygc(&ctxt, &buckets, &did_work);
c->copygc_running = false;
wake_up(&c->copygc_running_wq);
@@ -413,16 +414,14 @@ static int bch2_copygc_thread(void *arg)
if (min_member_capacity == U64_MAX)
min_member_capacity = 128 * 2048;
- move_buckets_wait(&ctxt, buckets, true);
+ move_buckets_wait(&ctxt, &buckets, true);
bch2_kthread_io_clock_wait(clock, last + (min_member_capacity >> 6),
MAX_SCHEDULE_TIMEOUT);
}
}
- move_buckets_wait(&ctxt, buckets, true);
-
- rhashtable_destroy(&buckets->table);
- kfree(buckets);
+ move_buckets_wait(&ctxt, &buckets, true);
+ rhashtable_destroy(&buckets.table);
bch2_moving_ctxt_exit(&ctxt);
bch2_move_stats_exit(&move_stats, c);
diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h
index d1885cf67a45..f615910d6f98 100644
--- a/fs/bcachefs/movinggc.h
+++ b/fs/bcachefs/movinggc.h
@@ -2,16 +2,15 @@
#ifndef _BCACHEFS_MOVINGGC_H
#define _BCACHEFS_MOVINGGC_H
-unsigned long bch2_copygc_wait_amount(struct bch_fs *);
+u64 bch2_copygc_wait_amount(struct bch_fs *);
void bch2_copygc_wait_to_text(struct printbuf *, struct bch_fs *);
static inline void bch2_copygc_wakeup(struct bch_fs *c)
{
- rcu_read_lock();
+ guard(rcu)();
struct task_struct *p = rcu_dereference(c->copygc_thread);
if (p)
wake_up_process(p);
- rcu_read_unlock();
}
void bch2_copygc_stop(struct bch_fs *);
diff --git a/fs/bcachefs/namei.c b/fs/bcachefs/namei.c
index 9136a9097789..24120037c031 100644
--- a/fs/bcachefs/namei.c
+++ b/fs/bcachefs/namei.c
@@ -11,6 +11,14 @@
#include <linux/posix_acl.h>
+static inline subvol_inum parent_inum(subvol_inum inum, struct bch_inode_unpacked *inode)
+{
+ return (subvol_inum) {
+ .subvol = inode->bi_parent_subvol ?: inum.subvol,
+ .inum = inode->bi_dir,
+ };
+}
+
static inline int is_subdir_for_nlink(struct bch_inode_unpacked *inode)
{
return S_ISDIR(inode->bi_mode) && !inode->bi_subvol;
@@ -49,7 +57,7 @@ int bch2_create_trans(struct btree_trans *trans,
if (!(flags & BCH_CREATE_SNAPSHOT)) {
/* Normal create path - allocate a new inode: */
- bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u);
+ bch2_inode_init_late(c, new_inode, now, uid, gid, mode, rdev, dir_u);
if (flags & BCH_CREATE_TMPFILE)
new_inode->bi_flags |= BCH_INODE_unlinked;
@@ -279,7 +287,7 @@ int bch2_unlink_trans(struct btree_trans *trans,
}
if (deleting_subvol && !inode_u->bi_subvol) {
- ret = -BCH_ERR_ENOENT_not_subvol;
+ ret = bch_err_throw(c, ENOENT_not_subvol);
goto err;
}
@@ -404,8 +412,7 @@ int bch2_rename_trans(struct btree_trans *trans,
src_hash = bch2_hash_info_init(c, src_dir_u);
- if (dst_dir.inum != src_dir.inum ||
- dst_dir.subvol != src_dir.subvol) {
+ if (!subvol_inum_eq(dst_dir, src_dir)) {
ret = bch2_inode_peek(trans, &dst_dir_iter, dst_dir_u, dst_dir,
BTREE_ITER_intent);
if (ret)
@@ -418,8 +425,8 @@ int bch2_rename_trans(struct btree_trans *trans,
}
ret = bch2_dirent_rename(trans,
- src_dir, &src_hash, &src_dir_u->bi_size,
- dst_dir, &dst_hash, &dst_dir_u->bi_size,
+ src_dir, &src_hash,
+ dst_dir, &dst_hash,
src_name, &src_inum, &src_offset,
dst_name, &dst_inum, &dst_offset,
mode);
@@ -497,32 +504,41 @@ int bch2_rename_trans(struct btree_trans *trans,
}
}
- if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) &&
- S_ISDIR(src_inode_u->bi_mode)) {
- ret = -EXDEV;
- goto err;
- }
+ if (!subvol_inum_eq(dst_dir, src_dir)) {
+ if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) &&
+ S_ISDIR(src_inode_u->bi_mode)) {
+ ret = -EXDEV;
+ goto err;
+ }
- if (mode == BCH_RENAME_EXCHANGE &&
- bch2_reinherit_attrs(dst_inode_u, src_dir_u) &&
- S_ISDIR(dst_inode_u->bi_mode)) {
- ret = -EXDEV;
- goto err;
- }
+ if (mode == BCH_RENAME_EXCHANGE &&
+ bch2_reinherit_attrs(dst_inode_u, src_dir_u) &&
+ S_ISDIR(dst_inode_u->bi_mode)) {
+ ret = -EXDEV;
+ goto err;
+ }
- if (is_subdir_for_nlink(src_inode_u)) {
- src_dir_u->bi_nlink--;
- dst_dir_u->bi_nlink++;
- }
+ ret = bch2_maybe_propagate_has_case_insensitive(trans, src_inum, src_inode_u) ?:
+ (mode == BCH_RENAME_EXCHANGE
+ ? bch2_maybe_propagate_has_case_insensitive(trans, dst_inum, dst_inode_u)
+ : 0);
+ if (ret)
+ goto err;
- if (S_ISDIR(src_inode_u->bi_mode) &&
- !src_inode_u->bi_subvol)
- src_inode_u->bi_depth = dst_dir_u->bi_depth + 1;
+ if (is_subdir_for_nlink(src_inode_u)) {
+ src_dir_u->bi_nlink--;
+ dst_dir_u->bi_nlink++;
+ }
- if (mode == BCH_RENAME_EXCHANGE &&
- S_ISDIR(dst_inode_u->bi_mode) &&
- !dst_inode_u->bi_subvol)
- dst_inode_u->bi_depth = src_dir_u->bi_depth + 1;
+ if (S_ISDIR(src_inode_u->bi_mode) &&
+ !src_inode_u->bi_subvol)
+ src_inode_u->bi_depth = dst_dir_u->bi_depth + 1;
+
+ if (mode == BCH_RENAME_EXCHANGE &&
+ S_ISDIR(dst_inode_u->bi_mode) &&
+ !dst_inode_u->bi_subvol)
+ dst_inode_u->bi_depth = src_dir_u->bi_depth + 1;
+ }
if (dst_inum.inum && is_subdir_for_nlink(dst_inode_u)) {
dst_dir_u->bi_nlink--;
@@ -593,31 +609,39 @@ static inline void reverse_bytes(void *b, size_t n)
}
}
-/* XXX: we don't yet attempt to print paths when we don't know the subvol */
-int bch2_inum_to_path(struct btree_trans *trans, subvol_inum inum, struct printbuf *path)
+static int __bch2_inum_to_path(struct btree_trans *trans,
+ u32 subvol, u64 inum, u32 snapshot,
+ struct printbuf *path)
{
unsigned orig_pos = path->pos;
int ret = 0;
- while (!(inum.subvol == BCACHEFS_ROOT_SUBVOL &&
- inum.inum == BCACHEFS_ROOT_INO)) {
+ while (true) {
+ if (!snapshot) {
+ ret = bch2_subvolume_get_snapshot(trans, subvol, &snapshot);
+ if (ret)
+ goto disconnected;
+ }
+
struct bch_inode_unpacked inode;
- ret = bch2_inode_find_by_inum_trans(trans, inum, &inode);
+ ret = bch2_inode_find_by_inum_snapshot(trans, inum, snapshot, &inode, 0);
if (ret)
goto disconnected;
+ if (inode.bi_subvol == BCACHEFS_ROOT_SUBVOL &&
+ inode.bi_inum == BCACHEFS_ROOT_INO)
+ break;
+
if (!inode.bi_dir && !inode.bi_dir_offset) {
- ret = -BCH_ERR_ENOENT_inode_no_backpointer;
+ ret = bch_err_throw(trans->c, ENOENT_inode_no_backpointer);
goto disconnected;
}
- inum.subvol = inode.bi_parent_subvol ?: inum.subvol;
- inum.inum = inode.bi_dir;
-
- u32 snapshot;
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- goto disconnected;
+ inum = inode.bi_dir;
+ if (inode.bi_parent_subvol) {
+ subvol = inode.bi_parent_subvol;
+ snapshot = 0;
+ }
struct btree_iter d_iter;
struct bkey_s_c_dirent d = bch2_bkey_get_iter_typed(trans, &d_iter,
@@ -654,6 +678,20 @@ disconnected:
goto out;
}
+int bch2_inum_to_path(struct btree_trans *trans,
+ subvol_inum inum,
+ struct printbuf *path)
+{
+ return __bch2_inum_to_path(trans, inum.subvol, inum.inum, 0, path);
+}
+
+int bch2_inum_snapshot_to_path(struct btree_trans *trans, u64 inum, u32 snapshot,
+ snapshot_id_list *snapshot_overwrites,
+ struct printbuf *path)
+{
+ return __bch2_inum_to_path(trans, 0, inum, snapshot, path);
+}
+
/* fsck */
static int bch2_check_dirent_inode_dirent(struct btree_trans *trans,
@@ -695,15 +733,6 @@ static int bch2_check_dirent_inode_dirent(struct btree_trans *trans,
return __bch2_fsck_write_inode(trans, target);
}
- if (bch2_inode_should_have_single_bp(target) &&
- !fsck_err(trans, inode_wrong_backpointer,
- "dirent points to inode that does not point back:\n%s",
- (bch2_bkey_val_to_text(&buf, c, d.s_c),
- prt_newline(&buf),
- bch2_inode_unpacked_to_text(&buf, target),
- buf.buf)))
- goto err;
-
struct bkey_s_c_dirent bp_dirent =
bch2_bkey_get_iter_typed(trans, &bp_iter, BTREE_ID_dirents,
SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot),
@@ -730,6 +759,7 @@ static int bch2_check_dirent_inode_dirent(struct btree_trans *trans,
ret = __bch2_fsck_write_inode(trans, target);
}
} else {
+ printbuf_reset(&buf);
bch2_bkey_val_to_text(&buf, c, d.s_c);
prt_newline(&buf);
bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
@@ -819,7 +849,8 @@ int __bch2_check_dirent_target(struct btree_trans *trans,
n->v.d_inum = cpu_to_le64(target->bi_inum);
}
- ret = bch2_trans_update(trans, dirent_iter, &n->k_i, 0);
+ ret = bch2_trans_update(trans, dirent_iter, &n->k_i,
+ BTREE_UPDATE_internal_snapshot_node);
if (ret)
goto err;
}
@@ -829,3 +860,149 @@ fsck_err:
bch_err_fn(c, ret);
return ret;
}
+
+/*
+ * BCH_INODE_has_case_insensitive:
+ * We have to track whether directories have any descendent directory that is
+ * casefolded - for overlayfs:
+ */
+
+static int bch2_propagate_has_case_insensitive(struct btree_trans *trans, subvol_inum inum)
+{
+ struct btree_iter iter = {};
+ int ret = 0;
+
+ while (true) {
+ struct bch_inode_unpacked inode;
+ ret = bch2_inode_peek(trans, &iter, &inode, inum,
+ BTREE_ITER_intent|BTREE_ITER_with_updates);
+ if (ret)
+ break;
+
+ if (inode.bi_flags & BCH_INODE_has_case_insensitive)
+ break;
+
+ inode.bi_flags |= BCH_INODE_has_case_insensitive;
+ ret = bch2_inode_write(trans, &iter, &inode);
+ if (ret)
+ break;
+
+ bch2_trans_iter_exit(trans, &iter);
+ if (subvol_inum_eq(inum, BCACHEFS_ROOT_SUBVOL_INUM))
+ break;
+
+ inum = parent_inum(inum, &inode);
+ }
+
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int bch2_maybe_propagate_has_case_insensitive(struct btree_trans *trans, subvol_inum inum,
+ struct bch_inode_unpacked *inode)
+{
+ if (!bch2_inode_casefold(trans->c, inode))
+ return 0;
+
+ inode->bi_flags |= BCH_INODE_has_case_insensitive;
+
+ return bch2_propagate_has_case_insensitive(trans, parent_inum(inum, inode));
+}
+
+int bch2_check_inode_has_case_insensitive(struct btree_trans *trans,
+ struct bch_inode_unpacked *inode,
+ snapshot_id_list *snapshot_overwrites,
+ bool *do_update)
+{
+ struct printbuf buf = PRINTBUF;
+ bool repairing_parents = false;
+ int ret = 0;
+
+ if (!S_ISDIR(inode->bi_mode)) {
+ /*
+ * Old versions set bi_casefold for non dirs, but that's
+ * unnecessary and wasteful
+ */
+ if (inode->bi_casefold) {
+ inode->bi_casefold = 0;
+ *do_update = true;
+ }
+ return 0;
+ }
+
+ if (trans->c->sb.version < bcachefs_metadata_version_inode_has_case_insensitive)
+ return 0;
+
+ if (bch2_inode_casefold(trans->c, inode) &&
+ !(inode->bi_flags & BCH_INODE_has_case_insensitive)) {
+ prt_printf(&buf, "casefolded dir with has_case_insensitive not set\ninum %llu:%u ",
+ inode->bi_inum, inode->bi_snapshot);
+
+ ret = bch2_inum_snapshot_to_path(trans, inode->bi_inum, inode->bi_snapshot,
+ snapshot_overwrites, &buf);
+ if (ret)
+ goto err;
+
+ if (fsck_err(trans, inode_has_case_insensitive_not_set, "%s", buf.buf)) {
+ inode->bi_flags |= BCH_INODE_has_case_insensitive;
+ *do_update = true;
+ }
+ }
+
+ if (!(inode->bi_flags & BCH_INODE_has_case_insensitive))
+ goto out;
+
+ struct bch_inode_unpacked dir = *inode;
+ u32 snapshot = dir.bi_snapshot;
+
+ while (!(dir.bi_inum == BCACHEFS_ROOT_INO &&
+ dir.bi_subvol == BCACHEFS_ROOT_SUBVOL)) {
+ if (dir.bi_parent_subvol) {
+ ret = bch2_subvolume_get_snapshot(trans, dir.bi_parent_subvol, &snapshot);
+ if (ret)
+ goto err;
+
+ snapshot_overwrites = NULL;
+ }
+
+ ret = bch2_inode_find_by_inum_snapshot(trans, dir.bi_dir, snapshot, &dir, 0);
+ if (ret)
+ goto err;
+
+ if (!(dir.bi_flags & BCH_INODE_has_case_insensitive)) {
+ prt_printf(&buf, "parent of casefolded dir with has_case_insensitive not set\n");
+
+ ret = bch2_inum_snapshot_to_path(trans, dir.bi_inum, dir.bi_snapshot,
+ snapshot_overwrites, &buf);
+ if (ret)
+ goto err;
+
+ if (fsck_err(trans, inode_parent_has_case_insensitive_not_set, "%s", buf.buf)) {
+ dir.bi_flags |= BCH_INODE_has_case_insensitive;
+ ret = __bch2_fsck_write_inode(trans, &dir);
+ if (ret)
+ goto err;
+ }
+ }
+
+ /*
+ * We only need to check the first parent, unless we find an
+ * inconsistency
+ */
+ if (!repairing_parents)
+ break;
+ }
+out:
+err:
+fsck_err:
+ printbuf_exit(&buf);
+ if (ret)
+ return ret;
+
+ if (repairing_parents) {
+ return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
+ -BCH_ERR_transaction_restart_nested;
+ }
+
+ return 0;
+}
diff --git a/fs/bcachefs/namei.h b/fs/bcachefs/namei.h
index 2e6f6364767f..ae6ebc2d0785 100644
--- a/fs/bcachefs/namei.h
+++ b/fs/bcachefs/namei.h
@@ -43,6 +43,8 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *,
struct bch_inode_unpacked *);
int bch2_inum_to_path(struct btree_trans *, subvol_inum, struct printbuf *);
+int bch2_inum_snapshot_to_path(struct btree_trans *, u64, u32,
+ snapshot_id_list *, struct printbuf *);
int __bch2_check_dirent_target(struct btree_trans *,
struct btree_iter *,
@@ -69,4 +71,9 @@ static inline int bch2_check_dirent_target(struct btree_trans *trans,
return __bch2_check_dirent_target(trans, dirent_iter, d, target, in_fsck);
}
+int bch2_maybe_propagate_has_case_insensitive(struct btree_trans *, subvol_inum,
+ struct bch_inode_unpacked *);
+int bch2_check_inode_has_case_insensitive(struct btree_trans *, struct bch_inode_unpacked *,
+ snapshot_id_list *, bool *);
+
#endif /* _BCACHEFS_NAMEI_H */
diff --git a/fs/bcachefs/nocow_locking.c b/fs/bcachefs/nocow_locking.c
index 3c21981a4a1c..962218fa68ec 100644
--- a/fs/bcachefs/nocow_locking.c
+++ b/fs/bcachefs/nocow_locking.c
@@ -133,12 +133,10 @@ void bch2_fs_nocow_locking_exit(struct bch_fs *c)
BUG_ON(atomic_read(&l->l[j]));
}
-int bch2_fs_nocow_locking_init(struct bch_fs *c)
+void bch2_fs_nocow_locking_init_early(struct bch_fs *c)
{
struct bucket_nocow_lock_table *t = &c->nocow_locks;
for (struct nocow_lock_bucket *l = t->l; l < t->l + ARRAY_SIZE(t->l); l++)
spin_lock_init(&l->lock);
-
- return 0;
}
diff --git a/fs/bcachefs/nocow_locking.h b/fs/bcachefs/nocow_locking.h
index f9d6a426a960..48b8a003c0d2 100644
--- a/fs/bcachefs/nocow_locking.h
+++ b/fs/bcachefs/nocow_locking.h
@@ -45,6 +45,6 @@ static inline bool bch2_bucket_nocow_trylock(struct bucket_nocow_lock_table *t,
void bch2_nocow_locks_to_text(struct printbuf *, struct bucket_nocow_lock_table *);
void bch2_fs_nocow_locking_exit(struct bch_fs *);
-int bch2_fs_nocow_locking_init(struct bch_fs *);
+void bch2_fs_nocow_locking_init_early(struct bch_fs *);
#endif /* _BCACHEFS_NOCOW_LOCKING_H */
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index af3258814822..b1cf88905b81 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -7,7 +7,9 @@
#include "compress.h"
#include "disk_groups.h"
#include "error.h"
+#include "movinggc.h"
#include "opts.h"
+#include "rebalance.h"
#include "recovery_passes.h"
#include "super-io.h"
#include "util.h"
@@ -19,6 +21,11 @@ const char * const bch2_error_actions[] = {
NULL
};
+const char * const bch2_degraded_actions[] = {
+ BCH_DEGRADED_ACTIONS()
+ NULL
+};
+
const char * const bch2_fsck_fix_opts[] = {
BCH_FIX_ERRORS_OPTS()
NULL
@@ -273,20 +280,20 @@ int bch2_opt_lookup(const char *name)
return -1;
}
-struct synonym {
+struct opt_synonym {
const char *s1, *s2;
};
-static const struct synonym bch_opt_synonyms[] = {
+static const struct opt_synonym bch2_opt_synonyms[] = {
{ "quota", "usrquota" },
};
static int bch2_mount_opt_lookup(const char *name)
{
- const struct synonym *i;
+ const struct opt_synonym *i;
- for (i = bch_opt_synonyms;
- i < bch_opt_synonyms + ARRAY_SIZE(bch_opt_synonyms);
+ for (i = bch2_opt_synonyms;
+ i < bch2_opt_synonyms + ARRAY_SIZE(bch2_opt_synonyms);
i++)
if (!strcmp(name, i->s1))
name = i->s2;
@@ -294,6 +301,30 @@ static int bch2_mount_opt_lookup(const char *name)
return bch2_opt_lookup(name);
}
+struct opt_val_synonym {
+ const char *opt, *v1, *v2;
+};
+
+static const struct opt_val_synonym bch2_opt_val_synonyms[] = {
+ { "degraded", "true", "yes" },
+ { "degraded", "false", "no" },
+ { "degraded", "1", "yes" },
+ { "degraded", "0", "no" },
+};
+
+static const char *bch2_opt_val_synonym_lookup(const char *opt, const char *val)
+{
+ const struct opt_val_synonym *i;
+
+ for (i = bch2_opt_val_synonyms;
+ i < bch2_opt_val_synonyms + ARRAY_SIZE(bch2_opt_val_synonyms);
+ i++)
+ if (!strcmp(opt, i->opt) && !strcmp(val, i->v1))
+ return i->v2;
+
+ return val;
+}
+
int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err)
{
if (v < opt->min) {
@@ -337,21 +368,22 @@ int bch2_opt_parse(struct bch_fs *c,
{
ssize_t ret;
+ if (err)
+ printbuf_indent_add_nextline(err, 2);
+
switch (opt->type) {
case BCH_OPT_BOOL:
- if (val) {
- ret = lookup_constant(bool_names, val, -BCH_ERR_option_not_bool);
- if (ret != -BCH_ERR_option_not_bool) {
- *res = ret;
- } else {
- if (err)
- prt_printf(err, "%s: must be bool", opt->attr.name);
- return ret;
- }
+ if (!val)
+ val = "1";
+
+ ret = lookup_constant(bool_names, val, -BCH_ERR_option_not_bool);
+ if (ret != -BCH_ERR_option_not_bool) {
+ *res = ret;
} else {
- *res = 1;
+ if (err)
+ prt_printf(err, "%s: must be bool", opt->attr.name);
+ return ret;
}
-
break;
case BCH_OPT_UINT:
if (!val) {
@@ -360,9 +392,15 @@ int bch2_opt_parse(struct bch_fs *c,
return -EINVAL;
}
- ret = opt->flags & OPT_HUMAN_READABLE
- ? bch2_strtou64_h(val, res)
- : kstrtou64(val, 10, res);
+ if (*val != '-') {
+ ret = opt->flags & OPT_HUMAN_READABLE
+ ? bch2_strtou64_h(val, res)
+ : kstrtou64(val, 10, res);
+ } else {
+ prt_printf(err, "%s: must be a non-negative number", opt->attr.name);
+ return -BCH_ERR_option_negative;
+ }
+
if (ret < 0) {
if (err)
prt_printf(err, "%s: must be a number",
@@ -480,7 +518,7 @@ void bch2_opts_to_text(struct printbuf *out,
}
}
-int bch2_opt_check_may_set(struct bch_fs *c, struct bch_dev *ca, int id, u64 v)
+int bch2_opt_hook_pre_set(struct bch_fs *c, struct bch_dev *ca, enum bch_opt_id id, u64 v)
{
int ret = 0;
@@ -498,15 +536,17 @@ int bch2_opt_check_may_set(struct bch_fs *c, struct bch_dev *ca, int id, u64 v)
if (v)
bch2_check_set_feature(c, BCH_FEATURE_ec);
break;
+ default:
+ break;
}
return ret;
}
-int bch2_opts_check_may_set(struct bch_fs *c)
+int bch2_opts_hooks_pre_set(struct bch_fs *c)
{
for (unsigned i = 0; i < bch2_opts_nr; i++) {
- int ret = bch2_opt_check_may_set(c, NULL, i, bch2_opt_get_by_id(&c->opts, i));
+ int ret = bch2_opt_hook_pre_set(c, NULL, i, bch2_opt_get_by_id(&c->opts, i));
if (ret)
return ret;
}
@@ -514,6 +554,61 @@ int bch2_opts_check_may_set(struct bch_fs *c)
return 0;
}
+void bch2_opt_hook_post_set(struct bch_fs *c, struct bch_dev *ca, u64 inum,
+ struct bch_opts *new_opts, enum bch_opt_id id)
+{
+ switch (id) {
+ case Opt_foreground_target:
+ if (new_opts->foreground_target &&
+ !new_opts->background_target)
+ bch2_set_rebalance_needs_scan(c, inum);
+ break;
+ case Opt_compression:
+ if (new_opts->compression &&
+ !new_opts->background_compression)
+ bch2_set_rebalance_needs_scan(c, inum);
+ break;
+ case Opt_background_target:
+ if (new_opts->background_target)
+ bch2_set_rebalance_needs_scan(c, inum);
+ break;
+ case Opt_background_compression:
+ if (new_opts->background_compression)
+ bch2_set_rebalance_needs_scan(c, inum);
+ break;
+ case Opt_rebalance_enabled:
+ bch2_rebalance_wakeup(c);
+ break;
+ case Opt_copygc_enabled:
+ bch2_copygc_wakeup(c);
+ break;
+ case Opt_discard:
+ if (!ca) {
+ mutex_lock(&c->sb_lock);
+ for_each_member_device(c, ca) {
+ struct bch_member *m =
+ bch2_members_v2_get_mut(ca->disk_sb.sb, ca->dev_idx);
+ SET_BCH_MEMBER_DISCARD(m, c->opts.discard);
+ }
+
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+ }
+ break;
+ case Opt_version_upgrade:
+ /*
+ * XXX: in the future we'll likely want to do compatible
+ * upgrades at runtime as well, but right now there's nothing
+ * that does that:
+ */
+ if (new_opts->version_upgrade == BCH_VERSION_UPGRADE_incompatible)
+ bch2_sb_upgrade_incompat(c);
+ break;
+ default:
+ break;
+ }
+}
+
int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts,
struct printbuf *parse_later,
const char *name, const char *val)
@@ -536,6 +631,12 @@ int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts,
if (id < 0)
return 0;
+ /* must have a value for synonym lookup - but OPT_FN is weird */
+ if (!val && bch2_opt_table[id].type != BCH_OPT_FN)
+ val = "1";
+
+ val = bch2_opt_val_synonym_lookup(name, val);
+
if (!(bch2_opt_table[id].flags & OPT_MOUNT))
goto bad_opt;
@@ -667,9 +768,11 @@ int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb)
return 0;
}
-void __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx,
+bool __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx,
const struct bch_option *opt, u64 v)
{
+ bool changed = false;
+
if (opt->flags & OPT_SB_FIELD_SECTORS)
v >>= 9;
@@ -679,26 +782,35 @@ void __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx,
if (opt->flags & OPT_SB_FIELD_ONE_BIAS)
v++;
- if ((opt->flags & OPT_FS) && opt->set_sb && dev_idx < 0)
+ if ((opt->flags & OPT_FS) && opt->set_sb && dev_idx < 0) {
+ changed = v != opt->get_sb(sb);
+
opt->set_sb(sb, v);
+ }
if ((opt->flags & OPT_DEVICE) && opt->set_member && dev_idx >= 0) {
if (WARN(!bch2_member_exists(sb, dev_idx),
"tried to set device option %s on nonexistent device %i",
opt->attr.name, dev_idx))
- return;
+ return false;
- opt->set_member(bch2_members_v2_get_mut(sb, dev_idx), v);
+ struct bch_member *m = bch2_members_v2_get_mut(sb, dev_idx);
+ changed = v != opt->get_member(m);
+ opt->set_member(m, v);
}
+
+ return changed;
}
-void bch2_opt_set_sb(struct bch_fs *c, struct bch_dev *ca,
+bool bch2_opt_set_sb(struct bch_fs *c, struct bch_dev *ca,
const struct bch_option *opt, u64 v)
{
mutex_lock(&c->sb_lock);
- __bch2_opt_set_sb(c->disk_sb.sb, ca ? ca->dev_idx : -1, opt, v);
- bch2_write_super(c);
+ bool changed = __bch2_opt_set_sb(c->disk_sb.sb, ca ? ca->dev_idx : -1, opt, v);
+ if (changed)
+ bch2_write_super(c);
mutex_unlock(&c->sb_lock);
+ return changed;
}
/* io opts: */
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index dfb14810124c..2a02606254b3 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -11,6 +11,7 @@
struct bch_fs;
extern const char * const bch2_error_actions[];
+extern const char * const bch2_degraded_actions[];
extern const char * const bch2_fsck_fix_opts[];
extern const char * const bch2_version_upgrade_opts[];
extern const char * const bch2_sb_features[];
@@ -307,14 +308,9 @@ enum fsck_err_opts {
NULL, "Enable project quotas") \
x(degraded, u8, \
OPT_FS|OPT_MOUNT, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, false, \
+ OPT_STR(bch2_degraded_actions), \
+ BCH_SB_DEGRADED_ACTION, BCH_DEGRADED_ask, \
NULL, "Allow mounting in degraded mode") \
- x(very_degraded, u8, \
- OPT_FS|OPT_MOUNT, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, false, \
- NULL, "Allow mounting in when data will be missing") \
x(no_splitbrain_check, u8, \
OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
@@ -454,7 +450,7 @@ enum fsck_err_opts {
BCH2_NO_SB_OPT, false, \
NULL, "Reconstruct alloc btree") \
x(version_upgrade, u8, \
- OPT_FS|OPT_MOUNT, \
+ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
OPT_STR(bch2_version_upgrade_opts), \
BCH_SB_VERSION_UPGRADE, BCH_VERSION_UPGRADE_compatible, \
NULL, "Set superblock to latest version,\n" \
@@ -494,6 +490,17 @@ enum fsck_err_opts {
BCH2_NO_SB_OPT, true, \
NULL, "Enable rebalance: disable for debugging, or to\n"\
"quiet the system when doing performance testing\n")\
+ x(rebalance_on_ac_only, u8, \
+ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_BOOL(), \
+ BCH_SB_REBALANCE_AC_ONLY, false, \
+ NULL, "Enable rebalance while on mains power only\n") \
+ x(auto_snapshot_deletion, u8, \
+ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_BOOL(), \
+ BCH2_NO_SB_OPT, true, \
+ NULL, "Enable automatic snapshot deletion: disable for debugging, or to\n"\
+ "quiet the system when doing performance testing\n")\
x(no_data_io, u8, \
OPT_MOUNT, \
OPT_BOOL(), \
@@ -522,7 +529,7 @@ enum fsck_err_opts {
BCH_MEMBER_DATA_ALLOWED, BIT(BCH_DATA_journal)|BIT(BCH_DATA_btree)|BIT(BCH_DATA_user),\
"types", "Allowed data types for this device: journal, btree, and/or user")\
x(discard, u8, \
- OPT_MOUNT|OPT_DEVICE|OPT_RUNTIME, \
+ OPT_MOUNT|OPT_FS|OPT_DEVICE|OPT_RUNTIME, \
OPT_BOOL(), \
BCH_MEMBER_DISCARD, true, \
NULL, "Enable discard/TRIM support") \
@@ -530,7 +537,7 @@ enum fsck_err_opts {
OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \
BCH2_NO_SB_OPT, true, \
- NULL, "BTREE_ITER_prefetch casuse btree nodes to be\n"\
+ NULL, "BTREE_ITER_prefetch causes btree nodes to be\n"\
" prefetched sequentially")
struct bch_opts {
@@ -616,10 +623,10 @@ void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64);
u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id, int);
int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *);
-void __bch2_opt_set_sb(struct bch_sb *, int, const struct bch_option *, u64);
+bool __bch2_opt_set_sb(struct bch_sb *, int, const struct bch_option *, u64);
struct bch_dev;
-void bch2_opt_set_sb(struct bch_fs *, struct bch_dev *, const struct bch_option *, u64);
+bool bch2_opt_set_sb(struct bch_fs *, struct bch_dev *, const struct bch_option *, u64);
int bch2_opt_lookup(const char *);
int bch2_opt_validate(const struct bch_option *, u64, struct printbuf *);
@@ -636,8 +643,11 @@ void bch2_opts_to_text(struct printbuf *,
struct bch_fs *, struct bch_sb *,
unsigned, unsigned, unsigned);
-int bch2_opt_check_may_set(struct bch_fs *, struct bch_dev *, int, u64);
-int bch2_opts_check_may_set(struct bch_fs *);
+int bch2_opt_hook_pre_set(struct bch_fs *, struct bch_dev *, enum bch_opt_id, u64);
+int bch2_opts_hooks_pre_set(struct bch_fs *);
+void bch2_opt_hook_post_set(struct bch_fs *, struct bch_dev *, u64,
+ struct bch_opts *, enum bch_opt_id);
+
int bch2_parse_one_mount_opt(struct bch_fs *, struct bch_opts *,
struct printbuf *, const char *, const char *);
int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, struct printbuf *,
diff --git a/fs/bcachefs/printbuf.h b/fs/bcachefs/printbuf.h
index 1ca476adbf6f..8f4e28d440ac 100644
--- a/fs/bcachefs/printbuf.h
+++ b/fs/bcachefs/printbuf.h
@@ -140,6 +140,14 @@ void bch2_prt_bitflags_vector(struct printbuf *, const char * const[],
.size = _size, \
})
+static inline struct printbuf bch2_printbuf_init(void)
+{
+ return PRINTBUF;
+}
+
+DEFINE_CLASS(printbuf, struct printbuf,
+ bch2_printbuf_exit(&_T), bch2_printbuf_init(), void)
+
/*
* Returns size remaining of output buffer:
*/
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index 3d4755d73af7..f241efb1fb50 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -527,7 +527,7 @@ int bch2_fs_quota_read(struct bch_fs *c)
struct bch_sb_field_quota *sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
if (!sb_quota) {
mutex_unlock(&c->sb_lock);
- return -BCH_ERR_ENOSPC_sb_quota;
+ return bch_err_throw(c, ENOSPC_sb_quota);
}
bch2_sb_quota_read(c);
@@ -572,7 +572,7 @@ static int bch2_quota_enable(struct super_block *sb, unsigned uflags)
mutex_lock(&c->sb_lock);
sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
if (!sb_quota) {
- ret = -BCH_ERR_ENOSPC_sb_quota;
+ ret = bch_err_throw(c, ENOSPC_sb_quota);
goto unlock;
}
@@ -726,7 +726,7 @@ static int bch2_quota_set_info(struct super_block *sb, int type,
mutex_lock(&c->sb_lock);
sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
if (!sb_quota) {
- ret = -BCH_ERR_ENOSPC_sb_quota;
+ ret = bch_err_throw(c, ENOSPC_sb_quota);
goto unlock;
}
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 623273556aa9..1c345b86b1c0 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -80,6 +80,7 @@ static inline unsigned bch2_bkey_ptrs_need_move(struct bch_fs *c,
unsigned ptr_bit = 1;
unsigned rewrite_ptrs = 0;
+ guard(rcu)();
bkey_for_each_ptr(ptrs, ptr) {
if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, opts->background_target))
rewrite_ptrs |= ptr_bit;
@@ -95,6 +96,9 @@ static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c,
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
+ return 0;
+
return bch2_bkey_ptrs_need_compress(c, opts, k, ptrs) |
bch2_bkey_ptrs_need_move(c, opts, ptrs);
}
@@ -107,6 +111,9 @@ u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
if (!opts)
return 0;
+ if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
+ return 0;
+
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
u64 sectors = 0;
@@ -126,10 +133,13 @@ u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
}
}
incompressible:
- if (opts->background_target)
+ if (opts->background_target) {
+ guard(rcu)();
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
- if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, opts->background_target))
+ if (!p.ptr.cached &&
+ !bch2_dev_in_target(c, p.ptr.dev, opts->background_target))
sectors += p.crc.compressed_size;
+ }
return sectors;
}
@@ -433,7 +443,7 @@ static int do_rebalance_extent(struct moving_context *ctxt,
if (bch2_err_matches(ret, ENOMEM)) {
/* memory allocation failure, wait for some IO to finish */
bch2_move_ctxt_wait_for_io(ctxt);
- ret = -BCH_ERR_transaction_restart_nested;
+ ret = bch_err_throw(c, transaction_restart_nested);
}
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -447,22 +457,11 @@ out:
return ret;
}
-static bool rebalance_pred(struct bch_fs *c, void *arg,
- struct bkey_s_c k,
- struct bch_io_opts *io_opts,
- struct data_update_opts *data_opts)
-{
- data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k);
- data_opts->target = io_opts->background_target;
- data_opts->write_flags |= BCH_WRITE_only_specified_devs;
- return data_opts->rewrite_ptrs != 0;
-}
-
static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie)
{
struct btree_trans *trans = ctxt->trans;
+ struct bch_fs *c = trans->c;
struct bch_fs_rebalance *r = &trans->c->rebalance;
- int ret;
bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
ctxt->stats = &r->scan_stats;
@@ -477,11 +476,34 @@ static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie)
r->state = BCH_REBALANCE_scanning;
- ret = __bch2_move_data(ctxt, r->scan_start, r->scan_end, rebalance_pred, NULL) ?:
- commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- bch2_clear_rebalance_needs_scan(trans, inum, cookie));
+ struct per_snapshot_io_opts snapshot_io_opts;
+ per_snapshot_io_opts_init(&snapshot_io_opts, c);
+
+ int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents,
+ r->scan_start.pos, r->scan_end.pos,
+ BTREE_ITER_all_snapshots|
+ BTREE_ITER_not_extents|
+ BTREE_ITER_prefetch, k, ({
+ ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos);
+ struct bch_io_opts *io_opts = bch2_move_get_io_opts(trans,
+ &snapshot_io_opts, iter.pos, &iter, k);
+ PTR_ERR_OR_ZERO(io_opts);
+ })) ?:
+ commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ bch2_clear_rebalance_needs_scan(trans, inum, cookie));
+
+ per_snapshot_io_opts_exit(&snapshot_io_opts);
bch2_move_stats_exit(&r->scan_stats, trans->c);
+
+ /*
+ * Ensure that the rebalance_work entries we created are seen by the
+ * next iteration of do_rebalance(), so we don't end up stuck in
+ * rebalance_wait():
+ */
+ atomic64_inc(&r->scan_stats.sectors_seen);
+ bch2_btree_write_buffer_flush_sync(trans);
+
return ret;
}
@@ -503,7 +525,14 @@ static void rebalance_wait(struct bch_fs *c)
r->state = BCH_REBALANCE_waiting;
}
- bch2_kthread_io_clock_wait(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT);
+ bch2_kthread_io_clock_wait_once(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT);
+}
+
+static bool bch2_rebalance_enabled(struct bch_fs *c)
+{
+ return c->opts.rebalance_enabled &&
+ !(c->opts.rebalance_on_ac_only &&
+ c->rebalance.on_battery);
}
static int do_rebalance(struct moving_context *ctxt)
@@ -513,6 +542,7 @@ static int do_rebalance(struct moving_context *ctxt)
struct bch_fs_rebalance *r = &c->rebalance;
struct btree_iter rebalance_work_iter, extent_iter = {};
struct bkey_s_c k;
+ u32 kick = r->kick;
int ret = 0;
bch2_trans_begin(trans);
@@ -525,9 +555,9 @@ static int do_rebalance(struct moving_context *ctxt)
BTREE_ITER_all_snapshots);
while (!bch2_move_ratelimit(ctxt)) {
- if (!c->opts.rebalance_enabled) {
+ if (!bch2_rebalance_enabled(c)) {
bch2_moving_ctxt_flush_all(ctxt);
- kthread_wait_freezable(c->opts.rebalance_enabled ||
+ kthread_wait_freezable(bch2_rebalance_enabled(c) ||
kthread_should_stop());
}
@@ -562,7 +592,8 @@ static int do_rebalance(struct moving_context *ctxt)
if (!ret &&
!kthread_should_stop() &&
!atomic64_read(&r->work_stats.sectors_seen) &&
- !atomic64_read(&r->scan_stats.sectors_seen)) {
+ !atomic64_read(&r->scan_stats.sectors_seen) &&
+ kick == r->kick) {
bch2_moving_ctxt_flush_all(ctxt);
bch2_trans_unlock_long(trans);
rebalance_wait(c);
@@ -585,7 +616,7 @@ static int bch2_rebalance_thread(void *arg)
* Data move operations can't run until after check_snapshots has
* completed, and bch2_snapshot_is_ancestor() is available.
*/
- kthread_wait_freezable(c->recovery_pass_done > BCH_RECOVERY_PASS_check_snapshots ||
+ kthread_wait_freezable(c->recovery.pass_done > BCH_RECOVERY_PASS_check_snapshots ||
kthread_should_stop());
bch2_moving_ctxt_init(&ctxt, c, NULL, &r->work_stats,
@@ -646,11 +677,12 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
}
prt_newline(out);
- rcu_read_lock();
- struct task_struct *t = rcu_dereference(c->rebalance.thread);
- if (t)
- get_task_struct(t);
- rcu_read_unlock();
+ struct task_struct *t;
+ scoped_guard(rcu) {
+ t = rcu_dereference(c->rebalance.thread);
+ if (t)
+ get_task_struct(t);
+ }
if (t) {
bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL);
@@ -702,7 +734,156 @@ int bch2_rebalance_start(struct bch_fs *c)
return 0;
}
-void bch2_fs_rebalance_init(struct bch_fs *c)
+#ifdef CONFIG_POWER_SUPPLY
+#include <linux/power_supply.h>
+
+static int bch2_rebalance_power_notifier(struct notifier_block *nb,
+ unsigned long event, void *data)
+{
+ struct bch_fs *c = container_of(nb, struct bch_fs, rebalance.power_notifier);
+
+ c->rebalance.on_battery = !power_supply_is_system_supplied();
+ bch2_rebalance_wakeup(c);
+ return NOTIFY_OK;
+}
+#endif
+
+void bch2_fs_rebalance_exit(struct bch_fs *c)
+{
+#ifdef CONFIG_POWER_SUPPLY
+ power_supply_unreg_notifier(&c->rebalance.power_notifier);
+#endif
+}
+
+int bch2_fs_rebalance_init(struct bch_fs *c)
+{
+ struct bch_fs_rebalance *r = &c->rebalance;
+
+ bch2_pd_controller_init(&r->pd);
+
+#ifdef CONFIG_POWER_SUPPLY
+ r->power_notifier.notifier_call = bch2_rebalance_power_notifier;
+ int ret = power_supply_reg_notifier(&r->power_notifier);
+ if (ret)
+ return ret;
+
+ r->on_battery = !power_supply_is_system_supplied();
+#endif
+ return 0;
+}
+
+static int check_rebalance_work_one(struct btree_trans *trans,
+ struct btree_iter *extent_iter,
+ struct btree_iter *rebalance_iter,
+ struct bkey_buf *last_flushed)
{
- bch2_pd_controller_init(&c->rebalance.pd);
+ struct bch_fs *c = trans->c;
+ struct bkey_s_c extent_k, rebalance_k;
+ struct printbuf buf = PRINTBUF;
+
+ int ret = bkey_err(extent_k = bch2_btree_iter_peek(trans, extent_iter)) ?:
+ bkey_err(rebalance_k = bch2_btree_iter_peek(trans, rebalance_iter));
+ if (ret)
+ return ret;
+
+ if (!extent_k.k &&
+ extent_iter->btree_id == BTREE_ID_reflink &&
+ (!rebalance_k.k ||
+ rebalance_k.k->p.inode >= BCACHEFS_ROOT_INO)) {
+ bch2_trans_iter_exit(trans, extent_iter);
+ bch2_trans_iter_init(trans, extent_iter,
+ BTREE_ID_extents, POS_MIN,
+ BTREE_ITER_prefetch|
+ BTREE_ITER_all_snapshots);
+ return bch_err_throw(c, transaction_restart_nested);
+ }
+
+ if (!extent_k.k && !rebalance_k.k)
+ return 1;
+
+ int cmp = bpos_cmp(extent_k.k ? extent_k.k->p : SPOS_MAX,
+ rebalance_k.k ? rebalance_k.k->p : SPOS_MAX);
+
+ struct bkey deleted;
+ bkey_init(&deleted);
+
+ if (cmp < 0) {
+ deleted.p = extent_k.k->p;
+ rebalance_k.k = &deleted;
+ } else if (cmp > 0) {
+ deleted.p = rebalance_k.k->p;
+ extent_k.k = &deleted;
+ }
+
+ bool should_have_rebalance =
+ bch2_bkey_sectors_need_rebalance(c, extent_k) != 0;
+ bool have_rebalance = rebalance_k.k->type == KEY_TYPE_set;
+
+ if (should_have_rebalance != have_rebalance) {
+ ret = bch2_btree_write_buffer_maybe_flush(trans, extent_k, last_flushed);
+ if (ret)
+ return ret;
+
+ bch2_bkey_val_to_text(&buf, c, extent_k);
+ }
+
+ if (fsck_err_on(!should_have_rebalance && have_rebalance,
+ trans, rebalance_work_incorrectly_set,
+ "rebalance work incorrectly set\n%s", buf.buf)) {
+ ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
+ extent_k.k->p, false);
+ if (ret)
+ goto err;
+ }
+
+ if (fsck_err_on(should_have_rebalance && !have_rebalance,
+ trans, rebalance_work_incorrectly_unset,
+ "rebalance work incorrectly unset\n%s", buf.buf)) {
+ ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
+ extent_k.k->p, true);
+ if (ret)
+ goto err;
+ }
+
+ if (cmp <= 0)
+ bch2_btree_iter_advance(trans, extent_iter);
+ if (cmp >= 0)
+ bch2_btree_iter_advance(trans, rebalance_iter);
+err:
+fsck_err:
+ printbuf_exit(&buf);
+ return ret;
+}
+
+int bch2_check_rebalance_work(struct bch_fs *c)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct btree_iter rebalance_iter, extent_iter;
+ int ret = 0;
+
+ bch2_trans_iter_init(trans, &extent_iter,
+ BTREE_ID_reflink, POS_MIN,
+ BTREE_ITER_prefetch);
+ bch2_trans_iter_init(trans, &rebalance_iter,
+ BTREE_ID_rebalance_work, POS_MIN,
+ BTREE_ITER_prefetch);
+
+ struct bkey_buf last_flushed;
+ bch2_bkey_buf_init(&last_flushed);
+ bkey_init(&last_flushed.k->k);
+
+ while (!ret) {
+ bch2_trans_begin(trans);
+
+ ret = check_rebalance_work_one(trans, &extent_iter, &rebalance_iter, &last_flushed);
+
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ ret = 0;
+ }
+
+ bch2_bkey_buf_exit(&last_flushed, c);
+ bch2_trans_iter_exit(trans, &extent_iter);
+ bch2_trans_iter_exit(trans, &rebalance_iter);
+ bch2_trans_put(trans);
+ return ret < 0 ? ret : 0;
}
diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h
index e5e8eb4a2dd1..7a565ea7dbfc 100644
--- a/fs/bcachefs/rebalance.h
+++ b/fs/bcachefs/rebalance.h
@@ -39,19 +39,21 @@ int bch2_set_fs_needs_rebalance(struct bch_fs *);
static inline void bch2_rebalance_wakeup(struct bch_fs *c)
{
- struct task_struct *p;
-
- rcu_read_lock();
- p = rcu_dereference(c->rebalance.thread);
+ c->rebalance.kick++;
+ guard(rcu)();
+ struct task_struct *p = rcu_dereference(c->rebalance.thread);
if (p)
wake_up_process(p);
- rcu_read_unlock();
}
void bch2_rebalance_status_to_text(struct printbuf *, struct bch_fs *);
void bch2_rebalance_stop(struct bch_fs *);
int bch2_rebalance_start(struct bch_fs *);
-void bch2_fs_rebalance_init(struct bch_fs *);
+
+void bch2_fs_rebalance_exit(struct bch_fs *);
+int bch2_fs_rebalance_init(struct bch_fs *);
+
+int bch2_check_rebalance_work(struct bch_fs *);
#endif /* _BCACHEFS_REBALANCE_H */
diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h
index fe5098c17dfc..c659da149fa3 100644
--- a/fs/bcachefs/rebalance_types.h
+++ b/fs/bcachefs/rebalance_types.h
@@ -18,6 +18,7 @@ enum bch_rebalance_states {
struct bch_fs_rebalance {
struct task_struct __rcu *thread;
+ u32 kick;
struct bch_pd_controller pd;
enum bch_rebalance_states state;
@@ -30,6 +31,11 @@ struct bch_fs_rebalance {
struct bbpos scan_start;
struct bbpos scan_end;
struct bch_move_stats scan_stats;
+
+ bool on_battery;
+#ifdef CONFIG_POWER_SUPPLY
+ struct notifier_block power_notifier;
+#endif
};
#endif /* _BCACHEFS_REBALANCE_TYPES_H */
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index d6c4ef819d40..1e68e61f08e8 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -33,8 +33,9 @@
#include <linux/sort.h>
#include <linux/stat.h>
-
-int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree)
+int bch2_btree_lost_data(struct bch_fs *c,
+ struct printbuf *msg,
+ enum btree_id btree)
{
u64 b = BIT_ULL(btree);
int ret = 0;
@@ -43,32 +44,32 @@ int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree)
struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
if (!(c->sb.btrees_lost_data & b)) {
- struct printbuf buf = PRINTBUF;
- bch2_btree_id_to_text(&buf, btree);
- bch_err(c, "flagging btree %s lost data", buf.buf);
- printbuf_exit(&buf);
+ prt_printf(msg, "flagging btree ");
+ bch2_btree_id_to_text(msg, btree);
+ prt_printf(msg, " lost data\n");
+
ext->btrees_lost_data |= cpu_to_le64(b);
}
/* Once we have runtime self healing for topology errors we won't need this: */
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_topology) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0) ?: ret;
/* Btree node accounting will be off: */
__set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent);
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_allocations) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_allocations, 0) ?: ret;
#ifdef CONFIG_BCACHEFS_DEBUG
/*
* These are much more minor, and don't need to be corrected right away,
* but in debug mode we want the next fsck run to be clean:
*/
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_lrus) ?: ret;
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_backpointers_to_extents) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_lrus, 0) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_backpointers_to_extents, 0) ?: ret;
#endif
switch (btree) {
case BTREE_ID_alloc:
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret;
__set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent);
@@ -78,26 +79,30 @@ int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree)
__set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent);
goto out;
case BTREE_ID_backpointers:
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_btree_backpointers) ?: ret;
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_extents_to_backpointers) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_btree_backpointers, 0) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_extents_to_backpointers, 0) ?: ret;
goto out;
case BTREE_ID_need_discard:
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret;
goto out;
case BTREE_ID_freespace:
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret;
goto out;
case BTREE_ID_bucket_gens:
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret;
goto out;
case BTREE_ID_lru:
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret;
goto out;
case BTREE_ID_accounting:
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_allocations) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_allocations, 0) ?: ret;
+ goto out;
+ case BTREE_ID_snapshots:
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_reconstruct_snapshots, 0) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0) ?: ret;
goto out;
default:
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0) ?: ret;
goto out;
}
out:
@@ -114,11 +119,8 @@ static void kill_btree(struct bch_fs *c, enum btree_id btree)
}
/* for -o reconstruct_alloc: */
-static void bch2_reconstruct_alloc(struct bch_fs *c)
+void bch2_reconstruct_alloc(struct bch_fs *c)
{
- bch2_journal_log_msg(c, "dropping alloc info");
- bch_info(c, "dropping and reconstructing all alloc info");
-
mutex_lock(&c->sb_lock);
struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
@@ -160,6 +162,8 @@ static void bch2_reconstruct_alloc(struct bch_fs *c)
c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
+ c->disk_sb.sb->features[0] &= ~cpu_to_le64(BIT_ULL(BCH_FEATURE_no_alloc_info));
+
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
@@ -282,7 +286,12 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
goto out;
if (k->k->k.type == KEY_TYPE_accounting) {
- ret = bch2_trans_update_buffered(trans, BTREE_ID_accounting, k->k);
+ struct bkey_i *n = bch2_trans_subbuf_alloc(trans, &trans->accounting, k->k->k.u64s);
+ ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ goto out;
+
+ bkey_copy(n, k->k);
goto out;
}
@@ -430,7 +439,7 @@ int bch2_journal_replay(struct bch_fs *c)
trans = NULL;
if (!c->opts.retain_recovery_info &&
- c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay)
+ c->recovery.pass_done >= BCH_RECOVERY_PASS_journal_replay)
bch2_journal_keys_put_initial(c);
replay_now_at(j, j->replay_journal_seq_end);
@@ -585,9 +594,6 @@ static int read_btree_roots(struct bch_fs *c)
buf.buf, bch2_err_str(ret))) {
if (btree_id_is_alloc(i))
r->error = 0;
-
- ret = bch2_btree_lost_data(c, i);
- BUG_ON(ret);
}
}
@@ -667,7 +673,7 @@ static bool check_version_upgrade(struct bch_fs *c)
bch2_recovery_passes_from_stable(le64_to_cpu(passes)));
}
- bch_info(c, "%s", buf.buf);
+ bch_notice(c, "%s", buf.buf);
printbuf_exit(&buf);
ret = true;
@@ -683,7 +689,7 @@ static bool check_version_upgrade(struct bch_fs *c)
bch2_version_to_text(&buf, c->sb.version_incompat_allowed);
prt_newline(&buf);
- bch_info(c, "%s", buf.buf);
+ bch_notice(c, "%s", buf.buf);
printbuf_exit(&buf);
ret = true;
@@ -790,11 +796,11 @@ int bch2_fs_recovery(struct bch_fs *c)
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
- if (c->opts.fsck)
- set_bit(BCH_FS_fsck_running, &c->flags);
if (c->sb.clean)
set_bit(BCH_FS_clean_recovery, &c->flags);
- set_bit(BCH_FS_recovery_running, &c->flags);
+ if (c->opts.fsck)
+ set_bit(BCH_FS_in_fsck, &c->flags);
+ set_bit(BCH_FS_in_recovery, &c->flags);
ret = bch2_blacklist_table_initialize(c);
if (ret) {
@@ -873,7 +879,7 @@ int bch2_fs_recovery(struct bch_fs *c)
use_clean:
if (!clean) {
bch_err(c, "no superblock clean section found");
- ret = -BCH_ERR_fsck_repair_impossible;
+ ret = bch_err_throw(c, fsck_repair_impossible);
goto err;
}
@@ -889,8 +895,37 @@ use_clean:
if (ret)
goto err;
- if (c->opts.reconstruct_alloc)
+ ret = bch2_fs_resize_on_mount(c);
+ if (ret) {
+ up_write(&c->state_lock);
+ goto err;
+ }
+
+ if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) {
+ bch_info(c, "filesystem is an unresized image file, mounting ro");
+ c->opts.read_only = true;
+ }
+
+ if (!c->opts.read_only &&
+ (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))) {
+ bch_info(c, "mounting a filesystem with no alloc info read-write; will recreate");
+
+ bch2_reconstruct_alloc(c);
+ } else if (c->opts.reconstruct_alloc) {
+ bch2_journal_log_msg(c, "dropping alloc info");
+ bch_info(c, "dropping and reconstructing all alloc info");
+
bch2_reconstruct_alloc(c);
+ }
+
+ if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) {
+ /* We can't go RW to fix errors without alloc info */
+ if (c->opts.fix_errors == FSCK_FIX_yes ||
+ c->opts.fix_errors == FSCK_FIX_ask)
+ c->opts.fix_errors = FSCK_FIX_no;
+ if (c->opts.errors == BCH_ON_ERROR_fix_safe)
+ c->opts.errors = BCH_ON_ERROR_continue;
+ }
/*
* After an unclean shutdown, skip then next few journal sequence
@@ -933,8 +968,10 @@ use_clean:
set_bit(BCH_FS_btree_running, &c->flags);
ret = bch2_sb_set_upgrade_extra(c);
+ if (ret)
+ goto err;
- ret = bch2_run_recovery_passes(c);
+ ret = bch2_run_recovery_passes(c, 0);
if (ret)
goto err;
@@ -945,8 +982,7 @@ use_clean:
* multithreaded use:
*/
set_bit(BCH_FS_may_go_rw, &c->flags);
- clear_bit(BCH_FS_fsck_running, &c->flags);
- clear_bit(BCH_FS_recovery_running, &c->flags);
+ clear_bit(BCH_FS_in_fsck, &c->flags);
/* in case we don't run journal replay, i.e. norecovery mode */
set_bit(BCH_FS_accounting_replay_done, &c->flags);
@@ -969,9 +1005,8 @@ use_clean:
bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean");
clear_bit(BCH_FS_errors_fixed, &c->flags);
- c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info;
-
- ret = bch2_run_recovery_passes(c);
+ ret = bch2_run_recovery_passes(c,
+ BCH_RECOVERY_PASS_check_alloc_info);
if (ret)
goto err;
@@ -1015,7 +1050,7 @@ use_clean:
if (c->opts.fsck &&
!test_bit(BCH_FS_error, &c->flags) &&
- c->recovery_pass_done == BCH_RECOVERY_PASS_NR - 1 &&
+ c->recovery.pass_done == BCH_RECOVERY_PASS_NR - 1 &&
ext->btrees_lost_data) {
ext->btrees_lost_data = 0;
write_sb = true;
@@ -1058,10 +1093,6 @@ use_clean:
out:
bch2_flush_fsck_errs(c);
- if (!c->opts.retain_recovery_info) {
- bch2_journal_keys_put_initial(c);
- bch2_find_btree_nodes_exit(&c->found_btree_nodes);
- }
if (!IS_ERR(clean))
kfree(clean);
@@ -1076,8 +1107,17 @@ out:
return ret;
err:
fsck_err:
- bch2_fs_emergency_read_only(c);
- goto out;
+ {
+ struct printbuf buf = PRINTBUF;
+ bch2_log_msg_start(c, &buf);
+
+ prt_printf(&buf, "error in recovery: %s", bch2_err_str(ret));
+ bch2_fs_emergency_read_only2(c, &buf);
+
+ bch2_print_str(c, KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+ }
+ return ret;
}
int bch2_fs_initialize(struct bch_fs *c)
@@ -1193,7 +1233,7 @@ int bch2_fs_initialize(struct bch_fs *c)
if (ret)
goto err;
- c->recovery_pass_done = BCH_RECOVERY_PASS_NR - 1;
+ c->recovery.pass_done = BCH_RECOVERY_PASS_NR - 1;
bch2_copygc_wakeup(c);
bch2_rebalance_wakeup(c);
@@ -1216,7 +1256,7 @@ int bch2_fs_initialize(struct bch_fs *c)
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
- c->curr_recovery_pass = BCH_RECOVERY_PASS_NR;
+ c->recovery.curr_pass = BCH_RECOVERY_PASS_NR;
return 0;
err:
bch_err_fn(c, ret);
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
index b0d55754b21b..c023f52fc2d6 100644
--- a/fs/bcachefs/recovery.h
+++ b/fs/bcachefs/recovery.h
@@ -2,7 +2,8 @@
#ifndef _BCACHEFS_RECOVERY_H
#define _BCACHEFS_RECOVERY_H
-int bch2_btree_lost_data(struct bch_fs *, enum btree_id);
+int bch2_btree_lost_data(struct bch_fs *, struct printbuf *, enum btree_id);
+void bch2_reconstruct_alloc(struct bch_fs *);
int bch2_journal_replay(struct bch_fs *);
diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c
index 22f72bb5b853..605588e33fb3 100644
--- a/fs/bcachefs/recovery_passes.c
+++ b/fs/bcachefs/recovery_passes.c
@@ -28,6 +28,176 @@ const char * const bch2_recovery_passes[] = {
NULL
};
+static const u8 passes_to_stable_map[] = {
+#define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n,
+ BCH_RECOVERY_PASSES()
+#undef x
+};
+
+static const u8 passes_from_stable_map[] = {
+#define x(n, id, ...) [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n,
+ BCH_RECOVERY_PASSES()
+#undef x
+};
+
+static enum bch_recovery_pass_stable bch2_recovery_pass_to_stable(enum bch_recovery_pass pass)
+{
+ return passes_to_stable_map[pass];
+}
+
+u64 bch2_recovery_passes_to_stable(u64 v)
+{
+ u64 ret = 0;
+ for (unsigned i = 0; i < ARRAY_SIZE(passes_to_stable_map); i++)
+ if (v & BIT_ULL(i))
+ ret |= BIT_ULL(passes_to_stable_map[i]);
+ return ret;
+}
+
+static enum bch_recovery_pass bch2_recovery_pass_from_stable(enum bch_recovery_pass_stable pass)
+{
+ return pass < ARRAY_SIZE(passes_from_stable_map)
+ ? passes_from_stable_map[pass]
+ : 0;
+}
+
+u64 bch2_recovery_passes_from_stable(u64 v)
+{
+ u64 ret = 0;
+ for (unsigned i = 0; i < ARRAY_SIZE(passes_from_stable_map); i++)
+ if (v & BIT_ULL(i))
+ ret |= BIT_ULL(passes_from_stable_map[i]);
+ return ret;
+}
+
+static int bch2_sb_recovery_passes_validate(struct bch_sb *sb, struct bch_sb_field *f,
+ enum bch_validate_flags flags, struct printbuf *err)
+{
+ return 0;
+}
+
+static void bch2_sb_recovery_passes_to_text(struct printbuf *out,
+ struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ struct bch_sb_field_recovery_passes *r =
+ field_to_type(f, recovery_passes);
+ unsigned nr = recovery_passes_nr_entries(r);
+
+ if (out->nr_tabstops < 1)
+ printbuf_tabstop_push(out, 32);
+ if (out->nr_tabstops < 2)
+ printbuf_tabstop_push(out, 16);
+
+ prt_printf(out, "Pass\tLast run\tLast runtime\n");
+
+ for (struct recovery_pass_entry *i = r->start; i < r->start + nr; i++) {
+ if (!i->last_run)
+ continue;
+
+ unsigned idx = i - r->start;
+
+ prt_printf(out, "%s\t", bch2_recovery_passes[bch2_recovery_pass_from_stable(idx)]);
+
+ bch2_prt_datetime(out, le64_to_cpu(i->last_run));
+ prt_tab(out);
+
+ bch2_pr_time_units(out, le32_to_cpu(i->last_runtime) * NSEC_PER_SEC);
+
+ if (BCH_RECOVERY_PASS_NO_RATELIMIT(i))
+ prt_str(out, " (no ratelimit)");
+
+ prt_newline(out);
+ }
+}
+
+static struct recovery_pass_entry *bch2_sb_recovery_pass_entry(struct bch_fs *c,
+ enum bch_recovery_pass pass)
+{
+ enum bch_recovery_pass_stable stable = bch2_recovery_pass_to_stable(pass);
+
+ lockdep_assert_held(&c->sb_lock);
+
+ struct bch_sb_field_recovery_passes *r =
+ bch2_sb_field_get(c->disk_sb.sb, recovery_passes);
+
+ if (stable >= recovery_passes_nr_entries(r)) {
+ unsigned u64s = struct_size(r, start, stable + 1) / sizeof(u64);
+
+ r = bch2_sb_field_resize(&c->disk_sb, recovery_passes, u64s);
+ if (!r) {
+ bch_err(c, "error creating recovery_passes sb section");
+ return NULL;
+ }
+ }
+
+ return r->start + stable;
+}
+
+static void bch2_sb_recovery_pass_complete(struct bch_fs *c,
+ enum bch_recovery_pass pass,
+ s64 start_time)
+{
+ guard(mutex)(&c->sb_lock);
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+ __clear_bit_le64(bch2_recovery_pass_to_stable(pass),
+ ext->recovery_passes_required);
+
+ struct recovery_pass_entry *e = bch2_sb_recovery_pass_entry(c, pass);
+ if (e) {
+ s64 end_time = ktime_get_real_seconds();
+ e->last_run = cpu_to_le64(end_time);
+ e->last_runtime = cpu_to_le32(max(0, end_time - start_time));
+ SET_BCH_RECOVERY_PASS_NO_RATELIMIT(e, false);
+ }
+
+ bch2_write_super(c);
+}
+
+void bch2_recovery_pass_set_no_ratelimit(struct bch_fs *c,
+ enum bch_recovery_pass pass)
+{
+ guard(mutex)(&c->sb_lock);
+
+ struct recovery_pass_entry *e = bch2_sb_recovery_pass_entry(c, pass);
+ if (e && !BCH_RECOVERY_PASS_NO_RATELIMIT(e)) {
+ SET_BCH_RECOVERY_PASS_NO_RATELIMIT(e, false);
+ bch2_write_super(c);
+ }
+}
+
+static bool bch2_recovery_pass_want_ratelimit(struct bch_fs *c, enum bch_recovery_pass pass)
+{
+ enum bch_recovery_pass_stable stable = bch2_recovery_pass_to_stable(pass);
+ bool ret = false;
+
+ lockdep_assert_held(&c->sb_lock);
+
+ struct bch_sb_field_recovery_passes *r =
+ bch2_sb_field_get(c->disk_sb.sb, recovery_passes);
+
+ if (stable < recovery_passes_nr_entries(r)) {
+ struct recovery_pass_entry *i = r->start + stable;
+
+ /*
+ * Ratelimit if the last runtime was more than 1% of the time
+ * since we last ran
+ */
+ ret = (u64) le32_to_cpu(i->last_runtime) * 100 >
+ ktime_get_real_seconds() - le64_to_cpu(i->last_run);
+
+ if (BCH_RECOVERY_PASS_NO_RATELIMIT(i))
+ ret = false;
+ }
+
+ return ret;
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_recovery_passes = {
+ .validate = bch2_sb_recovery_passes_validate,
+ .to_text = bch2_sb_recovery_passes_to_text
+};
+
/* Fake recovery pass, so that scan_for_btree_nodes isn't 0: */
static int bch2_recovery_pass_empty(struct bch_fs *c)
{
@@ -47,11 +217,36 @@ static int bch2_set_may_go_rw(struct bch_fs *c)
set_bit(BCH_FS_may_go_rw, &c->flags);
- if (keys->nr || !c->opts.read_only || c->opts.fsck || !c->sb.clean || c->opts.recovery_passes)
+ if (keys->nr ||
+ !c->opts.read_only ||
+ !c->sb.clean ||
+ c->opts.recovery_passes ||
+ (c->opts.fsck && !(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)))) {
+ if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) {
+ bch_info(c, "mounting a filesystem with no alloc info read-write; will recreate");
+ bch2_reconstruct_alloc(c);
+ }
+
return bch2_fs_read_write_early(c);
+ }
return 0;
}
+/*
+ * Make sure root inode is readable while we're still in recovery and can rewind
+ * for repair:
+ */
+static int bch2_lookup_root_inode(struct bch_fs *c)
+{
+ subvol_inum inum = BCACHEFS_ROOT_SUBVOL_INUM;
+ struct bch_inode_unpacked inode_u;
+ struct bch_subvolume subvol;
+
+ return bch2_trans_do(c,
+ bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?:
+ bch2_inode_find_by_inum_trans(trans, inum, &inode_u));
+}
+
struct recovery_pass_fn {
int (*fn)(struct bch_fs *);
unsigned when;
@@ -63,252 +258,382 @@ static struct recovery_pass_fn recovery_pass_fns[] = {
#undef x
};
-static const u8 passes_to_stable_map[] = {
-#define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n,
- BCH_RECOVERY_PASSES()
-#undef x
-};
+static u64 bch2_recovery_passes_match(unsigned flags)
+{
+ u64 ret = 0;
-static enum bch_recovery_pass_stable bch2_recovery_pass_to_stable(enum bch_recovery_pass pass)
+ for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++)
+ if (recovery_pass_fns[i].when & flags)
+ ret |= BIT_ULL(i);
+ return ret;
+}
+
+u64 bch2_fsck_recovery_passes(void)
{
- return passes_to_stable_map[pass];
+ return bch2_recovery_passes_match(PASS_FSCK);
}
-u64 bch2_recovery_passes_to_stable(u64 v)
+static void bch2_run_async_recovery_passes(struct bch_fs *c)
{
- u64 ret = 0;
- for (unsigned i = 0; i < ARRAY_SIZE(passes_to_stable_map); i++)
- if (v & BIT_ULL(i))
- ret |= BIT_ULL(passes_to_stable_map[i]);
- return ret;
+ if (!down_trylock(&c->recovery.run_lock))
+ return;
+
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_async_recovery_passes))
+ goto unlock;
+
+ if (queue_work(system_long_wq, &c->recovery.work))
+ return;
+
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_async_recovery_passes);
+unlock:
+ up(&c->recovery.run_lock);
}
-u64 bch2_recovery_passes_from_stable(u64 v)
+static bool recovery_pass_needs_set(struct bch_fs *c,
+ enum bch_recovery_pass pass,
+ enum bch_run_recovery_pass_flags *flags)
{
- static const u8 map[] = {
-#define x(n, id, ...) [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n,
- BCH_RECOVERY_PASSES()
-#undef x
- };
+ struct bch_fs_recovery *r = &c->recovery;
+ bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags);
+ bool persistent = !in_recovery || !(*flags & RUN_RECOVERY_PASS_nopersistent);
- u64 ret = 0;
- for (unsigned i = 0; i < ARRAY_SIZE(map); i++)
- if (v & BIT_ULL(i))
- ret |= BIT_ULL(map[i]);
- return ret;
+ if ((*flags & RUN_RECOVERY_PASS_ratelimit) &&
+ !bch2_recovery_pass_want_ratelimit(c, pass))
+ *flags &= ~RUN_RECOVERY_PASS_ratelimit;
+
+ /*
+ * If RUN_RECOVERY_PASS_nopersistent is set, we don't want to do
+ * anything if the pass has already run: these mean we need a prior pass
+ * to run before we continue to repair, we don't expect that pass to fix
+ * the damage we encountered.
+ *
+ * Otherwise, we run run_explicit_recovery_pass when we find damage, so
+ * it should run again even if it's already run:
+ */
+
+ if (persistent
+ ? !(c->sb.recovery_passes_required & BIT_ULL(pass))
+ : !((r->passes_to_run|r->passes_complete) & BIT_ULL(pass)))
+ return true;
+
+ if (!(*flags & RUN_RECOVERY_PASS_ratelimit) &&
+ (r->passes_ratelimiting & BIT_ULL(pass)))
+ return true;
+
+ return false;
}
/*
* For when we need to rewind recovery passes and run a pass we skipped:
*/
-static int __bch2_run_explicit_recovery_pass(struct bch_fs *c,
- enum bch_recovery_pass pass)
+int __bch2_run_explicit_recovery_pass(struct bch_fs *c,
+ struct printbuf *out,
+ enum bch_recovery_pass pass,
+ enum bch_run_recovery_pass_flags flags)
{
- if (c->curr_recovery_pass == ARRAY_SIZE(recovery_pass_fns))
- return -BCH_ERR_not_in_recovery;
+ struct bch_fs_recovery *r = &c->recovery;
+ int ret = 0;
- if (c->recovery_passes_complete & BIT_ULL(pass))
- return 0;
+ lockdep_assert_held(&c->sb_lock);
- bool print = !(c->opts.recovery_passes & BIT_ULL(pass));
+ bch2_printbuf_make_room(out, 1024);
+ out->atomic++;
+
+ unsigned long lockflags;
+ spin_lock_irqsave(&r->lock, lockflags);
+
+ if (!recovery_pass_needs_set(c, pass, &flags))
+ goto out;
+
+ bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags);
+ bool rewind = in_recovery &&
+ r->curr_pass > pass &&
+ !(r->passes_complete & BIT_ULL(pass));
+ bool ratelimit = flags & RUN_RECOVERY_PASS_ratelimit;
+
+ if (!(in_recovery && (flags & RUN_RECOVERY_PASS_nopersistent))) {
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+ __set_bit_le64(bch2_recovery_pass_to_stable(pass), ext->recovery_passes_required);
+ }
if (pass < BCH_RECOVERY_PASS_set_may_go_rw &&
- c->curr_recovery_pass >= BCH_RECOVERY_PASS_set_may_go_rw) {
- if (print)
- bch_info(c, "need recovery pass %s (%u), but already rw",
- bch2_recovery_passes[pass], pass);
- return -BCH_ERR_cannot_rewind_recovery;
+ (!in_recovery || r->curr_pass >= BCH_RECOVERY_PASS_set_may_go_rw)) {
+ prt_printf(out, "need recovery pass %s (%u), but already rw\n",
+ bch2_recovery_passes[pass], pass);
+ ret = bch_err_throw(c, cannot_rewind_recovery);
+ goto out;
}
- if (print)
- bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)",
- bch2_recovery_passes[pass], pass,
- bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass);
+ if (ratelimit)
+ r->passes_ratelimiting |= BIT_ULL(pass);
+ else
+ r->passes_ratelimiting &= ~BIT_ULL(pass);
+
+ if (in_recovery && !ratelimit) {
+ prt_printf(out, "running recovery pass %s (%u), currently at %s (%u)%s\n",
+ bch2_recovery_passes[pass], pass,
+ bch2_recovery_passes[r->curr_pass], r->curr_pass,
+ rewind ? " - rewinding" : "");
- c->opts.recovery_passes |= BIT_ULL(pass);
+ r->passes_to_run |= BIT_ULL(pass);
- if (c->curr_recovery_pass > pass) {
- c->next_recovery_pass = pass;
- c->recovery_passes_complete &= (1ULL << pass) >> 1;
- return -BCH_ERR_restart_recovery;
+ if (rewind) {
+ r->next_pass = pass;
+ r->passes_complete &= (1ULL << pass) >> 1;
+ ret = bch_err_throw(c, restart_recovery);
+ }
} else {
- return 0;
+ prt_printf(out, "scheduling recovery pass %s (%u)%s\n",
+ bch2_recovery_passes[pass], pass,
+ ratelimit ? " - ratelimiting" : "");
+
+ struct recovery_pass_fn *p = recovery_pass_fns + pass;
+ if (p->when & PASS_ONLINE)
+ bch2_run_async_recovery_passes(c);
}
+out:
+ spin_unlock_irqrestore(&r->lock, lockflags);
+ --out->atomic;
+ return ret;
}
int bch2_run_explicit_recovery_pass(struct bch_fs *c,
- enum bch_recovery_pass pass)
+ struct printbuf *out,
+ enum bch_recovery_pass pass,
+ enum bch_run_recovery_pass_flags flags)
{
- unsigned long flags;
- spin_lock_irqsave(&c->recovery_pass_lock, flags);
- int ret = __bch2_run_explicit_recovery_pass(c, pass);
- spin_unlock_irqrestore(&c->recovery_pass_lock, flags);
- return ret;
-}
+ int ret = 0;
-int bch2_run_explicit_recovery_pass_persistent_locked(struct bch_fs *c,
- enum bch_recovery_pass pass)
-{
- lockdep_assert_held(&c->sb_lock);
+ scoped_guard(mutex, &c->sb_lock) {
+ if (!recovery_pass_needs_set(c, pass, &flags))
+ return 0;
- struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
- __set_bit_le64(bch2_recovery_pass_to_stable(pass), ext->recovery_passes_required);
+ ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags);
+ bch2_write_super(c);
+ }
- return bch2_run_explicit_recovery_pass(c, pass);
+ return ret;
}
-int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c,
- enum bch_recovery_pass pass)
+/*
+ * Returns 0 if @pass has run recently, otherwise one of
+ * -BCH_ERR_restart_recovery
+ * -BCH_ERR_recovery_pass_will_run
+ */
+int bch2_require_recovery_pass(struct bch_fs *c,
+ struct printbuf *out,
+ enum bch_recovery_pass pass)
{
- enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass);
+ if (test_bit(BCH_FS_in_recovery, &c->flags) &&
+ c->recovery.passes_complete & BIT_ULL(pass))
+ return 0;
- mutex_lock(&c->sb_lock);
- struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+ guard(mutex)(&c->sb_lock);
- if (!test_bit_le64(s, ext->recovery_passes_required)) {
- __set_bit_le64(s, ext->recovery_passes_required);
+ if (bch2_recovery_pass_want_ratelimit(c, pass))
+ return 0;
+
+ enum bch_run_recovery_pass_flags flags = 0;
+ int ret = 0;
+
+ if (recovery_pass_needs_set(c, pass, &flags)) {
+ ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags);
bch2_write_super(c);
}
- mutex_unlock(&c->sb_lock);
- return bch2_run_explicit_recovery_pass(c, pass);
+ return ret ?: bch_err_throw(c, recovery_pass_will_run);
}
-static void bch2_clear_recovery_pass_required(struct bch_fs *c,
- enum bch_recovery_pass pass)
+int bch2_run_print_explicit_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
{
- enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass);
+ enum bch_run_recovery_pass_flags flags = RUN_RECOVERY_PASS_nopersistent;
- mutex_lock(&c->sb_lock);
- struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+ if (!recovery_pass_needs_set(c, pass, &flags))
+ return 0;
- if (test_bit_le64(s, ext->recovery_passes_required)) {
- __clear_bit_le64(s, ext->recovery_passes_required);
- bch2_write_super(c);
- }
- mutex_unlock(&c->sb_lock);
-}
+ struct printbuf buf = PRINTBUF;
+ bch2_log_msg_start(c, &buf);
-u64 bch2_fsck_recovery_passes(void)
-{
- u64 ret = 0;
+ mutex_lock(&c->sb_lock);
+ int ret = __bch2_run_explicit_recovery_pass(c, &buf, pass,
+ RUN_RECOVERY_PASS_nopersistent);
+ mutex_unlock(&c->sb_lock);
- for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++)
- if (recovery_pass_fns[i].when & PASS_FSCK)
- ret |= BIT_ULL(i);
+ bch2_print_str(c, KERN_NOTICE, buf.buf);
+ printbuf_exit(&buf);
return ret;
}
-static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
-{
- struct recovery_pass_fn *p = recovery_pass_fns + pass;
-
- if (c->opts.recovery_passes_exclude & BIT_ULL(pass))
- return false;
- if (c->opts.recovery_passes & BIT_ULL(pass))
- return true;
- if ((p->when & PASS_FSCK) && c->opts.fsck)
- return true;
- if ((p->when & PASS_UNCLEAN) && !c->sb.clean)
- return true;
- if (p->when & PASS_ALWAYS)
- return true;
- return false;
-}
-
static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
{
+ struct bch_fs_recovery *r = &c->recovery;
struct recovery_pass_fn *p = recovery_pass_fns + pass;
- int ret;
if (!(p->when & PASS_SILENT))
bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."),
bch2_recovery_passes[pass]);
- ret = p->fn(c);
- if (ret)
+
+ s64 start_time = ktime_get_real_seconds();
+ int ret = p->fn(c);
+
+ r->passes_to_run &= ~BIT_ULL(pass);
+
+ if (ret) {
+ r->passes_failing |= BIT_ULL(pass);
return ret;
+ }
+
+ r->passes_failing = 0;
+
+ if (!test_bit(BCH_FS_error, &c->flags))
+ bch2_sb_recovery_pass_complete(c, pass, start_time);
+
if (!(p->when & PASS_SILENT))
bch2_print(c, KERN_CONT " done\n");
return 0;
}
-int bch2_run_online_recovery_passes(struct bch_fs *c)
+static int __bch2_run_recovery_passes(struct bch_fs *c, u64 orig_passes_to_run,
+ bool online)
{
- for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) {
- struct recovery_pass_fn *p = recovery_pass_fns + i;
-
- if (!(p->when & PASS_ONLINE))
- continue;
+ struct bch_fs_recovery *r = &c->recovery;
+ int ret = 0;
- int ret = bch2_run_recovery_pass(c, i);
- if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) {
- i = c->curr_recovery_pass;
- continue;
- }
- if (ret)
- return ret;
- }
+ spin_lock_irq(&r->lock);
- return 0;
-}
+ if (online)
+ orig_passes_to_run &= bch2_recovery_passes_match(PASS_ONLINE);
-int bch2_run_recovery_passes(struct bch_fs *c)
-{
- int ret = 0;
+ if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))
+ orig_passes_to_run &= ~bch2_recovery_passes_match(PASS_ALLOC);
/*
- * We can't allow set_may_go_rw to be excluded; that would cause us to
- * use the journal replay keys for updates where it's not expected.
+ * A failed recovery pass will be retried after another pass succeeds -
+ * but not this iteration.
+ *
+ * This is because some passes depend on repair done by other passes: we
+ * may want to retry, but we don't want to loop on failing passes.
*/
- c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw;
- spin_lock_irq(&c->recovery_pass_lock);
+ orig_passes_to_run &= ~r->passes_failing;
- while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns) && !ret) {
- unsigned prev_done = c->recovery_pass_done;
- unsigned pass = c->curr_recovery_pass;
+ r->passes_to_run = orig_passes_to_run;
- c->next_recovery_pass = pass + 1;
+ while (r->passes_to_run) {
+ unsigned prev_done = r->pass_done;
+ unsigned pass = __ffs64(r->passes_to_run);
+ r->curr_pass = pass;
+ r->next_pass = r->curr_pass + 1;
+ r->passes_to_run &= ~BIT_ULL(pass);
- if (c->opts.recovery_pass_last &&
- c->curr_recovery_pass > c->opts.recovery_pass_last)
- break;
+ spin_unlock_irq(&r->lock);
+
+ int ret2 = bch2_run_recovery_pass(c, pass) ?:
+ bch2_journal_flush(&c->journal);
- if (should_run_recovery_pass(c, pass)) {
- spin_unlock_irq(&c->recovery_pass_lock);
- ret = bch2_run_recovery_pass(c, pass) ?:
- bch2_journal_flush(&c->journal);
-
- if (!ret && !test_bit(BCH_FS_error, &c->flags))
- bch2_clear_recovery_pass_required(c, pass);
- spin_lock_irq(&c->recovery_pass_lock);
-
- if (c->next_recovery_pass < c->curr_recovery_pass) {
- /*
- * bch2_run_explicit_recovery_pass() was called: we
- * can't always catch -BCH_ERR_restart_recovery because
- * it may have been called from another thread (btree
- * node read completion)
- */
- ret = 0;
- c->recovery_passes_complete &= ~(~0ULL << c->curr_recovery_pass);
- } else {
- c->recovery_passes_complete |= BIT_ULL(pass);
- c->recovery_pass_done = max(c->recovery_pass_done, pass);
- }
+ spin_lock_irq(&r->lock);
+
+ if (r->next_pass < r->curr_pass) {
+ /* Rewind: */
+ r->passes_to_run |= orig_passes_to_run & (~0ULL << r->next_pass);
+ } else if (!ret2) {
+ r->pass_done = max(r->pass_done, pass);
+ r->passes_complete |= BIT_ULL(pass);
+ } else {
+ ret = ret2;
}
- c->curr_recovery_pass = c->next_recovery_pass;
+ if (ret && !online)
+ break;
if (prev_done <= BCH_RECOVERY_PASS_check_snapshots &&
- c->recovery_pass_done > BCH_RECOVERY_PASS_check_snapshots) {
+ r->pass_done > BCH_RECOVERY_PASS_check_snapshots) {
bch2_copygc_wakeup(c);
bch2_rebalance_wakeup(c);
}
}
- spin_unlock_irq(&c->recovery_pass_lock);
+ clear_bit(BCH_FS_in_recovery, &c->flags);
+ spin_unlock_irq(&r->lock);
return ret;
}
+
+static void bch2_async_recovery_passes_work(struct work_struct *work)
+{
+ struct bch_fs *c = container_of(work, struct bch_fs, recovery.work);
+ struct bch_fs_recovery *r = &c->recovery;
+
+ __bch2_run_recovery_passes(c,
+ c->sb.recovery_passes_required & ~r->passes_ratelimiting,
+ true);
+
+ up(&r->run_lock);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_async_recovery_passes);
+}
+
+int bch2_run_online_recovery_passes(struct bch_fs *c, u64 passes)
+{
+ return __bch2_run_recovery_passes(c, c->sb.recovery_passes_required|passes, true);
+}
+
+int bch2_run_recovery_passes(struct bch_fs *c, enum bch_recovery_pass from)
+{
+ u64 passes =
+ bch2_recovery_passes_match(PASS_ALWAYS) |
+ (!c->sb.clean ? bch2_recovery_passes_match(PASS_UNCLEAN) : 0) |
+ (c->opts.fsck ? bch2_recovery_passes_match(PASS_FSCK) : 0) |
+ c->opts.recovery_passes |
+ c->sb.recovery_passes_required;
+
+ if (c->opts.recovery_pass_last)
+ passes &= BIT_ULL(c->opts.recovery_pass_last + 1) - 1;
+
+ /*
+ * We can't allow set_may_go_rw to be excluded; that would cause us to
+ * use the journal replay keys for updates where it's not expected.
+ */
+ c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw;
+ passes &= ~c->opts.recovery_passes_exclude;
+
+ passes &= ~(BIT_ULL(from) - 1);
+
+ down(&c->recovery.run_lock);
+ int ret = __bch2_run_recovery_passes(c, passes, false);
+ up(&c->recovery.run_lock);
+
+ return ret;
+}
+
+static void prt_passes(struct printbuf *out, const char *msg, u64 passes)
+{
+ prt_printf(out, "%s:\t", msg);
+ prt_bitflags(out, bch2_recovery_passes, passes);
+ prt_newline(out);
+}
+
+void bch2_recovery_pass_status_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ struct bch_fs_recovery *r = &c->recovery;
+
+ printbuf_tabstop_push(out, 32);
+ prt_passes(out, "Scheduled passes", c->sb.recovery_passes_required);
+ prt_passes(out, "Scheduled online passes", c->sb.recovery_passes_required &
+ bch2_recovery_passes_match(PASS_ONLINE));
+ prt_passes(out, "Complete passes", r->passes_complete);
+ prt_passes(out, "Failing passes", r->passes_failing);
+
+ if (r->curr_pass) {
+ prt_printf(out, "Current pass:\t%s\n", bch2_recovery_passes[r->curr_pass]);
+ prt_passes(out, "Current passes", r->passes_to_run);
+ }
+}
+
+void bch2_fs_recovery_passes_init(struct bch_fs *c)
+{
+ spin_lock_init(&c->recovery.lock);
+ sema_init(&c->recovery.run_lock, 1);
+
+ INIT_WORK(&c->recovery.work, bch2_async_recovery_passes_work);
+}
diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h
index 7d7339c8fa29..260571c7105e 100644
--- a/fs/bcachefs/recovery_passes.h
+++ b/fs/bcachefs/recovery_passes.h
@@ -3,16 +3,37 @@
extern const char * const bch2_recovery_passes[];
+extern const struct bch_sb_field_ops bch_sb_field_ops_recovery_passes;
+
u64 bch2_recovery_passes_to_stable(u64 v);
u64 bch2_recovery_passes_from_stable(u64 v);
u64 bch2_fsck_recovery_passes(void);
-int bch2_run_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass);
-int bch2_run_explicit_recovery_pass_persistent_locked(struct bch_fs *, enum bch_recovery_pass);
-int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *, enum bch_recovery_pass);
+void bch2_recovery_pass_set_no_ratelimit(struct bch_fs *, enum bch_recovery_pass);
+
+enum bch_run_recovery_pass_flags {
+ RUN_RECOVERY_PASS_nopersistent = BIT(0),
+ RUN_RECOVERY_PASS_ratelimit = BIT(1),
+};
+
+int bch2_run_print_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass);
+
+int __bch2_run_explicit_recovery_pass(struct bch_fs *, struct printbuf *,
+ enum bch_recovery_pass,
+ enum bch_run_recovery_pass_flags);
+int bch2_run_explicit_recovery_pass(struct bch_fs *, struct printbuf *,
+ enum bch_recovery_pass,
+ enum bch_run_recovery_pass_flags);
+
+int bch2_require_recovery_pass(struct bch_fs *, struct printbuf *,
+ enum bch_recovery_pass);
+
+int bch2_run_online_recovery_passes(struct bch_fs *, u64);
+int bch2_run_recovery_passes(struct bch_fs *, enum bch_recovery_pass);
+
+void bch2_recovery_pass_status_to_text(struct printbuf *, struct bch_fs *);
-int bch2_run_online_recovery_passes(struct bch_fs *);
-int bch2_run_recovery_passes(struct bch_fs *);
+void bch2_fs_recovery_passes_init(struct bch_fs *);
#endif /* _BCACHEFS_RECOVERY_PASSES_H */
diff --git a/fs/bcachefs/recovery_passes_format.h b/fs/bcachefs/recovery_passes_format.h
new file mode 100644
index 000000000000..b63c20558d3d
--- /dev/null
+++ b/fs/bcachefs/recovery_passes_format.h
@@ -0,0 +1,106 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_RECOVERY_PASSES_FORMAT_H
+#define _BCACHEFS_RECOVERY_PASSES_FORMAT_H
+
+#define PASS_SILENT BIT(0)
+#define PASS_FSCK BIT(1)
+#define PASS_UNCLEAN BIT(2)
+#define PASS_ALWAYS BIT(3)
+#define PASS_ONLINE BIT(4)
+#define PASS_ALLOC BIT(5)
+#define PASS_FSCK_ALLOC (PASS_FSCK|PASS_ALLOC)
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+#define PASS_FSCK_DEBUG BIT(1)
+#else
+#define PASS_FSCK_DEBUG 0
+#endif
+
+/*
+ * Passes may be reordered, but the second field is a persistent identifier and
+ * must never change:
+ */
+#define BCH_RECOVERY_PASSES() \
+ x(recovery_pass_empty, 41, PASS_SILENT) \
+ x(scan_for_btree_nodes, 37, 0) \
+ x(check_topology, 4, 0) \
+ x(accounting_read, 39, PASS_ALWAYS) \
+ x(alloc_read, 0, PASS_ALWAYS) \
+ x(stripes_read, 1, 0) \
+ x(initialize_subvolumes, 2, 0) \
+ x(snapshots_read, 3, PASS_ALWAYS) \
+ x(check_allocations, 5, PASS_FSCK_ALLOC) \
+ x(trans_mark_dev_sbs, 6, PASS_ALWAYS|PASS_SILENT|PASS_ALLOC) \
+ x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT|PASS_ALLOC) \
+ x(set_may_go_rw, 8, PASS_ALWAYS|PASS_SILENT) \
+ x(journal_replay, 9, PASS_ALWAYS) \
+ x(check_alloc_info, 10, PASS_ONLINE|PASS_FSCK_ALLOC) \
+ x(check_lrus, 11, PASS_ONLINE|PASS_FSCK_ALLOC) \
+ x(check_btree_backpointers, 12, PASS_ONLINE|PASS_FSCK_ALLOC) \
+ x(check_backpointers_to_extents, 13, PASS_ONLINE|PASS_FSCK_DEBUG) \
+ x(check_extents_to_backpointers, 14, PASS_ONLINE|PASS_FSCK_ALLOC) \
+ x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK_ALLOC) \
+ x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \
+ x(bucket_gens_init, 17, 0) \
+ x(reconstruct_snapshots, 38, 0) \
+ x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \
+ x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \
+ x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \
+ x(check_subvol_children, 35, PASS_ONLINE|PASS_FSCK) \
+ x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \
+ x(fs_upgrade_for_subvolumes, 22, 0) \
+ x(check_inodes, 24, PASS_FSCK) \
+ x(check_extents, 25, PASS_FSCK) \
+ x(check_indirect_extents, 26, PASS_ONLINE|PASS_FSCK) \
+ x(check_dirents, 27, PASS_FSCK) \
+ x(check_xattrs, 28, PASS_FSCK) \
+ x(check_root, 29, PASS_ONLINE|PASS_FSCK) \
+ x(check_unreachable_inodes, 40, PASS_FSCK) \
+ x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \
+ x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \
+ x(check_nlinks, 31, PASS_FSCK) \
+ x(check_rebalance_work, 43, PASS_ONLINE|PASS_FSCK) \
+ x(resume_logged_ops, 23, PASS_ALWAYS) \
+ x(delete_dead_inodes, 32, PASS_ALWAYS) \
+ x(fix_reflink_p, 33, 0) \
+ x(set_fs_needs_rebalance, 34, 0) \
+ x(lookup_root_inode, 42, PASS_ALWAYS|PASS_SILENT)
+
+/* We normally enumerate recovery passes in the order we run them: */
+enum bch_recovery_pass {
+#define x(n, id, when) BCH_RECOVERY_PASS_##n,
+ BCH_RECOVERY_PASSES()
+#undef x
+ BCH_RECOVERY_PASS_NR
+};
+
+/* But we also need stable identifiers that can be used in the superblock */
+enum bch_recovery_pass_stable {
+#define x(n, id, when) BCH_RECOVERY_PASS_STABLE_##n = id,
+ BCH_RECOVERY_PASSES()
+#undef x
+};
+
+struct recovery_pass_entry {
+ __le64 last_run;
+ __le32 last_runtime;
+ __le32 flags;
+};
+
+LE32_BITMASK(BCH_RECOVERY_PASS_NO_RATELIMIT, struct recovery_pass_entry, flags, 0, 1)
+
+struct bch_sb_field_recovery_passes {
+ struct bch_sb_field field;
+ struct recovery_pass_entry start[];
+};
+
+static inline unsigned
+recovery_passes_nr_entries(struct bch_sb_field_recovery_passes *r)
+{
+ return r
+ ? ((vstruct_end(&r->field) - (void *) &r->start[0]) /
+ sizeof(struct recovery_pass_entry))
+ : 0;
+}
+
+#endif /* _BCACHEFS_RECOVERY_PASSES_FORMAT_H */
diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h
index e89b9c783285..aa9526938cc3 100644
--- a/fs/bcachefs/recovery_passes_types.h
+++ b/fs/bcachefs/recovery_passes_types.h
@@ -2,79 +2,26 @@
#ifndef _BCACHEFS_RECOVERY_PASSES_TYPES_H
#define _BCACHEFS_RECOVERY_PASSES_TYPES_H
-#define PASS_SILENT BIT(0)
-#define PASS_FSCK BIT(1)
-#define PASS_UNCLEAN BIT(2)
-#define PASS_ALWAYS BIT(3)
-#define PASS_ONLINE BIT(4)
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-#define PASS_FSCK_DEBUG BIT(1)
-#else
-#define PASS_FSCK_DEBUG 0
-#endif
-
-/*
- * Passes may be reordered, but the second field is a persistent identifier and
- * must never change:
- */
-#define BCH_RECOVERY_PASSES() \
- x(recovery_pass_empty, 41, PASS_SILENT) \
- x(scan_for_btree_nodes, 37, 0) \
- x(check_topology, 4, 0) \
- x(accounting_read, 39, PASS_ALWAYS) \
- x(alloc_read, 0, PASS_ALWAYS) \
- x(stripes_read, 1, 0) \
- x(initialize_subvolumes, 2, 0) \
- x(snapshots_read, 3, PASS_ALWAYS) \
- x(check_allocations, 5, PASS_FSCK) \
- x(trans_mark_dev_sbs, 6, PASS_ALWAYS|PASS_SILENT) \
- x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT) \
- x(set_may_go_rw, 8, PASS_ALWAYS|PASS_SILENT) \
- x(journal_replay, 9, PASS_ALWAYS) \
- x(check_alloc_info, 10, PASS_ONLINE|PASS_FSCK) \
- x(check_lrus, 11, PASS_ONLINE|PASS_FSCK) \
- x(check_btree_backpointers, 12, PASS_ONLINE|PASS_FSCK) \
- x(check_backpointers_to_extents, 13, PASS_ONLINE|PASS_FSCK_DEBUG) \
- x(check_extents_to_backpointers, 14, PASS_ONLINE|PASS_FSCK) \
- x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK) \
- x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \
- x(bucket_gens_init, 17, 0) \
- x(reconstruct_snapshots, 38, 0) \
- x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \
- x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \
- x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \
- x(check_subvol_children, 35, PASS_ONLINE|PASS_FSCK) \
- x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \
- x(fs_upgrade_for_subvolumes, 22, 0) \
- x(check_inodes, 24, PASS_FSCK) \
- x(check_extents, 25, PASS_FSCK) \
- x(check_indirect_extents, 26, PASS_ONLINE|PASS_FSCK) \
- x(check_dirents, 27, PASS_FSCK) \
- x(check_xattrs, 28, PASS_FSCK) \
- x(check_root, 29, PASS_ONLINE|PASS_FSCK) \
- x(check_unreachable_inodes, 40, PASS_FSCK) \
- x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \
- x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \
- x(check_nlinks, 31, PASS_FSCK) \
- x(resume_logged_ops, 23, PASS_ALWAYS) \
- x(delete_dead_inodes, 32, PASS_ALWAYS) \
- x(fix_reflink_p, 33, 0) \
- x(set_fs_needs_rebalance, 34, 0)
-
-/* We normally enumerate recovery passes in the order we run them: */
-enum bch_recovery_pass {
-#define x(n, id, when) BCH_RECOVERY_PASS_##n,
- BCH_RECOVERY_PASSES()
-#undef x
- BCH_RECOVERY_PASS_NR
-};
-
-/* But we also need stable identifiers that can be used in the superblock */
-enum bch_recovery_pass_stable {
-#define x(n, id, when) BCH_RECOVERY_PASS_STABLE_##n = id,
- BCH_RECOVERY_PASSES()
-#undef x
+struct bch_fs_recovery {
+ /*
+ * Two different uses:
+ * "Has this fsck pass?" - i.e. should this type of error be an
+ * emergency read-only
+ * And, in certain situations fsck will rewind to an earlier pass: used
+ * for signaling to the toplevel code which pass we want to run now.
+ */
+ enum bch_recovery_pass curr_pass;
+ enum bch_recovery_pass next_pass;
+ /* never rewinds version of curr_pass */
+ enum bch_recovery_pass pass_done;
+ u64 passes_to_run;
+ /* bitmask of recovery passes that we actually ran */
+ u64 passes_complete;
+ u64 passes_failing;
+ u64 passes_ratelimiting;
+ spinlock_t lock;
+ struct semaphore run_lock;
+ struct work_struct work;
};
#endif /* _BCACHEFS_RECOVERY_PASSES_TYPES_H */
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 710178e3da4c..a535abd44df3 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -3,6 +3,7 @@
#include "bkey_buf.h"
#include "btree_update.h"
#include "buckets.h"
+#include "enumerated_ref.h"
#include "error.h"
#include "extents.h"
#include "inode.h"
@@ -311,7 +312,7 @@ static int trans_trigger_reflink_p_segment(struct btree_trans *trans,
if (!bkey_refcount_c(k)) {
if (!(flags & BTREE_TRIGGER_overwrite))
- ret = -BCH_ERR_missing_indirect_extent;
+ ret = bch_err_throw(c, missing_indirect_extent);
goto next;
}
@@ -610,8 +611,8 @@ s64 bch2_remap_range(struct bch_fs *c,
!bch2_request_incompat_feature(c, bcachefs_metadata_version_reflink_p_may_update_opts);
int ret = 0, ret2 = 0;
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_reflink))
- return -BCH_ERR_erofs_no_writes;
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_reflink))
+ return bch_err_throw(c, erofs_no_writes);
bch2_check_set_feature(c, BCH_FEATURE_reflink);
@@ -710,7 +711,8 @@ s64 bch2_remap_range(struct bch_fs *c,
SET_REFLINK_P_IDX(&dst_p->v, offset);
if (reflink_p_may_update_opts_field &&
- may_change_src_io_path_opts)
+ may_change_src_io_path_opts &&
+ REFLINK_P_MAY_UPDATE_OPTIONS(src_p.v))
SET_REFLINK_P_MAY_UPDATE_OPTIONS(&dst_p->v, true);
} else {
BUG();
@@ -761,7 +763,7 @@ err:
bch2_bkey_buf_exit(&new_src, c);
bch2_bkey_buf_exit(&new_dst, c);
- bch2_write_ref_put(c, BCH_WRITE_REF_reflink);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_reflink);
return dst_done ?: ret ?: ret2;
}
@@ -846,7 +848,7 @@ int bch2_gc_reflink_start(struct bch_fs *c)
struct reflink_gc *r = genradix_ptr_alloc(&c->reflink_gc_table,
c->reflink_gc_nr++, GFP_KERNEL);
if (!r) {
- ret = -BCH_ERR_ENOMEM_gc_reflink_start;
+ ret = bch_err_throw(c, ENOMEM_gc_reflink_start);
break;
}
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 477ef0997949..8383bd7fdb3f 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -119,7 +119,7 @@ int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r,
return 0;
bad:
bch2_replicas_entry_to_text(err, r);
- return -BCH_ERR_invalid_replicas_entry;
+ return bch_err_throw(c, invalid_replicas_entry);
}
void bch2_cpu_replicas_to_text(struct printbuf *out,
@@ -311,7 +311,7 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c,
!__replicas_has_entry(&c->replicas_gc, new_entry)) {
new_gc = cpu_replicas_add_entry(c, &c->replicas_gc, new_entry);
if (!new_gc.entries) {
- ret = -BCH_ERR_ENOMEM_cpu_replicas;
+ ret = bch_err_throw(c, ENOMEM_cpu_replicas);
goto err;
}
}
@@ -319,7 +319,7 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c,
if (!__replicas_has_entry(&c->replicas, new_entry)) {
new_r = cpu_replicas_add_entry(c, &c->replicas, new_entry);
if (!new_r.entries) {
- ret = -BCH_ERR_ENOMEM_cpu_replicas;
+ ret = bch_err_throw(c, ENOMEM_cpu_replicas);
goto err;
}
@@ -422,7 +422,7 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
if (!c->replicas_gc.entries) {
mutex_unlock(&c->sb_lock);
bch_err(c, "error allocating c->replicas_gc");
- return -BCH_ERR_ENOMEM_replicas_gc;
+ return bch_err_throw(c, ENOMEM_replicas_gc);
}
for_each_cpu_replicas_entry(&c->replicas, e)
@@ -458,7 +458,7 @@ retry:
new.entries = kcalloc(nr, new.entry_size, GFP_KERNEL);
if (!new.entries) {
bch_err(c, "error allocating c->replicas_gc");
- return -BCH_ERR_ENOMEM_replicas_gc;
+ return bch_err_throw(c, ENOMEM_replicas_gc);
}
mutex_lock(&c->sb_lock);
@@ -622,7 +622,7 @@ static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c,
sb_r = bch2_sb_field_resize(&c->disk_sb, replicas_v0,
DIV_ROUND_UP(bytes, sizeof(u64)));
if (!sb_r)
- return -BCH_ERR_ENOSPC_sb_replicas;
+ return bch_err_throw(c, ENOSPC_sb_replicas);
bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas);
sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas_v0);
@@ -667,7 +667,7 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
sb_r = bch2_sb_field_resize(&c->disk_sb, replicas,
DIV_ROUND_UP(bytes, sizeof(u64)));
if (!sb_r)
- return -BCH_ERR_ENOSPC_sb_replicas;
+ return bch_err_throw(c, ENOSPC_sb_replicas);
bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0);
sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas);
@@ -819,19 +819,18 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
if (e->data_type == BCH_DATA_cached)
continue;
- rcu_read_lock();
- for (unsigned i = 0; i < e->nr_devs; i++) {
- if (e->devs[i] == BCH_SB_MEMBER_INVALID) {
- nr_failed++;
- continue;
- }
+ scoped_guard(rcu)
+ for (unsigned i = 0; i < e->nr_devs; i++) {
+ if (e->devs[i] == BCH_SB_MEMBER_INVALID) {
+ nr_failed++;
+ continue;
+ }
- nr_online += test_bit(e->devs[i], devs.d);
+ nr_online += test_bit(e->devs[i], devs.d);
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, e->devs[i]);
- nr_failed += !ca || ca->mi.state == BCH_MEMBER_STATE_failed;
- }
- rcu_read_unlock();
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, e->devs[i]);
+ nr_failed += !ca || ca->mi.state == BCH_MEMBER_STATE_failed;
+ }
if (nr_online + nr_failed == e->nr_devs)
continue;
diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h
index fa27ec59a647..b868702a431a 100644
--- a/fs/bcachefs/sb-counters_format.h
+++ b/fs/bcachefs/sb-counters_format.h
@@ -16,6 +16,7 @@ enum counters_flags {
x(io_read_split, 33, TYPE_COUNTER) \
x(io_read_reuse_race, 34, TYPE_COUNTER) \
x(io_read_retry, 32, TYPE_COUNTER) \
+ x(io_read_fail_and_poison, 82, TYPE_COUNTER) \
x(io_write, 1, TYPE_SECTORS) \
x(io_move, 2, TYPE_SECTORS) \
x(io_move_read, 35, TYPE_SECTORS) \
@@ -24,6 +25,8 @@ enum counters_flags {
x(io_move_fail, 38, TYPE_COUNTER) \
x(io_move_write_fail, 82, TYPE_COUNTER) \
x(io_move_start_fail, 39, TYPE_COUNTER) \
+ x(io_move_created_rebalance, 83, TYPE_COUNTER) \
+ x(io_move_evacuate_bucket, 84, TYPE_COUNTER) \
x(bucket_invalidate, 3, TYPE_COUNTER) \
x(bucket_discard, 4, TYPE_COUNTER) \
x(bucket_discard_fast, 79, TYPE_COUNTER) \
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
index badd0e17ada5..b61f88450a6d 100644
--- a/fs/bcachefs/sb-downgrade.c
+++ b/fs/bcachefs/sb-downgrade.c
@@ -100,7 +100,11 @@
BCH_FSCK_ERR_ptr_to_missing_backpointer) \
x(stripe_backpointers, \
BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\
- BCH_FSCK_ERR_ptr_to_missing_backpointer)
+ BCH_FSCK_ERR_ptr_to_missing_backpointer) \
+ x(inode_has_case_insensitive, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \
+ BCH_FSCK_ERR_inode_has_case_insensitive_not_set, \
+ BCH_FSCK_ERR_inode_parent_has_case_insensitive_not_set)
#define DOWNGRADE_TABLE() \
x(bucket_stripe_sectors, \
@@ -374,6 +378,9 @@ int bch2_sb_downgrade_update(struct bch_fs *c)
if (BCH_VERSION_MAJOR(src->version) != BCH_VERSION_MAJOR(le16_to_cpu(c->disk_sb.sb->version)))
continue;
+ if (src->version < c->sb.version_incompat)
+ continue;
+
struct bch_sb_field_downgrade_entry *dst;
unsigned bytes = sizeof(*dst) + sizeof(dst->errors[0]) * src->nr_errors;
@@ -410,7 +417,7 @@ int bch2_sb_downgrade_update(struct bch_fs *c)
d = bch2_sb_field_resize(&c->disk_sb, downgrade, sb_u64s);
if (!d) {
- ret = -BCH_ERR_ENOSPC_sb_downgrade;
+ ret = bch_err_throw(c, ENOSPC_sb_downgrade);
goto out;
}
diff --git a/fs/bcachefs/sb-errors.c b/fs/bcachefs/sb-errors.c
index 013a96883b4e..48853efdc105 100644
--- a/fs/bcachefs/sb-errors.c
+++ b/fs/bcachefs/sb-errors.c
@@ -78,6 +78,28 @@ const struct bch_sb_field_ops bch_sb_field_ops_errors = {
.to_text = bch2_sb_errors_to_text,
};
+void bch2_fs_errors_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ if (out->nr_tabstops < 1)
+ printbuf_tabstop_push(out, 48);
+ if (out->nr_tabstops < 2)
+ printbuf_tabstop_push(out, 8);
+ if (out->nr_tabstops < 3)
+ printbuf_tabstop_push(out, 16);
+
+ guard(mutex)(&c->fsck_error_counts_lock);
+
+ bch_sb_errors_cpu *e = &c->fsck_error_counts;
+ darray_for_each(*e, i) {
+ bch2_sb_error_id_to_text(out, i->id);
+ prt_tab(out);
+ prt_u64(out, i->nr);
+ prt_tab(out);
+ bch2_prt_datetime(out, i->last_error_time);
+ prt_newline(out);
+ }
+}
+
void bch2_sb_error_count(struct bch_fs *c, enum bch_sb_error_id err)
{
bch_sb_errors_cpu *e = &c->fsck_error_counts;
diff --git a/fs/bcachefs/sb-errors.h b/fs/bcachefs/sb-errors.h
index b2357b8e6107..e86267264692 100644
--- a/fs/bcachefs/sb-errors.h
+++ b/fs/bcachefs/sb-errors.h
@@ -7,6 +7,7 @@
extern const char * const bch2_sb_error_strs[];
void bch2_sb_error_id_to_text(struct printbuf *, enum bch_sb_error_id);
+void bch2_fs_errors_to_text(struct printbuf *, struct bch_fs *);
extern const struct bch_sb_field_ops bch_sb_field_ops_errors;
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index 4036a20c6adc..6fdbf265e4c0 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -209,7 +209,7 @@ enum bch_fsck_flags {
x(subvol_to_missing_root, 188, 0) \
x(subvol_root_wrong_bi_subvol, 189, FSCK_AUTOFIX) \
x(bkey_in_missing_snapshot, 190, 0) \
- x(bkey_in_deleted_snapshot, 315, 0) \
+ x(bkey_in_deleted_snapshot, 315, FSCK_AUTOFIX) \
x(inode_pos_inode_nonzero, 191, 0) \
x(inode_pos_blockdev_range, 192, 0) \
x(inode_alloc_cursor_inode_bad, 301, 0) \
@@ -232,6 +232,7 @@ enum bch_fsck_flags {
x(inode_dir_multiple_links, 206, FSCK_AUTOFIX) \
x(inode_dir_missing_backpointer, 284, FSCK_AUTOFIX) \
x(inode_dir_unlinked_but_not_empty, 286, FSCK_AUTOFIX) \
+ x(inode_dir_has_nonzero_i_size, 319, FSCK_AUTOFIX) \
x(inode_multiple_links_but_nlink_0, 207, FSCK_AUTOFIX) \
x(inode_wrong_backpointer, 208, FSCK_AUTOFIX) \
x(inode_wrong_nlink, 209, FSCK_AUTOFIX) \
@@ -243,6 +244,7 @@ enum bch_fsck_flags {
x(inode_parent_has_case_insensitive_not_set, 317, FSCK_AUTOFIX) \
x(vfs_inode_i_blocks_underflow, 311, FSCK_AUTOFIX) \
x(vfs_inode_i_blocks_not_zero_at_truncate, 313, FSCK_AUTOFIX) \
+ x(vfs_bad_inode_rm, 320, 0) \
x(deleted_inode_but_clean, 211, FSCK_AUTOFIX) \
x(deleted_inode_missing, 212, FSCK_AUTOFIX) \
x(deleted_inode_is_dir, 213, FSCK_AUTOFIX) \
@@ -328,7 +330,7 @@ enum bch_fsck_flags {
x(dirent_stray_data_after_cf_name, 305, 0) \
x(rebalance_work_incorrectly_set, 309, FSCK_AUTOFIX) \
x(rebalance_work_incorrectly_unset, 310, FSCK_AUTOFIX) \
- x(MAX, 319, 0)
+ x(MAX, 321, 0)
enum bch_sb_error_id {
#define x(t, n, ...) BCH_FSCK_ERR_##t = n,
diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c
index 72779912939b..363eb0c6eb7c 100644
--- a/fs/bcachefs/sb-members.c
+++ b/fs/bcachefs/sb-members.c
@@ -5,11 +5,31 @@
#include "disk_groups.h"
#include "error.h"
#include "opts.h"
+#include "recovery_passes.h"
#include "replicas.h"
#include "sb-members.h"
#include "super-io.h"
-void bch2_dev_missing(struct bch_fs *c, unsigned dev)
+int bch2_dev_missing_bkey(struct bch_fs *c, struct bkey_s_c k, unsigned dev)
+{
+ struct printbuf buf = PRINTBUF;
+ bch2_log_msg_start(c, &buf);
+
+ prt_printf(&buf, "pointer to nonexistent device %u in key\n", dev);
+ bch2_bkey_val_to_text(&buf, c, k);
+
+ bool print = bch2_count_fsck_err(c, ptr_to_invalid_device, &buf);
+
+ int ret = bch2_run_explicit_recovery_pass(c, &buf,
+ BCH_RECOVERY_PASS_check_allocations, 0);
+
+ if (print)
+ bch2_print_str(c, KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+ return ret;
+}
+
+void bch2_dev_missing_atomic(struct bch_fs *c, unsigned dev)
{
if (dev != BCH_SB_MEMBER_INVALID)
bch2_fs_inconsistent(c, "pointer to nonexistent device %u", dev);
@@ -81,7 +101,7 @@ static int sb_members_v2_resize_entries(struct bch_fs *c)
mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s);
if (!mi)
- return -BCH_ERR_ENOSPC_sb_members_v2;
+ return bch_err_throw(c, ENOSPC_sb_members_v2);
for (int i = c->disk_sb.sb->nr_devices - 1; i >= 0; --i) {
void *dst = (void *) mi->_members + (i * sizeof(struct bch_member));
@@ -119,6 +139,11 @@ int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb)
struct bch_sb_field_members_v1 *mi1;
struct bch_sb_field_members_v2 *mi2;
+ if (BCH_SB_VERSION_INCOMPAT(disk_sb->sb) > bcachefs_metadata_version_extent_flags) {
+ bch2_sb_field_resize(disk_sb, members_v1, 0);
+ return 0;
+ }
+
mi1 = bch2_sb_field_resize(disk_sb, members_v1,
DIV_ROUND_UP(sizeof(*mi1) + BCH_MEMBER_V1_BYTES *
disk_sb->sb->nr_devices, sizeof(u64)));
@@ -170,6 +195,12 @@ static int validate_member(struct printbuf *err,
return -BCH_ERR_invalid_sb_members;
}
+ if (BCH_MEMBER_FREESPACE_INITIALIZED(&m) &&
+ sb->features[0] & cpu_to_le64(BIT_ULL(BCH_FEATURE_no_alloc_info))) {
+ prt_printf(err, "device %u: freespace initialized but fs has no alloc info", i);
+ return -BCH_ERR_invalid_sb_members;
+ }
+
return 0;
}
@@ -191,17 +222,11 @@ static void member_to_text(struct printbuf *out,
printbuf_indent_add(out, 2);
prt_printf(out, "Label:\t");
- if (BCH_MEMBER_GROUP(&m)) {
- unsigned idx = BCH_MEMBER_GROUP(&m) - 1;
-
- if (idx < disk_groups_nr(gi))
- prt_printf(out, "%s (%u)",
- gi->entries[idx].label, idx);
- else
- prt_printf(out, "(bad disk labels section)");
- } else {
+ if (BCH_MEMBER_GROUP(&m))
+ bch2_disk_path_to_text_sb(out, sb,
+ BCH_MEMBER_GROUP(&m) - 1);
+ else
prt_printf(out, "(none)");
- }
prt_newline(out);
prt_printf(out, "UUID:\t");
@@ -268,6 +293,7 @@ static void member_to_text(struct printbuf *out,
prt_printf(out, "Discard:\t%llu\n", BCH_MEMBER_DISCARD(&m));
prt_printf(out, "Freespace initialized:\t%llu\n", BCH_MEMBER_FREESPACE_INITIALIZED(&m));
+ prt_printf(out, "Resize on mount:\t%llu\n", BCH_MEMBER_RESIZE_ON_MOUNT(&m));
printbuf_indent_sub(out, 2);
}
@@ -352,14 +378,13 @@ void bch2_sb_members_from_cpu(struct bch_fs *c)
{
struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
- rcu_read_lock();
+ guard(rcu)();
for_each_member_device_rcu(c, ca, NULL) {
struct bch_member *m = __bch2_members_v2_get_mut(mi, ca->dev_idx);
for (unsigned e = 0; e < BCH_MEMBER_ERROR_NR; e++)
m->errors[e] = cpu_to_le64(atomic64_read(&ca->errors[e]));
}
- rcu_read_unlock();
}
void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca)
@@ -417,20 +442,14 @@ void bch2_dev_errors_reset(struct bch_dev *ca)
bool bch2_dev_btree_bitmap_marked(struct bch_fs *c, struct bkey_s_c k)
{
- bool ret = true;
- rcu_read_lock();
+ guard(rcu)();
bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
- if (!ca)
- continue;
-
- if (!bch2_dev_btree_bitmap_marked_sectors(ca, ptr->offset, btree_sectors(c))) {
- ret = false;
- break;
- }
+ if (ca &&
+ !bch2_dev_btree_bitmap_marked_sectors(ca, ptr->offset, btree_sectors(c)))
+ return false;
}
- rcu_read_unlock();
- return ret;
+ return true;
}
static void __bch2_dev_btree_bitmap_mark(struct bch_sb_field_members_v2 *mi, unsigned dev,
@@ -493,6 +512,7 @@ int bch2_sb_member_alloc(struct bch_fs *c)
unsigned u64s;
int best = -1;
u64 best_last_mount = 0;
+ unsigned nr_deleted = 0;
if (dev_idx < BCH_SB_MEMBERS_MAX)
goto have_slot;
@@ -503,7 +523,10 @@ int bch2_sb_member_alloc(struct bch_fs *c)
continue;
struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, dev_idx);
- if (bch2_member_alive(&m))
+
+ nr_deleted += uuid_equal(&m.uuid, &BCH_SB_MEMBER_DELETED_UUID);
+
+ if (!bch2_is_zero(&m.uuid, sizeof(m.uuid)))
continue;
u64 last_mount = le64_to_cpu(m.last_mount);
@@ -517,6 +540,10 @@ int bch2_sb_member_alloc(struct bch_fs *c)
goto have_slot;
}
+ if (nr_deleted)
+ bch_err(c, "unable to allocate new member, but have %u deleted: run fsck",
+ nr_deleted);
+
return -BCH_ERR_ENOSPC_sb_members;
have_slot:
nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
@@ -532,3 +559,22 @@ have_slot:
c->disk_sb.sb->nr_devices = nr_devices;
return dev_idx;
}
+
+void bch2_sb_members_clean_deleted(struct bch_fs *c)
+{
+ mutex_lock(&c->sb_lock);
+ bool write_sb = false;
+
+ for (unsigned i = 0; i < c->sb.nr_devices; i++) {
+ struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, i);
+
+ if (uuid_equal(&m->uuid, &BCH_SB_MEMBER_DELETED_UUID)) {
+ memset(&m->uuid, 0, sizeof(m->uuid));
+ write_sb = true;
+ }
+ }
+
+ if (write_sb)
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+}
diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h
index 42786657522c..8d8a8a857648 100644
--- a/fs/bcachefs/sb-members.h
+++ b/fs/bcachefs/sb-members.h
@@ -4,6 +4,7 @@
#include "darray.h"
#include "bkey_types.h"
+#include "enumerated_ref.h"
extern char * const bch2_member_error_strs[];
@@ -20,19 +21,16 @@ struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i);
static inline bool bch2_dev_is_online(struct bch_dev *ca)
{
- return !percpu_ref_is_zero(&ca->io_ref[READ]);
+ return !enumerated_ref_is_zero(&ca->io_ref[READ]);
}
static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *, unsigned);
static inline bool bch2_dev_idx_is_online(struct bch_fs *c, unsigned dev)
{
- rcu_read_lock();
+ guard(rcu)();
struct bch_dev *ca = bch2_dev_rcu(c, dev);
- bool ret = ca && bch2_dev_is_online(ca);
- rcu_read_unlock();
-
- return ret;
+ return ca && bch2_dev_is_online(ca);
}
static inline bool bch2_dev_is_healthy(struct bch_dev *ca)
@@ -104,6 +102,12 @@ static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, struct bch_dev *
for (struct bch_dev *_ca = NULL; \
(_ca = __bch2_next_dev((_c), _ca, (_mask)));)
+#define for_each_online_member_rcu(_c, _ca) \
+ for_each_member_device_rcu(_c, _ca, &(_c)->online_devs)
+
+#define for_each_rw_member_rcu(_c, _ca) \
+ for_each_member_device_rcu(_c, _ca, &(_c)->rw_devs[BCH_DATA_free])
+
static inline void bch2_dev_get(struct bch_dev *ca)
{
#ifdef CONFIG_BCACHEFS_DEBUG
@@ -135,12 +139,10 @@ static inline void bch2_dev_put(struct bch_dev *ca)
static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev *ca)
{
- rcu_read_lock();
+ guard(rcu)();
bch2_dev_put(ca);
if ((ca = __bch2_next_dev(c, ca, NULL)))
bch2_dev_get(ca);
- rcu_read_unlock();
-
return ca;
}
@@ -157,33 +159,32 @@ static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev
static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
struct bch_dev *ca,
unsigned state_mask,
- int rw)
+ int rw, unsigned ref_idx)
{
- rcu_read_lock();
+ guard(rcu)();
if (ca)
- percpu_ref_put(&ca->io_ref[rw]);
+ enumerated_ref_put(&ca->io_ref[rw], ref_idx);
while ((ca = __bch2_next_dev(c, ca, NULL)) &&
(!((1 << ca->mi.state) & state_mask) ||
- !percpu_ref_tryget(&ca->io_ref[rw])))
+ !enumerated_ref_tryget(&ca->io_ref[rw], ref_idx)))
;
- rcu_read_unlock();
return ca;
}
-#define __for_each_online_member(_c, _ca, state_mask, rw) \
+#define __for_each_online_member(_c, _ca, state_mask, rw, ref_idx) \
for (struct bch_dev *_ca = NULL; \
- (_ca = bch2_get_next_online_dev(_c, _ca, state_mask, rw));)
+ (_ca = bch2_get_next_online_dev(_c, _ca, state_mask, rw, ref_idx));)
-#define for_each_online_member(c, ca) \
- __for_each_online_member(c, ca, ~0, READ)
+#define for_each_online_member(c, ca, ref_idx) \
+ __for_each_online_member(c, ca, ~0, READ, ref_idx)
-#define for_each_rw_member(c, ca) \
- __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), WRITE)
+#define for_each_rw_member(c, ca, ref_idx) \
+ __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), WRITE, ref_idx)
-#define for_each_readable_member(c, ca) \
- __for_each_online_member(c, ca, BIT( BCH_MEMBER_STATE_rw)|BIT(BCH_MEMBER_STATE_ro), READ)
+#define for_each_readable_member(c, ca, ref_idx) \
+ __for_each_online_member(c, ca, BIT( BCH_MEMBER_STATE_rw)|BIT(BCH_MEMBER_STATE_ro), READ, ref_idx)
static inline bool bch2_dev_exists(const struct bch_fs *c, unsigned dev)
{
@@ -218,23 +219,24 @@ static inline struct bch_dev *bch2_dev_rcu_noerror(struct bch_fs *c, unsigned de
: NULL;
}
-void bch2_dev_missing(struct bch_fs *, unsigned);
+int bch2_dev_missing_bkey(struct bch_fs *, struct bkey_s_c, unsigned);
+
+void bch2_dev_missing_atomic(struct bch_fs *, unsigned);
static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *c, unsigned dev)
{
struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev);
if (unlikely(!ca))
- bch2_dev_missing(c, dev);
+ bch2_dev_missing_atomic(c, dev);
return ca;
}
static inline struct bch_dev *bch2_dev_tryget_noerror(struct bch_fs *c, unsigned dev)
{
- rcu_read_lock();
+ guard(rcu)();
struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev);
if (ca)
bch2_dev_get(ca);
- rcu_read_unlock();
return ca;
}
@@ -242,7 +244,7 @@ static inline struct bch_dev *bch2_dev_tryget(struct bch_fs *c, unsigned dev)
{
struct bch_dev *ca = bch2_dev_tryget_noerror(c, dev);
if (unlikely(!ca))
- bch2_dev_missing(c, dev);
+ bch2_dev_missing_atomic(c, dev);
return ca;
}
@@ -285,43 +287,31 @@ static inline struct bch_dev *bch2_dev_iterate(struct bch_fs *c, struct bch_dev
return bch2_dev_tryget(c, dev_idx);
}
-static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, int rw)
+static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev,
+ int rw, unsigned ref_idx)
{
might_sleep();
- rcu_read_lock();
+ guard(rcu)();
struct bch_dev *ca = bch2_dev_rcu(c, dev);
- if (ca && !percpu_ref_tryget(&ca->io_ref[rw]))
- ca = NULL;
- rcu_read_unlock();
+ if (!ca || !enumerated_ref_tryget(&ca->io_ref[rw], ref_idx))
+ return NULL;
- if (ca &&
- (ca->mi.state == BCH_MEMBER_STATE_rw ||
- (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ)))
+ if (ca->mi.state == BCH_MEMBER_STATE_rw ||
+ (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ))
return ca;
- if (ca)
- percpu_ref_put(&ca->io_ref[rw]);
+ enumerated_ref_put(&ca->io_ref[rw], ref_idx);
return NULL;
}
-/* XXX kill, move to struct bch_fs */
-static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
-{
- struct bch_devs_mask devs;
-
- memset(&devs, 0, sizeof(devs));
- for_each_online_member(c, ca)
- __set_bit(ca->dev_idx, devs.d);
- return devs;
-}
-
extern const struct bch_sb_field_ops bch_sb_field_ops_members_v1;
extern const struct bch_sb_field_ops bch_sb_field_ops_members_v2;
static inline bool bch2_member_alive(struct bch_member *m)
{
- return !bch2_is_zero(&m->uuid, sizeof(m->uuid));
+ return !bch2_is_zero(&m->uuid, sizeof(m->uuid)) &&
+ !uuid_equal(&m->uuid, &BCH_SB_MEMBER_DELETED_UUID);
}
static inline bool bch2_member_exists(struct bch_sb *sb, unsigned dev)
@@ -351,6 +341,7 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
? BCH_MEMBER_DURABILITY(mi) - 1
: 1,
.freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi),
+ .resize_on_mount = BCH_MEMBER_RESIZE_ON_MOUNT(mi),
.valid = bch2_member_alive(mi),
.btree_bitmap_shift = mi->btree_bitmap_shift,
.btree_allocated_bitmap = le64_to_cpu(mi->btree_allocated_bitmap),
@@ -381,5 +372,6 @@ bool bch2_dev_btree_bitmap_marked(struct bch_fs *, struct bkey_s_c);
void bch2_dev_btree_bitmap_mark(struct bch_fs *, struct bkey_s_c);
int bch2_sb_member_alloc(struct bch_fs *);
+void bch2_sb_members_clean_deleted(struct bch_fs *);
#endif /* _BCACHEFS_SB_MEMBERS_H */
diff --git a/fs/bcachefs/sb-members_format.h b/fs/bcachefs/sb-members_format.h
index 3affec823b3f..fb72ad730518 100644
--- a/fs/bcachefs/sb-members_format.h
+++ b/fs/bcachefs/sb-members_format.h
@@ -13,6 +13,10 @@
*/
#define BCH_SB_MEMBER_INVALID 255
+#define BCH_SB_MEMBER_DELETED_UUID \
+ UUID_INIT(0xffffffff, 0xffff, 0xffff, \
+ 0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef)
+
#define BCH_MIN_NR_NBUCKETS (1 << 6)
#define BCH_IOPS_MEASUREMENTS() \
@@ -88,6 +92,8 @@ LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags, 20, 28)
LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags, 28, 30)
LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED,
struct bch_member, flags, 30, 31)
+LE64_BITMASK(BCH_MEMBER_RESIZE_ON_MOUNT,
+ struct bch_member, flags, 31, 32)
#if 0
LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20);
diff --git a/fs/bcachefs/sb-members_types.h b/fs/bcachefs/sb-members_types.h
index c0eda888fe39..d6443e186872 100644
--- a/fs/bcachefs/sb-members_types.h
+++ b/fs/bcachefs/sb-members_types.h
@@ -13,6 +13,7 @@ struct bch_member_cpu {
u8 data_allowed;
u8 durability;
u8 freespace_initialized;
+ u8 resize_on_mount;
u8 valid;
u8 btree_bitmap_shift;
u64 btree_allocated_bitmap;
diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index 7c403427fbdb..538c324f4765 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -339,12 +339,9 @@ static inline bool six_owner_running(struct six_lock *lock)
* acquiring the lock and setting the owner field. If we're an RT task
* that will live-lock because we won't let the owner complete.
*/
- rcu_read_lock();
+ guard(rcu)();
struct task_struct *owner = READ_ONCE(lock->owner);
- bool ret = owner ? owner_on_cpu(owner) : !rt_or_dl_task(current);
- rcu_read_unlock();
-
- return ret;
+ return owner ? owner_on_cpu(owner) : !rt_or_dl_task(current);
}
static inline bool six_optimistic_spin(struct six_lock *lock,
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index fec569c7deb1..23a332d76b32 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -1,11 +1,13 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "bbpos.h"
#include "bkey_buf.h"
#include "btree_cache.h"
#include "btree_key_cache.h"
#include "btree_update.h"
#include "buckets.h"
+#include "enumerated_ref.h"
#include "errcode.h"
#include "error.h"
#include "fs.h"
@@ -52,7 +54,7 @@ int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id,
BTREE_ITER_with_updates, snapshot_tree, s);
if (bch2_err_matches(ret, ENOENT))
- ret = -BCH_ERR_ENOENT_snapshot_tree;
+ ret = bch_err_throw(trans->c, ENOENT_snapshot_tree);
return ret;
}
@@ -65,7 +67,7 @@ __bch2_snapshot_tree_create(struct btree_trans *trans)
struct bkey_i_snapshot_tree *s_t;
if (ret == -BCH_ERR_ENOSPC_btree_slot)
- ret = -BCH_ERR_ENOSPC_snapshot_tree;
+ ret = bch_err_throw(trans->c, ENOSPC_snapshot_tree);
if (ret)
return ERR_PTR(ret);
@@ -103,11 +105,8 @@ static bool __bch2_snapshot_is_ancestor_early(struct snapshot_table *t, u32 id,
static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
{
- rcu_read_lock();
- bool ret = __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor);
- rcu_read_unlock();
-
- return ret;
+ guard(rcu)();
+ return __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor);
}
static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor)
@@ -138,13 +137,11 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
{
bool ret;
- rcu_read_lock();
+ guard(rcu)();
struct snapshot_table *t = rcu_dereference(c->snapshots);
- if (unlikely(c->recovery_pass_done < BCH_RECOVERY_PASS_check_snapshots)) {
- ret = __bch2_snapshot_is_ancestor_early(t, id, ancestor);
- goto out;
- }
+ if (unlikely(c->recovery.pass_done < BCH_RECOVERY_PASS_check_snapshots))
+ return __bch2_snapshot_is_ancestor_early(t, id, ancestor);
if (likely(ancestor >= IS_ANCESTOR_BITMAP))
while (id && id < ancestor - IS_ANCESTOR_BITMAP)
@@ -155,9 +152,6 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
: id == ancestor;
EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor));
-out:
- rcu_read_unlock();
-
return ret;
}
@@ -209,9 +203,14 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
{
struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
- prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u",
- BCH_SNAPSHOT_SUBVOL(s.v),
- BCH_SNAPSHOT_DELETED(s.v),
+ if (BCH_SNAPSHOT_SUBVOL(s.v))
+ prt_str(out, "subvol ");
+ if (BCH_SNAPSHOT_WILL_DELETE(s.v))
+ prt_str(out, "will_delete ");
+ if (BCH_SNAPSHOT_DELETED(s.v))
+ prt_str(out, "deleted ");
+
+ prt_printf(out, "parent %10u children %10u %10u subvol %u tree %u",
le32_to_cpu(s.v->parent),
le32_to_cpu(s.v->children[0]),
le32_to_cpu(s.v->children[1]),
@@ -281,6 +280,16 @@ fsck_err:
return ret;
}
+static int bch2_snapshot_table_make_room(struct bch_fs *c, u32 id)
+{
+ mutex_lock(&c->snapshot_table_lock);
+ int ret = snapshot_t_mut(c, id)
+ ? 0
+ : bch_err_throw(c, ENOMEM_mark_snapshot);
+ mutex_unlock(&c->snapshot_table_lock);
+ return ret;
+}
+
static int __bch2_mark_snapshot(struct btree_trans *trans,
enum btree_id btree, unsigned level,
struct bkey_s_c old, struct bkey_s_c new,
@@ -295,14 +304,16 @@ static int __bch2_mark_snapshot(struct btree_trans *trans,
t = snapshot_t_mut(c, id);
if (!t) {
- ret = -BCH_ERR_ENOMEM_mark_snapshot;
+ ret = bch_err_throw(c, ENOMEM_mark_snapshot);
goto err;
}
if (new.k->type == KEY_TYPE_snapshot) {
struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
- t->live = true;
+ t->state = !BCH_SNAPSHOT_DELETED(s.v)
+ ? SNAPSHOT_ID_live
+ : SNAPSHOT_ID_deleted;
t->parent = le32_to_cpu(s.v->parent);
t->children[0] = le32_to_cpu(s.v->children[0]);
t->children[1] = le32_to_cpu(s.v->children[1]);
@@ -327,9 +338,9 @@ static int __bch2_mark_snapshot(struct btree_trans *trans,
parent - id - 1 < IS_ANCESTOR_BITMAP)
__set_bit(parent - id - 1, t->is_ancestor);
- if (BCH_SNAPSHOT_DELETED(s.v)) {
+ if (BCH_SNAPSHOT_WILL_DELETE(s.v)) {
set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
- if (c->curr_recovery_pass > BCH_RECOVERY_PASS_delete_dead_snapshots)
+ if (c->recovery.pass_done > BCH_RECOVERY_PASS_delete_dead_snapshots)
bch2_delete_dead_snapshots_async(c);
}
} else {
@@ -390,21 +401,29 @@ static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id)
return 0;
}
-u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)
+u32 bch2_snapshot_oldest_subvol(struct bch_fs *c, u32 snapshot_root,
+ snapshot_id_list *skip)
{
- u32 id = snapshot_root;
- u32 subvol = 0, s;
-
- rcu_read_lock();
+ guard(rcu)();
+ u32 id, subvol = 0, s;
+retry:
+ id = snapshot_root;
while (id && bch2_snapshot_exists(c, id)) {
- s = snapshot_t(c, id)->subvol;
-
- if (s && (!subvol || s < subvol))
- subvol = s;
+ if (!(skip && snapshot_list_has_id(skip, id))) {
+ s = snapshot_t(c, id)->subvol;
+ if (s && (!subvol || s < subvol))
+ subvol = s;
+ }
id = bch2_snapshot_tree_next(c, id);
+ if (id == snapshot_root)
+ break;
+ }
+
+ if (!subvol && skip) {
+ skip = NULL;
+ goto retry;
}
- rcu_read_unlock();
return subvol;
}
@@ -437,7 +456,7 @@ static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
if (!ret && !found) {
struct bkey_i_subvolume *u;
- *subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root);
+ *subvol_id = bch2_snapshot_oldest_subvol(c, snapshot_root, NULL);
u = bch2_bkey_get_mut_typed(trans, &iter,
BTREE_ID_subvolumes, POS(0, *subvol_id),
@@ -589,18 +608,14 @@ static int snapshot_tree_ptr_good(struct btree_trans *trans,
u32 bch2_snapshot_skiplist_get(struct bch_fs *c, u32 id)
{
- const struct snapshot_t *s;
-
if (!id)
return 0;
- rcu_read_lock();
- s = snapshot_t(c, id);
- if (s->parent)
- id = bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth));
- rcu_read_unlock();
-
- return id;
+ guard(rcu)();
+ const struct snapshot_t *s = snapshot_t(c, id);
+ return s->parent
+ ? bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth))
+ : id;
}
static int snapshot_skiplist_good(struct btree_trans *trans, u32 id, struct bch_snapshot s)
@@ -654,7 +669,7 @@ static int snapshot_tree_ptr_repair(struct btree_trans *trans,
u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot);
ret = PTR_ERR_OR_ZERO(u) ?:
bch2_snapshot_tree_create(trans, root_id,
- bch2_snapshot_tree_oldest_subvol(c, root_id),
+ bch2_snapshot_oldest_subvol(c, root_id, NULL),
&tree_id);
if (ret)
goto err;
@@ -699,6 +714,9 @@ static int check_snapshot(struct btree_trans *trans,
memset(&s, 0, sizeof(s));
memcpy(&s, k.v, min(sizeof(s), bkey_val_bytes(k.k)));
+ if (BCH_SNAPSHOT_DELETED(&s))
+ return 0;
+
id = le32_to_cpu(s.parent);
if (id) {
ret = bch2_snapshot_lookup(trans, id, &v);
@@ -736,7 +754,7 @@ static int check_snapshot(struct btree_trans *trans,
}
bool should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
- !BCH_SNAPSHOT_DELETED(&s);
+ !BCH_SNAPSHOT_WILL_DELETE(&s);
if (should_have_subvol) {
id = le32_to_cpu(s.subvol);
@@ -887,9 +905,8 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id)
}
bch2_trans_iter_exit(trans, &iter);
- return bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?:
- bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
- bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0);
+ return bch2_snapshot_table_make_room(c, id) ?:
+ bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0);
}
/* Figure out which snapshot nodes belong in the same tree: */
@@ -917,10 +934,7 @@ static inline bool same_snapshot(struct snapshot_tree_reconstruct *r, struct bpo
static inline bool snapshot_id_lists_have_common(snapshot_id_list *l, snapshot_id_list *r)
{
- darray_for_each(*l, i)
- if (snapshot_list_has_id(r, *i))
- return true;
- return false;
+ return darray_find_p(*l, i, snapshot_list_has_id(r, *i)) != NULL;
}
static void snapshot_id_list_to_text(struct printbuf *out, snapshot_id_list *s)
@@ -987,12 +1001,12 @@ int bch2_reconstruct_snapshots(struct bch_fs *c)
snapshot_id_list_to_text(&buf, t);
darray_for_each(*t, id) {
- if (fsck_err_on(!bch2_snapshot_exists(c, *id),
+ if (fsck_err_on(bch2_snapshot_id_state(c, *id) == SNAPSHOT_ID_empty,
trans, snapshot_node_missing,
"snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) {
if (t->nr > 1) {
bch_err(c, "cannot reconstruct snapshot trees with multiple nodes");
- ret = -BCH_ERR_fsck_repair_unimplemented;
+ ret = bch_err_throw(c, fsck_repair_unimplemented);
goto err;
}
@@ -1012,27 +1026,92 @@ err:
return ret;
}
-int bch2_check_key_has_snapshot(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k)
+int __bch2_check_key_has_snapshot(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF;
int ret = 0;
+ enum snapshot_id_state state = bch2_snapshot_id_state(c, k.k->p.snapshot);
- if (fsck_err_on(!bch2_snapshot_exists(c, k.k->p.snapshot),
- trans, bkey_in_missing_snapshot,
- "key in missing snapshot %s, delete?",
+ /* Snapshot was definitively deleted, this error is marked autofix */
+ if (fsck_err_on(state == SNAPSHOT_ID_deleted,
+ trans, bkey_in_deleted_snapshot,
+ "key in deleted snapshot %s, delete?",
(bch2_btree_id_to_text(&buf, iter->btree_id),
prt_char(&buf, ' '),
bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
ret = bch2_btree_delete_at(trans, iter,
- BTREE_UPDATE_internal_snapshot_node) ?: 1;
+ BTREE_UPDATE_internal_snapshot_node) ?: 1;
+
+ if (state == SNAPSHOT_ID_empty) {
+ /*
+ * Snapshot missing: we should have caught this with btree_lost_data and
+ * kicked off reconstruct_snapshots, so if we end up here we have no
+ * idea what happened.
+ *
+ * Do not delete unless we know that subvolumes and snapshots
+ * are consistent:
+ *
+ * XXX:
+ *
+ * We could be smarter here, and instead of using the generic
+ * recovery pass ratelimiting, track if there have been any
+ * changes to the snapshots or inodes btrees since those passes
+ * last ran.
+ */
+ ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_check_snapshots) ?: ret;
+ ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_check_subvols) ?: ret;
+
+ if (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots))
+ ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_reconstruct_snapshots) ?: ret;
+
+ unsigned repair_flags = FSCK_CAN_IGNORE | (!ret ? FSCK_CAN_FIX : 0);
+
+ if (__fsck_err(trans, repair_flags, bkey_in_missing_snapshot,
+ "key in missing snapshot %s, delete?",
+ (bch2_btree_id_to_text(&buf, iter->btree_id),
+ prt_char(&buf, ' '),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ ret = bch2_btree_delete_at(trans, iter,
+ BTREE_UPDATE_internal_snapshot_node) ?: 1;
+ }
+ }
fsck_err:
printbuf_exit(&buf);
return ret;
}
+int __bch2_get_snapshot_overwrites(struct btree_trans *trans,
+ enum btree_id btree, struct bpos pos,
+ snapshot_id_list *s)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ for_each_btree_key_reverse_norestart(trans, iter, btree, bpos_predecessor(pos),
+ BTREE_ITER_all_snapshots, k, ret) {
+ if (!bkey_eq(k.k->p, pos))
+ break;
+
+ if (!bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot) ||
+ snapshot_list_has_ancestor(c, s, k.k->p.snapshot))
+ continue;
+
+ ret = snapshot_list_add(c, s, k.k->p.snapshot);
+ if (ret)
+ break;
+ }
+ bch2_trans_iter_exit(trans, &iter);
+ if (ret)
+ darray_exit(s);
+
+ return ret;
+}
+
/*
* Mark a snapshot as deleted, for future cleanup:
*/
@@ -1051,10 +1130,10 @@ int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
}
/* already deleted? */
- if (BCH_SNAPSHOT_DELETED(&s->v))
+ if (BCH_SNAPSHOT_WILL_DELETE(&s->v))
goto err;
- SET_BCH_SNAPSHOT_DELETED(&s->v, true);
+ SET_BCH_SNAPSHOT_WILL_DELETE(&s->v, true);
SET_BCH_SNAPSHOT_SUBVOL(&s->v, false);
s->v.subvol = 0;
err:
@@ -1074,24 +1153,25 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
struct btree_iter iter, p_iter = {};
struct btree_iter c_iter = {};
struct btree_iter tree_iter = {};
- struct bkey_s_c_snapshot s;
u32 parent_id, child_id;
unsigned i;
int ret = 0;
- s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id),
- BTREE_ITER_intent, snapshot);
- ret = bkey_err(s);
+ struct bkey_i_snapshot *s =
+ bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id),
+ BTREE_ITER_intent, snapshot);
+ ret = PTR_ERR_OR_ZERO(s);
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
"missing snapshot %u", id);
if (ret)
goto err;
- BUG_ON(s.v->children[1]);
+ BUG_ON(BCH_SNAPSHOT_DELETED(&s->v));
+ BUG_ON(s->v.children[1]);
- parent_id = le32_to_cpu(s.v->parent);
- child_id = le32_to_cpu(s.v->children[0]);
+ parent_id = le32_to_cpu(s->v.parent);
+ child_id = le32_to_cpu(s->v.children[0]);
if (parent_id) {
struct bkey_i_snapshot *parent;
@@ -1149,24 +1229,38 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
*/
struct bkey_i_snapshot_tree *s_t;
- BUG_ON(s.v->children[1]);
+ BUG_ON(s->v.children[1]);
s_t = bch2_bkey_get_mut_typed(trans, &tree_iter,
- BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)),
+ BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s->v.tree)),
0, snapshot_tree);
ret = PTR_ERR_OR_ZERO(s_t);
if (ret)
goto err;
- if (s.v->children[0]) {
- s_t->v.root_snapshot = s.v->children[0];
+ if (s->v.children[0]) {
+ s_t->v.root_snapshot = s->v.children[0];
} else {
s_t->k.type = KEY_TYPE_deleted;
set_bkey_val_u64s(&s_t->k, 0);
}
}
- ret = bch2_btree_delete_at(trans, &iter, 0);
+ if (!bch2_request_incompat_feature(c, bcachefs_metadata_version_snapshot_deletion_v2)) {
+ SET_BCH_SNAPSHOT_DELETED(&s->v, true);
+ s->v.parent = 0;
+ s->v.children[0] = 0;
+ s->v.children[1] = 0;
+ s->v.subvol = 0;
+ s->v.tree = 0;
+ s->v.depth = 0;
+ s->v.skip[0] = 0;
+ s->v.skip[1] = 0;
+ s->v.skip[2] = 0;
+ } else {
+ s->k.type = KEY_TYPE_deleted;
+ set_bkey_val_u64s(&s->k, 0);
+ }
err:
bch2_trans_iter_exit(trans, &tree_iter);
bch2_trans_iter_exit(trans, &p_iter);
@@ -1202,7 +1296,7 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
goto err;
if (!k.k || !k.k->p.offset) {
- ret = -BCH_ERR_ENOSPC_snapshot_create;
+ ret = bch_err_throw(c, ENOSPC_snapshot_create);
goto err;
}
@@ -1336,18 +1430,10 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
* that key to snapshot leaf nodes, where we can mutate it
*/
-struct snapshot_interior_delete {
- u32 id;
- u32 live_child;
-};
-typedef DARRAY(struct snapshot_interior_delete) interior_delete_list;
-
static inline u32 interior_delete_has_id(interior_delete_list *l, u32 id)
{
- darray_for_each(*l, i)
- if (i->id == id)
- return i->live_child;
- return 0;
+ struct snapshot_interior_delete *i = darray_find_p(*l, i, i->id == id);
+ return i ? i->live_child : 0;
}
static unsigned __live_child(struct snapshot_table *t, u32 id,
@@ -1375,28 +1461,32 @@ static unsigned __live_child(struct snapshot_table *t, u32 id,
return 0;
}
-static unsigned live_child(struct bch_fs *c, u32 id,
- snapshot_id_list *delete_leaves,
- interior_delete_list *delete_interior)
+static unsigned live_child(struct bch_fs *c, u32 id)
{
- rcu_read_lock();
- u32 ret = __live_child(rcu_dereference(c->snapshots), id,
- delete_leaves, delete_interior);
- rcu_read_unlock();
- return ret;
+ struct snapshot_delete *d = &c->snapshot_delete;
+
+ guard(rcu)();
+ return __live_child(rcu_dereference(c->snapshots), id,
+ &d->delete_leaves, &d->delete_interior);
+}
+
+static bool snapshot_id_dying(struct snapshot_delete *d, unsigned id)
+{
+ return snapshot_list_has_id(&d->delete_leaves, id) ||
+ interior_delete_has_id(&d->delete_interior, id) != 0;
}
static int delete_dead_snapshots_process_key(struct btree_trans *trans,
struct btree_iter *iter,
- struct bkey_s_c k,
- snapshot_id_list *delete_leaves,
- interior_delete_list *delete_interior)
+ struct bkey_s_c k)
{
- if (snapshot_list_has_id(delete_leaves, k.k->p.snapshot))
+ struct snapshot_delete *d = &trans->c->snapshot_delete;
+
+ if (snapshot_list_has_id(&d->delete_leaves, k.k->p.snapshot))
return bch2_btree_delete_at(trans, iter,
BTREE_UPDATE_internal_snapshot_node);
- u32 live_child = interior_delete_has_id(delete_interior, k.k->p.snapshot);
+ u32 live_child = interior_delete_has_id(&d->delete_interior, k.k->p.snapshot);
if (live_child) {
struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
int ret = PTR_ERR_OR_ZERO(new);
@@ -1427,55 +1517,214 @@ static int delete_dead_snapshots_process_key(struct btree_trans *trans,
return 0;
}
+static bool skip_unrelated_snapshot_tree(struct btree_trans *trans, struct btree_iter *iter, u64 *prev_inum)
+{
+ struct bch_fs *c = trans->c;
+ struct snapshot_delete *d = &c->snapshot_delete;
+
+ u64 inum = iter->btree_id != BTREE_ID_inodes
+ ? iter->pos.inode
+ : iter->pos.offset;
+
+ if (*prev_inum == inum)
+ return false;
+
+ *prev_inum = inum;
+
+ bool ret = !snapshot_list_has_id(&d->deleting_from_trees,
+ bch2_snapshot_tree(c, iter->pos.snapshot));
+ if (unlikely(ret)) {
+ struct bpos pos = iter->pos;
+ pos.snapshot = 0;
+ if (iter->btree_id != BTREE_ID_inodes)
+ pos.offset = U64_MAX;
+ bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(pos));
+ }
+
+ return ret;
+}
+
+static int delete_dead_snapshot_keys_v1(struct btree_trans *trans)
+{
+ struct bch_fs *c = trans->c;
+ struct snapshot_delete *d = &c->snapshot_delete;
+
+ for (d->pos.btree = 0; d->pos.btree < BTREE_ID_NR; d->pos.btree++) {
+ struct disk_reservation res = { 0 };
+ u64 prev_inum = 0;
+
+ d->pos.pos = POS_MIN;
+
+ if (!btree_type_has_snapshots(d->pos.btree))
+ continue;
+
+ int ret = for_each_btree_key_commit(trans, iter,
+ d->pos.btree, POS_MIN,
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
+ &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({
+ d->pos.pos = iter.pos;
+
+ if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum))
+ continue;
+
+ delete_dead_snapshots_process_key(trans, &iter, k);
+ }));
+
+ bch2_disk_reservation_put(c, &res);
+
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int delete_dead_snapshot_keys_range(struct btree_trans *trans, enum btree_id btree,
+ struct bpos start, struct bpos end)
+{
+ struct bch_fs *c = trans->c;
+ struct snapshot_delete *d = &c->snapshot_delete;
+ struct disk_reservation res = { 0 };
+
+ d->pos.btree = btree;
+ d->pos.pos = POS_MIN;
+
+ int ret = for_each_btree_key_max_commit(trans, iter,
+ btree, start, end,
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
+ &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({
+ d->pos.pos = iter.pos;
+ delete_dead_snapshots_process_key(trans, &iter, k);
+ }));
+
+ bch2_disk_reservation_put(c, &res);
+ return ret;
+}
+
+static int delete_dead_snapshot_keys_v2(struct btree_trans *trans)
+{
+ struct bch_fs *c = trans->c;
+ struct snapshot_delete *d = &c->snapshot_delete;
+ struct disk_reservation res = { 0 };
+ u64 prev_inum = 0;
+ int ret = 0;
+
+ struct btree_iter iter;
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, POS_MIN,
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots);
+
+ while (1) {
+ struct bkey_s_c k;
+ ret = lockrestart_do(trans,
+ bkey_err(k = bch2_btree_iter_peek(trans, &iter)));
+ if (ret)
+ break;
+
+ if (!k.k)
+ break;
+
+ d->pos.btree = iter.btree_id;
+ d->pos.pos = iter.pos;
+
+ if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum))
+ continue;
+
+ if (snapshot_id_dying(d, k.k->p.snapshot)) {
+ struct bpos start = POS(k.k->p.offset, 0);
+ struct bpos end = POS(k.k->p.offset, U64_MAX);
+
+ ret = delete_dead_snapshot_keys_range(trans, BTREE_ID_extents, start, end) ?:
+ delete_dead_snapshot_keys_range(trans, BTREE_ID_dirents, start, end) ?:
+ delete_dead_snapshot_keys_range(trans, BTREE_ID_xattrs, start, end);
+ if (ret)
+ break;
+
+ bch2_btree_iter_set_pos(trans, &iter, POS(0, k.k->p.offset + 1));
+ } else {
+ bch2_btree_iter_advance(trans, &iter);
+ }
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ if (ret)
+ goto err;
+
+ prev_inum = 0;
+ ret = for_each_btree_key_commit(trans, iter,
+ BTREE_ID_inodes, POS_MIN,
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
+ &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({
+ d->pos.btree = iter.btree_id;
+ d->pos.pos = iter.pos;
+
+ if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum))
+ continue;
+
+ delete_dead_snapshots_process_key(trans, &iter, k);
+ }));
+err:
+ bch2_disk_reservation_put(c, &res);
+ return ret;
+}
+
/*
* For a given snapshot, if it doesn't have a subvolume that points to it, and
* it doesn't have child snapshot nodes - it's now redundant and we can mark it
* as deleted.
*/
-static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s_c k,
- snapshot_id_list *delete_leaves,
- interior_delete_list *delete_interior)
+static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s_c k)
{
if (k.k->type != KEY_TYPE_snapshot)
return 0;
struct bch_fs *c = trans->c;
+ struct snapshot_delete *d = &c->snapshot_delete;
struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
unsigned live_children = 0;
+ int ret = 0;
if (BCH_SNAPSHOT_SUBVOL(s.v))
return 0;
+ if (BCH_SNAPSHOT_DELETED(s.v))
+ return 0;
+
+ mutex_lock(&d->progress_lock);
for (unsigned i = 0; i < 2; i++) {
u32 child = le32_to_cpu(s.v->children[i]);
live_children += child &&
- !snapshot_list_has_id(delete_leaves, child);
+ !snapshot_list_has_id(&d->delete_leaves, child);
}
+ u32 tree = bch2_snapshot_tree(c, s.k->p.offset);
+
if (live_children == 0) {
- return snapshot_list_add(c, delete_leaves, s.k->p.offset);
+ ret = snapshot_list_add_nodup(c, &d->deleting_from_trees, tree) ?:
+ snapshot_list_add(c, &d->delete_leaves, s.k->p.offset);
} else if (live_children == 1) {
- struct snapshot_interior_delete d = {
+ struct snapshot_interior_delete n = {
.id = s.k->p.offset,
- .live_child = live_child(c, s.k->p.offset, delete_leaves, delete_interior),
+ .live_child = live_child(c, s.k->p.offset),
};
- if (!d.live_child) {
- bch_err(c, "error finding live child of snapshot %u", d.id);
- return -EINVAL;
+ if (!n.live_child) {
+ bch_err(c, "error finding live child of snapshot %u", n.id);
+ ret = -EINVAL;
+ } else {
+ ret = snapshot_list_add_nodup(c, &d->deleting_from_trees, tree) ?:
+ darray_push(&d->delete_interior, n);
}
-
- return darray_push(delete_interior, d);
- } else {
- return 0;
}
+ mutex_unlock(&d->progress_lock);
+
+ return ret;
}
static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
interior_delete_list *skip)
{
- rcu_read_lock();
+ guard(rcu)();
while (interior_delete_has_id(skip, id))
id = __bch2_snapshot_parent(c, id);
@@ -1484,7 +1733,6 @@ static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
id = __bch2_snapshot_parent(c, id);
} while (interior_delete_has_id(skip, id));
}
- rcu_read_unlock();
return id;
}
@@ -1498,6 +1746,9 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
struct bkey_i_snapshot *s;
int ret;
+ if (!bch2_snapshot_exists(c, k.k->p.offset))
+ return 0;
+
if (k.k->type != KEY_TYPE_snapshot)
return 0;
@@ -1545,39 +1796,56 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
return bch2_trans_update(trans, iter, &s->k_i, 0);
}
-int bch2_delete_dead_snapshots(struct bch_fs *c)
+static void bch2_snapshot_delete_nodes_to_text(struct printbuf *out, struct snapshot_delete *d)
{
- if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags))
+ prt_printf(out, "deleting from trees");
+ darray_for_each(d->deleting_from_trees, i)
+ prt_printf(out, " %u", *i);
+
+ prt_printf(out, "deleting leaves");
+ darray_for_each(d->delete_leaves, i)
+ prt_printf(out, " %u", *i);
+ prt_newline(out);
+
+ prt_printf(out, "interior");
+ darray_for_each(d->delete_interior, i)
+ prt_printf(out, " %u->%u", i->id, i->live_child);
+ prt_newline(out);
+}
+
+int __bch2_delete_dead_snapshots(struct bch_fs *c)
+{
+ struct snapshot_delete *d = &c->snapshot_delete;
+ int ret = 0;
+
+ if (!mutex_trylock(&d->lock))
return 0;
+ if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags))
+ goto out_unlock;
+
struct btree_trans *trans = bch2_trans_get(c);
- snapshot_id_list delete_leaves = {};
- interior_delete_list delete_interior = {};
- int ret = 0;
/*
* For every snapshot node: If we have no live children and it's not
* pointed to by a subvolume, delete it:
*/
+ d->running = true;
+ d->pos = BBPOS_MIN;
+
ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k,
- check_should_delete_snapshot(trans, k, &delete_leaves, &delete_interior));
+ check_should_delete_snapshot(trans, k));
if (!bch2_err_matches(ret, EROFS))
bch_err_msg(c, ret, "walking snapshots");
if (ret)
goto err;
- if (!delete_leaves.nr && !delete_interior.nr)
+ if (!d->delete_leaves.nr && !d->delete_interior.nr)
goto err;
{
struct printbuf buf = PRINTBUF;
- prt_printf(&buf, "deleting leaves");
- darray_for_each(delete_leaves, i)
- prt_printf(&buf, " %u", *i);
-
- prt_printf(&buf, " interior");
- darray_for_each(delete_interior, i)
- prt_printf(&buf, " %u->%u", i->id, i->live_child);
+ bch2_snapshot_delete_nodes_to_text(&buf, d);
ret = commit_do(trans, NULL, NULL, 0, bch2_trans_log_msg(trans, &buf));
printbuf_exit(&buf);
@@ -1585,29 +1853,15 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
goto err;
}
- for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) {
- struct disk_reservation res = { 0 };
-
- if (!btree_type_has_snapshots(btree))
- continue;
-
- ret = for_each_btree_key_commit(trans, iter,
- btree, POS_MIN,
- BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
- &res, NULL, BCH_TRANS_COMMIT_no_enospc,
- delete_dead_snapshots_process_key(trans, &iter, k,
- &delete_leaves,
- &delete_interior));
-
- bch2_disk_reservation_put(c, &res);
-
- if (!bch2_err_matches(ret, EROFS))
- bch_err_msg(c, ret, "deleting keys from dying snapshots");
- if (ret)
- goto err;
- }
+ ret = !bch2_request_incompat_feature(c, bcachefs_metadata_version_snapshot_deletion_v2)
+ ? delete_dead_snapshot_keys_v2(trans)
+ : delete_dead_snapshot_keys_v1(trans);
+ if (!bch2_err_matches(ret, EROFS))
+ bch_err_msg(c, ret, "deleting keys from dying snapshots");
+ if (ret)
+ goto err;
- darray_for_each(delete_leaves, i) {
+ darray_for_each(d->delete_leaves, i) {
ret = commit_do(trans, NULL, NULL, 0,
bch2_snapshot_node_delete(trans, *i));
if (!bch2_err_matches(ret, EROFS))
@@ -1624,11 +1878,11 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN,
BTREE_ITER_intent, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &delete_interior));
+ bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &d->delete_interior));
if (ret)
goto err;
- darray_for_each(delete_interior, i) {
+ darray_for_each(d->delete_interior, i) {
ret = commit_do(trans, NULL, NULL, 0,
bch2_snapshot_node_delete(trans, i->id));
if (!bch2_err_matches(ret, EROFS))
@@ -1637,33 +1891,68 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
goto err;
}
err:
- darray_exit(&delete_interior);
- darray_exit(&delete_leaves);
+ mutex_lock(&d->progress_lock);
+ darray_exit(&d->deleting_from_trees);
+ darray_exit(&d->delete_interior);
+ darray_exit(&d->delete_leaves);
+ d->running = false;
+ mutex_unlock(&d->progress_lock);
bch2_trans_put(trans);
+
+ bch2_recovery_pass_set_no_ratelimit(c, BCH_RECOVERY_PASS_check_snapshots);
+out_unlock:
+ mutex_unlock(&d->lock);
if (!bch2_err_matches(ret, EROFS))
bch_err_fn(c, ret);
return ret;
}
+int bch2_delete_dead_snapshots(struct bch_fs *c)
+{
+ if (!c->opts.auto_snapshot_deletion)
+ return 0;
+
+ return __bch2_delete_dead_snapshots(c);
+}
+
void bch2_delete_dead_snapshots_work(struct work_struct *work)
{
- struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
+ struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete.work);
set_worker_desc("bcachefs-delete-dead-snapshots/%s", c->name);
bch2_delete_dead_snapshots(c);
- bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_delete_dead_snapshots);
}
void bch2_delete_dead_snapshots_async(struct bch_fs *c)
{
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots))
+ if (!c->opts.auto_snapshot_deletion)
+ return;
+
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_delete_dead_snapshots))
return;
BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags));
- if (!queue_work(c->write_ref_wq, &c->snapshot_delete_work))
- bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
+ if (!queue_work(system_long_wq, &c->snapshot_delete.work))
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_delete_dead_snapshots);
+}
+
+void bch2_snapshot_delete_status_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ struct snapshot_delete *d = &c->snapshot_delete;
+
+ if (!d->running) {
+ prt_str(out, "(not running)");
+ return;
+ }
+
+ mutex_lock(&d->progress_lock);
+ bch2_snapshot_delete_nodes_to_text(out, d);
+
+ bch2_bbpos_to_text(out, d->pos);
+ mutex_unlock(&d->progress_lock);
}
int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
@@ -1704,7 +1993,7 @@ static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct
return 0;
struct bkey_s_c_snapshot snap = bkey_s_c_to_snapshot(k);
- if (BCH_SNAPSHOT_DELETED(snap.v) ||
+ if (BCH_SNAPSHOT_WILL_DELETE(snap.v) ||
interior_snapshot_needs_delete(snap))
set_bit(BCH_FS_need_delete_dead_snapshots, &trans->c->flags);
@@ -1733,10 +2022,6 @@ int bch2_snapshots_read(struct bch_fs *c)
BUG_ON(!test_bit(BCH_FS_new_fs, &c->flags) &&
test_bit(BCH_FS_may_go_rw, &c->flags));
- if (bch2_err_matches(ret, EIO) ||
- (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots)))
- ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_reconstruct_snapshots);
-
return ret;
}
@@ -1744,3 +2029,11 @@ void bch2_fs_snapshots_exit(struct bch_fs *c)
{
kvfree(rcu_dereference_protected(c->snapshots, true));
}
+
+void bch2_fs_snapshots_init_early(struct bch_fs *c)
+{
+ INIT_WORK(&c->snapshot_delete.work, bch2_delete_dead_snapshots_work);
+ mutex_init(&c->snapshot_delete.lock);
+ mutex_init(&c->snapshot_delete.progress_lock);
+ mutex_init(&c->snapshots_unlinked_lock);
+}
diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h
index 81180181d7c9..6766bf673ed9 100644
--- a/fs/bcachefs/snapshot.h
+++ b/fs/bcachefs/snapshot.h
@@ -46,12 +46,9 @@ static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id)
static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id)
{
- rcu_read_lock();
+ guard(rcu)();
const struct snapshot_t *s = snapshot_t(c, id);
- id = s ? s->tree : 0;
- rcu_read_unlock();
-
- return id;
+ return s ? s->tree : 0;
}
static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
@@ -62,11 +59,8 @@ static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
{
- rcu_read_lock();
- id = __bch2_snapshot_parent_early(c, id);
- rcu_read_unlock();
-
- return id;
+ guard(rcu)();
+ return __bch2_snapshot_parent_early(c, id);
}
static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id)
@@ -88,61 +82,53 @@ static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id)
static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id)
{
- rcu_read_lock();
- id = __bch2_snapshot_parent(c, id);
- rcu_read_unlock();
-
- return id;
+ guard(rcu)();
+ return __bch2_snapshot_parent(c, id);
}
static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n)
{
- rcu_read_lock();
+ guard(rcu)();
while (n--)
id = __bch2_snapshot_parent(c, id);
- rcu_read_unlock();
-
return id;
}
-u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *, u32);
+u32 bch2_snapshot_oldest_subvol(struct bch_fs *, u32, snapshot_id_list *);
u32 bch2_snapshot_skiplist_get(struct bch_fs *, u32);
static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id)
{
- u32 parent;
+ guard(rcu)();
- rcu_read_lock();
+ u32 parent;
while ((parent = __bch2_snapshot_parent(c, id)))
id = parent;
- rcu_read_unlock();
-
return id;
}
-static inline bool __bch2_snapshot_exists(struct bch_fs *c, u32 id)
+static inline enum snapshot_id_state __bch2_snapshot_id_state(struct bch_fs *c, u32 id)
{
const struct snapshot_t *s = snapshot_t(c, id);
- return s ? s->live : 0;
+ return s ? s->state : SNAPSHOT_ID_empty;
}
-static inline bool bch2_snapshot_exists(struct bch_fs *c, u32 id)
+static inline enum snapshot_id_state bch2_snapshot_id_state(struct bch_fs *c, u32 id)
{
- rcu_read_lock();
- bool ret = __bch2_snapshot_exists(c, id);
- rcu_read_unlock();
+ guard(rcu)();
+ return __bch2_snapshot_id_state(c, id);
+}
- return ret;
+static inline bool bch2_snapshot_exists(struct bch_fs *c, u32 id)
+{
+ return bch2_snapshot_id_state(c, id) == SNAPSHOT_ID_live;
}
static inline int bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id)
{
- rcu_read_lock();
+ guard(rcu)();
const struct snapshot_t *s = snapshot_t(c, id);
- int ret = s ? s->children[0] : -BCH_ERR_invalid_snapshot_node;
- rcu_read_unlock();
-
- return ret;
+ return s ? s->children[0] : -BCH_ERR_invalid_snapshot_node;
}
static inline int bch2_snapshot_is_leaf(struct bch_fs *c, u32 id)
@@ -155,13 +141,8 @@ static inline int bch2_snapshot_is_leaf(struct bch_fs *c, u32 id)
static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent)
{
- u32 depth;
-
- rcu_read_lock();
- depth = parent ? snapshot_t(c, parent)->depth + 1 : 0;
- rcu_read_unlock();
-
- return depth;
+ guard(rcu)();
+ return parent ? snapshot_t(c, parent)->depth + 1 : 0;
}
bool __bch2_snapshot_is_ancestor(struct bch_fs *, u32, u32);
@@ -175,20 +156,14 @@ static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ances
static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id)
{
- rcu_read_lock();
+ guard(rcu)();
const struct snapshot_t *t = snapshot_t(c, id);
- bool ret = t && (t->children[0]|t->children[1]) != 0;
- rcu_read_unlock();
-
- return ret;
+ return t && (t->children[0]|t->children[1]) != 0;
}
static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id)
{
- darray_for_each(*s, i)
- if (*i == id)
- return true;
- return false;
+ return darray_find(*s, id) != NULL;
}
static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list *s, u32 id)
@@ -241,10 +216,38 @@ int bch2_snapshot_node_create(struct btree_trans *, u32,
int bch2_check_snapshot_trees(struct bch_fs *);
int bch2_check_snapshots(struct bch_fs *);
int bch2_reconstruct_snapshots(struct bch_fs *);
-int bch2_check_key_has_snapshot(struct btree_trans *, struct btree_iter *, struct bkey_s_c);
+
+int __bch2_check_key_has_snapshot(struct btree_trans *, struct btree_iter *, struct bkey_s_c);
+
+static inline int bch2_check_key_has_snapshot(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k)
+{
+ return likely(bch2_snapshot_exists(trans->c, k.k->p.snapshot))
+ ? 0
+ : __bch2_check_key_has_snapshot(trans, iter, k);
+}
+
+int __bch2_get_snapshot_overwrites(struct btree_trans *,
+ enum btree_id, struct bpos,
+ snapshot_id_list *);
+
+/*
+ * Get a list of snapshot IDs that have overwritten a given key:
+ */
+static inline int bch2_get_snapshot_overwrites(struct btree_trans *trans,
+ enum btree_id btree, struct bpos pos,
+ snapshot_id_list *s)
+{
+ darray_init(s);
+
+ return bch2_snapshot_has_children(trans->c, pos.snapshot)
+ ? __bch2_get_snapshot_overwrites(trans, btree, pos, s)
+ : 0;
+
+}
int bch2_snapshot_node_set_deleted(struct btree_trans *, u32);
-void bch2_delete_dead_snapshots_work(struct work_struct *);
int __bch2_key_has_snapshot_overwrites(struct btree_trans *, enum btree_id, struct bpos);
@@ -259,7 +262,14 @@ static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
return __bch2_key_has_snapshot_overwrites(trans, id, pos);
}
+int __bch2_delete_dead_snapshots(struct bch_fs *);
+int bch2_delete_dead_snapshots(struct bch_fs *);
+void bch2_delete_dead_snapshots_work(struct work_struct *);
+void bch2_delete_dead_snapshots_async(struct bch_fs *);
+void bch2_snapshot_delete_status_to_text(struct printbuf *, struct bch_fs *);
+
int bch2_snapshots_read(struct bch_fs *);
void bch2_fs_snapshots_exit(struct bch_fs *);
+void bch2_fs_snapshots_init_early(struct bch_fs *);
#endif /* _BCACHEFS_SNAPSHOT_H */
diff --git a/fs/bcachefs/snapshot_format.h b/fs/bcachefs/snapshot_format.h
index aabcd3a74cd9..9bccae1f3590 100644
--- a/fs/bcachefs/snapshot_format.h
+++ b/fs/bcachefs/snapshot_format.h
@@ -15,10 +15,10 @@ struct bch_snapshot {
bch_le128 btime;
};
-LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1)
-
+LE32_BITMASK(BCH_SNAPSHOT_WILL_DELETE, struct bch_snapshot, flags, 0, 1)
/* True if a subvolume points to this snapshot node: */
LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2)
+LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 2, 3)
/*
* Snapshot trees:
diff --git a/fs/bcachefs/snapshot_types.h b/fs/bcachefs/snapshot_types.h
new file mode 100644
index 000000000000..0ab698f13e5c
--- /dev/null
+++ b/fs/bcachefs/snapshot_types.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SNAPSHOT_TYPES_H
+#define _BCACHEFS_SNAPSHOT_TYPES_H
+
+#include "bbpos_types.h"
+#include "darray.h"
+#include "subvolume_types.h"
+
+typedef DARRAY(u32) snapshot_id_list;
+
+#define IS_ANCESTOR_BITMAP 128
+
+struct snapshot_t {
+ enum snapshot_id_state {
+ SNAPSHOT_ID_empty,
+ SNAPSHOT_ID_live,
+ SNAPSHOT_ID_deleted,
+ } state;
+ u32 parent;
+ u32 skip[3];
+ u32 depth;
+ u32 children[2];
+ u32 subvol; /* Nonzero only if a subvolume points to this node: */
+ u32 tree;
+ unsigned long is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)];
+};
+
+struct snapshot_table {
+ struct rcu_head rcu;
+ size_t nr;
+#ifndef RUST_BINDGEN
+ DECLARE_FLEX_ARRAY(struct snapshot_t, s);
+#else
+ struct snapshot_t s[0];
+#endif
+};
+
+struct snapshot_interior_delete {
+ u32 id;
+ u32 live_child;
+};
+typedef DARRAY(struct snapshot_interior_delete) interior_delete_list;
+
+struct snapshot_delete {
+ struct mutex lock;
+ struct work_struct work;
+
+ struct mutex progress_lock;
+ snapshot_id_list deleting_from_trees;
+ snapshot_id_list delete_leaves;
+ interior_delete_list delete_interior;
+
+ bool running;
+ struct bbpos pos;
+};
+
+#endif /* _BCACHEFS_SNAPSHOT_TYPES_H */
diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c
index a90bf7b8a2b4..71b735a85026 100644
--- a/fs/bcachefs/str_hash.c
+++ b/fs/bcachefs/str_hash.c
@@ -31,14 +31,15 @@ static int bch2_dirent_has_target(struct btree_trans *trans, struct bkey_s_c_dir
}
}
-static noinline int fsck_rename_dirent(struct btree_trans *trans,
- struct snapshots_seen *s,
- const struct bch_hash_desc desc,
- struct bch_hash_info *hash_info,
- struct bkey_s_c_dirent old)
+static int bch2_fsck_rename_dirent(struct btree_trans *trans,
+ struct snapshots_seen *s,
+ const struct bch_hash_desc desc,
+ struct bch_hash_info *hash_info,
+ struct bkey_s_c_dirent old,
+ bool *updated_before_k_pos)
{
struct qstr old_name = bch2_dirent_get_name(old);
- struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, bkey_bytes(old.k) + 32);
+ struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, BKEY_U64s_MAX * sizeof(u64));
int ret = PTR_ERR_OR_ZERO(new);
if (ret)
return ret;
@@ -47,28 +48,39 @@ static noinline int fsck_rename_dirent(struct btree_trans *trans,
dirent_copy_target(new, old);
new->k.p = old.k->p;
+ char *renamed_buf = bch2_trans_kmalloc(trans, old_name.len + 20);
+ ret = PTR_ERR_OR_ZERO(renamed_buf);
+ if (ret)
+ return ret;
+
for (unsigned i = 0; i < 1000; i++) {
- unsigned len = sprintf(new->v.d_name, "%.*s.fsck_renamed-%u",
- old_name.len, old_name.name, i);
- unsigned u64s = BKEY_U64s + dirent_val_u64s(len, 0);
+ new->k.u64s = BKEY_U64s_MAX;
- if (u64s > U8_MAX)
- return -EINVAL;
+ struct qstr renamed_name = (struct qstr) QSTR_INIT(renamed_buf,
+ sprintf(renamed_buf, "%.*s.fsck_renamed-%u",
+ old_name.len, old_name.name, i));
- new->k.u64s = u64s;
+ ret = bch2_dirent_init_name(new, hash_info, &renamed_name, NULL);
+ if (ret)
+ return ret;
ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info,
(subvol_inum) { 0, old.k->p.inode },
old.k->p.snapshot, &new->k_i,
- BTREE_UPDATE_internal_snapshot_node);
- if (!bch2_err_matches(ret, EEXIST))
+ BTREE_UPDATE_internal_snapshot_node|
+ STR_HASH_must_create);
+ if (ret && !bch2_err_matches(ret, EEXIST))
+ break;
+ if (!ret) {
+ if (bpos_lt(new->k.p, old.k->p))
+ *updated_before_k_pos = true;
break;
+ }
}
- if (ret)
- return ret;
-
- return bch2_fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i);
+ ret = ret ?: bch2_fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i);
+ bch_err_fn(trans->c, ret);
+ return ret;
}
static noinline int hash_pick_winner(struct btree_trans *trans,
@@ -101,17 +113,25 @@ static noinline int hash_pick_winner(struct btree_trans *trans,
}
}
-static int repair_inode_hash_info(struct btree_trans *trans,
- struct bch_inode_unpacked *snapshot_root)
+/*
+ * str_hash lookups across snapshots break in wild ways if hash_info in
+ * different snapshot versions doesn't match - so if we find one mismatch, check
+ * them all
+ */
+int bch2_repair_inode_hash_info(struct btree_trans *trans,
+ struct bch_inode_unpacked *snapshot_root)
{
+ struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_s_c k;
+ struct printbuf buf = PRINTBUF;
+ bool need_commit = false;
int ret = 0;
- for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes,
- SPOS(0, snapshot_root->bi_inum, snapshot_root->bi_snapshot - 1),
- BTREE_ITER_all_snapshots, k, ret) {
- if (k.k->p.offset != snapshot_root->bi_inum)
+ for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes,
+ POS(0, snapshot_root->bi_inum),
+ BTREE_ITER_all_snapshots, k, ret) {
+ if (bpos_ge(k.k->p, SPOS(0, snapshot_root->bi_inum, snapshot_root->bi_snapshot)))
break;
if (!bkey_is_inode(k.k))
continue;
@@ -121,19 +141,72 @@ static int repair_inode_hash_info(struct btree_trans *trans,
if (ret)
break;
- if (fsck_err_on(inode.bi_hash_seed != snapshot_root->bi_hash_seed ||
- INODE_STR_HASH(&inode) != INODE_STR_HASH(snapshot_root),
- trans, inode_snapshot_mismatch,
- "inode hash info in different snapshots don't match")) {
+ if (inode.bi_hash_seed == snapshot_root->bi_hash_seed &&
+ INODE_STR_HASH(&inode) == INODE_STR_HASH(snapshot_root)) {
+#ifdef CONFIG_BCACHEFS_DEBUG
+ struct bch_hash_info hash1 = bch2_hash_info_init(c, snapshot_root);
+ struct bch_hash_info hash2 = bch2_hash_info_init(c, &inode);
+
+ BUG_ON(hash1.type != hash2.type ||
+ memcmp(&hash1.siphash_key,
+ &hash2.siphash_key,
+ sizeof(hash1.siphash_key)));
+#endif
+ continue;
+ }
+
+ printbuf_reset(&buf);
+ prt_printf(&buf, "inode %llu hash info in snapshots %u %u don't match\n",
+ snapshot_root->bi_inum,
+ inode.bi_snapshot,
+ snapshot_root->bi_snapshot);
+
+ bch2_prt_str_hash_type(&buf, INODE_STR_HASH(&inode));
+ prt_printf(&buf, " %llx\n", inode.bi_hash_seed);
+
+ bch2_prt_str_hash_type(&buf, INODE_STR_HASH(snapshot_root));
+ prt_printf(&buf, " %llx", snapshot_root->bi_hash_seed);
+
+ if (fsck_err(trans, inode_snapshot_mismatch, "%s", buf.buf)) {
inode.bi_hash_seed = snapshot_root->bi_hash_seed;
SET_INODE_STR_HASH(&inode, INODE_STR_HASH(snapshot_root));
- ret = __bch2_fsck_write_inode(trans, &inode) ?:
- bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
- -BCH_ERR_transaction_restart_nested;
- break;
+
+ ret = __bch2_fsck_write_inode(trans, &inode);
+ if (ret)
+ break;
+ need_commit = true;
}
}
+
+ if (ret)
+ goto err;
+
+ if (!need_commit) {
+ struct printbuf buf = PRINTBUF;
+ bch2_log_msg_start(c, &buf);
+
+ prt_printf(&buf, "inode %llu hash info mismatch with root, but mismatch not found\n",
+ snapshot_root->bi_inum);
+
+ prt_printf(&buf, "root snapshot %u ", snapshot_root->bi_snapshot);
+ bch2_prt_str_hash_type(&buf, INODE_STR_HASH(snapshot_root));
+ prt_printf(&buf, " %llx\n", snapshot_root->bi_hash_seed);
+#if 0
+ prt_printf(&buf, "vs snapshot %u ", hash_info->inum_snapshot);
+ bch2_prt_str_hash_type(&buf, hash_info->type);
+ prt_printf(&buf, " %llx %llx", hash_info->siphash_key.k0, hash_info->siphash_key.k1);
+#endif
+ bch2_print_str(c, KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+ ret = bch_err_throw(c, fsck_repair_unimplemented);
+ goto err;
+ }
+
+ ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
+ -BCH_ERR_transaction_restart_nested;
+err:
fsck_err:
+ printbuf_exit(&buf);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
@@ -145,46 +218,121 @@ fsck_err:
static noinline int check_inode_hash_info_matches_root(struct btree_trans *trans, u64 inum,
struct bch_hash_info *hash_info)
{
+ struct bch_inode_unpacked snapshot_root;
+ int ret = bch2_inode_find_snapshot_root(trans, inum, &snapshot_root);
+ if (ret)
+ return ret;
+
+ struct bch_hash_info hash_root = bch2_hash_info_init(trans->c, &snapshot_root);
+ if (hash_info->type != hash_root.type ||
+ memcmp(&hash_info->siphash_key,
+ &hash_root.siphash_key,
+ sizeof(hash_root.siphash_key)))
+ ret = bch2_repair_inode_hash_info(trans, &snapshot_root);
+
+ return ret;
+}
+
+/* Put a str_hash key in its proper location, checking for duplicates */
+int bch2_str_hash_repair_key(struct btree_trans *trans,
+ struct snapshots_seen *s,
+ const struct bch_hash_desc *desc,
+ struct bch_hash_info *hash_info,
+ struct btree_iter *k_iter, struct bkey_s_c k,
+ struct btree_iter *dup_iter, struct bkey_s_c dup_k,
+ bool *updated_before_k_pos)
+{
struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c k;
+ struct printbuf buf = PRINTBUF;
+ bool free_snapshots_seen = false;
int ret = 0;
- for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, U32_MAX),
- BTREE_ITER_all_snapshots, k, ret) {
- if (k.k->p.offset != inum)
- break;
- if (bkey_is_inode(k.k))
- goto found;
+ if (!s) {
+ s = bch2_trans_kmalloc(trans, sizeof(*s));
+ ret = PTR_ERR_OR_ZERO(s);
+ if (ret)
+ goto out;
+
+ s->pos = k_iter->pos;
+ darray_init(&s->ids);
+
+ ret = bch2_get_snapshot_overwrites(trans, desc->btree_id, k_iter->pos, &s->ids);
+ if (ret)
+ goto out;
+
+ free_snapshots_seen = true;
}
- bch_err(c, "%s(): inum %llu not found", __func__, inum);
- ret = -BCH_ERR_fsck_repair_unimplemented;
- goto err;
-found:;
- struct bch_inode_unpacked inode;
- ret = bch2_inode_unpack(k, &inode);
- if (ret)
- goto err;
- struct bch_hash_info hash2 = bch2_hash_info_init(c, &inode);
- if (hash_info->type != hash2.type ||
- memcmp(&hash_info->siphash_key, &hash2.siphash_key, sizeof(hash2.siphash_key))) {
- ret = repair_inode_hash_info(trans, &inode);
- if (!ret) {
- bch_err(c, "inode hash info mismatch with root, but mismatch not found\n"
- "%u %llx %llx\n"
- "%u %llx %llx",
- hash_info->type,
- hash_info->siphash_key.k0,
- hash_info->siphash_key.k1,
- hash2.type,
- hash2.siphash_key.k0,
- hash2.siphash_key.k1);
- ret = -BCH_ERR_fsck_repair_unimplemented;
+ if (!dup_k.k) {
+ struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
+ ret = PTR_ERR_OR_ZERO(new);
+ if (ret)
+ goto out;
+
+ dup_k = bch2_hash_set_or_get_in_snapshot(trans, dup_iter, *desc, hash_info,
+ (subvol_inum) { 0, new->k.p.inode },
+ new->k.p.snapshot, new,
+ STR_HASH_must_create|
+ BTREE_ITER_with_updates|
+ BTREE_UPDATE_internal_snapshot_node);
+ ret = bkey_err(dup_k);
+ if (ret)
+ goto out;
+ if (dup_k.k)
+ goto duplicate_entries;
+
+ if (bpos_lt(new->k.p, k.k->p))
+ *updated_before_k_pos = true;
+
+ ret = bch2_insert_snapshot_whiteouts(trans, desc->btree_id,
+ k_iter->pos, new->k.p) ?:
+ bch2_hash_delete_at(trans, *desc, hash_info, k_iter,
+ BTREE_ITER_with_updates|
+ BTREE_UPDATE_internal_snapshot_node) ?:
+ bch2_fsck_update_backpointers(trans, s, *desc, hash_info, new) ?:
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
+ -BCH_ERR_transaction_restart_commit;
+ } else {
+duplicate_entries:
+ ret = hash_pick_winner(trans, *desc, hash_info, k, dup_k);
+ if (ret < 0)
+ goto out;
+
+ if (!fsck_err(trans, hash_table_key_duplicate,
+ "duplicate hash table keys%s:\n%s",
+ ret != 2 ? "" : ", both point to valid inodes",
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k),
+ prt_newline(&buf),
+ bch2_bkey_val_to_text(&buf, c, dup_k),
+ buf.buf)))
+ goto out;
+
+ switch (ret) {
+ case 0:
+ ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0);
+ break;
+ case 1:
+ ret = bch2_hash_delete_at(trans, *desc, hash_info, dup_iter, 0);
+ break;
+ case 2:
+ ret = bch2_fsck_rename_dirent(trans, s, *desc, hash_info,
+ bkey_s_c_to_dirent(k),
+ updated_before_k_pos) ?:
+ bch2_hash_delete_at(trans, *desc, hash_info, k_iter,
+ BTREE_ITER_with_updates);
+ goto out;
}
+
+ ret = bch2_trans_commit(trans, NULL, NULL, 0) ?:
+ -BCH_ERR_transaction_restart_commit;
}
-err:
- bch2_trans_iter_exit(trans, &iter);
+out:
+fsck_err:
+ bch2_trans_iter_exit(trans, dup_iter);
+ printbuf_exit(&buf);
+ if (free_snapshots_seen)
+ darray_exit(&s->ids);
return ret;
}
@@ -192,7 +340,8 @@ int __bch2_str_hash_check_key(struct btree_trans *trans,
struct snapshots_seen *s,
const struct bch_hash_desc *desc,
struct bch_hash_info *hash_info,
- struct btree_iter *k_iter, struct bkey_s_c hash_k)
+ struct btree_iter *k_iter, struct bkey_s_c hash_k,
+ bool *updated_before_k_pos)
{
struct bch_fs *c = trans->c;
struct btree_iter iter = {};
@@ -206,24 +355,31 @@ int __bch2_str_hash_check_key(struct btree_trans *trans,
for_each_btree_key_norestart(trans, iter, desc->btree_id,
SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot),
- BTREE_ITER_slots, k, ret) {
+ BTREE_ITER_slots|
+ BTREE_ITER_with_updates, k, ret) {
if (bkey_eq(k.k->p, hash_k.k->p))
break;
if (k.k->type == desc->key_type &&
- !desc->cmp_bkey(k, hash_k))
- goto duplicate_entries;
+ !desc->cmp_bkey(k, hash_k)) {
+ ret = check_inode_hash_info_matches_root(trans, hash_k.k->p.inode,
+ hash_info) ?:
+ bch2_str_hash_repair_key(trans, s, desc, hash_info,
+ k_iter, hash_k,
+ &iter, k, updated_before_k_pos);
+ break;
+ }
- if (bkey_deleted(k.k)) {
- bch2_trans_iter_exit(trans, &iter);
+ if (bkey_deleted(k.k))
goto bad_hash;
- }
}
-out:
bch2_trans_iter_exit(trans, &iter);
+out:
+fsck_err:
printbuf_exit(&buf);
return ret;
bad_hash:
+ bch2_trans_iter_exit(trans, &iter);
/*
* Before doing any repair, check hash_info itself:
*/
@@ -232,64 +388,12 @@ bad_hash:
goto out;
if (fsck_err(trans, hash_table_key_wrong_offset,
- "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s",
- bch2_btree_id_str(desc->btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
- struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, hash_k);
- if (IS_ERR(new))
- return PTR_ERR(new);
-
- k = bch2_hash_set_or_get_in_snapshot(trans, &iter, *desc, hash_info,
- (subvol_inum) { 0, hash_k.k->p.inode },
- hash_k.k->p.snapshot, new,
- STR_HASH_must_create|
- BTREE_ITER_with_updates|
- BTREE_UPDATE_internal_snapshot_node);
- ret = bkey_err(k);
- if (ret)
- goto out;
- if (k.k)
- goto duplicate_entries;
-
- ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter,
- BTREE_UPDATE_internal_snapshot_node) ?:
- bch2_fsck_update_backpointers(trans, s, *desc, hash_info, new) ?:
- bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
- -BCH_ERR_transaction_restart_nested;
- goto out;
- }
-fsck_err:
- goto out;
-duplicate_entries:
- ret = hash_pick_winner(trans, *desc, hash_info, hash_k, k);
- if (ret < 0)
- goto out;
-
- if (!fsck_err(trans, hash_table_key_duplicate,
- "duplicate hash table keys%s:\n%s",
- ret != 2 ? "" : ", both point to valid inodes",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, hash_k),
- prt_newline(&buf),
- bch2_bkey_val_to_text(&buf, c, k),
- buf.buf)))
- goto out;
-
- switch (ret) {
- case 0:
- ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0);
- break;
- case 1:
- ret = bch2_hash_delete_at(trans, *desc, hash_info, &iter, 0);
- break;
- case 2:
- ret = fsck_rename_dirent(trans, s, *desc, hash_info, bkey_s_c_to_dirent(hash_k)) ?:
- bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0);
- goto out;
- }
-
- ret = bch2_trans_commit(trans, NULL, NULL, 0) ?:
- -BCH_ERR_transaction_restart_nested;
+ "hash table key at wrong offset: should be at %llu\n%s",
+ hash,
+ (bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf)))
+ ret = bch2_str_hash_repair_key(trans, s, desc, hash_info,
+ k_iter, hash_k,
+ &iter, bkey_s_c_null,
+ updated_before_k_pos);
goto out;
}
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index 0c1a00539bd1..79d51aef70aa 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -32,6 +32,7 @@ bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt)
}
struct bch_hash_info {
+ u32 inum_snapshot;
u8 type;
struct unicode_map *cf_encoding;
/*
@@ -45,11 +46,12 @@ static inline struct bch_hash_info
bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi)
{
struct bch_hash_info info = {
- .type = INODE_STR_HASH(bi),
+ .inum_snapshot = bi->bi_snapshot,
+ .type = INODE_STR_HASH(bi),
#ifdef CONFIG_UNICODE
- .cf_encoding = bch2_inode_casefold(c, bi) ? c->cf_encoding : NULL,
+ .cf_encoding = bch2_inode_casefold(c, bi) ? c->cf_encoding : NULL,
#endif
- .siphash_key = { .k0 = bi->bi_hash_seed }
+ .siphash_key = { .k0 = bi->bi_hash_seed }
};
if (unlikely(info.type == BCH_STR_HASH_siphash_old)) {
@@ -259,6 +261,7 @@ struct bkey_s_c bch2_hash_set_or_get_in_snapshot(struct btree_trans *trans,
struct bkey_i *insert,
enum btree_iter_update_trigger_flags flags)
{
+ struct bch_fs *c = trans->c;
struct btree_iter slot = {};
struct bkey_s_c k;
bool found = false;
@@ -286,7 +289,7 @@ struct bkey_s_c bch2_hash_set_or_get_in_snapshot(struct btree_trans *trans,
}
if (!ret)
- ret = -BCH_ERR_ENOSPC_str_hash_create;
+ ret = bch_err_throw(c, ENOSPC_str_hash_create);
out:
bch2_trans_iter_exit(trans, &slot);
bch2_trans_iter_exit(trans, iter);
@@ -298,7 +301,7 @@ not_found:
bch2_trans_iter_exit(trans, &slot);
return k;
} else if (!found && (flags & STR_HASH_must_replace)) {
- ret = -BCH_ERR_ENOENT_str_hash_set_must_replace;
+ ret = bch_err_throw(c, ENOENT_str_hash_set_must_replace);
} else {
if (!found && slot.path)
swap(*iter, slot);
@@ -326,7 +329,7 @@ int bch2_hash_set_in_snapshot(struct btree_trans *trans,
return ret;
if (k.k) {
bch2_trans_iter_exit(trans, &iter);
- return -BCH_ERR_EEXIST_str_hash_set;
+ return bch_err_throw(trans->c, EEXIST_str_hash_set);
}
return 0;
@@ -392,18 +395,30 @@ int bch2_hash_delete(struct btree_trans *trans,
return ret;
}
+int bch2_repair_inode_hash_info(struct btree_trans *, struct bch_inode_unpacked *);
+
struct snapshots_seen;
+int bch2_str_hash_repair_key(struct btree_trans *,
+ struct snapshots_seen *,
+ const struct bch_hash_desc *,
+ struct bch_hash_info *,
+ struct btree_iter *, struct bkey_s_c,
+ struct btree_iter *, struct bkey_s_c,
+ bool *);
+
int __bch2_str_hash_check_key(struct btree_trans *,
struct snapshots_seen *,
const struct bch_hash_desc *,
struct bch_hash_info *,
- struct btree_iter *, struct bkey_s_c);
+ struct btree_iter *, struct bkey_s_c,
+ bool *);
static inline int bch2_str_hash_check_key(struct btree_trans *trans,
struct snapshots_seen *s,
const struct bch_hash_desc *desc,
struct bch_hash_info *hash_info,
- struct btree_iter *k_iter, struct bkey_s_c hash_k)
+ struct btree_iter *k_iter, struct bkey_s_c hash_k,
+ bool *updated_before_k_pos)
{
if (hash_k.k->type != desc->key_type)
return 0;
@@ -411,7 +426,8 @@ static inline int bch2_str_hash_check_key(struct btree_trans *trans,
if (likely(desc->hash_bkey(hash_info, hash_k) == hash_k.k->p.offset))
return 0;
- return __bch2_str_hash_check_key(trans, s, desc, hash_info, k_iter, hash_k);
+ return __bch2_str_hash_check_key(trans, s, desc, hash_info, k_iter, hash_k,
+ updated_before_k_pos);
}
#endif /* _BCACHEFS_STR_HASH_H */
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index d0209f7658bb..020587449123 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -3,6 +3,7 @@
#include "bcachefs.h"
#include "btree_key_cache.h"
#include "btree_update.h"
+#include "enumerated_ref.h"
#include "errcode.h"
#include "error.h"
#include "fs.h"
@@ -14,6 +15,22 @@
static int bch2_subvolume_delete(struct btree_trans *, u32);
+static int bch2_subvolume_missing(struct bch_fs *c, u32 subvolid)
+{
+ struct printbuf buf = PRINTBUF;
+ bch2_log_msg_start(c, &buf);
+
+ prt_printf(&buf, "missing subvolume %u", subvolid);
+ bool print = bch2_count_fsck_err(c, subvol_missing, &buf);
+
+ int ret = bch2_run_explicit_recovery_pass(c, &buf,
+ BCH_RECOVERY_PASS_check_inodes, 0);
+ if (print)
+ bch2_print_str(c, KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+ return ret;
+}
+
static struct bpos subvolume_children_pos(struct bkey_s_c k)
{
if (k.k->type != KEY_TYPE_subvolume)
@@ -45,7 +62,7 @@ static int check_subvol(struct btree_trans *trans,
ret = bch2_snapshot_lookup(trans, snapid, &snapshot);
if (bch2_err_matches(ret, ENOENT))
- return bch2_run_explicit_recovery_pass(c,
+ return bch2_run_print_explicit_recovery_pass(c,
BCH_RECOVERY_PASS_reconstruct_snapshots) ?: ret;
if (ret)
return ret;
@@ -113,10 +130,20 @@ static int check_subvol(struct btree_trans *trans,
"subvolume %llu points to missing subvolume root %llu:%u",
k.k->p.offset, le64_to_cpu(subvol.v->inode),
le32_to_cpu(subvol.v->snapshot))) {
- ret = bch2_subvolume_delete(trans, iter->pos.offset);
- bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset);
- ret = ret ?: -BCH_ERR_transaction_restart_nested;
- goto err;
+ /*
+ * Recreate - any contents that are still disconnected
+ * will then get reattached under lost+found
+ */
+ bch2_inode_init_early(c, &inode);
+ bch2_inode_init_late(c, &inode, bch2_current_time(c),
+ 0, 0, S_IFDIR|0700, 0, NULL);
+ inode.bi_inum = le64_to_cpu(subvol.v->inode);
+ inode.bi_snapshot = le32_to_cpu(subvol.v->snapshot);
+ inode.bi_subvol = k.k->p.offset;
+ inode.bi_parent_subvol = le32_to_cpu(subvol.v->fs_path_parent);
+ ret = __bch2_fsck_write_inode(trans, &inode);
+ if (ret)
+ goto err;
}
} else {
goto err;
@@ -124,13 +151,9 @@ static int check_subvol(struct btree_trans *trans,
if (!BCH_SUBVOLUME_SNAP(subvol.v)) {
u32 snapshot_root = bch2_snapshot_root(c, le32_to_cpu(subvol.v->snapshot));
- u32 snapshot_tree;
- struct bch_snapshot_tree st;
-
- rcu_read_lock();
- snapshot_tree = snapshot_t(c, snapshot_root)->tree;
- rcu_read_unlock();
+ u32 snapshot_tree = bch2_snapshot_tree(c, snapshot_root);
+ struct bch_snapshot_tree st;
ret = bch2_snapshot_tree_lookup(trans, snapshot_tree, &st);
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
@@ -242,6 +265,13 @@ void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
prt_printf(out, " creation_parent %u", le32_to_cpu(s.v->creation_parent));
prt_printf(out, " fs_parent %u", le32_to_cpu(s.v->fs_path_parent));
}
+
+ if (BCH_SUBVOLUME_RO(s.v))
+ prt_printf(out, " ro");
+ if (BCH_SUBVOLUME_SNAP(s.v))
+ prt_printf(out, " snapshot");
+ if (BCH_SUBVOLUME_UNLINKED(s.v))
+ prt_printf(out, " unlinked");
}
static int subvolume_children_mod(struct btree_trans *trans, struct bpos pos, bool set)
@@ -292,9 +322,8 @@ bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol,
int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, subvol),
BTREE_ITER_cached|
BTREE_ITER_with_updates, subvolume, s);
- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT) &&
- inconsistent_if_not_found,
- trans->c, "missing subvolume %u", subvol);
+ if (bch2_err_matches(ret, ENOENT) && inconsistent_if_not_found)
+ ret = bch2_subvolume_missing(trans->c, subvol) ?: ret;
return ret;
}
@@ -344,8 +373,8 @@ int __bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid,
subvolume);
ret = bkey_err(subvol);
- bch2_fs_inconsistent_on(warn && bch2_err_matches(ret, ENOENT), trans->c,
- "missing subvolume %u", subvolid);
+ if (bch2_err_matches(ret, ENOENT))
+ ret = bch2_subvolume_missing(trans->c, subvolid) ?: ret;
if (likely(!ret))
*snapid = le32_to_cpu(subvol.v->snapshot);
@@ -418,8 +447,8 @@ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
BTREE_ITER_cached|BTREE_ITER_intent,
subvolume);
int ret = bkey_err(subvol);
- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
- "missing subvolume %u", subvolid);
+ if (bch2_err_matches(ret, ENOENT))
+ ret = bch2_subvolume_missing(trans->c, subvolid) ?: ret;
if (ret)
goto err;
@@ -470,22 +499,23 @@ err:
static int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
{
- return bch2_subvolumes_reparent(trans, subvolid) ?:
+ int ret = bch2_subvolumes_reparent(trans, subvolid) ?:
commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
__bch2_subvolume_delete(trans, subvolid));
+
+ bch2_recovery_pass_set_no_ratelimit(trans->c, BCH_RECOVERY_PASS_check_subvols);
+ return ret;
}
static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work)
{
struct bch_fs *c = container_of(work, struct bch_fs,
snapshot_wait_for_pagecache_and_delete_work);
- snapshot_id_list s;
- u32 *id;
int ret = 0;
while (!ret) {
mutex_lock(&c->snapshots_unlinked_lock);
- s = c->snapshots_unlinked;
+ snapshot_id_list s = c->snapshots_unlinked;
darray_init(&c->snapshots_unlinked);
mutex_unlock(&c->snapshots_unlinked_lock);
@@ -494,7 +524,7 @@ static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *wor
bch2_evict_subvolume_inodes(c, &s);
- for (id = s.data; id < s.data + s.nr; id++) {
+ darray_for_each(s, id) {
ret = bch2_trans_run(c, bch2_subvolume_delete(trans, *id));
bch_err_msg(c, ret, "deleting subvolume %u", *id);
if (ret)
@@ -504,7 +534,7 @@ static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *wor
darray_exit(&s);
}
- bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_snapshot_delete_pagecache);
}
struct subvolume_unlink_hook {
@@ -527,11 +557,11 @@ static int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans
if (ret)
return ret;
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_snapshot_delete_pagecache))
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_snapshot_delete_pagecache))
return -EROFS;
if (!queue_work(c->write_ref_wq, &c->snapshot_wait_for_pagecache_and_delete_work))
- bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_snapshot_delete_pagecache);
return 0;
}
@@ -555,11 +585,10 @@ int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid)
BTREE_ID_subvolumes, POS(0, subvolid),
BTREE_ITER_cached, subvolume);
ret = PTR_ERR_OR_ZERO(n);
- if (unlikely(ret)) {
- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
- "missing subvolume %u", subvolid);
+ if (bch2_err_matches(ret, ENOENT))
+ ret = bch2_subvolume_missing(trans->c, subvolid) ?: ret;
+ if (unlikely(ret))
return ret;
- }
SET_BCH_SUBVOLUME_UNLINKED(&n->v, true);
n->v.fs_path_parent = 0;
@@ -584,7 +613,7 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
ret = bch2_bkey_get_empty_slot(trans, &dst_iter,
BTREE_ID_subvolumes, POS(0, U32_MAX));
if (ret == -BCH_ERR_ENOSPC_btree_slot)
- ret = -BCH_ERR_ENOSPC_subvolume_create;
+ ret = bch_err_throw(c, ENOSPC_subvolume_create);
if (ret)
return ret;
@@ -598,11 +627,10 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
BTREE_ID_subvolumes, POS(0, src_subvolid),
BTREE_ITER_cached, subvolume);
ret = PTR_ERR_OR_ZERO(src_subvol);
- if (unlikely(ret)) {
- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
- "subvolume %u not found", src_subvolid);
+ if (bch2_err_matches(ret, ENOENT))
+ ret = bch2_subvolume_missing(trans->c, src_subvolid) ?: ret;
+ if (unlikely(ret))
goto err;
- }
parent = le32_to_cpu(src_subvol->v.snapshot);
}
@@ -691,8 +719,9 @@ static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans)
return ret;
if (!bkey_is_inode(k.k)) {
- bch_err(trans->c, "root inode not found");
- ret = -BCH_ERR_ENOENT_inode;
+ struct bch_fs *c = trans->c;
+ bch_err(c, "root inode not found");
+ ret = bch_err_throw(c, ENOENT_inode);
goto err;
}
@@ -716,11 +745,8 @@ int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c)
return ret;
}
-int bch2_fs_subvolumes_init(struct bch_fs *c)
+void bch2_fs_subvolumes_init_early(struct bch_fs *c)
{
- INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work);
INIT_WORK(&c->snapshot_wait_for_pagecache_and_delete_work,
bch2_subvolume_wait_for_pagecache_and_delete);
- mutex_init(&c->snapshots_unlinked_lock);
- return 0;
}
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index f640c1e3d639..075f55e25c70 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -77,15 +77,12 @@ bch2_btree_iter_peek_in_subvolume_max_type(struct btree_trans *trans, struct btr
_end, _subvolid, _flags, _k, _do); \
})
-int bch2_delete_dead_snapshots(struct bch_fs *);
-void bch2_delete_dead_snapshots_async(struct bch_fs *);
-
int bch2_subvolume_unlink(struct btree_trans *, u32);
int bch2_subvolume_create(struct btree_trans *, u64, u32, u32, u32 *, u32 *, bool);
int bch2_initialize_subvolumes(struct bch_fs *);
int bch2_fs_upgrade_for_subvolumes(struct bch_fs *);
-int bch2_fs_subvolumes_init(struct bch_fs *);
+void bch2_fs_subvolumes_init_early(struct bch_fs *);
#endif /* _BCACHEFS_SUBVOLUME_H */
diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h
index 1549d6daf7af..9d634b906dcd 100644
--- a/fs/bcachefs/subvolume_types.h
+++ b/fs/bcachefs/subvolume_types.h
@@ -2,33 +2,6 @@
#ifndef _BCACHEFS_SUBVOLUME_TYPES_H
#define _BCACHEFS_SUBVOLUME_TYPES_H
-#include "darray.h"
-
-typedef DARRAY(u32) snapshot_id_list;
-
-#define IS_ANCESTOR_BITMAP 128
-
-struct snapshot_t {
- bool live;
- u32 parent;
- u32 skip[3];
- u32 depth;
- u32 children[2];
- u32 subvol; /* Nonzero only if a subvolume points to this node: */
- u32 tree;
- unsigned long is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)];
-};
-
-struct snapshot_table {
- struct rcu_head rcu;
- size_t nr;
-#ifndef RUST_BINDGEN
- DECLARE_FLEX_ARRAY(struct snapshot_t, s);
-#else
- struct snapshot_t s[0];
-#endif
-};
-
typedef struct {
/* we can't have padding in this struct: */
u64 subvol;
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index cb5d960aed92..6c2e1d647403 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -87,7 +87,8 @@ int bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version v
struct printbuf buf = PRINTBUF;
prt_str(&buf, "requested incompat feature ");
bch2_version_to_text(&buf, version);
- prt_str(&buf, " currently not enabled");
+ prt_str(&buf, " currently not enabled, allowed up to ");
+ bch2_version_to_text(&buf, version);
prt_printf(&buf, "\n set version_upgrade=incompat to enable");
bch_notice(c, "%s", buf.buf);
@@ -260,11 +261,11 @@ struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *sb,
/* XXX: we're not checking that offline device have enough space */
- for_each_online_member(c, ca) {
+ for_each_online_member(c, ca, BCH_DEV_READ_REF_sb_field_resize) {
struct bch_sb_handle *dev_sb = &ca->disk_sb;
if (bch2_sb_realloc(dev_sb, le32_to_cpu(dev_sb->sb->u64s) + d)) {
- percpu_ref_put(&ca->io_ref[READ]);
+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_sb_field_resize);
return NULL;
}
}
@@ -384,7 +385,6 @@ static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out)
int bch2_sb_validate(struct bch_sb *sb, u64 read_offset,
enum bch_validate_flags flags, struct printbuf *out)
{
- struct bch_sb_field_members_v1 *mi;
enum bch_opt_id opt_id;
int ret;
@@ -468,6 +468,9 @@ int bch2_sb_validate(struct bch_sb *sb, u64 read_offset,
SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(sb, BCH_SB_VERSION_INCOMPAT(sb));
}
+ if (sb->nr_devices > 1)
+ SET_BCH_SB_MULTI_DEVICE(sb, true);
+
if (!flags) {
/*
* Been seeing a bug where these are getting inexplicably
@@ -536,14 +539,17 @@ int bch2_sb_validate(struct bch_sb *sb, u64 read_offset,
}
}
+ struct bch_sb_field *mi =
+ bch2_sb_field_get_id(sb, BCH_SB_FIELD_members_v2) ?:
+ bch2_sb_field_get_id(sb, BCH_SB_FIELD_members_v1);
+
/* members must be validated first: */
- mi = bch2_sb_field_get(sb, members_v1);
if (!mi) {
prt_printf(out, "Invalid superblock: member info area missing");
return -BCH_ERR_invalid_sb_members_missing;
}
- ret = bch2_sb_field_validate(sb, &mi->field, flags, out);
+ ret = bch2_sb_field_validate(sb, mi, flags, out);
if (ret)
return ret;
@@ -612,11 +618,15 @@ static void bch2_sb_update(struct bch_fs *c)
c->sb.features = le64_to_cpu(src->features[0]);
c->sb.compat = le64_to_cpu(src->compat[0]);
+ c->sb.multi_device = BCH_SB_MULTI_DEVICE(src);
memset(c->sb.errors_silent, 0, sizeof(c->sb.errors_silent));
struct bch_sb_field_ext *ext = bch2_sb_field_get(src, ext);
if (ext) {
+ c->sb.recovery_passes_required =
+ bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
+
le_bitvector_to_cpu(c->sb.errors_silent, (void *) ext->errors_silent,
sizeof(c->sb.errors_silent) * 8);
c->sb.btrees_lost_data = le64_to_cpu(ext->btrees_lost_data);
@@ -961,7 +971,7 @@ static void write_super_endio(struct bio *bio)
}
closure_put(&ca->fs->sb_write);
- percpu_ref_put(&ca->io_ref[READ]);
+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super);
}
static void read_back_super(struct bch_fs *c, struct bch_dev *ca)
@@ -979,7 +989,7 @@ static void read_back_super(struct bch_fs *c, struct bch_dev *ca)
this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], bio_sectors(bio));
- percpu_ref_get(&ca->io_ref[READ]);
+ enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super);
closure_bio_submit(bio, &c->sb_write);
}
@@ -1005,7 +1015,7 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_sb],
bio_sectors(bio));
- percpu_ref_get(&ca->io_ref[READ]);
+ enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super);
closure_bio_submit(bio, &c->sb_write);
}
@@ -1022,7 +1032,7 @@ int bch2_write_super(struct bch_fs *c)
trace_and_count(c, write_super, c, _RET_IP_);
- if (c->opts.very_degraded)
+ if (c->opts.degraded == BCH_DEGRADED_very)
degraded_flags |= BCH_FORCE_IF_LOST;
lockdep_assert_held(&c->sb_lock);
@@ -1037,13 +1047,13 @@ int bch2_write_super(struct bch_fs *c)
* For now, we expect to be able to call write_super() when we're not
* yet RW:
*/
- for_each_online_member(c, ca) {
+ for_each_online_member(c, ca, BCH_DEV_READ_REF_write_super) {
ret = darray_push(&online_devices, ca);
if (bch2_fs_fatal_err_on(ret, c, "%s: error allocating online devices", __func__)) {
- percpu_ref_put(&ca->io_ref[READ]);
+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super);
goto out;
}
- percpu_ref_get(&ca->io_ref[READ]);
+ enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super);
}
/* Make sure we're using the new magic numbers: */
@@ -1102,7 +1112,7 @@ int bch2_write_super(struct bch_fs *c)
prt_str(&buf, ")");
bch2_fs_fatal_error(c, ": %s", buf.buf);
printbuf_exit(&buf);
- ret = -BCH_ERR_sb_not_downgraded;
+ ret = bch_err_throw(c, sb_not_downgraded);
goto out;
}
@@ -1132,7 +1142,7 @@ int bch2_write_super(struct bch_fs *c)
if (c->opts.errors != BCH_ON_ERROR_continue &&
c->opts.errors != BCH_ON_ERROR_fix_safe) {
- ret = -BCH_ERR_erofs_sb_err;
+ ret = bch_err_throw(c, erofs_sb_err);
bch2_fs_fatal_error(c, "%s", buf.buf);
} else {
bch_err(c, "%s", buf.buf);
@@ -1151,7 +1161,7 @@ int bch2_write_super(struct bch_fs *c)
ca->disk_sb.seq);
bch2_fs_fatal_error(c, "%s", buf.buf);
printbuf_exit(&buf);
- ret = -BCH_ERR_erofs_sb_err;
+ ret = bch_err_throw(c, erofs_sb_err);
}
}
@@ -1205,12 +1215,12 @@ int bch2_write_super(struct bch_fs *c)
!can_mount_with_written), c,
": Unable to write superblock to sufficient devices (from %ps)",
(void *) _RET_IP_))
- ret = -BCH_ERR_erofs_sb_err;
+ ret = bch_err_throw(c, erofs_sb_err);
out:
/* Make new options visible after they're persistent: */
bch2_sb_update(c);
darray_for_each(online_devices, ca)
- percpu_ref_put(&(*ca)->io_ref[READ]);
+ enumerated_ref_put(&(*ca)->io_ref[READ], BCH_DEV_READ_REF_write_super);
darray_exit(&online_devices);
printbuf_exit(&err);
return ret;
@@ -1270,6 +1280,31 @@ void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version, bool incompat)
}
}
+void bch2_sb_upgrade_incompat(struct bch_fs *c)
+{
+ mutex_lock(&c->sb_lock);
+ if (c->sb.version == c->sb.version_incompat_allowed)
+ goto unlock;
+
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "Now allowing incompatible features up to ");
+ bch2_version_to_text(&buf, c->sb.version);
+ prt_str(&buf, ", previously allowed up to ");
+ bch2_version_to_text(&buf, c->sb.version_incompat_allowed);
+ prt_newline(&buf);
+
+ bch_notice(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+
+ c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
+ SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb,
+ max(BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb), c->sb.version));
+ bch2_write_super(c);
+unlock:
+ mutex_unlock(&c->sb_lock);
+}
+
static int bch2_sb_ext_validate(struct bch_sb *sb, struct bch_sb_field *f,
enum bch_validate_flags flags, struct printbuf *err)
{
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index 78f708a6fbcd..a3b7a90f2533 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -107,6 +107,7 @@ static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat)
bool bch2_check_version_downgrade(struct bch_fs *);
void bch2_sb_upgrade(struct bch_fs *, unsigned, bool);
+void bch2_sb_upgrade_incompat(struct bch_fs *);
void __bch2_sb_field_to_text(struct printbuf *, struct bch_sb *,
struct bch_sb_field *);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 84a37d971ffd..397a69da5a75 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -10,6 +10,8 @@
#include "bcachefs.h"
#include "alloc_background.h"
#include "alloc_foreground.h"
+#include "async_objs.h"
+#include "backpointers.h"
#include "bkey_sort.h"
#include "btree_cache.h"
#include "btree_gc.h"
@@ -28,6 +30,7 @@
#include "disk_accounting.h"
#include "disk_groups.h"
#include "ec.h"
+#include "enumerated_ref.h"
#include "errcode.h"
#include "error.h"
#include "fs.h"
@@ -48,6 +51,7 @@
#include "quota.h"
#include "rebalance.h"
#include "recovery.h"
+#include "recovery_passes.h"
#include "replicas.h"
#include "sb-clean.h"
#include "sb-counters.h"
@@ -75,14 +79,32 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
MODULE_DESCRIPTION("bcachefs filesystem");
-const char * const bch2_fs_flag_strs[] = {
+typedef DARRAY(struct bch_sb_handle) bch_sb_handles;
+
#define x(n) #n,
+const char * const bch2_fs_flag_strs[] = {
BCH_FS_FLAGS()
-#undef x
NULL
};
-void bch2_print_str(struct bch_fs *c, const char *str)
+const char * const bch2_write_refs[] = {
+ BCH_WRITE_REFS()
+ NULL
+};
+
+const char * const bch2_dev_read_refs[] = {
+ BCH_DEV_READ_REFS()
+ NULL
+};
+
+const char * const bch2_dev_write_refs[] = {
+ BCH_DEV_WRITE_REFS()
+ NULL
+};
+#undef x
+
+static void __bch2_print_str(struct bch_fs *c, const char *prefix,
+ const char *str, bool nonblocking)
{
#ifdef __KERNEL__
struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c);
@@ -92,7 +114,17 @@ void bch2_print_str(struct bch_fs *c, const char *str)
return;
}
#endif
- bch2_print_string_as_lines(KERN_ERR, str);
+ bch2_print_string_as_lines(KERN_ERR, str, nonblocking);
+}
+
+void bch2_print_str(struct bch_fs *c, const char *prefix, const char *str)
+{
+ __bch2_print_str(c, prefix, str, false);
+}
+
+void bch2_print_str_nonblocking(struct bch_fs *c, const char *prefix, const char *str)
+{
+ __bch2_print_str(c, prefix, str, true);
}
__printf(2, 0)
@@ -183,26 +215,21 @@ static int bch2_dev_alloc(struct bch_fs *, unsigned);
static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *);
static void bch2_dev_io_ref_stop(struct bch_dev *, int);
static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *);
+static int bch2_fs_init_rw(struct bch_fs *);
struct bch_fs *bch2_dev_to_fs(dev_t dev)
{
- struct bch_fs *c;
-
- mutex_lock(&bch_fs_list_lock);
- rcu_read_lock();
+ guard(mutex)(&bch_fs_list_lock);
+ guard(rcu)();
+ struct bch_fs *c;
list_for_each_entry(c, &bch_fs_list, list)
for_each_member_device_rcu(c, ca, NULL)
if (ca->disk_sb.bdev && ca->disk_sb.bdev->bd_dev == dev) {
closure_get(&c->cl);
- goto found;
+ return c;
}
- c = NULL;
-found:
- rcu_read_unlock();
- mutex_unlock(&bch_fs_list_lock);
-
- return c;
+ return NULL;
}
static struct bch_fs *__bch2_uuid_to_fs(__uuid_t uuid)
@@ -297,15 +324,13 @@ static void __bch2_fs_read_only(struct bch_fs *c)
}
}
-#ifndef BCH_WRITE_REF_DEBUG
-static void bch2_writes_disabled(struct percpu_ref *writes)
+static void bch2_writes_disabled(struct enumerated_ref *writes)
{
struct bch_fs *c = container_of(writes, struct bch_fs, writes);
set_bit(BCH_FS_write_disable_complete, &c->flags);
wake_up(&bch2_read_only_wait);
}
-#endif
void bch2_fs_read_only(struct bch_fs *c)
{
@@ -323,12 +348,7 @@ void bch2_fs_read_only(struct bch_fs *c)
* writes will return -EROFS:
*/
set_bit(BCH_FS_going_ro, &c->flags);
-#ifndef BCH_WRITE_REF_DEBUG
- percpu_ref_kill(&c->writes);
-#else
- for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++)
- bch2_write_ref_put(c, i);
-#endif
+ enumerated_ref_stop_async(&c->writes);
/*
* If we're not doing an emergency shutdown, we want to wait on
@@ -366,7 +386,7 @@ void bch2_fs_read_only(struct bch_fs *c)
!test_bit(BCH_FS_emergency_ro, &c->flags) &&
test_bit(BCH_FS_started, &c->flags) &&
test_bit(BCH_FS_clean_shutdown, &c->flags) &&
- c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) {
+ c->recovery.pass_done >= BCH_RECOVERY_PASS_journal_replay) {
BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal));
BUG_ON(atomic_long_read(&c->btree_cache.nr_dirty));
BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty));
@@ -412,6 +432,30 @@ bool bch2_fs_emergency_read_only(struct bch_fs *c)
return ret;
}
+static bool __bch2_fs_emergency_read_only2(struct bch_fs *c, struct printbuf *out,
+ bool locked)
+{
+ bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags);
+
+ if (!locked)
+ bch2_journal_halt(&c->journal);
+ else
+ bch2_journal_halt_locked(&c->journal);
+ bch2_fs_read_only_async(c);
+ wake_up(&bch2_read_only_wait);
+
+ if (ret)
+ prt_printf(out, "emergency read only at seq %llu\n",
+ journal_cur_seq(&c->journal));
+
+ return ret;
+}
+
+bool bch2_fs_emergency_read_only2(struct bch_fs *c, struct printbuf *out)
+{
+ return __bch2_fs_emergency_read_only2(c, out, false);
+}
+
bool bch2_fs_emergency_read_only_locked(struct bch_fs *c)
{
bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags);
@@ -429,9 +473,17 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags));
+ if (WARN_ON(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)))
+ return bch_err_throw(c, erofs_no_alloc_info);
+
if (test_bit(BCH_FS_initial_gc_unfixed, &c->flags)) {
bch_err(c, "cannot go rw, unfixed btree errors");
- return -BCH_ERR_erofs_unfixed_errors;
+ return bch_err_throw(c, erofs_unfixed_errors);
+ }
+
+ if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) {
+ bch_err(c, "cannot go rw, filesystem is an unresized image file");
+ return bch_err_throw(c, erofs_filesystem_full);
}
if (test_bit(BCH_FS_rw, &c->flags))
@@ -439,16 +491,23 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
bch_info(c, "going read-write");
+ ret = bch2_fs_init_rw(c);
+ if (ret)
+ goto err;
+
ret = bch2_sb_members_v2_init(c);
if (ret)
goto err;
clear_bit(BCH_FS_clean_shutdown, &c->flags);
- __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), READ) {
- bch2_dev_allocator_add(c, ca);
- percpu_ref_reinit(&ca->io_ref[WRITE]);
- }
+ scoped_guard(rcu)
+ for_each_online_member_rcu(c, ca)
+ if (ca->mi.state == BCH_MEMBER_STATE_rw) {
+ bch2_dev_allocator_add(c, ca);
+ enumerated_ref_start(&ca->io_ref[WRITE]);
+ }
+
bch2_recalc_capacity(c);
/*
@@ -474,14 +533,7 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
set_bit(BCH_FS_rw, &c->flags);
set_bit(BCH_FS_was_rw, &c->flags);
-#ifndef BCH_WRITE_REF_DEBUG
- percpu_ref_reinit(&c->writes);
-#else
- for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) {
- BUG_ON(atomic_long_read(&c->writes[i]));
- atomic_long_inc(&c->writes[i]);
- }
-#endif
+ enumerated_ref_start(&c->writes);
ret = bch2_copygc_start(c);
if (ret) {
@@ -512,10 +564,13 @@ int bch2_fs_read_write(struct bch_fs *c)
{
if (c->opts.recovery_pass_last &&
c->opts.recovery_pass_last < BCH_RECOVERY_PASS_journal_replay)
- return -BCH_ERR_erofs_norecovery;
+ return bch_err_throw(c, erofs_norecovery);
if (c->opts.nochanges)
- return -BCH_ERR_erofs_nochanges;
+ return bch_err_throw(c, erofs_nochanges);
+
+ if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))
+ return bch_err_throw(c, erofs_no_alloc_info);
return __bch2_fs_read_write(c, false);
}
@@ -543,35 +598,37 @@ static void __bch2_fs_free(struct bch_fs *c)
bch2_find_btree_nodes_exit(&c->found_btree_nodes);
bch2_free_pending_node_rewrites(c);
bch2_free_fsck_errs(c);
- bch2_fs_accounting_exit(c);
- bch2_fs_sb_errors_exit(c);
- bch2_fs_counters_exit(c);
+ bch2_fs_vfs_exit(c);
bch2_fs_snapshots_exit(c);
+ bch2_fs_sb_errors_exit(c);
+ bch2_fs_replicas_exit(c);
+ bch2_fs_rebalance_exit(c);
bch2_fs_quota_exit(c);
+ bch2_fs_nocow_locking_exit(c);
+ bch2_fs_journal_exit(&c->journal);
bch2_fs_fs_io_direct_exit(c);
bch2_fs_fs_io_buffered_exit(c);
bch2_fs_fsio_exit(c);
- bch2_fs_vfs_exit(c);
- bch2_fs_ec_exit(c);
- bch2_fs_encryption_exit(c);
- bch2_fs_nocow_locking_exit(c);
bch2_fs_io_write_exit(c);
bch2_fs_io_read_exit(c);
+ bch2_fs_encryption_exit(c);
+ bch2_fs_ec_exit(c);
+ bch2_fs_counters_exit(c);
+ bch2_fs_compress_exit(c);
+ bch2_io_clock_exit(&c->io_clock[WRITE]);
+ bch2_io_clock_exit(&c->io_clock[READ]);
bch2_fs_buckets_waiting_for_journal_exit(c);
- bch2_fs_btree_interior_update_exit(c);
+ bch2_fs_btree_write_buffer_exit(c);
bch2_fs_btree_key_cache_exit(&c->btree_key_cache);
- bch2_fs_btree_cache_exit(c);
bch2_fs_btree_iter_exit(c);
- bch2_fs_replicas_exit(c);
- bch2_fs_journal_exit(&c->journal);
- bch2_io_clock_exit(&c->io_clock[WRITE]);
- bch2_io_clock_exit(&c->io_clock[READ]);
- bch2_fs_compress_exit(c);
- bch2_fs_btree_gc_exit(c);
+ bch2_fs_btree_interior_update_exit(c);
+ bch2_fs_btree_cache_exit(c);
+ bch2_fs_accounting_exit(c);
+ bch2_fs_async_obj_exit(c);
bch2_journal_keys_put_initial(c);
bch2_find_btree_nodes_exit(&c->found_btree_nodes);
+
BUG_ON(atomic_read(&c->journal_keys.ref));
- bch2_fs_btree_write_buffer_exit(c);
percpu_free_rwsem(&c->mark_lock);
if (c->online_reserved) {
u64 v = percpu_u64_get(c->online_reserved);
@@ -587,9 +644,7 @@ static void __bch2_fs_free(struct bch_fs *c)
mempool_exit(&c->btree_bounce_pool);
bioset_exit(&c->btree_bio);
mempool_exit(&c->fill_iter);
-#ifndef BCH_WRITE_REF_DEBUG
- percpu_ref_exit(&c->writes);
-#endif
+ enumerated_ref_exit(&c->writes);
kfree(rcu_dereference_protected(c->disk_groups, 1));
kfree(c->journal_seq_blacklist_table);
@@ -601,8 +656,8 @@ static void __bch2_fs_free(struct bch_fs *c)
destroy_workqueue(c->btree_read_complete_wq);
if (c->copygc_wq)
destroy_workqueue(c->copygc_wq);
- if (c->btree_io_complete_wq)
- destroy_workqueue(c->btree_io_complete_wq);
+ if (c->btree_write_complete_wq)
+ destroy_workqueue(c->btree_write_complete_wq);
if (c->btree_update_wq)
destroy_workqueue(c->btree_update_wq);
@@ -628,6 +683,12 @@ void __bch2_fs_stop(struct bch_fs *c)
bch2_fs_read_only(c);
up_write(&c->state_lock);
+ for (unsigned i = 0; i < c->sb.nr_devices; i++) {
+ struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true);
+ if (ca)
+ bch2_dev_io_ref_stop(ca, READ);
+ }
+
for_each_member_device(c, ca)
bch2_dev_unlink(ca);
@@ -656,8 +717,6 @@ void __bch2_fs_stop(struct bch_fs *c)
void bch2_fs_free(struct bch_fs *c)
{
- unsigned i;
-
mutex_lock(&bch_fs_list_lock);
list_del(&c->list);
mutex_unlock(&bch_fs_list_lock);
@@ -665,7 +724,7 @@ void bch2_fs_free(struct bch_fs *c)
closure_sync(&c->cl);
closure_debug_destroy(&c->cl);
- for (i = 0; i < c->sb.nr_devices; i++) {
+ for (unsigned i = 0; i < c->sb.nr_devices; i++) {
struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true);
if (ca) {
@@ -693,9 +752,10 @@ static int bch2_fs_online(struct bch_fs *c)
lockdep_assert_held(&bch_fs_list_lock);
- if (__bch2_uuid_to_fs(c->sb.uuid)) {
+ if (c->sb.multi_device &&
+ __bch2_uuid_to_fs(c->sb.uuid)) {
bch_err(c, "filesystem UUID already open");
- return -EINVAL;
+ return bch_err_throw(c, filesystem_uuid_already_open);
}
ret = bch2_fs_chardev_init(c);
@@ -706,7 +766,9 @@ static int bch2_fs_online(struct bch_fs *c)
bch2_fs_debug_init(c);
- ret = kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ?:
+ ret = (c->sb.multi_device
+ ? kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b)
+ : kobject_add(&c->kobj, NULL, "%s", c->name)) ?:
kobject_add(&c->internal, &c->kobj, "internal") ?:
kobject_add(&c->opts_dir, &c->kobj, "options") ?:
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
@@ -737,7 +799,37 @@ err:
return ret;
}
-static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
+static int bch2_fs_init_rw(struct bch_fs *c)
+{
+ if (test_bit(BCH_FS_rw_init_done, &c->flags))
+ return 0;
+
+ if (!(c->btree_update_wq = alloc_workqueue("bcachefs",
+ WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) ||
+ !(c->btree_write_complete_wq = alloc_workqueue("bcachefs_btree_write_complete",
+ WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
+ !(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
+ WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
+ !(c->btree_write_submit_wq = alloc_workqueue("bcachefs_btree_write_sumit",
+ WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
+ !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref",
+ WQ_FREEZABLE, 0)))
+ return bch_err_throw(c, ENOMEM_fs_other_alloc);
+
+ int ret = bch2_fs_btree_interior_update_init(c) ?:
+ bch2_fs_btree_write_buffer_init(c) ?:
+ bch2_fs_fs_io_buffered_init(c) ?:
+ bch2_fs_io_write_init(c) ?:
+ bch2_fs_journal_init(&c->journal);
+ if (ret)
+ return ret;
+
+ set_bit(BCH_FS_rw_init_done, &c->flags);
+ return 0;
+}
+
+static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts,
+ bch_sb_handles *sbs)
{
struct bch_fs *c;
struct printbuf name = PRINTBUF;
@@ -750,7 +842,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
goto out;
}
- c->stdio = (void *)(unsigned long) opts.stdio;
+ c->stdio = (void *)(unsigned long) opts->stdio;
__module_get(THIS_MODULE);
@@ -774,24 +866,29 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
refcount_set(&c->ro_ref, 1);
init_waitqueue_head(&c->ro_ref_wait);
- spin_lock_init(&c->recovery_pass_lock);
- sema_init(&c->online_fsck_mutex, 1);
for (i = 0; i < BCH_TIME_STAT_NR; i++)
bch2_time_stats_init(&c->times[i]);
- bch2_fs_copygc_init(c);
- bch2_fs_btree_key_cache_init_early(&c->btree_key_cache);
- bch2_fs_btree_iter_init_early(c);
- bch2_fs_btree_interior_update_init_early(c);
- bch2_fs_journal_keys_init(c);
bch2_fs_allocator_background_init(c);
bch2_fs_allocator_foreground_init(c);
- bch2_fs_rebalance_init(c);
- bch2_fs_quota_init(c);
+ bch2_fs_btree_cache_init_early(&c->btree_cache);
+ bch2_fs_btree_gc_init_early(c);
+ bch2_fs_btree_interior_update_init_early(c);
+ bch2_fs_btree_iter_init_early(c);
+ bch2_fs_btree_key_cache_init_early(&c->btree_key_cache);
+ bch2_fs_btree_write_buffer_init_early(c);
+ bch2_fs_copygc_init(c);
bch2_fs_ec_init_early(c);
+ bch2_fs_journal_init_early(&c->journal);
+ bch2_fs_journal_keys_init(c);
bch2_fs_move_init(c);
+ bch2_fs_nocow_locking_init_early(c);
+ bch2_fs_quota_init(c);
+ bch2_fs_recovery_passes_init(c);
bch2_fs_sb_errors_init_early(c);
+ bch2_fs_snapshots_init_early(c);
+ bch2_fs_subvolumes_init_early(c);
INIT_LIST_HEAD(&c->list);
@@ -817,8 +914,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write];
c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq];
- bch2_fs_btree_cache_init_early(&c->btree_cache);
-
mutex_init(&c->sectors_available_lock);
ret = percpu_init_rwsem(&c->mark_lock);
@@ -832,14 +927,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
if (ret)
goto err;
- pr_uuid(&name, c->sb.user_uuid.b);
- ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0;
- if (ret)
- goto err;
-
- strscpy(c->name, name.buf, sizeof(c->name));
- printbuf_exit(&name);
-
/* Compat: */
if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 &&
!BCH_SB_JOURNAL_FLUSH_DELAY(sb))
@@ -854,7 +941,14 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
if (ret)
goto err;
- bch2_opts_apply(&c->opts, opts);
+ bch2_opts_apply(&c->opts, *opts);
+
+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
+ c->opts.block_size > PAGE_SIZE) {
+ bch_err(c, "cannot mount bs > ps filesystem without CONFIG_TRANSPARENT_HUGEPAGE");
+ ret = -EINVAL;
+ goto err;
+ }
c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc;
if (c->opts.inodes_use_key_cache)
@@ -870,26 +964,26 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
goto err;
}
+ if (c->sb.multi_device)
+ pr_uuid(&name, c->sb.user_uuid.b);
+ else
+ prt_bdevname(&name, sbs->data[0].bdev);
+
+ ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0;
+ if (ret)
+ goto err;
+
+ strscpy(c->name, name.buf, sizeof(c->name));
+ printbuf_exit(&name);
+
iter_size = sizeof(struct sort_iter) +
(btree_blocks(c) + 1) * 2 *
sizeof(struct sort_iter_set);
- if (!(c->btree_update_wq = alloc_workqueue("bcachefs",
- WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) ||
- !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io",
- WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
- !(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
- WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
- !(c->btree_read_complete_wq = alloc_workqueue("bcachefs_btree_read_complete",
+ if (!(c->btree_read_complete_wq = alloc_workqueue("bcachefs_btree_read_complete",
WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 512)) ||
- !(c->btree_write_submit_wq = alloc_workqueue("bcachefs_btree_write_sumit",
- WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
- !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref",
- WQ_FREEZABLE, 0)) ||
-#ifndef BCH_WRITE_REF_DEBUG
- percpu_ref_init(&c->writes, bch2_writes_disabled,
- PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
-#endif
+ enumerated_ref_init(&c->writes, BCH_WRITE_REF_NR,
+ bch2_writes_disabled) ||
mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
bioset_init(&c->btree_bio, 1,
max(offsetof(struct btree_read_bio, bio),
@@ -901,33 +995,28 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1,
c->opts.btree_node_size) ||
mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048)) {
- ret = -BCH_ERR_ENOMEM_fs_other_alloc;
+ ret = bch_err_throw(c, ENOMEM_fs_other_alloc);
goto err;
}
- ret = bch2_fs_counters_init(c) ?:
- bch2_fs_sb_errors_init(c) ?:
- bch2_io_clock_init(&c->io_clock[READ]) ?:
- bch2_io_clock_init(&c->io_clock[WRITE]) ?:
- bch2_fs_journal_init(&c->journal) ?:
- bch2_fs_btree_iter_init(c) ?:
+ ret =
+ bch2_fs_async_obj_init(c) ?:
bch2_fs_btree_cache_init(c) ?:
+ bch2_fs_btree_iter_init(c) ?:
bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?:
- bch2_fs_btree_interior_update_init(c) ?:
- bch2_fs_btree_gc_init(c) ?:
bch2_fs_buckets_waiting_for_journal_init(c) ?:
- bch2_fs_btree_write_buffer_init(c) ?:
- bch2_fs_subvolumes_init(c) ?:
- bch2_fs_io_read_init(c) ?:
- bch2_fs_io_write_init(c) ?:
- bch2_fs_nocow_locking_init(c) ?:
- bch2_fs_encryption_init(c) ?:
+ bch2_io_clock_init(&c->io_clock[READ]) ?:
+ bch2_io_clock_init(&c->io_clock[WRITE]) ?:
bch2_fs_compress_init(c) ?:
+ bch2_fs_counters_init(c) ?:
bch2_fs_ec_init(c) ?:
- bch2_fs_vfs_init(c) ?:
+ bch2_fs_encryption_init(c) ?:
bch2_fs_fsio_init(c) ?:
- bch2_fs_fs_io_buffered_init(c) ?:
- bch2_fs_fs_io_direct_init(c);
+ bch2_fs_fs_io_direct_init(c) ?:
+ bch2_fs_io_read_init(c) ?:
+ bch2_fs_rebalance_init(c) ?:
+ bch2_fs_sb_errors_init(c) ?:
+ bch2_fs_vfs_init(c);
if (ret)
goto err;
@@ -942,10 +1031,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
ret = -EINVAL;
goto err;
}
- bch_info(c, "Using encoding defined by superblock: utf8-%u.%u.%u",
- unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING),
- unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING),
- unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING));
#else
if (c->sb.features & BIT_ULL(BCH_FEATURE_casefolding)) {
printk(KERN_ERR "Cannot mount a filesystem with casefolding on a kernel without CONFIG_UNICODE\n");
@@ -1013,6 +1098,11 @@ static void print_mount_opts(struct bch_fs *c)
bch2_version_to_text(&p, c->sb.version_incompat_allowed);
}
+ if (c->opts.verbose) {
+ prt_printf(&p, "\n features: ");
+ prt_bitflags(&p, bch2_sb_features, c->sb.features);
+ }
+
bch_info(c, "%s", p.buf);
printbuf_exit(&p);
}
@@ -1020,19 +1110,18 @@ static void print_mount_opts(struct bch_fs *c)
static bool bch2_fs_may_start(struct bch_fs *c)
{
struct bch_dev *ca;
- unsigned i, flags = 0;
+ unsigned flags = 0;
- if (c->opts.very_degraded)
+ switch (c->opts.degraded) {
+ case BCH_DEGRADED_very:
flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST;
-
- if (c->opts.degraded)
+ break;
+ case BCH_DEGRADED_yes:
flags |= BCH_FORCE_IF_DEGRADED;
-
- if (!c->opts.degraded &&
- !c->opts.very_degraded) {
+ break;
+ default:
mutex_lock(&c->sb_lock);
-
- for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
+ for (unsigned i = 0; i < c->disk_sb.sb->nr_devices; i++) {
if (!bch2_member_exists(c->disk_sb.sb, i))
continue;
@@ -1046,9 +1135,10 @@ static bool bch2_fs_may_start(struct bch_fs *c)
}
}
mutex_unlock(&c->sb_lock);
+ break;
}
- return bch2_have_enough_devs(c, bch2_online_devs(c), flags, true);
+ return bch2_have_enough_devs(c, c->online_devs, flags, true);
}
int bch2_fs_start(struct bch_fs *c)
@@ -1058,8 +1148,15 @@ int bch2_fs_start(struct bch_fs *c)
print_mount_opts(c);
+#ifdef CONFIG_UNICODE
+ bch_info(c, "Using encoding defined by superblock: utf8-%u.%u.%u",
+ unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING),
+ unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING),
+ unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING));
+#endif
+
if (!bch2_fs_may_start(c))
- return -BCH_ERR_insufficient_devices_to_start;
+ return bch_err_throw(c, insufficient_devices_to_start);
down_write(&c->state_lock);
mutex_lock(&c->sb_lock);
@@ -1070,7 +1167,7 @@ int bch2_fs_start(struct bch_fs *c)
sizeof(struct bch_sb_field_ext) / sizeof(u64))) {
mutex_unlock(&c->sb_lock);
up_write(&c->state_lock);
- ret = -BCH_ERR_ENOSPC_sb;
+ ret = bch_err_throw(c, ENOSPC_sb);
goto err;
}
@@ -1081,13 +1178,20 @@ int bch2_fs_start(struct bch_fs *c)
goto err;
}
- for_each_online_member(c, ca)
- bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = cpu_to_le64(now);
+ scoped_guard(rcu)
+ for_each_online_member_rcu(c, ca)
+ bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount =
+ cpu_to_le64(now);
+ /*
+ * Dno't write superblock yet: recovery might have to downgrade
+ */
mutex_unlock(&c->sb_lock);
- for_each_rw_member(c, ca)
- bch2_dev_allocator_add(c, ca);
+ scoped_guard(rcu)
+ for_each_online_member_rcu(c, ca)
+ if (ca->mi.state == BCH_MEMBER_STATE_rw)
+ bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
up_write(&c->state_lock);
@@ -1100,12 +1204,12 @@ int bch2_fs_start(struct bch_fs *c)
if (ret)
goto err;
- ret = bch2_opts_check_may_set(c);
+ ret = bch2_opts_hooks_pre_set(c);
if (ret)
goto err;
if (bch2_fs_init_fault("fs_start")) {
- ret = -BCH_ERR_injected_fs_start;
+ ret = bch_err_throw(c, injected_fs_start);
goto err;
}
@@ -1132,11 +1236,11 @@ static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
if (le16_to_cpu(sb->block_size) != block_sectors(c))
- return -BCH_ERR_mismatched_block_size;
+ return bch_err_throw(c, mismatched_block_size);
if (le16_to_cpu(m.bucket_size) <
BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb))
- return -BCH_ERR_bucket_size_too_small;
+ return bch_err_throw(c, bucket_size_too_small);
return 0;
}
@@ -1234,11 +1338,14 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs,
static void bch2_dev_io_ref_stop(struct bch_dev *ca, int rw)
{
- if (!percpu_ref_is_zero(&ca->io_ref[rw])) {
- reinit_completion(&ca->io_ref_completion[rw]);
- percpu_ref_kill(&ca->io_ref[rw]);
- wait_for_completion(&ca->io_ref_completion[rw]);
- }
+ if (rw == READ)
+ clear_bit(ca->dev_idx, ca->fs->online_devs.d);
+
+ if (!enumerated_ref_is_zero(&ca->io_ref[rw]))
+ enumerated_ref_stop(&ca->io_ref[rw],
+ rw == READ
+ ? bch2_dev_read_refs
+ : bch2_dev_write_refs);
}
static void bch2_dev_release(struct kobject *kobj)
@@ -1250,8 +1357,8 @@ static void bch2_dev_release(struct kobject *kobj)
static void bch2_dev_free(struct bch_dev *ca)
{
- WARN_ON(!percpu_ref_is_zero(&ca->io_ref[WRITE]));
- WARN_ON(!percpu_ref_is_zero(&ca->io_ref[READ]));
+ WARN_ON(!enumerated_ref_is_zero(&ca->io_ref[WRITE]));
+ WARN_ON(!enumerated_ref_is_zero(&ca->io_ref[READ]));
cancel_work_sync(&ca->io_error_work);
@@ -1260,6 +1367,9 @@ static void bch2_dev_free(struct bch_dev *ca)
if (ca->kobj.state_in_sysfs)
kobject_del(&ca->kobj);
+ bch2_bucket_bitmap_free(&ca->bucket_backpointer_mismatch);
+ bch2_bucket_bitmap_free(&ca->bucket_backpointer_empty);
+
bch2_free_super(&ca->disk_sb);
bch2_dev_allocator_background_exit(ca);
bch2_dev_journal_exit(ca);
@@ -1271,8 +1381,8 @@ static void bch2_dev_free(struct bch_dev *ca)
bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]);
bch2_time_stats_quantiles_exit(&ca->io_latency[READ]);
- percpu_ref_exit(&ca->io_ref[WRITE]);
- percpu_ref_exit(&ca->io_ref[READ]);
+ enumerated_ref_exit(&ca->io_ref[WRITE]);
+ enumerated_ref_exit(&ca->io_ref[READ]);
#ifndef CONFIG_BCACHEFS_DEBUG
percpu_ref_exit(&ca->ref);
#endif
@@ -1284,7 +1394,7 @@ static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca)
lockdep_assert_held(&c->state_lock);
- if (percpu_ref_is_zero(&ca->io_ref[READ]))
+ if (enumerated_ref_is_zero(&ca->io_ref[READ]))
return;
__bch2_dev_read_only(c, ca);
@@ -1306,20 +1416,6 @@ static void bch2_dev_ref_complete(struct percpu_ref *ref)
}
#endif
-static void bch2_dev_io_ref_read_complete(struct percpu_ref *ref)
-{
- struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref[READ]);
-
- complete(&ca->io_ref_completion[READ]);
-}
-
-static void bch2_dev_io_ref_write_complete(struct percpu_ref *ref)
-{
- struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref[WRITE]);
-
- complete(&ca->io_ref_completion[WRITE]);
-}
-
static void bch2_dev_unlink(struct bch_dev *ca)
{
struct kobject *b;
@@ -1381,8 +1477,6 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
kobject_init(&ca->kobj, &bch2_dev_ktype);
init_completion(&ca->ref_completion);
- init_completion(&ca->io_ref_completion[READ]);
- init_completion(&ca->io_ref_completion[WRITE]);
INIT_WORK(&ca->io_error_work, bch2_io_error_work);
@@ -1406,12 +1500,13 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
atomic_long_set(&ca->ref, 1);
#endif
+ mutex_init(&ca->bucket_backpointer_mismatch.lock);
+ mutex_init(&ca->bucket_backpointer_empty.lock);
+
bch2_dev_allocator_background_init(ca);
- if (percpu_ref_init(&ca->io_ref[READ], bch2_dev_io_ref_read_complete,
- PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
- percpu_ref_init(&ca->io_ref[WRITE], bch2_dev_io_ref_write_complete,
- PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
+ if (enumerated_ref_init(&ca->io_ref[READ], BCH_DEV_READ_REF_NR, NULL) ||
+ enumerated_ref_init(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_NR, NULL) ||
!(ca->sb_read_scratch = kmalloc(BCH_SB_READ_SCRATCH_BUF_SIZE, GFP_KERNEL)) ||
bch2_dev_buckets_alloc(c, ca) ||
!(ca->io_done = alloc_percpu(*ca->io_done)))
@@ -1428,7 +1523,9 @@ static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca,
{
ca->dev_idx = dev_idx;
__set_bit(ca->dev_idx, ca->self.d);
- scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx);
+
+ if (!ca->name[0])
+ scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx);
ca->fs = c;
rcu_assign_pointer(c->devs[ca->dev_idx], ca);
@@ -1454,7 +1551,7 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
bch2_dev_attach(c, ca, dev_idx);
return 0;
err:
- return -BCH_ERR_ENOMEM_dev_alloc;
+ return bch_err_throw(c, ENOMEM_dev_alloc);
}
static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
@@ -1464,22 +1561,27 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
if (bch2_dev_is_online(ca)) {
bch_err(ca, "already have device online in slot %u",
sb->sb->dev_idx);
- return -BCH_ERR_device_already_online;
+ return bch_err_throw(ca->fs, device_already_online);
}
if (get_capacity(sb->bdev->bd_disk) <
ca->mi.bucket_size * ca->mi.nbuckets) {
bch_err(ca, "cannot online: device too small");
- return -BCH_ERR_device_size_too_small;
+ return bch_err_throw(ca->fs, device_size_too_small);
}
- BUG_ON(!percpu_ref_is_zero(&ca->io_ref[READ]));
- BUG_ON(!percpu_ref_is_zero(&ca->io_ref[WRITE]));
+ BUG_ON(!enumerated_ref_is_zero(&ca->io_ref[READ]));
+ BUG_ON(!enumerated_ref_is_zero(&ca->io_ref[WRITE]));
ret = bch2_dev_journal_init(ca, sb->sb);
if (ret)
return ret;
+ struct printbuf name = PRINTBUF;
+ prt_bdevname(&name, sb->bdev);
+ strscpy(ca->name, name.buf, sizeof(ca->name));
+ printbuf_exit(&name);
+
/* Commit: */
ca->disk_sb = *sb;
memset(sb, 0, sizeof(*sb));
@@ -1493,7 +1595,7 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
ca->dev = ca->disk_sb.bdev->bd_dev;
- percpu_ref_reinit(&ca->io_ref[READ]);
+ enumerated_ref_start(&ca->io_ref[READ]);
return 0;
}
@@ -1517,16 +1619,9 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
if (ret)
return ret;
- bch2_dev_sysfs_online(c, ca);
-
- struct printbuf name = PRINTBUF;
- prt_bdevname(&name, ca->disk_sb.bdev);
+ set_bit(ca->dev_idx, c->online_devs.d);
- if (c->sb.nr_devices == 1)
- strscpy(c->name, name.buf, sizeof(c->name));
- strscpy(ca->name, name.buf, sizeof(ca->name));
-
- printbuf_exit(&name);
+ bch2_dev_sysfs_online(c, ca);
bch2_rebalance_wakeup(c);
return 0;
@@ -1578,7 +1673,7 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
return true;
/* do we have enough devices to read from? */
- new_online_devs = bch2_online_devs(c);
+ new_online_devs = c->online_devs;
__clear_bit(ca->dev_idx, new_online_devs.d);
return bch2_have_enough_devs(c, new_online_devs, flags, false);
@@ -1608,8 +1703,8 @@ static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
- if (percpu_ref_is_zero(&ca->io_ref[WRITE]))
- percpu_ref_reinit(&ca->io_ref[WRITE]);
+ if (enumerated_ref_is_zero(&ca->io_ref[WRITE]))
+ enumerated_ref_start(&ca->io_ref[WRITE]);
bch2_dev_do_discards(ca);
}
@@ -1624,7 +1719,7 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
return 0;
if (!bch2_dev_state_allowed(c, ca, new_state, flags))
- return -BCH_ERR_device_state_not_allowed;
+ return bch_err_throw(c, device_state_not_allowed);
if (new_state != BCH_MEMBER_STATE_rw)
__bch2_dev_read_only(c, ca);
@@ -1663,6 +1758,8 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
{
struct bch_member *m;
unsigned dev_idx = ca->dev_idx, data;
+ bool fast_device_removal = !bch2_request_incompat_feature(c,
+ bcachefs_metadata_version_fast_device_removal);
int ret;
down_write(&c->state_lock);
@@ -1675,17 +1772,31 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) {
bch_err(ca, "Cannot remove without losing data");
- ret = -BCH_ERR_device_state_not_allowed;
+ ret = bch_err_throw(c, device_state_not_allowed);
goto err;
}
__bch2_dev_read_only(c, ca);
- ret = bch2_dev_data_drop(c, ca->dev_idx, flags);
- bch_err_msg(ca, ret, "bch2_dev_data_drop()");
+ ret = fast_device_removal
+ ? bch2_dev_data_drop_by_backpointers(c, ca->dev_idx, flags)
+ : (bch2_dev_data_drop(c, ca->dev_idx, flags) ?:
+ bch2_dev_remove_stripes(c, ca->dev_idx, flags));
if (ret)
goto err;
+ /* Check if device still has data before blowing away alloc info */
+ struct bch_dev_usage usage = bch2_dev_usage_read(ca);
+ for (unsigned i = 0; i < BCH_DATA_NR; i++)
+ if (!data_type_is_empty(i) &&
+ !data_type_is_hidden(i) &&
+ usage.buckets[i]) {
+ bch_err(ca, "Remove failed: still has data (%s, %llu buckets)",
+ __bch2_data_types[i], usage.buckets[i]);
+ ret = -EBUSY;
+ goto err;
+ }
+
ret = bch2_dev_remove_alloc(c, ca);
bch_err_msg(ca, ret, "bch2_dev_remove_alloc()");
if (ret)
@@ -1749,7 +1860,11 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
*/
mutex_lock(&c->sb_lock);
m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx);
- memset(&m->uuid, 0, sizeof(m->uuid));
+
+ if (fast_device_removal)
+ m->uuid = BCH_SB_MEMBER_DELETED_UUID;
+ else
+ memset(&m->uuid, 0, sizeof(m->uuid));
bch2_write_super(c);
@@ -1759,7 +1874,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
err:
if (test_bit(BCH_FS_rw, &c->flags) &&
ca->mi.state == BCH_MEMBER_STATE_rw &&
- !percpu_ref_is_zero(&ca->io_ref[READ]))
+ !enumerated_ref_is_zero(&ca->io_ref[READ]))
__bch2_dev_read_write(c, ca);
up_write(&c->state_lock);
return ret;
@@ -1769,11 +1884,11 @@ err:
int bch2_dev_add(struct bch_fs *c, const char *path)
{
struct bch_opts opts = bch2_opts_empty();
- struct bch_sb_handle sb;
+ struct bch_sb_handle sb = {};
struct bch_dev *ca = NULL;
struct printbuf errbuf = PRINTBUF;
struct printbuf label = PRINTBUF;
- int ret;
+ int ret = 0;
ret = bch2_read_super(path, &opts, &sb);
bch_err_msg(c, ret, "reading super");
@@ -1790,6 +1905,20 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
}
}
+ if (list_empty(&c->list)) {
+ mutex_lock(&bch_fs_list_lock);
+ if (__bch2_uuid_to_fs(c->sb.uuid))
+ ret = bch_err_throw(c, filesystem_uuid_already_open);
+ else
+ list_add(&c->list, &bch_fs_list);
+ mutex_unlock(&bch_fs_list_lock);
+
+ if (ret) {
+ bch_err(c, "filesystem UUID already open");
+ goto err;
+ }
+ }
+
ret = bch2_dev_may_add(sb.sb, c);
if (ret)
goto err;
@@ -1806,6 +1935,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
down_write(&c->state_lock);
mutex_lock(&c->sb_lock);
+ SET_BCH_SB_MULTI_DEVICE(c->disk_sb.sb, true);
ret = bch2_sb_from_fs(c, ca);
bch_err_msg(c, ret, "setting up new superblock");
@@ -1821,6 +1951,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
goto err_unlock;
}
unsigned dev_idx = ret;
+ ret = 0;
/* success: */
@@ -1840,27 +1971,29 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
- ret = bch2_dev_usage_init(ca, false);
- if (ret)
- goto err_late;
+ if (test_bit(BCH_FS_started, &c->flags)) {
+ ret = bch2_dev_usage_init(ca, false);
+ if (ret)
+ goto err_late;
- ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional);
- bch_err_msg(ca, ret, "marking new superblock");
- if (ret)
- goto err_late;
+ ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional);
+ bch_err_msg(ca, ret, "marking new superblock");
+ if (ret)
+ goto err_late;
- ret = bch2_fs_freespace_init(c);
- bch_err_msg(ca, ret, "initializing free space");
- if (ret)
- goto err_late;
+ ret = bch2_fs_freespace_init(c);
+ bch_err_msg(ca, ret, "initializing free space");
+ if (ret)
+ goto err_late;
- if (ca->mi.state == BCH_MEMBER_STATE_rw)
- __bch2_dev_read_write(c, ca);
+ if (ca->mi.state == BCH_MEMBER_STATE_rw)
+ __bch2_dev_read_write(c, ca);
- ret = bch2_dev_journal_alloc(ca, false);
- bch_err_msg(c, ret, "allocating journal");
- if (ret)
- goto err_late;
+ ret = bch2_dev_journal_alloc(ca, false);
+ bch_err_msg(c, ret, "allocating journal");
+ if (ret)
+ goto err_late;
+ }
up_write(&c->state_lock);
out:
@@ -1962,7 +2095,7 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) {
bch_err(ca, "Cannot offline required disk");
up_write(&c->state_lock);
- return -BCH_ERR_device_state_not_allowed;
+ return bch_err_throw(c, device_state_not_allowed);
}
__bch2_dev_offline(c, ca);
@@ -1971,6 +2104,18 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
return 0;
}
+static int __bch2_dev_resize_alloc(struct bch_dev *ca, u64 old_nbuckets, u64 new_nbuckets)
+{
+ struct bch_fs *c = ca->fs;
+ u64 v[3] = { new_nbuckets - old_nbuckets, 0, 0 };
+
+ return bch2_trans_commit_do(ca->fs, NULL, NULL, 0,
+ bch2_disk_accounting_mod2(trans, false, v, dev_data_type,
+ .dev = ca->dev_idx,
+ .data_type = BCH_DATA_free)) ?:
+ bch2_dev_freespace_init(c, ca, old_nbuckets, new_nbuckets);
+}
+
int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
{
struct bch_member *m;
@@ -1989,7 +2134,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
if (nbuckets > BCH_MEMBER_NBUCKETS_MAX) {
bch_err(ca, "New device size too big (%llu greater than max %u)",
nbuckets, BCH_MEMBER_NBUCKETS_MAX);
- ret = -BCH_ERR_device_size_too_big;
+ ret = bch_err_throw(c, device_size_too_big);
goto err;
}
@@ -1997,7 +2142,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
get_capacity(ca->disk_sb.bdev->bd_disk) <
ca->mi.bucket_size * nbuckets) {
bch_err(ca, "New size larger than device");
- ret = -BCH_ERR_device_size_too_small;
+ ret = bch_err_throw(c, device_size_too_small);
goto err;
}
@@ -2018,13 +2163,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
mutex_unlock(&c->sb_lock);
if (ca->mi.freespace_initialized) {
- u64 v[3] = { nbuckets - old_nbuckets, 0, 0 };
-
- ret = bch2_trans_commit_do(ca->fs, NULL, NULL, 0,
- bch2_disk_accounting_mod2(trans, false, v, dev_data_type,
- .dev = ca->dev_idx,
- .data_type = BCH_DATA_free)) ?:
- bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets);
+ ret = __bch2_dev_resize_alloc(ca, old_nbuckets, nbuckets);
if (ret)
goto err;
}
@@ -2035,6 +2174,49 @@ err:
return ret;
}
+int bch2_fs_resize_on_mount(struct bch_fs *c)
+{
+ for_each_online_member(c, ca, BCH_DEV_READ_REF_fs_resize_on_mount) {
+ u64 old_nbuckets = ca->mi.nbuckets;
+ u64 new_nbuckets = div64_u64(get_capacity(ca->disk_sb.bdev->bd_disk),
+ ca->mi.bucket_size);
+
+ if (ca->mi.resize_on_mount &&
+ new_nbuckets > ca->mi.nbuckets) {
+ bch_info(ca, "resizing to size %llu", new_nbuckets * ca->mi.bucket_size);
+ int ret = bch2_dev_buckets_resize(c, ca, new_nbuckets);
+ bch_err_fn(ca, ret);
+ if (ret) {
+ enumerated_ref_put(&ca->io_ref[READ],
+ BCH_DEV_READ_REF_fs_resize_on_mount);
+ up_write(&c->state_lock);
+ return ret;
+ }
+
+ mutex_lock(&c->sb_lock);
+ struct bch_member *m =
+ bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+ m->nbuckets = cpu_to_le64(new_nbuckets);
+ SET_BCH_MEMBER_RESIZE_ON_MOUNT(m, false);
+
+ c->disk_sb.sb->features[0] &= ~cpu_to_le64(BIT_ULL(BCH_FEATURE_small_image));
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+
+ if (ca->mi.freespace_initialized) {
+ ret = __bch2_dev_resize_alloc(ca, old_nbuckets, new_nbuckets);
+ if (ret) {
+ enumerated_ref_put(&ca->io_ref[READ],
+ BCH_DEV_READ_REF_fs_resize_on_mount);
+ up_write(&c->state_lock);
+ return ret;
+ }
+ }
+ }
+ }
+ return 0;
+}
+
/* return with ref on ca->ref: */
struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name)
{
@@ -2095,20 +2277,32 @@ static void bch2_fs_bdev_mark_dead(struct block_device *bdev, bool surprise)
if (!ca)
goto unlock;
- if (bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, BCH_FORCE_IF_DEGRADED)) {
+ bool dev = bch2_dev_state_allowed(c, ca,
+ BCH_MEMBER_STATE_failed,
+ BCH_FORCE_IF_DEGRADED);
+
+ if (!dev && sb) {
+ if (!surprise)
+ sync_filesystem(sb);
+ shrink_dcache_sb(sb);
+ evict_inodes(sb);
+ }
+
+ struct printbuf buf = PRINTBUF;
+ __bch2_log_msg_start(ca->name, &buf);
+
+ prt_printf(&buf, "offline from block layer");
+
+ if (dev) {
__bch2_dev_offline(c, ca);
} else {
- if (sb) {
- if (!surprise)
- sync_filesystem(sb);
- shrink_dcache_sb(sb);
- evict_inodes(sb);
- }
-
bch2_journal_flush(&c->journal);
- bch2_fs_emergency_read_only(c);
+ bch2_fs_emergency_read_only2(c, &buf);
}
+ bch2_print_str(c, KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+
bch2_dev_put(ca);
unlock:
if (sb)
@@ -2151,10 +2345,10 @@ static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r)
cmp_int(le64_to_cpu(l->write_time), le64_to_cpu(r->write_time));
}
-struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
- struct bch_opts opts)
+struct bch_fs *bch2_fs_open(darray_const_str *devices,
+ struct bch_opts *opts)
{
- DARRAY(struct bch_sb_handle) sbs = { 0 };
+ bch_sb_handles sbs = {};
struct bch_fs *c = NULL;
struct bch_sb_handle *best = NULL;
struct printbuf errbuf = PRINTBUF;
@@ -2163,27 +2357,27 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
if (!try_module_get(THIS_MODULE))
return ERR_PTR(-ENODEV);
- if (!nr_devices) {
+ if (!devices->nr) {
ret = -EINVAL;
goto err;
}
- ret = darray_make_room(&sbs, nr_devices);
+ ret = darray_make_room(&sbs, devices->nr);
if (ret)
goto err;
- for (unsigned i = 0; i < nr_devices; i++) {
+ darray_for_each(*devices, i) {
struct bch_sb_handle sb = { NULL };
- ret = bch2_read_super(devices[i], &opts, &sb);
+ ret = bch2_read_super(*i, opts, &sb);
if (ret)
goto err;
BUG_ON(darray_push(&sbs, sb));
}
- if (opts.nochanges && !opts.read_only) {
- ret = -BCH_ERR_erofs_nochanges;
+ if (opts->nochanges && !opts->read_only) {
+ ret = bch_err_throw(c, erofs_nochanges);
goto err_print;
}
@@ -2192,7 +2386,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
best = sb;
darray_for_each_reverse(sbs, sb) {
- ret = bch2_dev_in_fs(best, sb, &opts);
+ ret = bch2_dev_in_fs(best, sb, opts);
if (ret == -BCH_ERR_device_has_been_removed ||
ret == -BCH_ERR_device_splitbrain) {
@@ -2207,7 +2401,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
goto err_print;
}
- c = bch2_fs_alloc(best->sb, opts);
+ c = bch2_fs_alloc(best->sb, opts, &sbs);
ret = PTR_ERR_OR_ZERO(c);
if (ret)
goto err;
@@ -2236,7 +2430,7 @@ out:
return c;
err_print:
pr_err("bch_fs_open err opening %s: %s",
- devices[0], bch2_err_str(ret));
+ devices->data[0], bch2_err_str(ret));
err:
if (!IS_ERR_OR_NULL(c))
bch2_fs_stop(c);
@@ -2273,9 +2467,45 @@ err:
return -ENOMEM;
}
-#define BCH_DEBUG_PARAM(name, description) \
- bool bch2_##name; \
- module_param_named(name, bch2_##name, bool, 0644); \
+#define BCH_DEBUG_PARAM(name, description) DEFINE_STATIC_KEY_FALSE(bch2_##name);
+BCH_DEBUG_PARAMS_ALL()
+#undef BCH_DEBUG_PARAM
+
+static int bch2_param_set_static_key_t(const char *val, const struct kernel_param *kp)
+{
+ /* Match bool exactly, by re-using it. */
+ struct static_key *key = kp->arg;
+ struct kernel_param boolkp = *kp;
+ bool v;
+ int ret;
+
+ boolkp.arg = &v;
+
+ ret = param_set_bool(val, &boolkp);
+ if (ret)
+ return ret;
+ if (v)
+ static_key_enable(key);
+ else
+ static_key_disable(key);
+ return 0;
+}
+
+static int bch2_param_get_static_key_t(char *buffer, const struct kernel_param *kp)
+{
+ struct static_key *key = kp->arg;
+ return sprintf(buffer, "%c\n", static_key_enabled(key) ? 'N' : 'Y');
+}
+
+static const struct kernel_param_ops bch2_param_ops_static_key_t = {
+ .flags = KERNEL_PARAM_OPS_FL_NOARG,
+ .set = bch2_param_set_static_key_t,
+ .get = bch2_param_get_static_key_t,
+};
+
+#define BCH_DEBUG_PARAM(name, description) \
+ module_param_cb(name, &bch2_param_ops_static_key_t, &bch2_##name.key, 0644);\
+ __MODULE_PARM_TYPE(name, "static_key_t"); \
MODULE_PARM_DESC(name, description);
BCH_DEBUG_PARAMS()
#undef BCH_DEBUG_PARAM
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
index 23533bce5709..dc52f06cb2b9 100644
--- a/fs/bcachefs/super.h
+++ b/fs/bcachefs/super.h
@@ -9,6 +9,9 @@
#include <linux/math64.h>
extern const char * const bch2_fs_flag_strs[];
+extern const char * const bch2_write_refs[];
+extern const char * const bch2_dev_read_refs[];
+extern const char * const bch2_dev_write_refs[];
struct bch_fs *bch2_dev_to_fs(dev_t);
struct bch_fs *bch2_uuid_to_fs(__uuid_t);
@@ -29,18 +32,22 @@ int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64);
struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *);
bool bch2_fs_emergency_read_only(struct bch_fs *);
+bool bch2_fs_emergency_read_only2(struct bch_fs *, struct printbuf *);
+
bool bch2_fs_emergency_read_only_locked(struct bch_fs *);
void bch2_fs_read_only(struct bch_fs *);
int bch2_fs_read_write(struct bch_fs *);
int bch2_fs_read_write_early(struct bch_fs *);
+int bch2_fs_resize_on_mount(struct bch_fs *);
+
void __bch2_fs_stop(struct bch_fs *);
void bch2_fs_free(struct bch_fs *);
void bch2_fs_stop(struct bch_fs *);
int bch2_fs_start(struct bch_fs *);
-struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts);
+struct bch_fs *bch2_fs_open(darray_const_str *, struct bch_opts *);
extern const struct blk_holder_ops bch2_sb_handle_bdev_ops;
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 82ee333ddd21..05848375cea2 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -25,6 +25,8 @@
#include "disk_accounting.h"
#include "disk_groups.h"
#include "ec.h"
+#include "enumerated_ref.h"
+#include "error.h"
#include "inode.h"
#include "journal.h"
#include "journal_reclaim.h"
@@ -34,7 +36,9 @@
#include "nocow_locking.h"
#include "opts.h"
#include "rebalance.h"
+#include "recovery_passes.h"
#include "replicas.h"
+#include "sb-errors.h"
#include "super-io.h"
#include "tests.h"
@@ -141,12 +145,16 @@ do { \
write_attribute(trigger_gc);
write_attribute(trigger_discards);
write_attribute(trigger_invalidates);
+write_attribute(trigger_journal_commit);
write_attribute(trigger_journal_flush);
write_attribute(trigger_journal_writes);
write_attribute(trigger_btree_cache_shrink);
write_attribute(trigger_btree_key_cache_shrink);
-write_attribute(trigger_freelist_wakeup);
write_attribute(trigger_btree_updates);
+write_attribute(trigger_freelist_wakeup);
+write_attribute(trigger_recalc_capacity);
+write_attribute(trigger_delete_dead_snapshots);
+write_attribute(trigger_emergency_read_only);
read_attribute(gc_gens_pos);
read_attribute(uuid);
@@ -168,6 +176,7 @@ read_attribute(btree_write_stats);
read_attribute(btree_cache_size);
read_attribute(compression_stats);
+read_attribute(errors);
read_attribute(journal_debug);
read_attribute(btree_cache);
read_attribute(btree_key_cache);
@@ -176,25 +185,9 @@ read_attribute(open_buckets);
read_attribute(open_buckets_partial);
read_attribute(nocow_lock_table);
-#ifdef BCH_WRITE_REF_DEBUG
+read_attribute(read_refs);
read_attribute(write_refs);
-static const char * const bch2_write_refs[] = {
-#define x(n) #n,
- BCH_WRITE_REFS()
-#undef x
- NULL
-};
-
-static void bch2_write_refs_to_text(struct printbuf *out, struct bch_fs *c)
-{
- bch2_printbuf_tabstop_push(out, 24);
-
- for (unsigned i = 0; i < ARRAY_SIZE(c->writes); i++)
- prt_printf(out, "%s\t%li\n", bch2_write_refs[i], atomic_long_read(&c->writes[i]));
-}
-#endif
-
read_attribute(internal_uuid);
read_attribute(disk_groups);
@@ -212,6 +205,8 @@ read_attribute(copy_gc_wait);
sysfs_pd_controller_attribute(rebalance);
read_attribute(rebalance_status);
+read_attribute(snapshot_delete_status);
+read_attribute(recovery_status);
read_attribute(new_stripes);
@@ -334,6 +329,12 @@ SHOW(bch2_fs)
if (attr == &sysfs_rebalance_status)
bch2_rebalance_status_to_text(out, c);
+ if (attr == &sysfs_snapshot_delete_status)
+ bch2_snapshot_delete_status_to_text(out, c);
+
+ if (attr == &sysfs_recovery_status)
+ bch2_recovery_pass_status_to_text(out, c);
+
/* Debugging: */
if (attr == &sysfs_journal_debug)
@@ -357,6 +358,9 @@ SHOW(bch2_fs)
if (attr == &sysfs_compression_stats)
bch2_compression_stats_to_text(out, c);
+ if (attr == &sysfs_errors)
+ bch2_fs_errors_to_text(out, c);
+
if (attr == &sysfs_new_stripes)
bch2_new_stripes_to_text(out, c);
@@ -369,10 +373,8 @@ SHOW(bch2_fs)
if (attr == &sysfs_moving_ctxts)
bch2_fs_moving_ctxts_to_text(out, c);
-#ifdef BCH_WRITE_REF_DEBUG
if (attr == &sysfs_write_refs)
- bch2_write_refs_to_text(out, c);
-#endif
+ enumerated_ref_to_text(out, &c->writes, bch2_write_refs);
if (attr == &sysfs_nocow_lock_table)
bch2_nocow_locks_to_text(out, &c->nocow_locks);
@@ -405,7 +407,7 @@ STORE(bch2_fs)
if (attr == &sysfs_trigger_btree_updates)
queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work);
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs))
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_sysfs))
return -EROFS;
if (attr == &sysfs_trigger_btree_cache_shrink) {
@@ -434,6 +436,9 @@ STORE(bch2_fs)
if (attr == &sysfs_trigger_invalidates)
bch2_do_invalidates(c);
+ if (attr == &sysfs_trigger_journal_commit)
+ bch2_journal_flush(&c->journal);
+
if (attr == &sysfs_trigger_journal_flush) {
bch2_journal_flush_all_pins(&c->journal);
bch2_journal_meta(&c->journal);
@@ -445,6 +450,25 @@ STORE(bch2_fs)
if (attr == &sysfs_trigger_freelist_wakeup)
closure_wake_up(&c->freelist_wait);
+ if (attr == &sysfs_trigger_recalc_capacity) {
+ down_read(&c->state_lock);
+ bch2_recalc_capacity(c);
+ up_read(&c->state_lock);
+ }
+
+ if (attr == &sysfs_trigger_delete_dead_snapshots)
+ __bch2_delete_dead_snapshots(c);
+
+ if (attr == &sysfs_trigger_emergency_read_only) {
+ struct printbuf buf = PRINTBUF;
+ bch2_log_msg_start(c, &buf);
+
+ prt_printf(&buf, "shutdown by sysfs\n");
+ bch2_fs_emergency_read_only2(c, &buf);
+ bch2_print_str(c, KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+ }
+
#ifdef CONFIG_BCACHEFS_TESTS
if (attr == &sysfs_perf_test) {
char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp;
@@ -465,7 +489,7 @@ STORE(bch2_fs)
size = ret;
}
#endif
- bch2_write_ref_put(c, BCH_WRITE_REF_sysfs);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_sysfs);
return size;
}
SYSFS_OPS(bch2_fs);
@@ -476,8 +500,11 @@ struct attribute *bch2_fs_files[] = {
&sysfs_btree_write_stats,
&sysfs_rebalance_status,
+ &sysfs_snapshot_delete_status,
+ &sysfs_recovery_status,
&sysfs_compression_stats,
+ &sysfs_errors,
#ifdef CONFIG_BCACHEFS_TESTS
&sysfs_perf_test,
@@ -558,9 +585,7 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_new_stripes,
&sysfs_open_buckets,
&sysfs_open_buckets_partial,
-#ifdef BCH_WRITE_REF_DEBUG
&sysfs_write_refs,
-#endif
&sysfs_nocow_lock_table,
&sysfs_io_timers_read,
&sysfs_io_timers_write,
@@ -568,12 +593,16 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_trigger_gc,
&sysfs_trigger_discards,
&sysfs_trigger_invalidates,
+ &sysfs_trigger_journal_commit,
&sysfs_trigger_journal_flush,
&sysfs_trigger_journal_writes,
&sysfs_trigger_btree_cache_shrink,
&sysfs_trigger_btree_key_cache_shrink,
- &sysfs_trigger_freelist_wakeup,
&sysfs_trigger_btree_updates,
+ &sysfs_trigger_freelist_wakeup,
+ &sysfs_trigger_recalc_capacity,
+ &sysfs_trigger_delete_dead_snapshots,
+ &sysfs_trigger_emergency_read_only,
&sysfs_gc_gens_pos,
@@ -626,7 +655,7 @@ static ssize_t sysfs_opt_store(struct bch_fs *c,
* We don't need to take c->writes for correctness, but it eliminates an
* unsightly error message in the dmesg log when we're RO:
*/
- if (unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs)))
+ if (unlikely(!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_sysfs)))
return -EROFS;
char *tmp = kstrdup(buf, GFP_KERNEL);
@@ -637,40 +666,34 @@ static ssize_t sysfs_opt_store(struct bch_fs *c,
u64 v;
ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL) ?:
- bch2_opt_check_may_set(c, ca, id, v);
+ bch2_opt_hook_pre_set(c, ca, id, v);
kfree(tmp);
if (ret < 0)
goto err;
- bch2_opt_set_sb(c, ca, opt, v);
- bch2_opt_set_by_id(&c->opts, id, v);
-
- if (v &&
- (id == Opt_background_target ||
- (id == Opt_foreground_target && !c->opts.background_target) ||
- id == Opt_background_compression ||
- (id == Opt_compression && !c->opts.background_compression)))
- bch2_set_rebalance_needs_scan(c, 0);
-
- if (v && id == Opt_rebalance_enabled)
- bch2_rebalance_wakeup(c);
+ bool is_sb = opt->get_sb || opt->get_member;
+ bool changed = false;
- if (v && id == Opt_copygc_enabled)
- bch2_copygc_wakeup(c);
+ if (is_sb) {
+ changed = bch2_opt_set_sb(c, ca, opt, v);
+ } else if (!ca) {
+ changed = bch2_opt_get_by_id(&c->opts, id) != v;
+ } else {
+ /* device options that aren't superblock options aren't
+ * supported */
+ BUG();
+ }
- if (id == Opt_discard && !ca) {
- mutex_lock(&c->sb_lock);
- for_each_member_device(c, ca)
- opt->set_member(bch2_members_v2_get_mut(ca->disk_sb.sb, ca->dev_idx), v);
+ if (!ca)
+ bch2_opt_set_by_id(&c->opts, id, v);
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
- }
+ if (changed)
+ bch2_opt_hook_post_set(c, ca, 0, &c->opts, id);
ret = size;
err:
- bch2_write_ref_put(c, BCH_WRITE_REF_sysfs);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_sysfs);
return ret;
}
@@ -821,6 +844,12 @@ SHOW(bch2_dev)
if (opt_id >= 0)
return sysfs_opt_show(c, ca, opt_id, out);
+ if (attr == &sysfs_read_refs)
+ enumerated_ref_to_text(out, &ca->io_ref[READ], bch2_dev_read_refs);
+
+ if (attr == &sysfs_write_refs)
+ enumerated_ref_to_text(out, &ca->io_ref[WRITE], bch2_dev_write_refs);
+
return 0;
}
@@ -876,6 +905,9 @@ struct attribute *bch2_dev_files[] = {
/* debug: */
&sysfs_alloc_debug,
&sysfs_open_buckets,
+
+ &sysfs_read_refs,
+ &sysfs_write_refs,
NULL
};
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 519d00d62ae7..dc09532796af 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -199,6 +199,50 @@ DECLARE_EVENT_CLASS(bio,
(unsigned long long)__entry->sector, __entry->nr_sector)
);
+/* errors */
+
+TRACE_EVENT(error_throw,
+ TP_PROTO(struct bch_fs *c, int bch_err, unsigned long ip),
+ TP_ARGS(c, bch_err, ip),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ __field(int, err )
+ __array(char, err_str, 32 )
+ __array(char, ip, 32 )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = c->dev;
+ __entry->err = bch_err;
+ strscpy(__entry->err_str, bch2_err_str(bch_err), sizeof(__entry->err_str));
+ snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip);
+ ),
+
+ TP_printk("%d,%d %s ret %s", MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ip, __entry->err_str)
+);
+
+TRACE_EVENT(error_downcast,
+ TP_PROTO(int bch_err, int std_err, unsigned long ip),
+ TP_ARGS(bch_err, std_err, ip),
+
+ TP_STRUCT__entry(
+ __array(char, bch_err, 32 )
+ __array(char, std_err, 32 )
+ __array(char, ip, 32 )
+ ),
+
+ TP_fast_assign(
+ strscpy(__entry->bch_err, bch2_err_str(bch_err), sizeof(__entry->bch_err));
+ strscpy(__entry->std_err, bch2_err_str(std_err), sizeof(__entry->std_err));
+ snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip);
+ ),
+
+ TP_printk("%s ret %s -> %s %s", __entry->ip,
+ __entry->bch_err, __entry->std_err, __entry->ip)
+);
+
/* disk_accounting.c */
TRACE_EVENT(accounting_mem_insert,
@@ -339,6 +383,11 @@ DEFINE_EVENT(bio, io_read_reuse_race,
TP_ARGS(bio)
);
+DEFINE_EVENT(bio, io_read_fail_and_poison,
+ TP_PROTO(struct bio *bio),
+ TP_ARGS(bio)
+);
+
/* ec.c */
TRACE_EVENT(stripe_create,
@@ -1122,51 +1171,9 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split,
TP_ARGS(trans, caller_ip, path)
);
-TRACE_EVENT(trans_restart_upgrade,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- struct btree_path *path,
- unsigned old_locks_want,
- unsigned new_locks_want,
- struct get_locks_fail *f),
- TP_ARGS(trans, caller_ip, path, old_locks_want, new_locks_want, f),
-
- TP_STRUCT__entry(
- __array(char, trans_fn, 32 )
- __field(unsigned long, caller_ip )
- __field(u8, btree_id )
- __field(u8, old_locks_want )
- __field(u8, new_locks_want )
- __field(u8, level )
- __field(u32, path_seq )
- __field(u32, node_seq )
- TRACE_BPOS_entries(pos)
- ),
-
- TP_fast_assign(
- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
- __entry->caller_ip = caller_ip;
- __entry->btree_id = path->btree_id;
- __entry->old_locks_want = old_locks_want;
- __entry->new_locks_want = new_locks_want;
- __entry->level = f->l;
- __entry->path_seq = path->l[f->l].lock_seq;
- __entry->node_seq = IS_ERR_OR_NULL(f->b) ? 0 : f->b->c.lock.seq;
- TRACE_BPOS_assign(pos, path->pos)
- ),
-
- TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u level %u path seq %u node seq %u",
- __entry->trans_fn,
- (void *) __entry->caller_ip,
- bch2_btree_id_str(__entry->btree_id),
- __entry->pos_inode,
- __entry->pos_offset,
- __entry->pos_snapshot,
- __entry->old_locks_want,
- __entry->new_locks_want,
- __entry->level,
- __entry->path_seq,
- __entry->node_seq)
+DEFINE_EVENT(fs_str, trans_restart_upgrade,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
);
DEFINE_EVENT(trans_str, trans_restart_relock,
@@ -1468,23 +1475,19 @@ DEFINE_EVENT(fs_str, data_update,
TP_ARGS(c, str)
);
-TRACE_EVENT(error_downcast,
- TP_PROTO(int bch_err, int std_err, unsigned long ip),
- TP_ARGS(bch_err, std_err, ip),
-
- TP_STRUCT__entry(
- __array(char, bch_err, 32 )
- __array(char, std_err, 32 )
- __array(char, ip, 32 )
- ),
+DEFINE_EVENT(fs_str, io_move_pred,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
+);
- TP_fast_assign(
- strscpy(__entry->bch_err, bch2_err_str(bch_err), sizeof(__entry->bch_err));
- strscpy(__entry->std_err, bch2_err_str(std_err), sizeof(__entry->std_err));
- snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip);
- ),
+DEFINE_EVENT(fs_str, io_move_created_rebalance,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
+);
- TP_printk("%s -> %s %s", __entry->bch_err, __entry->std_err, __entry->ip)
+DEFINE_EVENT(fs_str, io_move_evacuate_bucket,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
);
#ifdef CONFIG_BCACHEFS_PATH_TRACEPOINTS
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 87af551692f4..dc3817f545fa 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -252,8 +252,18 @@ void bch2_prt_u64_base2(struct printbuf *out, u64 v)
bch2_prt_u64_base2_nbits(out, v, fls64(v) ?: 1);
}
-static void __bch2_print_string_as_lines(const char *prefix, const char *lines,
- bool nonblocking)
+static bool string_is_spaces(const char *str)
+{
+ while (*str) {
+ if (*str != ' ')
+ return false;
+ str++;
+ }
+ return true;
+}
+
+void bch2_print_string_as_lines(const char *prefix, const char *lines,
+ bool nonblocking)
{
bool locked = false;
const char *p;
@@ -272,6 +282,9 @@ static void __bch2_print_string_as_lines(const char *prefix, const char *lines,
while (*lines) {
p = strchrnul(lines, '\n');
+ if (!*p && string_is_spaces(lines))
+ break;
+
printk("%s%.*s\n", prefix, (int) (p - lines), lines);
if (!*p)
break;
@@ -281,16 +294,6 @@ static void __bch2_print_string_as_lines(const char *prefix, const char *lines,
console_unlock();
}
-void bch2_print_string_as_lines(const char *prefix, const char *lines)
-{
- return __bch2_print_string_as_lines(prefix, lines, false);
-}
-
-void bch2_print_string_as_lines_nonblocking(const char *prefix, const char *lines)
-{
- return __bch2_print_string_as_lines(prefix, lines, true);
-}
-
int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigned skipnr,
gfp_t gfp)
{
@@ -725,6 +728,16 @@ void bch2_corrupt_bio(struct bio *bio)
}
#endif
+void bch2_bio_to_text(struct printbuf *out, struct bio *bio)
+{
+ prt_printf(out, "bi_remaining:\t%u\n",
+ atomic_read(&bio->__bi_remaining));
+ prt_printf(out, "bi_end_io:\t%ps\n",
+ bio->bi_end_io);
+ prt_printf(out, "bi_status:\t%u\n",
+ bio->bi_status);
+}
+
#if 0
void eytzinger1_test(void)
{
@@ -1003,14 +1016,14 @@ u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr)
return ret;
}
-void bch2_darray_str_exit(darray_str *d)
+void bch2_darray_str_exit(darray_const_str *d)
{
darray_for_each(*d, i)
kfree(*i);
darray_exit(d);
}
-int bch2_split_devs(const char *_dev_name, darray_str *ret)
+int bch2_split_devs(const char *_dev_name, darray_const_str *ret)
{
darray_init(ret);
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 3e52c7f8ddd2..0a4b1d433621 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -14,8 +14,10 @@
#include <linux/log2.h>
#include <linux/percpu.h>
#include <linux/preempt.h>
+#include <linux/random.h>
#include <linux/ratelimit.h>
#include <linux/slab.h>
+#include <linux/sort.h>
#include <linux/vmalloc.h>
#include <linux/workqueue.h>
@@ -55,15 +57,16 @@ static inline size_t buf_pages(void *p, size_t len)
PAGE_SIZE);
}
-static inline void *bch2_kvmalloc(size_t n, gfp_t flags)
+static inline void *bch2_kvmalloc_noprof(size_t n, gfp_t flags)
{
void *p = unlikely(n >= INT_MAX)
- ? vmalloc(n)
- : kvmalloc(n, flags & ~__GFP_ZERO);
+ ? vmalloc_noprof(n)
+ : kvmalloc_noprof(n, flags & ~__GFP_ZERO);
if (p && (flags & __GFP_ZERO))
memset(p, 0, n);
return p;
}
+#define bch2_kvmalloc(...) alloc_hooks(bch2_kvmalloc_noprof(__VA_ARGS__))
#define init_heap(heap, _size, gfp) \
({ \
@@ -211,8 +214,7 @@ u64 bch2_read_flag_list(const char *, const char * const[]);
void bch2_prt_u64_base2_nbits(struct printbuf *, u64, unsigned);
void bch2_prt_u64_base2(struct printbuf *, u64);
-void bch2_print_string_as_lines(const char *prefix, const char *lines);
-void bch2_print_string_as_lines_nonblocking(const char *prefix, const char *lines);
+void bch2_print_string_as_lines(const char *, const char *, bool);
typedef DARRAY(unsigned long) bch_stacktrace;
int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *, unsigned, gfp_t);
@@ -419,6 +421,8 @@ static inline void bch2_maybe_corrupt_bio(struct bio *bio, unsigned ratio)
#define bch2_maybe_corrupt_bio(...) do {} while (0)
#endif
+void bch2_bio_to_text(struct printbuf *, struct bio *);
+
static inline void memcpy_u64s_small(void *dst, const void *src,
unsigned u64s)
{
@@ -669,8 +673,6 @@ static inline void percpu_memset(void __percpu *p, int c, size_t bytes)
u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned);
-#define cmp_int(l, r) ((l > r) - (l < r))
-
static inline int u8_cmp(u8 l, u8 r)
{
return cmp_int(l, r);
@@ -688,8 +690,8 @@ static inline bool qstr_eq(const struct qstr l, const struct qstr r)
return l.len == r.len && !memcmp(l.name, r.name, l.len);
}
-void bch2_darray_str_exit(darray_str *);
-int bch2_split_devs(const char *, darray_str *);
+void bch2_darray_str_exit(darray_const_str *);
+int bch2_split_devs(const char *, darray_const_str *);
#ifdef __KERNEL__
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index e6be32003f3b..627f153798c6 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -38,7 +38,7 @@ static u64 xattr_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
struct bkey_s_c_xattr x = bkey_s_c_to_xattr(k);
return bch2_xattr_hash(info,
- &X_SEARCH(x.v->x_type, x.v->x_name, x.v->x_name_len));
+ &X_SEARCH(x.v->x_type, x.v->x_name_and_value, x.v->x_name_len));
}
static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r)
@@ -48,7 +48,7 @@ static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r)
return l.v->x_type != r->type ||
l.v->x_name_len != r->name.len ||
- memcmp(l.v->x_name, r->name.name, r->name.len);
+ memcmp(l.v->x_name_and_value, r->name.name, r->name.len);
}
static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
@@ -58,7 +58,7 @@ static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
return l.v->x_type != r.v->x_type ||
l.v->x_name_len != r.v->x_name_len ||
- memcmp(l.v->x_name, r.v->x_name, r.v->x_name_len);
+ memcmp(l.v->x_name_and_value, r.v->x_name_and_value, r.v->x_name_len);
}
const struct bch_hash_desc bch2_xattr_hash_desc = {
@@ -96,7 +96,7 @@ int bch2_xattr_validate(struct bch_fs *c, struct bkey_s_c k,
c, xattr_invalid_type,
"invalid type (%u)", xattr.v->x_type);
- bkey_fsck_err_on(memchr(xattr.v->x_name, '\0', xattr.v->x_name_len),
+ bkey_fsck_err_on(memchr(xattr.v->x_name_and_value, '\0', xattr.v->x_name_len),
c, xattr_name_invalid_chars,
"xattr name has invalid characters");
fsck_err:
@@ -120,13 +120,13 @@ void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c,
unsigned name_len = xattr.v->x_name_len;
unsigned val_len = le16_to_cpu(xattr.v->x_val_len);
unsigned max_name_val_bytes = bkey_val_bytes(xattr.k) -
- offsetof(struct bch_xattr, x_name);
+ offsetof(struct bch_xattr, x_name_and_value);
val_len = min_t(int, val_len, max_name_val_bytes - name_len);
name_len = min(name_len, max_name_val_bytes);
prt_printf(out, "%.*s:%.*s",
- name_len, xattr.v->x_name,
+ name_len, xattr.v->x_name_and_value,
val_len, (char *) xattr_val(xattr.v));
if (xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS ||
@@ -176,6 +176,11 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum,
if (ret)
return ret;
+ /*
+ * Besides the ctime update, extents, dirents and xattrs updates require
+ * that an inode update also happens - to ensure that if a key exists in
+ * one of those btrees with a given snapshot ID an inode is also present
+ */
inode_u->bi_ctime = bch2_current_time(c);
ret = bch2_inode_write(trans, &inode_iter, inode_u);
@@ -202,7 +207,7 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum,
xattr->v.x_type = type;
xattr->v.x_name_len = namelen;
xattr->v.x_val_len = cpu_to_le16(size);
- memcpy(xattr->v.x_name, name, namelen);
+ memcpy(xattr->v.x_name_and_value, name, namelen);
memcpy(xattr_val(&xattr->v), value, size);
ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info,
@@ -270,7 +275,7 @@ static int bch2_xattr_emit(struct dentry *dentry,
if (!prefix)
return 0;
- return __bch2_xattr_emit(prefix, xattr->x_name, xattr->x_name_len, buf);
+ return __bch2_xattr_emit(prefix, xattr->x_name_and_value, xattr->x_name_len, buf);
}
static int bch2_xattr_list_bcachefs(struct bch_fs *c,
@@ -529,7 +534,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
if (ret < 0)
goto err_class_exit;
- ret = bch2_opt_check_may_set(c, NULL, opt_id, v);
+ ret = bch2_opt_hook_pre_set(c, NULL, opt_id, v);
if (ret < 0)
goto err_class_exit;
diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h
index 132fbbd15a66..1139bf345f70 100644
--- a/fs/bcachefs/xattr.h
+++ b/fs/bcachefs/xattr.h
@@ -18,12 +18,12 @@ void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
static inline unsigned xattr_val_u64s(unsigned name_len, unsigned val_len)
{
- return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name) +
+ return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name_and_value) +
name_len + val_len, sizeof(u64));
}
#define xattr_val(_xattr) \
- ((void *) (_xattr)->x_name + (_xattr)->x_name_len)
+ ((void *) (_xattr)->x_name_and_value + (_xattr)->x_name_len)
struct xattr_search_key {
u8 type;
diff --git a/fs/bcachefs/xattr_format.h b/fs/bcachefs/xattr_format.h
index 67426e33d04e..4121b78d9a92 100644
--- a/fs/bcachefs/xattr_format.h
+++ b/fs/bcachefs/xattr_format.h
@@ -16,10 +16,10 @@ struct bch_xattr {
/*
* x_name contains the name and value counted by
* x_name_len + x_val_len. The introduction of
- * __counted_by(x_name_len) caused a false positive
+ * __counted_by(x_name_len) previously caused a false positive
* detection of an out of bounds write.
*/
- __u8 x_name[];
+ __u8 x_name_and_value[];
} __packed __aligned(8);
#endif /* _BCACHEFS_XATTR_FORMAT_H */