summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@linux.dev>2024-12-01 21:44:38 -0500
committerKent Overstreet <kent.overstreet@linux.dev>2025-01-09 23:38:41 -0500
commitdf448ca355ce84d62993166294f75517e4128598 (patch)
treea69615a19c0a3ab28ee2ed9b7f6dfc2755642099
parent59c50511f7a8ecded211656425c9b92e31329333 (diff)
bcachefs: bcachefs_metadata_version_persistent_inode_cursors
Persistent cursors for inode allocation. A free inodes btree would add substantial overhead to inode allocation and freeing - a "next num to allocate" cursor is always going to be faster. We just need it to be persistent, to avoid scanning the inodes btree from the start on startup. Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
-rw-r--r--fs/bcachefs/bcachefs.h3
-rw-r--r--fs/bcachefs/bcachefs_format.h10
-rw-r--r--fs/bcachefs/btree_update.c2
-rw-r--r--fs/bcachefs/inode.c120
-rw-r--r--fs/bcachefs/inode.h10
-rw-r--r--fs/bcachefs/inode_format.h14
-rw-r--r--fs/bcachefs/logged_ops.c5
-rw-r--r--fs/bcachefs/logged_ops_format.h5
-rw-r--r--fs/bcachefs/opts.h10
-rw-r--r--fs/bcachefs/sb-errors_format.h3
-rw-r--r--fs/bcachefs/super-io.c5
-rw-r--r--fs/bcachefs/super.c7
12 files changed, 135 insertions, 59 deletions
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index b749c4ecad1b..161cf2f05d2a 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -1063,9 +1063,6 @@ struct bch_fs {
struct btree_node *verify_ondisk;
struct mutex verify_lock;
- u64 *unused_inode_hints;
- unsigned inode_shard_bits;
-
/*
* A btree node on disk could have too many bsets for an iterator to fit
* on the stack - have to dynamically allocate them
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index f140c3366e65..09e53bef1a30 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -418,7 +418,8 @@ static inline void bkey_init(struct bkey *k)
x(snapshot_tree, 31) \
x(logged_op_truncate, 32) \
x(logged_op_finsert, 33) \
- x(accounting, 34)
+ x(accounting, 34) \
+ x(inode_alloc_cursor, 35)
enum bch_bkey_type {
#define x(name, nr) KEY_TYPE_##name = nr,
@@ -682,7 +683,8 @@ struct bch_sb_field_ext {
x(backpointer_bucket_gen, BCH_VERSION(1, 14)) \
x(disk_accounting_big_endian, BCH_VERSION(1, 15)) \
x(reflink_p_may_update_opts, BCH_VERSION(1, 16)) \
- x(inode_depth, BCH_VERSION(1, 17))
+ x(inode_depth, BCH_VERSION(1, 17)) \
+ x(persistent_inode_cursors, BCH_VERSION(1, 18))
enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9,
@@ -850,6 +852,7 @@ LE64_BITMASK(BCH_SB_ALLOCATOR_STUCK_TIMEOUT,
LE64_BITMASK(BCH_SB_VERSION_INCOMPAT, struct bch_sb, flags[5], 32, 48);
LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED,
struct bch_sb, flags[5], 48, 64);
+LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6], 0, 4);
static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb)
{
@@ -1347,7 +1350,8 @@ enum btree_id_flags {
BIT_ULL(KEY_TYPE_set)) \
x(logged_ops, 17, 0, \
BIT_ULL(KEY_TYPE_logged_op_truncate)| \
- BIT_ULL(KEY_TYPE_logged_op_finsert)) \
+ BIT_ULL(KEY_TYPE_logged_op_finsert)| \
+ BIT_ULL(KEY_TYPE_inode_alloc_cursor)) \
x(rebalance_work, 18, BTREE_ID_SNAPSHOT_FIELD, \
BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie)) \
x(subvolume_children, 19, 0, \
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index a4b70e3fe4c3..13d794f201a5 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -588,7 +588,7 @@ struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsi
int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter,
enum btree_id btree, struct bpos end)
{
- bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_intent);
+ bch2_trans_iter_init(trans, iter, btree, end, BTREE_ITER_intent);
struct bkey_s_c k = bch2_btree_iter_peek_prev(iter);
int ret = bkey_err(k);
if (ret)
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index f6245b78eb78..04ec05206f8c 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -799,6 +799,28 @@ void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c,
prt_printf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation));
}
+int bch2_inode_alloc_cursor_validate(struct bch_fs *c, struct bkey_s_c k,
+ struct bkey_validate_context from)
+{
+ int ret = 0;
+
+ bkey_fsck_err_on(k.k->p.inode != LOGGED_OPS_INUM_inode_cursors,
+ c, inode_alloc_cursor_inode_bad,
+ "k.p.inode bad");
+fsck_err:
+ return ret;
+}
+
+void bch2_inode_alloc_cursor_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ struct bkey_s_c_inode_alloc_cursor i = bkey_s_c_to_inode_alloc_cursor(k);
+
+ prt_printf(out, "idx %llu generation %llu",
+ le64_to_cpu(i.v->idx),
+ le64_to_cpu(i.v->gen));
+}
+
void bch2_inode_init_early(struct bch_fs *c,
struct bch_inode_unpacked *inode_u)
{
@@ -859,43 +881,78 @@ static inline u32 bkey_generation(struct bkey_s_c k)
}
}
-/*
- * This just finds an empty slot:
- */
-int bch2_inode_create(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bch_inode_unpacked *inode_u,
- u32 snapshot, u64 cpu)
+static struct bkey_i_inode_alloc_cursor *
+bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *max)
{
struct bch_fs *c = trans->c;
- struct bkey_s_c k;
- u64 min, max, start, pos, *hint;
- int ret = 0;
- unsigned bits = (c->opts.inodes_32bit ? 31 : 63);
- if (c->opts.shard_inode_numbers) {
- bits -= c->inode_shard_bits;
+ u64 cursor_idx = c->opts.inodes_32bit ? 0 : cpu + 1;
+
+ cursor_idx &= ~(~0ULL << c->opts.shard_inode_numbers_bits);
- min = (cpu << bits);
- max = (cpu << bits) | ~(ULLONG_MAX << bits);
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter,
+ BTREE_ID_logged_ops,
+ POS(LOGGED_OPS_INUM_inode_cursors, cursor_idx),
+ BTREE_ITER_cached);
+ int ret = bkey_err(k);
+ if (ret)
+ return ERR_PTR(ret);
+
+ struct bkey_i_inode_alloc_cursor *cursor =
+ k.k->type == KEY_TYPE_inode_alloc_cursor
+ ? bch2_bkey_make_mut_typed(trans, &iter, &k, 0, inode_alloc_cursor)
+ : bch2_bkey_alloc(trans, &iter, 0, inode_alloc_cursor);
+ ret = PTR_ERR_OR_ZERO(cursor);
+ if (ret)
+ goto err;
- min = max_t(u64, min, BLOCKDEV_INODE_MAX);
- hint = c->unused_inode_hints + cpu;
+ if (c->opts.inodes_32bit) {
+ *min = BLOCKDEV_INODE_MAX;
+ *max = INT_MAX;
} else {
- min = BLOCKDEV_INODE_MAX;
- max = ~(ULLONG_MAX << bits);
- hint = c->unused_inode_hints;
+ cursor->v.bits = c->opts.shard_inode_numbers_bits;
+
+ unsigned bits = 63 - c->opts.shard_inode_numbers_bits;
+
+ *min = max(cpu << bits, (u64) INT_MAX + 1);
+ *max = (cpu << bits) | ~(ULLONG_MAX << bits);
}
- start = READ_ONCE(*hint);
+ if (le64_to_cpu(cursor->v.idx) < *min)
+ cursor->v.idx = cpu_to_le64(*min);
+
+ if (le64_to_cpu(cursor->v.idx) >= *max) {
+ cursor->v.idx = cpu_to_le64(*min);
+ le32_add_cpu(&cursor->v.gen, 1);
+ }
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret ? ERR_PTR(ret) : cursor;
+}
+
+/*
+ * This just finds an empty slot:
+ */
+int bch2_inode_create(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bch_inode_unpacked *inode_u,
+ u32 snapshot, u64 cpu)
+{
+ u64 min, max;
+ struct bkey_i_inode_alloc_cursor *cursor =
+ bch2_inode_alloc_cursor_get(trans, cpu, &min, &max);
+ int ret = PTR_ERR_OR_ZERO(cursor);
+ if (ret)
+ return ret;
- if (start >= max || start < min)
- start = min;
+ u64 start = le64_to_cpu(cursor->v.idx);
+ u64 pos = start;
- pos = start;
bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos),
BTREE_ITER_all_snapshots|
BTREE_ITER_intent);
+ struct bkey_s_c k;
again:
while ((k = bch2_btree_iter_peek(iter)).k &&
!(ret = bkey_err(k)) &&
@@ -925,6 +982,7 @@ again:
/* Retry from start */
pos = start = min;
bch2_btree_iter_set_pos(iter, POS(0, pos));
+ le32_add_cpu(&cursor->v.gen, 1);
goto again;
found_slot:
bch2_btree_iter_set_pos(iter, SPOS(0, pos, snapshot));
@@ -935,9 +993,9 @@ found_slot:
return ret;
}
- *hint = k.k->p.offset;
inode_u->bi_inum = k.k->p.offset;
- inode_u->bi_generation = bkey_generation(k);
+ inode_u->bi_generation = le64_to_cpu(cursor->v.gen);
+ cursor->v.idx = cpu_to_le64(k.k->p.offset + 1);
return 0;
}
@@ -999,8 +1057,6 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
{
struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter = { NULL };
- struct bkey_i_inode_generation delete;
- struct bch_inode_unpacked inode_u;
struct bkey_s_c k;
u32 snapshot;
int ret;
@@ -1040,13 +1096,7 @@ retry:
goto err;
}
- bch2_inode_unpack(k, &inode_u);
-
- bkey_inode_generation_init(&delete.k_i);
- delete.k.p = iter.pos;
- delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
-
- ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
+ ret = bch2_btree_delete_at(trans, &iter, 0) ?:
bch2_trans_commit(trans, NULL, NULL,
BCH_TRANS_COMMIT_no_enospc);
err:
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 5bca6950f20e..d2e134528f0e 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -68,6 +68,16 @@ void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bk
.min_val_size = 8, \
})
+int bch2_inode_alloc_cursor_validate(struct bch_fs *, struct bkey_s_c,
+ struct bkey_validate_context);
+void bch2_inode_alloc_cursor_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_inode_alloc_cursor ((struct bkey_ops) { \
+ .key_validate = bch2_inode_alloc_cursor_validate, \
+ .val_to_text = bch2_inode_alloc_cursor_to_text, \
+ .min_val_size = 16, \
+})
+
#if 0
typedef struct {
u64 lo;
diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h
index be1e747629d2..b99a5bf1a75e 100644
--- a/fs/bcachefs/inode_format.h
+++ b/fs/bcachefs/inode_format.h
@@ -102,7 +102,8 @@ struct bch_inode_generation {
x(bi_subvol, 32) \
x(bi_parent_subvol, 32) \
x(bi_nocow, 8) \
- x(bi_depth, 32)
+ x(bi_depth, 32) \
+ x(bi_inodes_32bit, 8)
/* subset of BCH_INODE_FIELDS */
#define BCH_INODE_OPTS() \
@@ -115,7 +116,8 @@ struct bch_inode_generation {
x(foreground_target, 16) \
x(background_target, 16) \
x(erasure_code, 16) \
- x(nocow, 8)
+ x(nocow, 8) \
+ x(inodes_32bit, 8)
enum inode_opt_id {
#define x(name, ...) \
@@ -165,4 +167,12 @@ LE64_BITMASK(INODEv3_FIELDS_START,
struct bch_inode_v3, bi_flags, 31, 36);
LE64_BITMASK(INODEv3_MODE, struct bch_inode_v3, bi_flags, 36, 52);
+struct bch_inode_alloc_cursor {
+ struct bch_val v;
+ __u8 bits;
+ __u8 pad;
+ __le32 gen;
+ __le64 idx;
+};
+
#endif /* _BCACHEFS_INODE_FORMAT_H */
diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c
index 1ac51af16299..75f27ec26f85 100644
--- a/fs/bcachefs/logged_ops.c
+++ b/fs/bcachefs/logged_ops.c
@@ -65,7 +65,8 @@ int bch2_resume_logged_ops(struct bch_fs *c)
int ret = bch2_trans_run(c,
for_each_btree_key_max(trans, iter,
BTREE_ID_logged_ops,
- POS(LOGGED_OPS_INUM, 0), POS(LOGGED_OPS_INUM, U64_MAX),
+ POS(LOGGED_OPS_INUM_logged_ops, 0),
+ POS(LOGGED_OPS_INUM_logged_ops, U64_MAX),
BTREE_ITER_prefetch, k,
resume_logged_op(trans, &iter, k)));
bch_err_fn(c, ret);
@@ -76,7 +77,7 @@ static int __bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k)
{
struct btree_iter iter;
int ret = bch2_bkey_get_empty_slot(trans, &iter,
- BTREE_ID_logged_ops, POS(LOGGED_OPS_INUM, U64_MAX));
+ BTREE_ID_logged_ops, POS(LOGGED_OPS_INUM_logged_ops, U64_MAX));
if (ret)
return ret;
diff --git a/fs/bcachefs/logged_ops_format.h b/fs/bcachefs/logged_ops_format.h
index 0b370a963ac6..cfb67c95d4c8 100644
--- a/fs/bcachefs/logged_ops_format.h
+++ b/fs/bcachefs/logged_ops_format.h
@@ -2,7 +2,10 @@
#ifndef _BCACHEFS_LOGGED_OPS_FORMAT_H
#define _BCACHEFS_LOGGED_OPS_FORMAT_H
-#define LOGGED_OPS_INUM 0
+enum logged_ops_inums {
+ LOGGED_OPS_INUM_logged_ops,
+ LOGGED_OPS_INUM_inode_cursors,
+};
struct bch_logged_op_truncate {
struct bch_val v;
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index ea69099e681d..e763d52e0f38 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -222,14 +222,14 @@ enum fsck_err_opts {
BCH_SB_ERASURE_CODE, false, \
NULL, "Enable erasure coding (DO NOT USE YET)") \
x(inodes_32bit, u8, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \
BCH_SB_INODE_32BIT, true, \
NULL, "Constrain inode numbers to 32 bits") \
- x(shard_inode_numbers, u8, \
- OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
- OPT_BOOL(), \
- BCH_SB_SHARD_INUMS, true, \
+ x(shard_inode_numbers_bits, u8, \
+ OPT_FS|OPT_FORMAT, \
+ OPT_UINT(0, 8), \
+ BCH_SB_SHARD_INUMS_NBITS, 0, \
NULL, "Shard new inode numbers by CPU id") \
x(inodes_use_key_cache, u8, \
OPT_FS|OPT_FORMAT|OPT_MOUNT, \
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index 806486635075..e26317c367f7 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -211,6 +211,7 @@ enum bch_fsck_flags {
x(bkey_in_missing_snapshot, 190, 0) \
x(inode_pos_inode_nonzero, 191, 0) \
x(inode_pos_blockdev_range, 192, 0) \
+ x(inode_alloc_cursor_inode_bad, 301, 0) \
x(inode_unpack_error, 193, 0) \
x(inode_str_hash_invalid, 194, 0) \
x(inode_v3_fields_start_bad, 195, 0) \
@@ -311,7 +312,7 @@ enum bch_fsck_flags {
x(logged_op_but_clean, 283, FSCK_AUTOFIX) \
x(compression_opt_not_marked_in_sb, 295, FSCK_AUTOFIX) \
x(compression_type_not_marked_in_sb, 296, FSCK_AUTOFIX) \
- x(MAX, 301, 0)
+ x(MAX, 302, 0)
enum bch_sb_error_id {
#define x(t, n, ...) BCH_FSCK_ERR_##t = n,
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index b0d52b6ccad4..dbc09e305c27 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -460,6 +460,11 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
SET_BCH_SB_PROMOTE_WHOLE_EXTENTS(sb, true);
}
+#ifdef __KERNEL__
+ if (!BCH_SB_SHARD_INUMS_NBITS(sb))
+ SET_BCH_SB_SHARD_INUMS_NBITS(sb, ilog2(roundup_pow_of_two(num_online_cpus())));
+#endif
+
for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) {
const struct bch_option *opt = bch2_opt_table + opt_id;
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 2b2e0835c8fe..7e97c198efe2 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -586,7 +586,6 @@ static void __bch2_fs_free(struct bch_fs *c)
#endif
kfree(rcu_dereference_protected(c->disk_groups, 1));
kfree(c->journal_seq_blacklist_table);
- kfree(c->unused_inode_hints);
if (c->write_ref_wq)
destroy_workqueue(c->write_ref_wq);
@@ -872,8 +871,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
(btree_blocks(c) + 1) * 2 *
sizeof(struct sort_iter_set);
- c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus()));
-
if (!(c->btree_update_wq = alloc_workqueue("bcachefs",
WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) ||
!(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io",
@@ -900,9 +897,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
!(c->online_reserved = alloc_percpu(u64)) ||
mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1,
c->opts.btree_node_size) ||
- mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
- !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits,
- sizeof(u64), GFP_KERNEL))) {
+ mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048)) {
ret = -BCH_ERR_ENOMEM_fs_other_alloc;
goto err;
}