summaryrefslogtreecommitdiff
path: root/fs/bcachefs/btree_io.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/bcachefs/btree_io.c')
-rw-r--r--fs/bcachefs/btree_io.c620
1 files changed, 455 insertions, 165 deletions
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index e371e60e3133..34018296053a 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1,6 +1,8 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "async_objs.h"
+#include "bkey_buf.h"
#include "bkey_methods.h"
#include "bkey_sort.h"
#include "btree_cache.h"
@@ -12,6 +14,7 @@
#include "buckets.h"
#include "checksum.h"
#include "debug.h"
+#include "enumerated_ref.h"
#include "error.h"
#include "extents.h"
#include "io_write.h"
@@ -40,6 +43,7 @@ void bch2_btree_node_io_unlock(struct btree *b)
clear_btree_node_write_in_flight_inner(b);
clear_btree_node_write_in_flight(b);
+ smp_mb__after_atomic();
wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
}
@@ -512,21 +516,23 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
struct bch_dev *ca,
+ bool print_pos,
struct btree *b, struct bset *i, struct bkey_packed *k,
- unsigned offset, int write)
+ unsigned offset, int rw)
{
- prt_printf(out, bch2_log_msg(c, "%s"),
- write == READ
- ? "error validating btree node "
- : "corrupt btree node before write ");
- if (ca)
- prt_printf(out, "on %s ", ca->name);
- prt_printf(out, "at btree ");
- bch2_btree_pos_to_text(out, c, b);
+ if (print_pos) {
+ prt_str(out, rw == READ
+ ? "error validating btree node "
+ : "corrupt btree node before write ");
+ prt_printf(out, "at btree ");
+ bch2_btree_pos_to_text(out, c, b);
+ prt_newline(out);
+ }
- printbuf_indent_add(out, 2);
+ if (ca)
+ prt_printf(out, "%s ", ca->name);
- prt_printf(out, "\nnode offset %u/%u",
+ prt_printf(out, "node offset %u/%u",
b->written, btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)));
if (i)
prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s));
@@ -537,34 +543,32 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
prt_str(out, ": ");
}
-__printf(10, 11)
+__printf(11, 12)
static int __btree_err(int ret,
struct bch_fs *c,
struct bch_dev *ca,
struct btree *b,
struct bset *i,
struct bkey_packed *k,
- int write,
- bool have_retry,
+ int rw,
enum bch_sb_error_id err_type,
+ struct bch_io_failures *failed,
+ struct printbuf *err_msg,
const char *fmt, ...)
{
- struct printbuf out = PRINTBUF;
- bool silent = c->curr_recovery_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes;
- va_list args;
+ if (c->recovery.curr_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes)
+ return -BCH_ERR_fsck_fix;
- btree_err_msg(&out, c, ca, b, i, k, b->written, write);
+ bool have_retry = false;
+ int ret2;
- va_start(args, fmt);
- prt_vprintf(&out, fmt, args);
- va_end(args);
+ if (ca) {
+ bch2_mark_btree_validate_failure(failed, ca->dev_idx);
- if (write == WRITE) {
- bch2_print_string_as_lines(KERN_ERR, out.buf);
- ret = c->opts.errors == BCH_ON_ERROR_continue
- ? 0
- : -BCH_ERR_fsck_errors_not_fixed;
- goto out;
+ struct extent_ptr_decoded pick;
+ have_retry = !bch2_bkey_pick_read_device(c,
+ bkey_i_to_s_c(&b->key),
+ failed, &pick, -1);
}
if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry)
@@ -572,37 +576,77 @@ static int __btree_err(int ret,
if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry)
ret = -BCH_ERR_btree_node_read_err_bad_node;
- if (!silent && ret != -BCH_ERR_btree_node_read_err_fixable)
- bch2_sb_error_count(c, err_type);
+ bch2_sb_error_count(c, err_type);
+
+ bool print_deferred = err_msg &&
+ rw == READ &&
+ !(test_bit(BCH_FS_in_fsck, &c->flags) &&
+ c->opts.fix_errors == FSCK_FIX_ask);
+
+ struct printbuf out = PRINTBUF;
+ bch2_log_msg_start(c, &out);
+
+ if (!print_deferred)
+ err_msg = &out;
+
+ btree_err_msg(err_msg, c, ca, !print_deferred, b, i, k, b->written, rw);
+
+ va_list args;
+ va_start(args, fmt);
+ prt_vprintf(err_msg, fmt, args);
+ va_end(args);
+
+ if (print_deferred) {
+ prt_newline(err_msg);
+
+ switch (ret) {
+ case -BCH_ERR_btree_node_read_err_fixable:
+ ret2 = bch2_fsck_err_opt(c, FSCK_CAN_FIX, err_type);
+ if (ret2 != -BCH_ERR_fsck_fix &&
+ ret2 != -BCH_ERR_fsck_ignore) {
+ ret = ret2;
+ goto fsck_err;
+ }
+
+ if (!have_retry)
+ ret = -BCH_ERR_fsck_fix;
+ goto out;
+ case -BCH_ERR_btree_node_read_err_bad_node:
+ prt_str(&out, ", ");
+ ret = __bch2_topology_error(c, &out);
+ break;
+ }
+
+ goto out;
+ }
+
+ if (rw == WRITE) {
+ prt_str(&out, ", ");
+ ret = __bch2_inconsistent_error(c, &out)
+ ? -BCH_ERR_fsck_errors_not_fixed
+ : 0;
+ goto print;
+ }
switch (ret) {
case -BCH_ERR_btree_node_read_err_fixable:
- ret = !silent
- ? __bch2_fsck_err(c, NULL, FSCK_CAN_FIX, err_type, "%s", out.buf)
- : -BCH_ERR_fsck_fix;
- if (ret != -BCH_ERR_fsck_fix &&
- ret != -BCH_ERR_fsck_ignore)
+ ret2 = __bch2_fsck_err(c, NULL, FSCK_CAN_FIX, err_type, "%s", out.buf);
+ if (ret2 != -BCH_ERR_fsck_fix &&
+ ret2 != -BCH_ERR_fsck_ignore) {
+ ret = ret2;
goto fsck_err;
- ret = -BCH_ERR_fsck_fix;
- break;
- case -BCH_ERR_btree_node_read_err_want_retry:
- case -BCH_ERR_btree_node_read_err_must_retry:
- if (!silent)
- bch2_print_string_as_lines(KERN_ERR, out.buf);
- break;
+ }
+
+ if (!have_retry)
+ ret = -BCH_ERR_fsck_fix;
+ goto out;
case -BCH_ERR_btree_node_read_err_bad_node:
- if (!silent)
- bch2_print_string_as_lines(KERN_ERR, out.buf);
- ret = bch2_topology_error(c);
- break;
- case -BCH_ERR_btree_node_read_err_incompatible:
- if (!silent)
- bch2_print_string_as_lines(KERN_ERR, out.buf);
- ret = -BCH_ERR_fsck_errors_not_fixed;
+ prt_str(&out, ", ");
+ ret = __bch2_topology_error(c, &out);
break;
- default:
- BUG();
}
+print:
+ bch2_print_str(c, KERN_ERR, out.buf);
out:
fsck_err:
printbuf_exit(&out);
@@ -611,8 +655,9 @@ fsck_err:
#define btree_err(type, c, ca, b, i, k, _err_type, msg, ...) \
({ \
- int _ret = __btree_err(type, c, ca, b, i, k, write, have_retry, \
+ int _ret = __btree_err(type, c, ca, b, i, k, write, \
BCH_FSCK_ERR_##_err_type, \
+ failed, err_msg, \
msg, ##__VA_ARGS__); \
\
if (_ret != -BCH_ERR_fsck_fix) { \
@@ -620,7 +665,7 @@ fsck_err:
goto fsck_err; \
} \
\
- *saw_error = true; \
+ true; \
})
#define btree_err_on(cond, ...) ((cond) ? btree_err(__VA_ARGS__) : false)
@@ -678,8 +723,9 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b)
static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
struct btree *b, struct bset *i,
- unsigned offset, unsigned sectors,
- int write, bool have_retry, bool *saw_error)
+ unsigned offset, unsigned sectors, int write,
+ struct bch_io_failures *failed,
+ struct printbuf *err_msg)
{
unsigned version = le16_to_cpu(i->version);
unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key));
@@ -816,7 +862,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
-BCH_ERR_btree_node_read_err_bad_node,
c, ca, b, i, NULL,
btree_node_bad_format,
- "invalid bkey format: %s\n %s", buf1.buf,
+ "invalid bkey format: %s\n%s", buf1.buf,
(printbuf_reset(&buf2),
bch2_bkey_format_to_text(&buf2, &bn->format), buf2.buf));
printbuf_reset(&buf1);
@@ -892,7 +938,8 @@ static inline int btree_node_read_bkey_cmp(const struct btree *b,
static int validate_bset_keys(struct bch_fs *c, struct btree *b,
struct bset *i, int write,
- bool have_retry, bool *saw_error)
+ struct bch_io_failures *failed,
+ struct printbuf *err_msg)
{
unsigned version = le16_to_cpu(i->version);
struct bkey_packed *k, *prev = NULL;
@@ -996,7 +1043,7 @@ drop_this_key:
}
got_good_key:
le16_add_cpu(&i->u64s, -next_good_key);
- memmove_u64s_down(k, bkey_p_next(k), (u64 *) vstruct_end(i) - (u64 *) k);
+ memmove_u64s_down(k, (u64 *) k + next_good_key, (u64 *) vstruct_end(i) - (u64 *) k);
set_btree_node_need_rewrite(b);
}
fsck_err:
@@ -1005,7 +1052,9 @@ fsck_err:
}
int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
- struct btree *b, bool have_retry, bool *saw_error)
+ struct btree *b,
+ struct bch_io_failures *failed,
+ struct printbuf *err_msg)
{
struct btree_node_entry *bne;
struct sort_iter *iter;
@@ -1015,11 +1064,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
bool used_mempool, blacklisted;
bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
- unsigned u64s;
unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key));
u64 max_journal_seq = 0;
struct printbuf buf = PRINTBUF;
- int ret = 0, retry_read = 0, write = READ;
+ int ret = 0, write = READ;
u64 start_time = local_clock();
b->version_ondisk = U16_MAX;
@@ -1153,15 +1201,14 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
b->version_ondisk = min(b->version_ondisk,
le16_to_cpu(i->version));
- ret = validate_bset(c, ca, b, i, b->written, sectors,
- READ, have_retry, saw_error);
+ ret = validate_bset(c, ca, b, i, b->written, sectors, READ, failed, err_msg);
if (ret)
goto fsck_err;
if (!b->written)
btree_node_set_format(b, b->data->format);
- ret = validate_bset_keys(c, b, i, READ, have_retry, saw_error);
+ ret = validate_bset_keys(c, b, i, READ, failed, err_msg);
if (ret)
goto fsck_err;
@@ -1186,7 +1233,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
le64_to_cpu(i->journal_seq),
b->written, b->written + sectors, ptr_written);
- b->written += sectors;
+ b->written = min(b->written + sectors, btree_sectors(c));
if (blacklisted && !first)
continue;
@@ -1222,23 +1269,20 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
sorted = btree_bounce_alloc(c, btree_buf_bytes(b), &used_mempool);
sorted->keys.u64s = 0;
- set_btree_bset(b, b->set, &b->data->keys);
-
b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter);
memset((uint8_t *)(sorted + 1) + b->nr.live_u64s * sizeof(u64), 0,
btree_buf_bytes(b) -
sizeof(struct btree_node) -
b->nr.live_u64s * sizeof(u64));
- u64s = le16_to_cpu(sorted->keys.u64s);
+ b->data->keys.u64s = sorted->keys.u64s;
*sorted = *b->data;
- sorted->keys.u64s = cpu_to_le16(u64s);
swap(sorted, b->data);
set_btree_bset(b, b->set, &b->data->keys);
b->nsets = 1;
b->data->keys.journal_seq = cpu_to_le64(max_journal_seq);
- BUG_ON(b->nr.live_u64s != u64s);
+ BUG_ON(b->nr.live_u64s != le16_to_cpu(b->data->keys.u64s));
btree_bounce_free(c, btree_buf_bytes(b), used_mempool, sorted);
@@ -1252,7 +1296,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
ret = btree_node_bkey_val_validate(c, b, u.s_c, READ);
if (ret == -BCH_ERR_fsck_delete_bkey ||
- (bch2_inject_invalid_keys &&
+ (static_branch_unlikely(&bch2_inject_invalid_keys) &&
!bversion_cmp(u.k->bversion, MAX_VERSION))) {
btree_keys_account_key_drop(&b->nr, 0, k);
@@ -1292,20 +1336,11 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
if (!ptr_written)
set_btree_node_need_rewrite(b);
-out:
+fsck_err:
mempool_free(iter, &c->fill_iter);
printbuf_exit(&buf);
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time);
- return retry_read;
-fsck_err:
- if (ret == -BCH_ERR_btree_node_read_err_want_retry ||
- ret == -BCH_ERR_btree_node_read_err_must_retry) {
- retry_read = 1;
- } else {
- set_btree_node_read_error(b);
- bch2_btree_lost_data(c, b->c.btree_id);
- }
- goto out;
+ return ret;
}
static void btree_node_read_work(struct work_struct *work)
@@ -1317,17 +1352,28 @@ static void btree_node_read_work(struct work_struct *work)
struct btree *b = rb->b;
struct bio *bio = &rb->bio;
struct bch_io_failures failed = { .nr = 0 };
+ int ret = 0;
+
struct printbuf buf = PRINTBUF;
- bool saw_error = false;
- bool retry = false;
- bool can_retry;
+ bch2_log_msg_start(c, &buf);
+
+ prt_printf(&buf, "btree node read error at btree ");
+ bch2_btree_pos_to_text(&buf, c, b);
+ prt_newline(&buf);
goto start;
while (1) {
- retry = true;
- bch_info(c, "retrying read");
- ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ);
+ ret = bch2_bkey_pick_read_device(c,
+ bkey_i_to_s_c(&b->key),
+ &failed, &rb->pick, -1);
+ if (ret) {
+ set_btree_node_read_error(b);
+ break;
+ }
+
+ ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ, BCH_DEV_READ_REF_btree_node_read);
rb->have_ioref = ca != NULL;
+ rb->start_time = local_clock();
bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META);
bio->bi_iter.bi_sector = rb->pick.ptr.offset;
bio->bi_iter.bi_size = btree_buf_bytes(b);
@@ -1338,60 +1384,66 @@ static void btree_node_read_work(struct work_struct *work)
} else {
bio->bi_status = BLK_STS_REMOVED;
}
+
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
+ rb->start_time, !bio->bi_status);
start:
- printbuf_reset(&buf);
- bch2_btree_pos_to_text(&buf, c, b);
- bch2_dev_io_err_on(ca && bio->bi_status, ca, BCH_MEMBER_ERROR_read,
- "btree read error %s for %s",
- bch2_blk_status_to_str(bio->bi_status), buf.buf);
if (rb->have_ioref)
- percpu_ref_put(&ca->io_ref);
+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_read);
rb->have_ioref = false;
- bch2_mark_io_failure(&failed, &rb->pick);
-
- can_retry = bch2_bkey_pick_read_device(c,
- bkey_i_to_s_c(&b->key),
- &failed, &rb->pick) > 0;
-
- if (!bio->bi_status &&
- !bch2_btree_node_read_done(c, ca, b, can_retry, &saw_error)) {
- if (retry)
- bch_info(c, "retry success");
- break;
+ if (bio->bi_status) {
+ bch2_mark_io_failure(&failed, &rb->pick, false);
+ continue;
}
- saw_error = true;
+ ret = bch2_btree_node_read_done(c, ca, b, &failed, &buf);
+ if (ret == -BCH_ERR_btree_node_read_err_want_retry ||
+ ret == -BCH_ERR_btree_node_read_err_must_retry)
+ continue;
- if (!can_retry) {
+ if (ret)
set_btree_node_read_error(b);
- bch2_btree_lost_data(c, b->c.btree_id);
- break;
- }
+
+ break;
}
- bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read],
- rb->start_time);
- bio_put(&rb->bio);
+ bch2_io_failures_to_text(&buf, c, &failed);
- if ((saw_error ||
+ if (btree_node_read_error(b))
+ bch2_btree_lost_data(c, &buf, b->c.btree_id);
+
+ /*
+ * only print retry success if we read from a replica with no errors
+ */
+ if (btree_node_read_error(b))
+ prt_printf(&buf, "ret %s", bch2_err_str(ret));
+ else if (failed.nr) {
+ if (!bch2_dev_io_failures(&failed, rb->pick.ptr.dev))
+ prt_printf(&buf, "retry success");
+ else
+ prt_printf(&buf, "repair success");
+ }
+
+ if ((failed.nr ||
btree_node_need_rewrite(b)) &&
!btree_node_read_error(b) &&
- c->curr_recovery_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) {
- if (saw_error) {
- printbuf_reset(&buf);
- bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
- prt_str(&buf, " ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
- bch_err_ratelimited(c, "%s: rewriting btree node at due to error\n %s",
- __func__, buf.buf);
- }
-
+ c->recovery.curr_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) {
+ prt_printf(&buf, " (rewriting node)");
bch2_btree_node_rewrite_async(c, b);
}
+ prt_newline(&buf);
+ if (failed.nr)
+ bch2_print_str_ratelimited(c, KERN_ERR, buf.buf);
+
+ async_object_list_del(c, btree_read_bio, rb->list_idx);
+ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read],
+ rb->start_time);
+ bio_put(&rb->bio);
printbuf_exit(&buf);
clear_btree_node_read_in_flight(b);
+ smp_mb__after_atomic();
wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
}
@@ -1400,16 +1452,20 @@ static void btree_node_read_endio(struct bio *bio)
struct btree_read_bio *rb =
container_of(bio, struct btree_read_bio, bio);
struct bch_fs *c = rb->c;
+ struct bch_dev *ca = rb->have_ioref
+ ? bch2_dev_have_ref(c, rb->pick.ptr.dev) : NULL;
- if (rb->have_ioref) {
- struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev);
-
- bch2_latency_acct(ca, rb->start_time, READ);
- }
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
+ rb->start_time, !bio->bi_status);
queue_work(c->btree_read_complete_wq, &rb->work);
}
+void bch2_btree_read_bio_to_text(struct printbuf *out, struct btree_read_bio *rbio)
+{
+ bch2_bio_to_text(out, &rbio->bio);
+}
+
struct btree_node_read_all {
struct closure cl;
struct bch_fs *c;
@@ -1469,12 +1525,13 @@ static CLOSURE_CALLBACK(btree_node_read_all_replicas_done)
struct btree *b = ra->b;
struct printbuf buf = PRINTBUF;
bool dump_bset_maps = false;
- bool have_retry = false;
int ret = 0, best = -1, write = READ;
unsigned i, written = 0, written2 = 0;
__le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2
? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0;
bool _saw_error = false, *saw_error = &_saw_error;
+ struct printbuf *err_msg = NULL;
+ struct bch_io_failures *failed = NULL;
for (i = 0; i < ra->nr; i++) {
struct btree_node *bn = ra->buf[i];
@@ -1567,14 +1624,19 @@ fsck_err:
if (best >= 0) {
memcpy(b->data, ra->buf[best], btree_buf_bytes(b));
- ret = bch2_btree_node_read_done(c, NULL, b, false, saw_error);
+ ret = bch2_btree_node_read_done(c, NULL, b, NULL, NULL);
} else {
ret = -1;
}
if (ret) {
set_btree_node_read_error(b);
- bch2_btree_lost_data(c, b->c.btree_id);
+
+ struct printbuf buf = PRINTBUF;
+ bch2_btree_lost_data(c, &buf, b->c.btree_id);
+ if (buf.pos)
+ bch_err(c, "%s", buf.buf);
+ printbuf_exit(&buf);
} else if (*saw_error)
bch2_btree_node_rewrite_async(c, b);
@@ -1588,6 +1650,7 @@ fsck_err:
printbuf_exit(&buf);
clear_btree_node_read_in_flight(b);
+ smp_mb__after_atomic();
wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
}
@@ -1602,6 +1665,8 @@ static void btree_node_read_all_replicas_endio(struct bio *bio)
struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev);
bch2_latency_acct(ca, rb->start_time, READ);
+ enumerated_ref_put(&ca->io_ref[READ],
+ BCH_DEV_READ_REF_btree_node_read_all_replicas);
}
ra->err[rb->idx] = bio->bi_status;
@@ -1641,7 +1706,8 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool
i = 0;
bkey_for_each_ptr_decode(k.k, ptrs, pick, entry) {
- struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
+ struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ,
+ BCH_DEV_READ_REF_btree_node_read_all_replicas);
struct btree_read_bio *rb =
container_of(ra->bio[i], struct btree_read_bio, bio);
rb->c = c;
@@ -1692,33 +1758,42 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
trace_and_count(c, btree_node_read, trans, b);
- if (bch2_verify_all_btree_replicas &&
+ if (static_branch_unlikely(&bch2_verify_all_btree_replicas) &&
!btree_node_read_all_replicas(c, b, sync))
return;
ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
- NULL, &pick);
+ NULL, &pick, -1);
if (ret <= 0) {
+ bool ratelimit = true;
struct printbuf buf = PRINTBUF;
+ bch2_log_msg_start(c, &buf);
prt_str(&buf, "btree node read error: no device to read from\n at ");
bch2_btree_pos_to_text(&buf, c, b);
- bch_err_ratelimited(c, "%s", buf.buf);
-
- if (c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_check_topology) &&
- c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology)
- bch2_fatal_error(c);
+ prt_newline(&buf);
+ bch2_btree_lost_data(c, &buf, b->c.btree_id);
+
+ if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_check_topology) &&
+ bch2_fs_emergency_read_only2(c, &buf))
+ ratelimit = false;
+
+ static DEFINE_RATELIMIT_STATE(rs,
+ DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+ if (!ratelimit || __ratelimit(&rs))
+ bch2_print_str(c, KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
set_btree_node_read_error(b);
- bch2_btree_lost_data(c, b->c.btree_id);
clear_btree_node_read_in_flight(b);
+ smp_mb__after_atomic();
wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
- printbuf_exit(&buf);
return;
}
- ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
+ ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, BCH_DEV_READ_REF_btree_node_read);
bio = bio_alloc_bioset(NULL,
buf_pages(b->data, btree_buf_bytes(b)),
@@ -1737,6 +1812,8 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
bio->bi_end_io = btree_node_read_endio;
bch2_bio_map(bio, b->data, btree_buf_bytes(b));
+ async_object_list_add(c, btree_read_bio, rb, &rb->list_idx);
+
if (rb->have_ioref) {
this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
bio_sectors(bio));
@@ -1811,6 +1888,192 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
return bch2_trans_run(c, __bch2_btree_root_read(trans, id, k, level));
}
+struct btree_node_scrub {
+ struct bch_fs *c;
+ struct bch_dev *ca;
+ void *buf;
+ bool used_mempool;
+ unsigned written;
+
+ enum btree_id btree;
+ unsigned level;
+ struct bkey_buf key;
+ __le64 seq;
+
+ struct work_struct work;
+ struct bio bio;
+};
+
+static bool btree_node_scrub_check(struct bch_fs *c, struct btree_node *data, unsigned ptr_written,
+ struct printbuf *err)
+{
+ unsigned written = 0;
+
+ if (le64_to_cpu(data->magic) != bset_magic(c)) {
+ prt_printf(err, "bad magic: want %llx, got %llx",
+ bset_magic(c), le64_to_cpu(data->magic));
+ return false;
+ }
+
+ while (written < (ptr_written ?: btree_sectors(c))) {
+ struct btree_node_entry *bne;
+ struct bset *i;
+ bool first = !written;
+
+ if (first) {
+ bne = NULL;
+ i = &data->keys;
+ } else {
+ bne = (void *) data + (written << 9);
+ i = &bne->keys;
+
+ if (!ptr_written && i->seq != data->keys.seq)
+ break;
+ }
+
+ struct nonce nonce = btree_nonce(i, written << 9);
+ bool good_csum_type = bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i));
+
+ if (first) {
+ if (good_csum_type) {
+ struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, data);
+ if (bch2_crc_cmp(data->csum, csum)) {
+ bch2_csum_err_msg(err, BSET_CSUM_TYPE(i), data->csum, csum);
+ return false;
+ }
+ }
+
+ written += vstruct_sectors(data, c->block_bits);
+ } else {
+ if (good_csum_type) {
+ struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
+ if (bch2_crc_cmp(bne->csum, csum)) {
+ bch2_csum_err_msg(err, BSET_CSUM_TYPE(i), bne->csum, csum);
+ return false;
+ }
+ }
+
+ written += vstruct_sectors(bne, c->block_bits);
+ }
+ }
+
+ return true;
+}
+
+static void btree_node_scrub_work(struct work_struct *work)
+{
+ struct btree_node_scrub *scrub = container_of(work, struct btree_node_scrub, work);
+ struct bch_fs *c = scrub->c;
+ struct printbuf err = PRINTBUF;
+
+ __bch2_btree_pos_to_text(&err, c, scrub->btree, scrub->level,
+ bkey_i_to_s_c(scrub->key.k));
+ prt_newline(&err);
+
+ if (!btree_node_scrub_check(c, scrub->buf, scrub->written, &err)) {
+ struct btree_trans *trans = bch2_trans_get(c);
+
+ struct btree_iter iter;
+ bch2_trans_node_iter_init(trans, &iter, scrub->btree,
+ scrub->key.k->k.p, 0, scrub->level - 1, 0);
+
+ struct btree *b;
+ int ret = lockrestart_do(trans,
+ PTR_ERR_OR_ZERO(b = bch2_btree_iter_peek_node(trans, &iter)));
+ if (ret)
+ goto err;
+
+ if (bkey_i_to_btree_ptr_v2(&b->key)->v.seq == scrub->seq) {
+ bch_err(c, "error validating btree node during scrub on %s at btree %s",
+ scrub->ca->name, err.buf);
+
+ ret = bch2_btree_node_rewrite(trans, &iter, b, 0, 0);
+ }
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ bch2_trans_begin(trans);
+ bch2_trans_put(trans);
+ }
+
+ printbuf_exit(&err);
+ bch2_bkey_buf_exit(&scrub->key, c);;
+ btree_bounce_free(c, c->opts.btree_node_size, scrub->used_mempool, scrub->buf);
+ enumerated_ref_put(&scrub->ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scrub);
+ kfree(scrub);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_node_scrub);
+}
+
+static void btree_node_scrub_endio(struct bio *bio)
+{
+ struct btree_node_scrub *scrub = container_of(bio, struct btree_node_scrub, bio);
+
+ queue_work(scrub->c->btree_read_complete_wq, &scrub->work);
+}
+
+int bch2_btree_node_scrub(struct btree_trans *trans,
+ enum btree_id btree, unsigned level,
+ struct bkey_s_c k, unsigned dev)
+{
+ if (k.k->type != KEY_TYPE_btree_ptr_v2)
+ return 0;
+
+ struct bch_fs *c = trans->c;
+
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_node_scrub))
+ return -BCH_ERR_erofs_no_writes;
+
+ struct extent_ptr_decoded pick;
+ int ret = bch2_bkey_pick_read_device(c, k, NULL, &pick, dev);
+ if (ret <= 0)
+ goto err;
+
+ struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ,
+ BCH_DEV_READ_REF_btree_node_scrub);
+ if (!ca) {
+ ret = -BCH_ERR_device_offline;
+ goto err;
+ }
+
+ bool used_mempool = false;
+ void *buf = btree_bounce_alloc(c, c->opts.btree_node_size, &used_mempool);
+
+ unsigned vecs = buf_pages(buf, c->opts.btree_node_size);
+
+ struct btree_node_scrub *scrub =
+ kzalloc(sizeof(*scrub) + sizeof(struct bio_vec) * vecs, GFP_KERNEL);
+ if (!scrub) {
+ ret = -ENOMEM;
+ goto err_free;
+ }
+
+ scrub->c = c;
+ scrub->ca = ca;
+ scrub->buf = buf;
+ scrub->used_mempool = used_mempool;
+ scrub->written = btree_ptr_sectors_written(k);
+
+ scrub->btree = btree;
+ scrub->level = level;
+ bch2_bkey_buf_init(&scrub->key);
+ bch2_bkey_buf_reassemble(&scrub->key, c, k);
+ scrub->seq = bkey_s_c_to_btree_ptr_v2(k).v->seq;
+
+ INIT_WORK(&scrub->work, btree_node_scrub_work);
+
+ bio_init(&scrub->bio, ca->disk_sb.bdev, scrub->bio.bi_inline_vecs, vecs, REQ_OP_READ);
+ bch2_bio_map(&scrub->bio, scrub->buf, c->opts.btree_node_size);
+ scrub->bio.bi_iter.bi_sector = pick.ptr.offset;
+ scrub->bio.bi_end_io = btree_node_scrub_endio;
+ submit_bio(&scrub->bio);
+ return 0;
+err_free:
+ btree_bounce_free(c, c->opts.btree_node_size, used_mempool, buf);
+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scrub);
+err:
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_node_scrub);
+ return ret;
+}
+
static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
struct btree_write *w)
{
@@ -1831,7 +2094,7 @@ static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
bch2_journal_pin_drop(&c->journal, &w->journal);
}
-static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
+static void __btree_node_write_done(struct bch_fs *c, struct btree *b, u64 start_time)
{
struct btree_write *w = btree_prev_write(b);
unsigned long old, new;
@@ -1839,6 +2102,9 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
bch2_btree_complete_write(c, b, w);
+ if (start_time)
+ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_write], start_time);
+
old = READ_ONCE(b->flags);
do {
new = old;
@@ -1865,11 +2131,13 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
if (new & (1U << BTREE_NODE_write_in_flight))
__bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED|type);
- else
+ else {
+ smp_mb__after_atomic();
wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
+ }
}
-static void btree_node_write_done(struct bch_fs *c, struct btree *b)
+static void btree_node_write_done(struct bch_fs *c, struct btree *b, u64 start_time)
{
struct btree_trans *trans = bch2_trans_get(c);
@@ -1877,7 +2145,7 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b)
/* we don't need transaction context anymore after we got the lock. */
bch2_trans_put(trans);
- __btree_node_write_done(c, b);
+ __btree_node_write_done(c, b, start_time);
six_unlock_read(&b->c.lock);
}
@@ -1887,6 +2155,7 @@ static void btree_node_write_work(struct work_struct *work)
container_of(work, struct btree_write_bio, work);
struct bch_fs *c = wbio->wbio.c;
struct btree *b = wbio->wbio.bio.bi_private;
+ u64 start_time = wbio->start_time;
int ret = 0;
btree_bounce_free(c,
@@ -1918,13 +2187,20 @@ static void btree_node_write_work(struct work_struct *work)
goto err;
}
out:
+ async_object_list_del(c, btree_write_bio, wbio->list_idx);
bio_put(&wbio->wbio.bio);
- btree_node_write_done(c, b);
+ btree_node_write_done(c, b, start_time);
return;
err:
set_btree_node_noevict(b);
- bch2_fs_fatal_err_on(!bch2_err_matches(ret, EROFS), c,
- "writing btree node: %s", bch2_err_str(ret));
+
+ if (!bch2_err_matches(ret, EROFS)) {
+ struct printbuf buf = PRINTBUF;
+ prt_printf(&buf, "writing btree node: %s\n ", bch2_err_str(ret));
+ bch2_btree_pos_to_text(&buf, c, b);
+ bch2_fs_fatal_error(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ }
goto out;
}
@@ -1937,23 +2213,34 @@ static void btree_node_write_endio(struct bio *bio)
struct bch_fs *c = wbio->c;
struct btree *b = wbio->bio.bi_private;
struct bch_dev *ca = wbio->have_ioref ? bch2_dev_have_ref(c, wbio->dev) : NULL;
- unsigned long flags;
- if (wbio->have_ioref)
- bch2_latency_acct(ca, wbio->submit_time, WRITE);
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write,
+ wbio->submit_time, !bio->bi_status);
+
+ if (ca && bio->bi_status) {
+ struct printbuf buf = PRINTBUF;
+ buf.atomic++;
+ prt_printf(&buf, "btree write error: %s\n ",
+ bch2_blk_status_to_str(bio->bi_status));
+ bch2_btree_pos_to_text(&buf, c, b);
+ bch_err_dev_ratelimited(ca, "%s", buf.buf);
+ printbuf_exit(&buf);
+ }
- if (!ca ||
- bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
- "btree write error: %s",
- bch2_blk_status_to_str(bio->bi_status)) ||
- bch2_meta_write_fault("btree")) {
+ if (bio->bi_status) {
+ unsigned long flags;
spin_lock_irqsave(&c->btree_write_error_lock, flags);
bch2_dev_list_add_dev(&orig->failed, wbio->dev);
spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
}
+ /*
+ * XXX: we should be using io_ref[WRITE], but we aren't retrying failed
+ * btree writes yet (due to device removal/ro):
+ */
if (wbio->have_ioref)
- percpu_ref_put(&ca->io_ref);
+ enumerated_ref_put(&ca->io_ref[READ],
+ BCH_DEV_READ_REF_btree_node_write);
if (parent) {
bio_put(bio);
@@ -1962,16 +2249,15 @@ static void btree_node_write_endio(struct bio *bio)
}
clear_btree_node_write_in_flight_inner(b);
+ smp_mb__after_atomic();
wake_up_bit(&b->flags, BTREE_NODE_write_in_flight_inner);
INIT_WORK(&wb->work, btree_node_write_work);
- queue_work(c->btree_io_complete_wq, &wb->work);
+ queue_work(c->btree_write_complete_wq, &wb->work);
}
static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
struct bset *i, unsigned sectors)
{
- bool saw_error;
-
int ret = bch2_bkey_validate(c, bkey_i_to_s_c(&b->key),
(struct bkey_validate_context) {
.from = BKEY_VALIDATE_btree_node,
@@ -1984,8 +2270,8 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
return ret;
}
- ret = validate_bset_keys(c, b, i, WRITE, false, &saw_error) ?:
- validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false, &saw_error);
+ ret = validate_bset_keys(c, b, i, WRITE, NULL, NULL) ?:
+ validate_bset(c, NULL, b, i, b->written, sectors, WRITE, NULL, NULL);
if (ret) {
bch2_inconsistent_error(c);
dump_stack();
@@ -2023,6 +2309,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
bool validate_before_checksum = false;
enum btree_write_type type = flags & BTREE_WRITE_TYPE_MASK;
void *data;
+ u64 start_time = local_clock();
int ret;
if (flags & BTREE_WRITE_ALREADY_STARTED)
@@ -2231,6 +2518,7 @@ do_write:
wbio->data = data;
wbio->data_bytes = bytes;
wbio->sector_offset = b->written;
+ wbio->start_time = start_time;
wbio->wbio.c = c;
wbio->wbio.used_mempool = used_mempool;
wbio->wbio.first_btree_write = !b->written;
@@ -2250,6 +2538,8 @@ do_write:
atomic64_inc(&c->btree_write_stats[type].nr);
atomic64_add(bytes_to_write, &c->btree_write_stats[type].bytes);
+ async_object_list_add(c, btree_write_bio, wbio, &wbio->list_idx);
+
INIT_WORK(&wbio->work, btree_write_submit);
queue_work(c->btree_write_submit_wq, &wbio->work);
return;
@@ -2258,7 +2548,7 @@ err:
b->written += sectors_to_write;
nowrite:
btree_bounce_free(c, bytes, used_mempool, data);
- __btree_node_write_done(c, b);
+ __btree_node_write_done(c, b, 0);
}
/*