summaryrefslogtreecommitdiff
path: root/fs/bcachefs/ec.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/bcachefs/ec.c')
-rw-r--r--fs/bcachefs/ec.c754
1 files changed, 326 insertions, 428 deletions
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index d2a5e76e6479..c581426e3894 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -16,10 +16,12 @@
#include "disk_accounting.h"
#include "disk_groups.h"
#include "ec.h"
+#include "enumerated_ref.h"
#include "error.h"
#include "io_read.h"
#include "io_write.h"
#include "keylist.h"
+#include "lru.h"
#include "recovery.h"
#include "replicas.h"
#include "super-io.h"
@@ -104,6 +106,8 @@ struct ec_bio {
struct bch_dev *ca;
struct ec_stripe_buf *buf;
size_t idx;
+ int rw;
+ u64 submit_time;
struct bio bio;
};
@@ -298,15 +302,27 @@ static int mark_stripe_bucket(struct btree_trans *trans,
struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
if (flags & BTREE_TRIGGER_transactional) {
+ struct extent_ptr_decoded p = {
+ .ptr = *ptr,
+ .crc = bch2_extent_crc_unpack(s.k, NULL),
+ };
+ struct bkey_i_backpointer bp;
+ bch2_extent_ptr_to_bp(c, BTREE_ID_stripes, 0, s.s_c, p,
+ (const union bch_extent_entry *) ptr, &bp);
+
struct bkey_i_alloc_v4 *a =
bch2_trans_start_alloc_update(trans, bucket, 0);
- ret = PTR_ERR_OR_ZERO(a) ?:
- __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags);
+ ret = PTR_ERR_OR_ZERO(a) ?:
+ __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags) ?:
+ bch2_bucket_backpointer_mod(trans, s.s_c, &bp,
+ !(flags & BTREE_TRIGGER_overwrite));
+ if (ret)
+ goto err;
}
if (flags & BTREE_TRIGGER_gc) {
struct bucket *g = gc_bucket(ca, bucket.offset);
- if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s",
+ if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n%s",
ptr->dev,
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
ret = -BCH_ERR_mark_stripe;
@@ -366,19 +382,6 @@ static int mark_stripe_buckets(struct btree_trans *trans,
return 0;
}
-static inline void stripe_to_mem(struct stripe *m, const struct bch_stripe *s)
-{
- m->sectors = le16_to_cpu(s->sectors);
- m->algorithm = s->algorithm;
- m->nr_blocks = s->nr_blocks;
- m->nr_redundant = s->nr_redundant;
- m->disk_label = s->disk_label;
- m->blocks_nonempty = 0;
-
- for (unsigned i = 0; i < s->nr_blocks; i++)
- m->blocks_nonempty += !!stripe_blockcount_get(s, i);
-}
-
int bch2_trigger_stripe(struct btree_trans *trans,
enum btree_id btree, unsigned level,
struct bkey_s_c old, struct bkey_s _new,
@@ -399,6 +402,15 @@ int bch2_trigger_stripe(struct btree_trans *trans,
(new_s->nr_blocks != old_s->nr_blocks ||
new_s->nr_redundant != old_s->nr_redundant));
+ if (flags & BTREE_TRIGGER_transactional) {
+ int ret = bch2_lru_change(trans,
+ BCH_LRU_STRIPE_FRAGMENTATION,
+ idx,
+ stripe_lru_pos(old_s),
+ stripe_lru_pos(new_s));
+ if (ret)
+ return ret;
+ }
if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
/*
@@ -443,24 +455,25 @@ int bch2_trigger_stripe(struct btree_trans *trans,
if (new_s) {
s64 sectors = (u64) le16_to_cpu(new_s->sectors) * new_s->nr_redundant;
- struct disk_accounting_pos acc = {
- .type = BCH_DISK_ACCOUNTING_replicas,
- };
+ struct disk_accounting_pos acc;
+ memset(&acc, 0, sizeof(acc));
+ acc.type = BCH_DISK_ACCOUNTING_replicas;
bch2_bkey_to_replicas(&acc.replicas, new);
int ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, gc);
if (ret)
return ret;
if (gc)
- memcpy(&gc->r.e, &acc.replicas, replicas_entry_bytes(&acc.replicas));
+ unsafe_memcpy(&gc->r.e, &acc.replicas,
+ replicas_entry_bytes(&acc.replicas), "VLA");
}
if (old_s) {
s64 sectors = -((s64) le16_to_cpu(old_s->sectors)) * old_s->nr_redundant;
- struct disk_accounting_pos acc = {
- .type = BCH_DISK_ACCOUNTING_replicas,
- };
+ struct disk_accounting_pos acc;
+ memset(&acc, 0, sizeof(acc));
+ acc.type = BCH_DISK_ACCOUNTING_replicas;
bch2_bkey_to_replicas(&acc.replicas, old);
int ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, gc);
if (ret)
@@ -472,38 +485,6 @@ int bch2_trigger_stripe(struct btree_trans *trans,
return ret;
}
- if (flags & BTREE_TRIGGER_atomic) {
- struct stripe *m = genradix_ptr(&c->stripes, idx);
-
- if (!m) {
- struct printbuf buf1 = PRINTBUF;
- struct printbuf buf2 = PRINTBUF;
-
- bch2_bkey_val_to_text(&buf1, c, old);
- bch2_bkey_val_to_text(&buf2, c, new);
- bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n"
- "old %s\n"
- "new %s", idx, buf1.buf, buf2.buf);
- printbuf_exit(&buf2);
- printbuf_exit(&buf1);
- bch2_inconsistent_error(c);
- return -1;
- }
-
- if (!new_s) {
- bch2_stripes_heap_del(c, m, idx);
-
- memset(m, 0, sizeof(*m));
- } else {
- stripe_to_mem(m, new_s);
-
- if (!old_s)
- bch2_stripes_heap_insert(c, m, idx);
- else
- bch2_stripes_heap_update(c, m, idx);
- }
- }
-
return 0;
}
@@ -527,20 +508,14 @@ static const struct bch_extent_ptr *bkey_matches_stripe(struct bch_stripe *s,
static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx)
{
- switch (k.k->type) {
- case KEY_TYPE_extent: {
- struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
- const union bch_extent_entry *entry;
-
- extent_for_each_entry(e, entry)
- if (extent_entry_type(entry) ==
- BCH_EXTENT_ENTRY_stripe_ptr &&
- entry->stripe_ptr.idx == idx)
- return true;
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
- break;
- }
- }
+ bkey_extent_entry_for_each(ptrs, entry)
+ if (extent_entry_type(entry) ==
+ BCH_EXTENT_ENTRY_stripe_ptr &&
+ entry->stripe_ptr.idx == idx)
+ return true;
return false;
}
@@ -725,15 +700,20 @@ static void ec_block_endio(struct bio *bio)
struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx];
struct bch_dev *ca = ec_bio->ca;
struct closure *cl = bio->bi_private;
+ int rw = ec_bio->rw;
+ unsigned ref = rw == READ
+ ? BCH_DEV_READ_REF_ec_block
+ : BCH_DEV_WRITE_REF_ec_block;
- if (bch2_dev_io_err_on(bio->bi_status, ca,
- bio_data_dir(bio)
- ? BCH_MEMBER_ERROR_write
- : BCH_MEMBER_ERROR_read,
- "erasure coding %s error: %s",
+ bch2_account_io_completion(ca, bio_data_dir(bio),
+ ec_bio->submit_time, !bio->bi_status);
+
+ if (bio->bi_status) {
+ bch_err_dev_ratelimited(ca, "erasure coding %s error: %s",
str_write_read(bio_data_dir(bio)),
- bch2_blk_status_to_str(bio->bi_status)))
+ bch2_blk_status_to_str(bio->bi_status));
clear_bit(ec_bio->idx, ec_bio->buf->valid);
+ }
int stale = dev_ptr_stale(ca, ptr);
if (stale) {
@@ -745,7 +725,7 @@ static void ec_block_endio(struct bio *bio)
}
bio_put(&ec_bio->bio);
- percpu_ref_put(&ca->io_ref);
+ enumerated_ref_put(&ca->io_ref[rw], ref);
closure_put(cl);
}
@@ -759,8 +739,11 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
? BCH_DATA_user
: BCH_DATA_parity;
int rw = op_is_write(opf);
+ unsigned ref = rw == READ
+ ? BCH_DEV_READ_REF_ec_block
+ : BCH_DEV_WRITE_REF_ec_block;
- struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, rw);
+ struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, rw, ref);
if (!ca) {
clear_bit(idx, buf->valid);
return;
@@ -796,6 +779,8 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
ec_bio->ca = ca;
ec_bio->buf = buf;
ec_bio->idx = idx;
+ ec_bio->rw = rw;
+ ec_bio->submit_time = local_clock();
ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9);
ec_bio->bio.bi_end_io = ec_block_endio;
@@ -804,14 +789,14 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b);
closure_get(cl);
- percpu_ref_get(&ca->io_ref);
+ enumerated_ref_get(&ca->io_ref[rw], ref);
submit_bio(&ec_bio->bio);
offset += b;
}
- percpu_ref_put(&ca->io_ref);
+ enumerated_ref_put(&ca->io_ref[rw], ref);
}
static int get_stripe_key_trans(struct btree_trans *trans, u64 idx,
@@ -917,26 +902,6 @@ err:
static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
{
- ec_stripes_heap n, *h = &c->ec_stripes_heap;
-
- if (idx >= h->size) {
- if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp))
- return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
-
- mutex_lock(&c->ec_stripes_heap_lock);
- if (n.size > h->size) {
- memcpy(n.data, h->data, h->nr * sizeof(h->data[0]));
- n.nr = h->nr;
- swap(*h, n);
- }
- mutex_unlock(&c->ec_stripes_heap_lock);
-
- free_heap(&n);
- }
-
- if (!genradix_ptr_alloc(&c->stripes, idx, gfp))
- return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
-
if (c->gc_pos.phase != GC_PHASE_not_running &&
!genradix_ptr_alloc(&c->gc_stripes, idx, gfp))
return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
@@ -1009,188 +974,58 @@ static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s)
s->idx = 0;
}
-/* Heap of all existing stripes, ordered by blocks_nonempty */
-
-static u64 stripe_idx_to_delete(struct bch_fs *c)
-{
- ec_stripes_heap *h = &c->ec_stripes_heap;
-
- lockdep_assert_held(&c->ec_stripes_heap_lock);
-
- if (h->nr &&
- h->data[0].blocks_nonempty == 0 &&
- !bch2_stripe_is_open(c, h->data[0].idx))
- return h->data[0].idx;
-
- return 0;
-}
-
-static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h,
- size_t i)
-{
- struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap);
-
- genradix_ptr(&c->stripes, h->data[i].idx)->heap_idx = i;
-}
-
-static inline bool ec_stripes_heap_cmp(const void *l, const void *r, void __always_unused *args)
-{
- struct ec_stripe_heap_entry *_l = (struct ec_stripe_heap_entry *)l;
- struct ec_stripe_heap_entry *_r = (struct ec_stripe_heap_entry *)r;
-
- return ((_l->blocks_nonempty > _r->blocks_nonempty) <
- (_l->blocks_nonempty < _r->blocks_nonempty));
-}
-
-static inline void ec_stripes_heap_swap(void *l, void *r, void *h)
-{
- struct ec_stripe_heap_entry *_l = (struct ec_stripe_heap_entry *)l;
- struct ec_stripe_heap_entry *_r = (struct ec_stripe_heap_entry *)r;
- ec_stripes_heap *_h = (ec_stripes_heap *)h;
- size_t i = _l - _h->data;
- size_t j = _r - _h->data;
-
- swap(*_l, *_r);
-
- ec_stripes_heap_set_backpointer(_h, i);
- ec_stripes_heap_set_backpointer(_h, j);
-}
-
-static const struct min_heap_callbacks callbacks = {
- .less = ec_stripes_heap_cmp,
- .swp = ec_stripes_heap_swap,
-};
-
-static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
-{
- ec_stripes_heap *h = &c->ec_stripes_heap;
- struct stripe *m = genradix_ptr(&c->stripes, idx);
-
- BUG_ON(m->heap_idx >= h->nr);
- BUG_ON(h->data[m->heap_idx].idx != idx);
-}
-
-void bch2_stripes_heap_del(struct bch_fs *c,
- struct stripe *m, size_t idx)
-{
- mutex_lock(&c->ec_stripes_heap_lock);
- heap_verify_backpointer(c, idx);
-
- min_heap_del(&c->ec_stripes_heap, m->heap_idx, &callbacks, &c->ec_stripes_heap);
- mutex_unlock(&c->ec_stripes_heap_lock);
-}
-
-void bch2_stripes_heap_insert(struct bch_fs *c,
- struct stripe *m, size_t idx)
-{
- mutex_lock(&c->ec_stripes_heap_lock);
- BUG_ON(min_heap_full(&c->ec_stripes_heap));
-
- genradix_ptr(&c->stripes, idx)->heap_idx = c->ec_stripes_heap.nr;
- min_heap_push(&c->ec_stripes_heap, &((struct ec_stripe_heap_entry) {
- .idx = idx,
- .blocks_nonempty = m->blocks_nonempty,
- }),
- &callbacks,
- &c->ec_stripes_heap);
-
- heap_verify_backpointer(c, idx);
- mutex_unlock(&c->ec_stripes_heap_lock);
-}
-
-void bch2_stripes_heap_update(struct bch_fs *c,
- struct stripe *m, size_t idx)
-{
- ec_stripes_heap *h = &c->ec_stripes_heap;
- bool do_deletes;
- size_t i;
-
- mutex_lock(&c->ec_stripes_heap_lock);
- heap_verify_backpointer(c, idx);
-
- h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty;
-
- i = m->heap_idx;
- min_heap_sift_up(h, i, &callbacks, &c->ec_stripes_heap);
- min_heap_sift_down(h, i, &callbacks, &c->ec_stripes_heap);
-
- heap_verify_backpointer(c, idx);
-
- do_deletes = stripe_idx_to_delete(c) != 0;
- mutex_unlock(&c->ec_stripes_heap_lock);
-
- if (do_deletes)
- bch2_do_stripe_deletes(c);
-}
-
/* stripe deletion */
static int ec_stripe_delete(struct btree_trans *trans, u64 idx)
{
- struct bch_fs *c = trans->c;
struct btree_iter iter;
- struct bkey_s_c k;
- struct bkey_s_c_stripe s;
- int ret;
-
- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, POS(0, idx),
- BTREE_ITER_intent);
- ret = bkey_err(k);
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter,
+ BTREE_ID_stripes, POS(0, idx),
+ BTREE_ITER_intent);
+ int ret = bkey_err(k);
if (ret)
goto err;
- if (k.k->type != KEY_TYPE_stripe) {
- bch2_fs_inconsistent(c, "attempting to delete nonexistent stripe %llu", idx);
- ret = -EINVAL;
- goto err;
- }
-
- s = bkey_s_c_to_stripe(k);
- for (unsigned i = 0; i < s.v->nr_blocks; i++)
- if (stripe_blockcount_get(s.v, i)) {
- struct printbuf buf = PRINTBUF;
-
- bch2_bkey_val_to_text(&buf, c, k);
- bch2_fs_inconsistent(c, "attempting to delete nonempty stripe %s", buf.buf);
- printbuf_exit(&buf);
- ret = -EINVAL;
- goto err;
- }
-
- ret = bch2_btree_delete_at(trans, &iter, 0);
+ /*
+ * We expect write buffer races here
+ * Important: check stripe_is_open with stripe key locked:
+ */
+ if (k.k->type == KEY_TYPE_stripe &&
+ !bch2_stripe_is_open(trans->c, idx) &&
+ stripe_lru_pos(bkey_s_c_to_stripe(k).v) == 1)
+ ret = bch2_btree_delete_at(trans, &iter, 0);
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
}
+/*
+ * XXX
+ * can we kill this and delete stripes from the trigger?
+ */
static void ec_stripe_delete_work(struct work_struct *work)
{
struct bch_fs *c =
container_of(work, struct bch_fs, ec_stripe_delete_work);
- while (1) {
- mutex_lock(&c->ec_stripes_heap_lock);
- u64 idx = stripe_idx_to_delete(c);
- mutex_unlock(&c->ec_stripes_heap_lock);
-
- if (!idx)
- break;
-
- int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- ec_stripe_delete(trans, idx));
- bch_err_fn(c, ret);
- if (ret)
- break;
- }
-
- bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
+ bch2_trans_run(c,
+ bch2_btree_write_buffer_tryflush(trans) ?:
+ for_each_btree_key_max_commit(trans, lru_iter, BTREE_ID_lru,
+ lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 1, 0),
+ lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 1, LRU_TIME_MAX),
+ 0, lru_k,
+ NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc, ({
+ ec_stripe_delete(trans, lru_k.k->p.offset);
+ })));
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_delete);
}
void bch2_do_stripe_deletes(struct bch_fs *c)
{
- if (bch2_write_ref_tryget(c, BCH_WRITE_REF_stripe_delete) &&
+ if (enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_stripe_delete) &&
!queue_work(c->write_ref_wq, &c->ec_stripe_delete_work))
- bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_delete);
}
/* stripe creation: */
@@ -1294,7 +1129,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
bch2_fs_inconsistent(c, "%s", buf.buf);
printbuf_exit(&buf);
- return -EIO;
+ return -BCH_ERR_erasure_coding_found_btree_node;
}
k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, last_flushed);
@@ -1360,7 +1195,7 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev);
if (!ca)
- return -EIO;
+ return -BCH_ERR_ENOENT_dev_not_found;
struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr);
@@ -1380,8 +1215,12 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
if (bp_k.k->type != KEY_TYPE_backpointer)
continue;
+ struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k);
+ if (bp.v->btree_id == BTREE_ID_stripes)
+ continue;
+
ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s,
- bkey_s_c_to_backpointer(bp_k), &last_flushed);
+ bp, &last_flushed);
}));
bch2_bkey_buf_exit(&last_flushed, c);
@@ -1393,21 +1232,19 @@ static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s)
{
struct btree_trans *trans = bch2_trans_get(c);
struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
- unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
- int ret = 0;
+ unsigned nr_data = v->nr_blocks - v->nr_redundant;
- ret = bch2_btree_write_buffer_flush_sync(trans);
+ int ret = bch2_btree_write_buffer_flush_sync(trans);
if (ret)
goto err;
- for (i = 0; i < nr_data; i++) {
+ for (unsigned i = 0; i < nr_data; i++) {
ret = ec_stripe_update_bucket(trans, s, i);
if (ret)
break;
}
err:
bch2_trans_put(trans);
-
return ret;
}
@@ -1416,7 +1253,8 @@ static void zero_out_rest_of_ec_bucket(struct bch_fs *c,
unsigned block,
struct open_bucket *ob)
{
- struct bch_dev *ca = bch2_dev_get_ioref(c, ob->dev, WRITE);
+ struct bch_dev *ca = bch2_dev_get_ioref(c, ob->dev, WRITE,
+ BCH_DEV_WRITE_REF_ec_bucket_zero);
if (!ca) {
s->err = -BCH_ERR_erofs_no_writes;
return;
@@ -1432,7 +1270,7 @@ static void zero_out_rest_of_ec_bucket(struct bch_fs *c,
ob->sectors_free,
GFP_KERNEL, 0);
- percpu_ref_put(&ca->io_ref);
+ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_ec_bucket_zero);
if (ret)
s->err = ret;
@@ -1473,6 +1311,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
if (s->err) {
if (!bch2_err_matches(s->err, EROFS))
bch_err(c, "error creating stripe: error writing data buckets");
+ ret = s->err;
goto err;
}
@@ -1481,6 +1320,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
if (ec_do_recov(c, &s->existing_stripe)) {
bch_err(c, "error creating stripe: error reading existing stripe");
+ ret = -BCH_ERR_ec_block_read;
goto err;
}
@@ -1506,6 +1346,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
if (ec_nr_failed(&s->new_stripe)) {
bch_err(c, "error creating stripe: error writing redundancy buckets");
+ ret = -BCH_ERR_ec_block_write;
goto err;
}
@@ -1527,6 +1368,8 @@ static void ec_stripe_create(struct ec_stripe_new *s)
if (ret)
goto err;
err:
+ trace_stripe_create(c, s->idx, ret);
+
bch2_disk_reservation_put(c, &s->res);
for (i = 0; i < v->nr_blocks; i++)
@@ -1577,15 +1420,15 @@ static void ec_stripe_create_work(struct work_struct *work)
while ((s = get_pending_stripe(c)))
ec_stripe_create(s);
- bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_create);
}
void bch2_ec_do_stripe_creates(struct bch_fs *c)
{
- bch2_write_ref_get(c, BCH_WRITE_REF_stripe_create);
+ enumerated_ref_get(&c->writes, BCH_WRITE_REF_stripe_create);
if (!queue_work(system_long_wq, &c->ec_stripe_create_work))
- bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_create);
}
static void ec_stripe_new_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
@@ -1612,11 +1455,11 @@ static void ec_stripe_new_cancel(struct bch_fs *c, struct ec_stripe_head *h, int
ec_stripe_new_set_pending(c, h);
}
-void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob)
+void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob, int err)
{
struct ec_stripe_new *s = ob->ec;
- s->err = -EIO;
+ s->err = err;
}
void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
@@ -1875,23 +1718,32 @@ err:
}
static int new_stripe_alloc_buckets(struct btree_trans *trans,
+ struct alloc_request *req,
struct ec_stripe_head *h, struct ec_stripe_new *s,
- enum bch_watermark watermark, struct closure *cl)
+ struct closure *cl)
{
struct bch_fs *c = trans->c;
- struct bch_devs_mask devs = h->devs;
struct open_bucket *ob;
- struct open_buckets buckets;
struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
unsigned i, j, nr_have_parity = 0, nr_have_data = 0;
- bool have_cache = true;
int ret = 0;
+ req->scratch_data_type = req->data_type;
+ req->scratch_ptrs = req->ptrs;
+ req->scratch_nr_replicas = req->nr_replicas;
+ req->scratch_nr_effective = req->nr_effective;
+ req->scratch_have_cache = req->have_cache;
+ req->scratch_devs_may_alloc = req->devs_may_alloc;
+
+ req->devs_may_alloc = h->devs;
+ req->have_cache = true;
+
BUG_ON(v->nr_blocks != s->nr_data + s->nr_parity);
BUG_ON(v->nr_redundant != s->nr_parity);
/* * We bypass the sector allocator which normally does this: */
- bitmap_and(devs.d, devs.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX);
+ bitmap_and(req->devs_may_alloc.d, req->devs_may_alloc.d,
+ c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX);
for_each_set_bit(i, s->blocks_gotten, v->nr_blocks) {
/*
@@ -1901,7 +1753,7 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans,
* block when updating the stripe
*/
if (v->ptrs[i].dev != BCH_SB_MEMBER_INVALID)
- __clear_bit(v->ptrs[i].dev, devs.d);
+ __clear_bit(v->ptrs[i].dev, req->devs_may_alloc.d);
if (i < s->nr_data)
nr_have_data++;
@@ -1912,95 +1764,94 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans,
BUG_ON(nr_have_data > s->nr_data);
BUG_ON(nr_have_parity > s->nr_parity);
- buckets.nr = 0;
+ req->ptrs.nr = 0;
if (nr_have_parity < s->nr_parity) {
- ret = bch2_bucket_alloc_set_trans(trans, &buckets,
- &h->parity_stripe,
- &devs,
- s->nr_parity,
- &nr_have_parity,
- &have_cache, 0,
- BCH_DATA_parity,
- watermark,
- cl);
-
- open_bucket_for_each(c, &buckets, ob, i) {
+ req->nr_replicas = s->nr_parity;
+ req->nr_effective = nr_have_parity;
+ req->data_type = BCH_DATA_parity;
+
+ ret = bch2_bucket_alloc_set_trans(trans, req, &h->parity_stripe, cl);
+
+ open_bucket_for_each(c, &req->ptrs, ob, i) {
j = find_next_zero_bit(s->blocks_gotten,
s->nr_data + s->nr_parity,
s->nr_data);
BUG_ON(j >= s->nr_data + s->nr_parity);
- s->blocks[j] = buckets.v[i];
+ s->blocks[j] = req->ptrs.v[i];
v->ptrs[j] = bch2_ob_ptr(c, ob);
__set_bit(j, s->blocks_gotten);
}
if (ret)
- return ret;
+ goto err;
}
- buckets.nr = 0;
+ req->ptrs.nr = 0;
if (nr_have_data < s->nr_data) {
- ret = bch2_bucket_alloc_set_trans(trans, &buckets,
- &h->block_stripe,
- &devs,
- s->nr_data,
- &nr_have_data,
- &have_cache, 0,
- BCH_DATA_user,
- watermark,
- cl);
-
- open_bucket_for_each(c, &buckets, ob, i) {
+ req->nr_replicas = s->nr_data;
+ req->nr_effective = nr_have_data;
+ req->data_type = BCH_DATA_user;
+
+ ret = bch2_bucket_alloc_set_trans(trans, req, &h->block_stripe, cl);
+
+ open_bucket_for_each(c, &req->ptrs, ob, i) {
j = find_next_zero_bit(s->blocks_gotten,
s->nr_data, 0);
BUG_ON(j >= s->nr_data);
- s->blocks[j] = buckets.v[i];
+ s->blocks[j] = req->ptrs.v[i];
v->ptrs[j] = bch2_ob_ptr(c, ob);
__set_bit(j, s->blocks_gotten);
}
if (ret)
- return ret;
+ goto err;
}
-
- return 0;
+err:
+ req->data_type = req->scratch_data_type;
+ req->ptrs = req->scratch_ptrs;
+ req->nr_replicas = req->scratch_nr_replicas;
+ req->nr_effective = req->scratch_nr_effective;
+ req->have_cache = req->scratch_have_cache;
+ req->devs_may_alloc = req->scratch_devs_may_alloc;
+ return ret;
}
-static s64 get_existing_stripe(struct bch_fs *c,
- struct ec_stripe_head *head)
+static int __get_existing_stripe(struct btree_trans *trans,
+ struct ec_stripe_head *head,
+ struct ec_stripe_buf *stripe,
+ u64 idx)
{
- ec_stripes_heap *h = &c->ec_stripes_heap;
- struct stripe *m;
- size_t heap_idx;
- u64 stripe_idx;
- s64 ret = -1;
-
- if (may_create_new_stripe(c))
- return -1;
+ struct bch_fs *c = trans->c;
- mutex_lock(&c->ec_stripes_heap_lock);
- for (heap_idx = 0; heap_idx < h->nr; heap_idx++) {
- /* No blocks worth reusing, stripe will just be deleted: */
- if (!h->data[heap_idx].blocks_nonempty)
- continue;
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter,
+ BTREE_ID_stripes, POS(0, idx), 0);
+ int ret = bkey_err(k);
+ if (ret)
+ goto err;
- stripe_idx = h->data[heap_idx].idx;
+ /* We expect write buffer races here */
+ if (k.k->type != KEY_TYPE_stripe)
+ goto out;
- m = genradix_ptr(&c->stripes, stripe_idx);
+ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+ if (stripe_lru_pos(s.v) <= 1)
+ goto out;
- if (m->disk_label == head->disk_label &&
- m->algorithm == head->algo &&
- m->nr_redundant == head->redundancy &&
- m->sectors == head->blocksize &&
- m->blocks_nonempty < m->nr_blocks - m->nr_redundant &&
- bch2_try_open_stripe(c, head->s, stripe_idx)) {
- ret = stripe_idx;
- break;
- }
+ if (s.v->disk_label == head->disk_label &&
+ s.v->algorithm == head->algo &&
+ s.v->nr_redundant == head->redundancy &&
+ le16_to_cpu(s.v->sectors) == head->blocksize &&
+ bch2_try_open_stripe(c, head->s, idx)) {
+ bkey_reassemble(&stripe->key, k);
+ ret = 1;
}
- mutex_unlock(&c->ec_stripes_heap_lock);
+out:
+ bch2_set_btree_iter_dontneed(trans, &iter);
+err:
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
@@ -2052,24 +1903,33 @@ static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stri
struct ec_stripe_new *s)
{
struct bch_fs *c = trans->c;
- s64 idx;
- int ret;
/*
* If we can't allocate a new stripe, and there's no stripes with empty
* blocks for us to reuse, that means we have to wait on copygc:
*/
- idx = get_existing_stripe(c, h);
- if (idx < 0)
- return -BCH_ERR_stripe_alloc_blocked;
+ if (may_create_new_stripe(c))
+ return -1;
- ret = get_stripe_key_trans(trans, idx, &s->existing_stripe);
- bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c,
- "reading stripe key: %s", bch2_err_str(ret));
- if (ret) {
- bch2_stripe_close(c, s);
- return ret;
+ struct btree_iter lru_iter;
+ struct bkey_s_c lru_k;
+ int ret = 0;
+
+ for_each_btree_key_max_norestart(trans, lru_iter, BTREE_ID_lru,
+ lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 2, 0),
+ lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 2, LRU_TIME_MAX),
+ 0, lru_k, ret) {
+ ret = __get_existing_stripe(trans, h, &s->existing_stripe, lru_k.k->p.offset);
+ if (ret)
+ break;
}
+ bch2_trans_iter_exit(trans, &lru_iter);
+ if (!ret)
+ ret = -BCH_ERR_stripe_alloc_blocked;
+ if (ret == 1)
+ ret = 0;
+ if (ret)
+ return ret;
return init_new_stripe_from_existing(c, s);
}
@@ -2102,7 +1962,7 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st
if (bkey_gt(k.k->p, POS(0, U32_MAX))) {
if (start_pos.offset) {
start_pos = min_pos;
- bch2_btree_iter_set_pos(&iter, start_pos);
+ bch2_btree_iter_set_pos(trans, &iter, start_pos);
continue;
}
@@ -2136,17 +1996,15 @@ err:
}
struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
- unsigned target,
+ struct alloc_request *req,
unsigned algo,
- unsigned redundancy,
- enum bch_watermark watermark,
struct closure *cl)
{
struct bch_fs *c = trans->c;
- struct ec_stripe_head *h;
- bool waiting = false;
+ unsigned redundancy = req->nr_replicas - 1;
unsigned disk_label = 0;
- struct target t = target_decode(target);
+ struct target t = target_decode(req->target);
+ bool waiting = false;
int ret;
if (t.type == TARGET_GROUP) {
@@ -2157,7 +2015,9 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
disk_label = t.group + 1; /* 0 == no label */
}
- h = __bch2_ec_stripe_head_get(trans, disk_label, algo, redundancy, watermark);
+ struct ec_stripe_head *h =
+ __bch2_ec_stripe_head_get(trans, disk_label, algo,
+ redundancy, req->watermark);
if (IS_ERR_OR_NULL(h))
return h;
@@ -2181,8 +2041,12 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
goto alloc_existing;
/* First, try to allocate a full stripe: */
- ret = new_stripe_alloc_buckets(trans, h, s, BCH_WATERMARK_stripe, NULL) ?:
+ enum bch_watermark saved_watermark = BCH_WATERMARK_stripe;
+ swap(req->watermark, saved_watermark);
+ ret = new_stripe_alloc_buckets(trans, req, h, s, NULL) ?:
__bch2_ec_stripe_head_reserve(trans, h, s);
+ swap(req->watermark, saved_watermark);
+
if (!ret)
goto allocate_buf;
if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
@@ -2200,8 +2064,8 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked)
goto err;
- if (watermark == BCH_WATERMARK_copygc) {
- ret = new_stripe_alloc_buckets(trans, h, s, watermark, NULL) ?:
+ if (req->watermark == BCH_WATERMARK_copygc) {
+ ret = new_stripe_alloc_buckets(trans, req, h, s, NULL) ?:
__bch2_ec_stripe_head_reserve(trans, h, s);
if (ret)
goto err;
@@ -2220,7 +2084,7 @@ alloc_existing:
* Retry allocating buckets, with the watermark for this
* particular write:
*/
- ret = new_stripe_alloc_buckets(trans, h, s, watermark, cl);
+ ret = new_stripe_alloc_buckets(trans, req, h, s, cl);
if (ret)
goto err;
@@ -2242,67 +2106,106 @@ err:
/* device removal */
-static int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, struct bkey_s_c k_a)
+int bch2_invalidate_stripe_to_dev(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k,
+ unsigned dev_idx,
+ unsigned flags)
{
- struct bch_alloc_v4 a_convert;
- const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k_a, &a_convert);
-
- if (!a->stripe)
+ if (k.k->type != KEY_TYPE_stripe)
return 0;
- if (a->stripe_sectors) {
- bch_err(trans->c, "trying to invalidate device in stripe when bucket has stripe data");
- return -BCH_ERR_invalidate_stripe_to_dev;
- }
-
- struct btree_iter iter;
struct bkey_i_stripe *s =
- bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_stripes, POS(0, a->stripe),
- BTREE_ITER_slots, stripe);
+ bch2_bkey_make_mut_typed(trans, iter, &k, 0, stripe);
int ret = PTR_ERR_OR_ZERO(s);
if (ret)
return ret;
- struct disk_accounting_pos acc = {
- .type = BCH_DISK_ACCOUNTING_replicas,
- };
+ struct disk_accounting_pos acc;
s64 sectors = 0;
for (unsigned i = 0; i < s->v.nr_blocks; i++)
sectors -= stripe_blockcount_get(&s->v, i);
+ memset(&acc, 0, sizeof(acc));
+ acc.type = BCH_DISK_ACCOUNTING_replicas;
bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i));
acc.replicas.data_type = BCH_DATA_user;
ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, false);
if (ret)
- goto err;
+ return ret;
struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(&s->k_i));
- bkey_for_each_ptr(ptrs, ptr)
- if (ptr->dev == k_a.k->p.inode)
+
+ /* XXX: how much redundancy do we still have? check degraded flags */
+
+ unsigned nr_good = 0;
+
+ rcu_read_lock();
+ bkey_for_each_ptr(ptrs, ptr) {
+ if (ptr->dev == dev_idx)
ptr->dev = BCH_SB_MEMBER_INVALID;
+ struct bch_dev *ca = bch2_dev_rcu(trans->c, ptr->dev);
+ nr_good += ca && ca->mi.state != BCH_MEMBER_STATE_failed;
+ }
+ rcu_read_unlock();
+
+ if (nr_good < s->v.nr_blocks && !(flags & BCH_FORCE_IF_DATA_DEGRADED))
+ return -BCH_ERR_remove_would_lose_data;
+
+ unsigned nr_data = s->v.nr_blocks - s->v.nr_redundant;
+
+ if (nr_good < nr_data && !(flags & BCH_FORCE_IF_DATA_LOST))
+ return -BCH_ERR_remove_would_lose_data;
+
sectors = -sectors;
+ memset(&acc, 0, sizeof(acc));
+ acc.type = BCH_DISK_ACCOUNTING_replicas;
bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i));
acc.replicas.data_type = BCH_DATA_user;
- ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, false);
+ return bch2_disk_accounting_mod(trans, &acc, &sectors, 1, false);
+}
+
+static int bch2_invalidate_stripe_to_dev_from_alloc(struct btree_trans *trans, struct bkey_s_c k_a,
+ unsigned flags)
+{
+ struct bch_alloc_v4 a_convert;
+ const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k_a, &a_convert);
+
+ if (!a->stripe)
+ return 0;
+
+ if (a->stripe_sectors) {
+ bch_err(trans->c, "trying to invalidate device in stripe when bucket has stripe data");
+ return -BCH_ERR_invalidate_stripe_to_dev;
+ }
+
+ struct btree_iter iter;
+ struct bkey_s_c_stripe s =
+ bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_stripes, POS(0, a->stripe),
+ BTREE_ITER_slots, stripe);
+ int ret = bkey_err(s);
if (ret)
- goto err;
-err:
+ return ret;
+
+ ret = bch2_invalidate_stripe_to_dev(trans, &iter, s.s_c, k_a.k->p.inode, flags);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
-int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx)
+int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx, unsigned flags)
{
- return bch2_trans_run(c,
+ int ret = bch2_trans_run(c,
for_each_btree_key_max_commit(trans, iter,
BTREE_ID_alloc, POS(dev_idx, 0), POS(dev_idx, U64_MAX),
BTREE_ITER_intent, k,
NULL, NULL, 0, ({
- bch2_invalidate_stripe_to_dev(trans, k);
+ bch2_invalidate_stripe_to_dev_from_alloc(trans, k, flags);
})));
+ bch_err_fn(c, ret);
+ return ret;
}
/* startup/shutdown */
@@ -2351,10 +2254,10 @@ void bch2_fs_ec_stop(struct bch_fs *c)
static bool bch2_fs_ec_flush_done(struct bch_fs *c)
{
- bool ret;
+ sched_annotate_sleep();
mutex_lock(&c->ec_stripe_new_lock);
- ret = list_empty(&c->ec_stripe_new_list);
+ bool ret = list_empty(&c->ec_stripe_new_list);
mutex_unlock(&c->ec_stripe_new_lock);
return ret;
@@ -2367,46 +2270,7 @@ void bch2_fs_ec_flush(struct bch_fs *c)
int bch2_stripes_read(struct bch_fs *c)
{
- int ret = bch2_trans_run(c,
- for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN,
- BTREE_ITER_prefetch, k, ({
- if (k.k->type != KEY_TYPE_stripe)
- continue;
-
- ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL);
- if (ret)
- break;
-
- struct stripe *m = genradix_ptr(&c->stripes, k.k->p.offset);
-
- stripe_to_mem(m, bkey_s_c_to_stripe(k).v);
-
- bch2_stripes_heap_insert(c, m, k.k->p.offset);
- 0;
- })));
- bch_err_fn(c, ret);
- return ret;
-}
-
-void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
-{
- ec_stripes_heap *h = &c->ec_stripes_heap;
- struct stripe *m;
- size_t i;
-
- mutex_lock(&c->ec_stripes_heap_lock);
- for (i = 0; i < min_t(size_t, h->nr, 50); i++) {
- m = genradix_ptr(&c->stripes, h->data[i].idx);
-
- prt_printf(out, "%zu %u/%u+%u", h->data[i].idx,
- h->data[i].blocks_nonempty,
- m->nr_blocks - m->nr_redundant,
- m->nr_redundant);
- if (bch2_stripe_is_open(c, h->data[i].idx))
- prt_str(out, " open");
- prt_newline(out);
- }
- mutex_unlock(&c->ec_stripes_heap_lock);
+ return 0;
}
static void bch2_new_stripe_to_text(struct printbuf *out, struct bch_fs *c,
@@ -2477,15 +2341,12 @@ void bch2_fs_ec_exit(struct bch_fs *c)
BUG_ON(!list_empty(&c->ec_stripe_new_list));
- free_heap(&c->ec_stripes_heap);
- genradix_free(&c->stripes);
bioset_exit(&c->ec_bioset);
}
void bch2_fs_ec_init_early(struct bch_fs *c)
{
spin_lock_init(&c->ec_stripes_new_lock);
- mutex_init(&c->ec_stripes_heap_lock);
INIT_LIST_HEAD(&c->ec_stripe_head_list);
mutex_init(&c->ec_stripe_head_lock);
@@ -2503,3 +2364,40 @@ int bch2_fs_ec_init(struct bch_fs *c)
return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio),
BIOSET_NEED_BVECS);
}
+
+static int bch2_check_stripe_to_lru_ref(struct btree_trans *trans,
+ struct bkey_s_c k,
+ struct bkey_buf *last_flushed)
+{
+ if (k.k->type != KEY_TYPE_stripe)
+ return 0;
+
+ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+
+ u64 lru_idx = stripe_lru_pos(s.v);
+ if (lru_idx) {
+ int ret = bch2_lru_check_set(trans, BCH_LRU_STRIPE_FRAGMENTATION,
+ k.k->p.offset, lru_idx, k, last_flushed);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+int bch2_check_stripe_to_lru_refs(struct bch_fs *c)
+{
+ struct bkey_buf last_flushed;
+
+ bch2_bkey_buf_init(&last_flushed);
+ bkey_init(&last_flushed.k->k);
+
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter, BTREE_ID_stripes,
+ POS_MIN, BTREE_ITER_prefetch, k,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ bch2_check_stripe_to_lru_ref(trans, k, &last_flushed)));
+
+ bch2_bkey_buf_exit(&last_flushed, c);
+ bch_err_fn(c, ret);
+ return ret;
+}