diff options
Diffstat (limited to 'fs/bcachefs/fs-io.c')
-rw-r--r-- | fs/bcachefs/fs-io.c | 160 |
1 files changed, 117 insertions, 43 deletions
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index 2456c41b215e..a233f45875e9 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -7,6 +7,7 @@ #include "btree_update.h" #include "buckets.h" #include "clock.h" +#include "enumerated_ref.h" #include "error.h" #include "extents.h" #include "extent_update.h" @@ -48,7 +49,8 @@ static void nocow_flush_endio(struct bio *_bio) struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio); closure_put(bio->cl); - percpu_ref_put(&bio->ca->io_ref); + enumerated_ref_put(&bio->ca->io_ref[WRITE], + BCH_DEV_WRITE_REF_nocow_flush); bio_put(&bio->bio); } @@ -69,11 +71,12 @@ void bch2_inode_flush_nocow_writes_async(struct bch_fs *c, memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush)); for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) { - rcu_read_lock(); - ca = rcu_dereference(c->devs[dev]); - if (ca && !percpu_ref_tryget(&ca->io_ref)) - ca = NULL; - rcu_read_unlock(); + scoped_guard(rcu) { + ca = rcu_dereference(c->devs[dev]); + if (ca && !enumerated_ref_tryget(&ca->io_ref[WRITE], + BCH_DEV_WRITE_REF_nocow_flush)) + ca = NULL; + } if (!ca) continue; @@ -144,10 +147,24 @@ int __must_check bch2_write_inode_size(struct bch_fs *c, void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, struct quota_res *quota_res, s64 sectors) { - bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c, - "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)", - inode->v.i_ino, (u64) inode->v.i_blocks, sectors, - inode->ei_inode.bi_sectors); + if (unlikely((s64) inode->v.i_blocks + sectors < 0)) { + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + prt_printf(&buf, "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)", + inode->v.i_ino, (u64) inode->v.i_blocks, sectors, + inode->ei_inode.bi_sectors); + + bool print = bch2_count_fsck_err(c, vfs_inode_i_blocks_underflow, &buf); + if (print) + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + + if (sectors < 0) + sectors = -inode->v.i_blocks; + else + sectors = 0; + } + inode->v.i_blocks += sectors; #ifdef CONFIG_BCACHEFS_QUOTA @@ -167,6 +184,34 @@ void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, /* fsync: */ +static int bch2_get_inode_journal_seq_trans(struct btree_trans *trans, subvol_inum inum, + u64 *seq) +{ + struct printbuf buf = PRINTBUF; + struct bch_inode_unpacked u; + struct btree_iter iter; + int ret = bch2_inode_peek(trans, &iter, &u, inum, 0); + if (ret) + return ret; + + u64 cur_seq = journal_cur_seq(&trans->c->journal); + *seq = min(cur_seq, u.bi_journal_seq); + + if (fsck_err_on(u.bi_journal_seq > cur_seq, + trans, inode_journal_seq_in_future, + "inode journal seq in future (currently at %llu)\n%s", + cur_seq, + (bch2_inode_unpacked_to_text(&buf, &u), + buf.buf))) { + u.bi_journal_seq = cur_seq; + ret = bch2_inode_write(trans, &iter, &u); + } +fsck_err: + bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); + return ret; +} + /* * inode->ei_inode.bi_journal_seq won't be up to date since it's set in an * insert trigger: look up the btree inode instead @@ -177,14 +222,15 @@ static int bch2_flush_inode(struct bch_fs *c, if (c->opts.journal_flush_disabled) return 0; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync)) + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_fsync)) return -EROFS; - struct bch_inode_unpacked u; - int ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u) ?: - bch2_journal_flush_seq(&c->journal, u.bi_journal_seq, TASK_INTERRUPTIBLE) ?: + u64 seq; + int ret = bch2_trans_commit_do(c, NULL, NULL, 0, + bch2_get_inode_journal_seq_trans(trans, inode_inum(inode), &seq)) ?: + bch2_journal_flush_seq(&c->journal, seq, TASK_INTERRUPTIBLE) ?: bch2_inode_flush_nocow_writes(c, inode); - bch2_write_ref_put(c, BCH_WRITE_REF_fsync); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_fsync); return ret; } @@ -222,7 +268,7 @@ static inline int range_has_data(struct bch_fs *c, u32 subvol, struct bpos end) { return bch2_trans_run(c, - for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents, start, end, + for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, start, end, subvol, 0, k, ({ bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k); }))); @@ -256,7 +302,7 @@ static int __bch2_truncate_folio(struct bch_inode_info *inode, folio = __filemap_get_folio(mapping, index, FGP_LOCK|FGP_CREAT, GFP_KERNEL); - if (IS_ERR_OR_NULL(folio)) { + if (IS_ERR(folio)) { ret = -ENOMEM; goto out; } @@ -437,6 +483,7 @@ int bchfs_truncate(struct mnt_idmap *idmap, ret = bch2_truncate_folio(inode, iattr->ia_size); if (unlikely(ret < 0)) goto err; + ret = 0; truncate_setsize(&inode->v, iattr->ia_size); @@ -472,11 +519,20 @@ int bchfs_truncate(struct mnt_idmap *idmap, goto err; } - bch2_fs_inconsistent_on(!inode->v.i_size && inode->v.i_blocks && - !bch2_journal_error(&c->journal), c, - "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)", - inode->v.i_ino, (u64) inode->v.i_blocks, - inode->ei_inode.bi_sectors); + if (unlikely(!inode->v.i_size && inode->v.i_blocks && + !bch2_journal_error(&c->journal))) { + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + prt_printf(&buf, + "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)", + inode->v.i_ino, (u64) inode->v.i_blocks, + inode->ei_inode.bi_sectors); + + bool print = bch2_count_fsck_err(c, vfs_inode_i_blocks_not_zero_at_truncate, &buf); + if (print) + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + } ret = bch2_setattr_nonsize(idmap, inode, iattr); err: @@ -606,9 +662,9 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode, if (ret) goto bkey_err; - bch2_btree_iter_set_snapshot(&iter, snapshot); + bch2_btree_iter_set_snapshot(trans, &iter, snapshot); - k = bch2_btree_iter_peek_slot(&iter); + k = bch2_btree_iter_peek_slot(trans, &iter); if ((ret = bkey_err(k))) goto bkey_err; @@ -619,13 +675,13 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode, /* already reserved */ if (bkey_extent_is_reservation(k) && bch2_bkey_nr_ptrs_fully_allocated(k) >= opts.data_replicas) { - bch2_btree_iter_advance(&iter); + bch2_btree_iter_advance(trans, &iter); continue; } if (bkey_extent_is_data(k.k) && !(mode & FALLOC_FL_ZERO_RANGE)) { - bch2_btree_iter_advance(&iter); + bch2_btree_iter_advance(trans, &iter); continue; } @@ -646,7 +702,7 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode, if (ret) goto bkey_err; } - bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, hole_start)); + bch2_btree_iter_set_pos(trans, &iter, POS(iter.pos.inode, hole_start)); if (ret) goto bkey_err; @@ -765,7 +821,7 @@ long bch2_fallocate_dispatch(struct file *file, int mode, struct bch_fs *c = inode->v.i_sb->s_fs_info; long ret; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fallocate)) + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_fallocate)) return -EROFS; inode_lock(&inode->v); @@ -789,7 +845,7 @@ long bch2_fallocate_dispatch(struct file *file, int mode, err: bch2_pagecache_block_put(inode); inode_unlock(&inode->v); - bch2_write_ref_put(c, BCH_WRITE_REF_fallocate); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_fallocate); return bch2_err_class(ret); } @@ -806,7 +862,7 @@ static int quota_reserve_range(struct bch_inode_info *inode, u64 sectors = end - start; int ret = bch2_trans_run(c, - for_each_btree_key_in_subvolume_upto(trans, iter, + for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, POS(inode->v.i_ino, start), POS(inode->v.i_ino, end - 1), @@ -877,11 +933,18 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, bch2_mark_pagecache_unallocated(src, pos_src >> 9, (pos_src + aligned_len) >> 9); + /* + * XXX: we'd like to be telling bch2_remap_range() if we have + * permission to write to the source file, and thus if io path option + * changes should be propagated through the copy, but we need mnt_idmap + * from the pathwalk, awkward + */ ret = bch2_remap_range(c, inode_inum(dst), pos_dst >> 9, inode_inum(src), pos_src >> 9, aligned_len >> 9, - pos_dst + len, &i_sectors_delta); + pos_dst + len, &i_sectors_delta, + false); if (ret < 0) goto err; @@ -922,7 +985,7 @@ static loff_t bch2_seek_data(struct file *file, u64 offset) return -ENXIO; int ret = bch2_trans_run(c, - for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents, + for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, POS(inode->v.i_ino, offset >> 9), POS(inode->v.i_ino, U64_MAX), inum.subvol, 0, k, ({ @@ -958,21 +1021,32 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset) return -ENXIO; int ret = bch2_trans_run(c, - for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents, + for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, POS(inode->v.i_ino, offset >> 9), POS(inode->v.i_ino, U64_MAX), inum.subvol, BTREE_ITER_slots, k, ({ - if (k.k->p.inode != inode->v.i_ino) { - next_hole = bch2_seek_pagecache_hole(&inode->v, - offset, MAX_LFS_FILESIZE, 0, false); - break; - } else if (!bkey_extent_is_data(k.k)) { - next_hole = bch2_seek_pagecache_hole(&inode->v, - max(offset, bkey_start_offset(k.k) << 9), - k.k->p.offset << 9, 0, false); - - if (next_hole < k.k->p.offset << 9) + if (k.k->p.inode != inode->v.i_ino || + !bkey_extent_is_data(k.k)) { + loff_t start_offset = k.k->p.inode == inode->v.i_ino + ? max(offset, bkey_start_offset(k.k) << 9) + : offset; + loff_t end_offset = k.k->p.inode == inode->v.i_ino + ? MAX_LFS_FILESIZE + : k.k->p.offset << 9; + + /* + * Found a hole in the btree, now make sure it's + * a hole in the pagecache. We might have to + * keep searching if this hole is entirely dirty + * in the page cache: + */ + bch2_trans_unlock(trans); + loff_t pagecache_hole = bch2_seek_pagecache_hole(&inode->v, + start_offset, end_offset, 0, false); + if (pagecache_hole < end_offset) { + next_hole = pagecache_hole; break; + } } else { offset = max(offset, bkey_start_offset(k.k) << 9); } |