diff options
author | Brian Foster <bfoster@redhat.com> | 2024-01-15 14:46:00 -0500 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@linux.dev> | 2025-04-24 19:10:29 -0400 |
commit | b9b0494017b5f6d0664ecbcd2d8870800f045581 (patch) | |
tree | ac3fe141e72f1fea41223d11e029cb51bc1956f3 /fs | |
parent | 2d55a637095d0eaaad609b8a518589ead34487b3 (diff) |
bcachefs: add fiemap delalloc extent detection
bcachefs currently populates fiemap data from the extents btree.
This works correctly when the fiemap sync flag is provided, but if
not, it skips all delalloc extents that have not yet been flushed.
This is because delalloc extents from buffered writes are first
stored as reservation in the pagecache, and only become resident in
the extents btree after writeback completes.
Update the fiemap implementation to process holes between extents by
scanning pagecache for data, via seek data/hole. If a valid data
range is found over a hole in the extent btree, fake up an extent
key and flag the extent as delalloc for reporting to userspace.
Note that this does not necessarily change behavior for the case
where there is dirty pagecache over already written extents, where
when in COW mode, writeback will allocate new blocks for the
underlying ranges. The existing behavior is consistent with btrfs
and it is recommended to use the sync flag for the most up to date
extent state from fiemap.
Signed-off-by: Brian Foster <bfoster@redhat.com>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/bcachefs/fs.c | 119 |
1 files changed, 112 insertions, 7 deletions
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 00698457dee5..f5a9f9e8c457 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -1356,6 +1356,87 @@ static int bch2_fiemap_extent(struct btree_trans *trans, return 0; } +/* + * Scan a range of an inode for data in pagecache. + * + * Intended to be retryable, so don't modify the output params until success is + * imminent. + */ +static int +bch2_fiemap_hole_pagecache(struct inode *vinode, u64 *start, u64 *end, + bool nonblock) +{ + loff_t dstart, dend; + + dstart = bch2_seek_pagecache_data(vinode, *start, *end, 0, nonblock); + if (dstart < 0) + return dstart; + if (dstart >= *end) + return -ENOENT; + + dend = bch2_seek_pagecache_hole(vinode, dstart, *end, 0, nonblock); + if (dend < 0) + return dend; + + *start = dstart; + *end = dend; + return 0; +} + +/* + * Scan a range of pagecache that corresponds to a file mapping hole in the + * extent btree. If data is found, fake up an extent key so it looks like a + * delalloc extent to the rest of the fiemap processing code. + * + * Returns 0 if cached data was found, -ENOENT if not. + */ +static int +bch2_fiemap_hole(struct btree_trans *trans, struct inode *vinode, u64 start, + u64 end, struct bch_fiemap_extent *cur) +{ + struct bch_fs *c = vinode->i_sb->s_fs_info; + struct bch_inode_info *ei = to_bch_ei(vinode); + struct bkey_i_extent *delextent; + struct bch_extent_ptr ptr = {}; + loff_t dstart = start, dend = end; + int ret; + + /* + * We hold btree locks here so we cannot block on folio locks without + * dropping trans locks first. Run a nonblocking scan for the common + * case of no folios over holes and fall back on failure. + * + * Note that dropping locks like this is technically racy against + * writeback inserting to the extent tree, but a non-sync fiemap scan is + * fundamentally racy with writeback anyways. Therefore, just report the + * range as delalloc regardless of whether we have to cycle trans locks. + */ + ret = bch2_fiemap_hole_pagecache(vinode, &dstart, &dend, true); + if (ret == -EAGAIN) { + /* open coded drop_locks_do() to relock even on error */ + bch2_trans_unlock(trans); + ret = bch2_fiemap_hole_pagecache(vinode, &dstart, &dend, false); + bch2_trans_relock(trans); + } + if (ret < 0) + return ret; + + /* + * Create a fake extent key in the buffer. We have to add a dummy extent + * pointer for the fill code to add an extent entry. It's explicitly + * zeroed to reflect delayed allocation (i.e. phys offset 0). + */ + bch2_bkey_buf_realloc(&cur->kbuf, c, sizeof(*delextent) / sizeof(u64)); + delextent = bkey_extent_init(cur->kbuf.k); + delextent->k.p = POS(ei->v.i_ino, dstart >> 9); + bch2_key_resize(&delextent->k, (dend - dstart) >> 9); + bch2_bkey_append_ptr(&delextent->k_i, ptr); + + cur->flags = FIEMAP_EXTENT_DELALLOC; + + return 0; +} + static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, u64 start, u64 len) { @@ -1386,6 +1467,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, POS(ei->v.i_ino, start), 0); while (!ret || bch2_err_matches(ret, BCH_ERR_transaction_restart)) { + bool have_delalloc = false; bch2_trans_begin(trans); @@ -1404,15 +1486,38 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, if (!k.k) break; - if (!bkey_extent_is_data(k.k) && - k.k->type != KEY_TYPE_reservation) { - bch2_btree_iter_advance(trans, &iter); - continue; + /* + * If a hole exists before the start of the extent key, scan the + * range for pagecache data that might be pending writeback and + * thus not yet exist in the extent tree. + */ + if (iter.pos.offset > start) { + ret = bch2_fiemap_hole(trans, vinode, start << 9, + iter.pos.offset << 9, &cur); + if (!ret) + have_delalloc = true; + else if (ret != -ENOENT) + break; } - ret = bch2_fiemap_extent(trans, &iter, k, &cur); - if (ret) - break; + /* process the current key if there's no delalloc to report */ + if (!have_delalloc) { + if (!bkey_extent_is_data(k.k) && + k.k->type != KEY_TYPE_reservation) { + start = bkey_start_offset(k.k) + k.k->size; + bch2_btree_iter_advance(trans, &iter); + continue; + } + + ret = bch2_fiemap_extent(trans, &iter, k, &cur); + if (ret) + break; + } + + /* + * Store the current extent in prev so we can flag the last + * extent on the way out. + */ bch2_bkey_buf_realloc(&prev.kbuf, c, cur.kbuf.k->k.u64s); start = cur.kbuf.k->k.p.offset; |