diff options
Diffstat (limited to 'fs/btrfs')
97 files changed, 6106 insertions, 5248 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index fa8515598341..c352f3ae0385 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -3,9 +3,9 @@ config BTRFS_FS tristate "Btrfs filesystem support" select BLK_CGROUP_PUNT_BIO + select CRC32 select CRYPTO select CRYPTO_CRC32C - select LIBCRC32C select CRYPTO_XXHASH select CRYPTO_SHA256 select CRYPTO_BLAKE2B @@ -52,10 +52,10 @@ config BTRFS_FS_RUN_SANITY_TESTS bool "Btrfs will run sanity tests upon loading" depends on BTRFS_FS help - This will run some basic sanity tests on the free space cache - code to make sure it is acting as it should. These are mostly - regression tests and are only really interesting to btrfs - developers. + This will run sanity tests for core functionality like free space, + extent maps, extent io, extent buffers, inodes, qgroups and others, + at module load time. These are mostly regression tests and are only + interesting to developers. If unsure, say N. @@ -63,9 +63,12 @@ config BTRFS_DEBUG bool "Btrfs debugging support" depends on BTRFS_FS help - Enable run-time debugging support for the btrfs filesystem. This may - enable additional and expensive checks with negative impact on - performance, or export extra information via sysfs. + Enable run-time debugging support for the btrfs filesystem. + + Additional potentially expensive checks, debugging functionality or + sysfs exported information is enabled, like leak checks of internal + objects, optional forced space fragmentation and /sys/fs/btrfs/debug . + This has negative impact on performance. If unsure, say N. @@ -73,8 +76,10 @@ config BTRFS_ASSERT bool "Btrfs assert support" depends on BTRFS_FS help - Enable run-time assertion checking. This will result in panics if - any of the assertions trip. This is meant for btrfs developers only. + Enable run-time assertion checking. Additional safety checks are + done, simple enough not to affect performance but verify invariants + and assumptions of code to run properly. This may result in panics, + and is meant for developers but can be enabled in general. If unsure, say N. @@ -89,7 +94,14 @@ config BTRFS_EXPERIMENTAL Current list: - - extent map shrinker - performance problems with too frequent shrinks + - COW fixup worker warning - last warning before removing the + functionality catching out-of-band page + dirtying, not necessary since 5.8 + + - RAID mirror read policy - additional read policies for balancing + reading from redundant block group + profiles (currently: pid, round-robin, + fixed devid) - send stream protocol v3 - fs-verity support diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index 7a7e0ef69973..15ea6348800b 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -12,6 +12,7 @@ #include <linux/string.h> #include <linux/mm.h> #include <uapi/linux/btrfs_tree.h> +#include "extent_io.h" struct extent_buffer; diff --git a/fs/btrfs/acl.h b/fs/btrfs/acl.h index 48b9ddae4a46..0458cd51ed48 100644 --- a/fs/btrfs/acl.h +++ b/fs/btrfs/acl.h @@ -3,6 +3,8 @@ #ifndef BTRFS_ACL_H #define BTRFS_ACL_H +#include <linux/types.h> + struct posix_acl; struct inode; struct btrfs_trans_handle; diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index a4c51600a408..6c6f3bb58f4e 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -168,7 +168,7 @@ static inline void thresh_exec_hook(struct btrfs_workqueue *wq) { int new_current_active; long pending; - int need_change = 0; + bool need_change = false; if (wq->thresh == NO_THRESHOLD) return; @@ -196,15 +196,14 @@ static inline void thresh_exec_hook(struct btrfs_workqueue *wq) new_current_active--; new_current_active = clamp_val(new_current_active, 1, wq->limit_active); if (new_current_active != wq->current_active) { - need_change = 1; + need_change = true; wq->current_active = new_current_active; } out: spin_unlock(&wq->thres_lock); - if (need_change) { + if (need_change) workqueue_set_max_active(wq->normal_wq, wq->current_active); - } } static void run_ordered_work(struct btrfs_workqueue *wq, @@ -220,8 +219,7 @@ static void run_ordered_work(struct btrfs_workqueue *wq, spin_lock_irqsave(lock, flags); if (list_empty(list)) break; - work = list_entry(list->next, struct btrfs_work, - ordered_list); + work = list_first_entry(list, struct btrfs_work, ordered_list); if (!test_bit(WORK_DONE_BIT, &work->flags)) break; /* @@ -296,7 +294,7 @@ static void btrfs_work_helper(struct work_struct *normal_work) struct btrfs_work *work = container_of(normal_work, struct btrfs_work, normal_work); struct btrfs_workqueue *wq = work->wq; - int need_order = 0; + bool need_order = false; /* * We should not touch things inside work in the following cases: @@ -307,7 +305,7 @@ static void btrfs_work_helper(struct work_struct *normal_work) * So we save the needed things here. */ if (work->ordered_func) - need_order = 1; + need_order = true; trace_btrfs_work_sched(work); thresh_exec_hook(wq); diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 3d3923cfc357..ed497f5f8d1b 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1399,11 +1399,11 @@ static int find_parent_nodes(struct btrfs_backref_walk_ctx *ctx, ASSERT(ctx->roots == NULL); key.objectid = ctx->bytenr; - key.offset = (u64)-1; if (btrfs_fs_incompat(ctx->fs_info, SKINNY_METADATA)) key.type = BTRFS_METADATA_ITEM_KEY; else key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = (u64)-1; path = btrfs_alloc_path(); if (!path) @@ -2206,11 +2206,11 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, struct btrfs_extent_item *ei; struct btrfs_key key; + key.objectid = logical; if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) key.type = BTRFS_METADATA_ITEM_KEY; else key.type = BTRFS_EXTENT_ITEM_KEY; - key.objectid = logical; key.offset = (u64)-1; ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); @@ -2877,7 +2877,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr) goto release; } if (path->slots[0] == 0) { - WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + DEBUG_WARN(); ret = -EUCLEAN; goto release; } @@ -3134,8 +3134,8 @@ void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache, return; while (!list_empty(&node->upper)) { - edge = list_entry(node->upper.next, struct btrfs_backref_edge, - list[LOWER]); + edge = list_first_entry(&node->upper, struct btrfs_backref_edge, + list[LOWER]); list_del(&edge->list[LOWER]); list_del(&edge->list[UPPER]); btrfs_backref_free_edge(cache, edge); @@ -3473,8 +3473,8 @@ int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans, * type BTRFS_TREE_BLOCK_REF_KEY */ ASSERT(list_is_singular(&cur->upper)); - edge = list_entry(cur->upper.next, struct btrfs_backref_edge, - list[LOWER]); + edge = list_first_entry(&cur->upper, struct btrfs_backref_edge, + list[LOWER]); ASSERT(list_empty(&edge->list[UPPER])); exist = edge->node[UPPER]; /* @@ -3617,7 +3617,7 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache, /* Sanity check, we shouldn't have any unchecked nodes */ if (!upper->checked) { - ASSERT(0); + DEBUG_WARN("we should not have any unchecked nodes"); return -EUCLEAN; } diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 74e614031274..953637115956 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -423,8 +423,8 @@ struct btrfs_backref_node *btrfs_backref_alloc_node( struct btrfs_backref_edge *btrfs_backref_alloc_edge( struct btrfs_backref_cache *cache); -#define LINK_LOWER (1 << 0) -#define LINK_UPPER (1 << 1) +#define LINK_LOWER (1U << 0) +#define LINK_UPPER (1U << 1) void btrfs_backref_link_edge(struct btrfs_backref_edge *edge, struct btrfs_backref_node *lower, diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index bc2555c44a12..f7d8958b7327 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -97,33 +97,17 @@ static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, return bbio; } -/* Free a bio that was never submitted to the underlying device. */ -static void btrfs_cleanup_bio(struct btrfs_bio *bbio) -{ - if (bbio_has_ordered_extent(bbio)) - btrfs_put_ordered_extent(bbio->ordered); - bio_put(&bbio->bio); -} - -static void __btrfs_bio_end_io(struct btrfs_bio *bbio) -{ - if (bbio_has_ordered_extent(bbio)) { - struct btrfs_ordered_extent *ordered = bbio->ordered; - - bbio->end_io(bbio); - btrfs_put_ordered_extent(ordered); - } else { - bbio->end_io(bbio); - } -} - void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) { bbio->bio.bi_status = status; if (bbio->bio.bi_pool == &btrfs_clone_bioset) { struct btrfs_bio *orig_bbio = bbio->private; - btrfs_cleanup_bio(bbio); + /* Free bio that was never submitted to the underlying device. */ + if (bbio_has_ordered_extent(bbio)) + btrfs_put_ordered_extent(bbio->ordered); + bio_put(&bbio->bio); + bbio = orig_bbio; } @@ -138,7 +122,15 @@ void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) /* Load split bio's error which might be set above. */ if (status == BLK_STS_OK) bbio->bio.bi_status = READ_ONCE(bbio->status); - __btrfs_bio_end_io(bbio); + + if (bbio_has_ordered_extent(bbio)) { + struct btrfs_ordered_extent *ordered = bbio->ordered; + + bbio->end_io(bbio); + btrfs_put_ordered_extent(ordered); + } else { + bbio->end_io(bbio); + } } } @@ -200,7 +192,7 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, btrfs_repair_io_failure(fs_info, btrfs_ino(inode), repair_bbio->file_offset, fs_info->sectorsize, repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT, - page_folio(bv->bv_page), bv->bv_offset, mirror); + bvec_phys(bv), mirror); } while (mirror != fbio->bbio->mirror_num); done: @@ -520,7 +512,7 @@ static void btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc, } } -static blk_status_t btrfs_bio_csum(struct btrfs_bio *bbio) +static int btrfs_bio_csum(struct btrfs_bio *bbio) { if (bbio->bio.bi_opf & REQ_META) return btree_csum_one_bio(bbio); @@ -551,11 +543,11 @@ static void run_one_async_start(struct btrfs_work *work) { struct async_submit_bio *async = container_of(work, struct async_submit_bio, work); - blk_status_t ret; + int ret; ret = btrfs_bio_csum(async->bbio); if (ret) - async->bbio->bio.bi_status = ret; + async->bbio->bio.bi_status = errno_to_blk_status(ret); } /* @@ -581,7 +573,7 @@ static void run_one_async_done(struct btrfs_work *work, bool do_free) /* If an error occurred we just want to clean up the bio and move on. */ if (bio->bi_status) { - btrfs_bio_end_io(async->bbio, async->bbio->bio.bi_status); + btrfs_bio_end_io(async->bbio, bio->bi_status); return; } @@ -682,8 +674,8 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) bool use_append = btrfs_use_zone_append(bbio); struct btrfs_io_context *bioc = NULL; struct btrfs_io_stripe smap; - blk_status_t ret; - int error; + blk_status_t status; + int ret; if (!bbio->inode || btrfs_is_data_reloc_root(inode->root)) smap.rst_search_commit_root = true; @@ -691,10 +683,10 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) smap.rst_search_commit_root = false; btrfs_bio_counter_inc_blocked(fs_info); - error = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, - &bioc, &smap, &mirror_num); - if (error) { - ret = errno_to_blk_status(error); + ret = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, + &bioc, &smap, &mirror_num); + if (ret) { + status = errno_to_blk_status(ret); btrfs_bio_counter_dec(fs_info); goto end_bbio; } @@ -708,7 +700,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) split = btrfs_split_bio(fs_info, bbio, map_length); if (IS_ERR(split)) { - ret = errno_to_blk_status(PTR_ERR(split)); + status = errno_to_blk_status(PTR_ERR(split)); btrfs_bio_counter_dec(fs_info); goto end_bbio; } @@ -723,7 +715,8 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) { bbio->saved_iter = bio->bi_iter; ret = btrfs_lookup_bio_sums(bbio); - if (ret) + status = errno_to_blk_status(ret); + if (status) goto fail; } @@ -756,13 +749,15 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) goto done; ret = btrfs_bio_csum(bbio); - if (ret) + status = errno_to_blk_status(ret); + if (status) goto fail; } else if (use_append || (btrfs_is_zoned(fs_info) && inode && inode->flags & BTRFS_INODE_NODATASUM)) { ret = btrfs_alloc_dummy_sum(bbio); - if (ret) + status = errno_to_blk_status(ret); + if (status) goto fail; } } @@ -783,10 +778,10 @@ fail: ASSERT(bbio->bio.bi_pool == &btrfs_clone_bioset); ASSERT(remaining); - btrfs_bio_end_io(remaining, ret); + btrfs_bio_end_io(remaining, status); } end_bbio: - btrfs_bio_end_io(bbio, ret); + btrfs_bio_end_io(bbio, status); /* Do not submit another chunk */ return true; } @@ -811,8 +806,7 @@ void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num) * freeing the bio. */ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, - u64 length, u64 logical, struct folio *folio, - unsigned int folio_offset, int mirror_num) + u64 length, u64 logical, phys_addr_t paddr, int mirror_num) { struct btrfs_io_stripe smap = { 0 }; struct bio_vec bvec; @@ -843,8 +837,7 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, bio_init(&bio, smap.dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; - ret = bio_add_folio(&bio, folio, length, folio_offset); - ASSERT(ret); + __bio_add_page(&bio, phys_to_page(paddr), length, offset_in_page(paddr)); ret = submit_bio_wait(&bio); if (ret) { /* try to remap that extent elsewhere? */ @@ -908,22 +901,18 @@ int __init btrfs_bioset_init(void) return -ENOMEM; if (bioset_init(&btrfs_clone_bioset, BIO_POOL_SIZE, offsetof(struct btrfs_bio, bio), 0)) - goto out_free_bioset; + goto out; if (bioset_init(&btrfs_repair_bioset, BIO_POOL_SIZE, offsetof(struct btrfs_bio, bio), BIOSET_NEED_BVECS)) - goto out_free_clone_bioset; + goto out; if (mempool_init_kmalloc_pool(&btrfs_failed_bio_pool, BIO_POOL_SIZE, sizeof(struct btrfs_failed_bio))) - goto out_free_repair_bioset; + goto out; return 0; -out_free_repair_bioset: - bioset_exit(&btrfs_repair_bioset); -out_free_clone_bioset: - bioset_exit(&btrfs_clone_bioset); -out_free_bioset: - bioset_exit(&btrfs_bioset); +out: + btrfs_bioset_exit(); return -ENOMEM; } diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index e2fe16074ad6..dc2eb43b7097 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -110,7 +110,6 @@ void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status); void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num); void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace); int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, - u64 length, u64 logical, struct folio *folio, - unsigned int folio_offset, int mirror_num); + u64 length, u64 logical, phys_addr_t paddr, int mirror_num); #endif diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index c0a8f7d92acc..5b0cb04b2b93 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -191,21 +191,21 @@ static int btrfs_bg_start_cmp(const struct rb_node *new, /* * This adds the block group to the fs_info rb tree for the block group cache */ -static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, - struct btrfs_block_group *block_group) +static int btrfs_add_block_group_cache(struct btrfs_block_group *block_group) { + struct btrfs_fs_info *fs_info = block_group->fs_info; struct rb_node *exist; int ret = 0; ASSERT(block_group->length != 0); - write_lock(&info->block_group_cache_lock); + write_lock(&fs_info->block_group_cache_lock); exist = rb_find_add_cached(&block_group->cache_node, - &info->block_group_cache_tree, btrfs_bg_start_cmp); + &fs_info->block_group_cache_tree, btrfs_bg_start_cmp); if (exist) ret = -EEXIST; - write_unlock(&info->block_group_cache_lock); + write_unlock(&fs_info->block_group_cache_lock); return ret; } @@ -525,10 +525,9 @@ int btrfs_add_new_free_space(struct btrfs_block_group *block_group, u64 start, *total_added_ret = 0; while (start < end) { - if (!find_first_extent_bit(&info->excluded_extents, start, - &extent_start, &extent_end, - EXTENT_DIRTY | EXTENT_UPTODATE, - NULL)) + if (!btrfs_find_first_extent_bit(&info->excluded_extents, start, + &extent_start, &extent_end, + EXTENT_DIRTY, NULL)) break; if (extent_start <= start) { @@ -584,7 +583,7 @@ static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ struct btrfs_root *extent_root; u64 search_offset; u64 search_end = block_group->start + block_group->length; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key search_key; int ret = 0; @@ -626,7 +625,6 @@ static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ lockdep_assert_held(&caching_ctl->mutex); lockdep_assert_held_read(&fs_info->commit_root_sem); - btrfs_free_path(path); return ret; } @@ -702,7 +700,7 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) struct btrfs_block_group *block_group = caching_ctl->block_group; struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_root *extent_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_key key; u64 total_found = 0; @@ -738,8 +736,8 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) path->reada = READA_FORWARD; key.objectid = last; - key.offset = 0; key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = 0; next: ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); @@ -785,8 +783,8 @@ next: if (key.objectid < last) { key.objectid = last; - key.offset = 0; key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = 0; btrfs_release_path(path); goto next; } @@ -829,14 +827,13 @@ next: block_group->start + block_group->length, NULL); out: - btrfs_free_path(path); return ret; } static inline void btrfs_free_excluded_extents(const struct btrfs_block_group *bg) { - clear_extent_bits(&bg->fs_info->excluded_extents, bg->start, - bg->start + bg->length - 1, EXTENT_UPTODATE); + btrfs_clear_extent_bits(&bg->fs_info->excluded_extents, bg->start, + bg->start + bg->length - 1, EXTENT_DIRTY); } static noinline void caching_thread(struct btrfs_work *work) @@ -1421,9 +1418,8 @@ static bool clean_pinned_extents(struct btrfs_trans_handle *trans, int ret; spin_lock(&fs_info->trans_lock); - if (trans->transaction->list.prev != &fs_info->trans_list) { - prev_trans = list_last_entry(&trans->transaction->list, - struct btrfs_transaction, list); + if (!list_is_first(&trans->transaction->list, &fs_info->trans_list)) { + prev_trans = list_prev_entry(trans->transaction, list); refcount_inc(&prev_trans->use_count); } spin_unlock(&fs_info->trans_lock); @@ -1440,14 +1436,14 @@ static bool clean_pinned_extents(struct btrfs_trans_handle *trans, */ mutex_lock(&fs_info->unused_bg_unpin_mutex); if (prev_trans) { - ret = clear_extent_bits(&prev_trans->pinned_extents, start, end, - EXTENT_DIRTY); + ret = btrfs_clear_extent_bits(&prev_trans->pinned_extents, start, end, + EXTENT_DIRTY); if (ret) goto out; } - ret = clear_extent_bits(&trans->transaction->pinned_extents, start, end, - EXTENT_DIRTY); + ret = btrfs_clear_extent_bits(&trans->transaction->pinned_extents, start, end, + EXTENT_DIRTY); out: mutex_unlock(&fs_info->unused_bg_unpin_mutex); if (prev_trans) @@ -1457,6 +1453,32 @@ out: } /* + * Link the block_group to a list via bg_list. + * + * @bg: The block_group to link to the list. + * @list: The list to link it to. + * + * Use this rather than list_add_tail() directly to ensure proper respect + * to locking and refcounting. + * + * Returns: true if the bg was linked with a refcount bump and false otherwise. + */ +static bool btrfs_link_bg_list(struct btrfs_block_group *bg, struct list_head *list) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + bool added = false; + + spin_lock(&fs_info->unused_bgs_lock); + if (list_empty(&bg->bg_list)) { + btrfs_get_block_group(bg); + list_add_tail(&bg->bg_list, list); + added = true; + } + spin_unlock(&fs_info->unused_bgs_lock); + return added; +} + +/* * Process the unused_bgs list and remove any that don't have any allocated * space inside of them. */ @@ -1571,8 +1593,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) * drop under the "next" label for the * fs_info->unused_bgs list. */ - btrfs_get_block_group(block_group); - list_add_tail(&block_group->bg_list, &retry_list); + btrfs_link_bg_list(block_group, &retry_list); trace_btrfs_skip_unused_block_group(block_group); spin_unlock(&block_group->lock); @@ -1823,7 +1844,8 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp); while (!list_empty(&fs_info->reclaim_bgs)) { u64 zone_unusable; - u64 reclaimed; + u64 used; + u64 reserved; int ret = 0; bg = list_first_entry(&fs_info->reclaim_bgs, @@ -1887,6 +1909,17 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) up_write(&space_info->groups_sem); goto next; } + + /* + * Cache the zone_unusable value before turning the block group + * to read only. As soon as the block group is read only it's + * zone_unusable value gets moved to the block group's read-only + * bytes and isn't available for calculations anymore. We also + * cache it before unlocking the block group, to prevent races + * (reports from KCSAN and such tools) with tasks updating it. + */ + zone_unusable = bg->zone_unusable; + spin_unlock(&bg->lock); spin_unlock(&space_info->lock); @@ -1903,31 +1936,47 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) goto next; } - /* - * Cache the zone_unusable value before turning the block group - * to read only. As soon as the blog group is read only it's - * zone_unusable value gets moved to the block group's read-only - * bytes and isn't available for calculations anymore. - */ - zone_unusable = bg->zone_unusable; ret = inc_block_group_ro(bg, 0); up_write(&space_info->groups_sem); if (ret < 0) goto next; + /* + * The amount of bytes reclaimed corresponds to the sum of the + * "used" and "reserved" counters. We have set the block group + * to RO above, which prevents reservations from happening but + * we may have existing reservations for which allocation has + * not yet been done - btrfs_update_block_group() was not yet + * called, which is where we will transfer a reserved extent's + * size from the "reserved" counter to the "used" counter - this + * happens when running delayed references. When we relocate the + * chunk below, relocation first flushes dellaloc, waits for + * ordered extent completion (which is where we create delayed + * references for data extents) and commits the current + * transaction (which runs delayed references), and only after + * it does the actual work to move extents out of the block + * group. So the reported amount of reclaimed bytes is + * effectively the sum of the 'used' and 'reserved' counters. + */ + spin_lock(&bg->lock); + used = bg->used; + reserved = bg->reserved; + spin_unlock(&bg->lock); + btrfs_info(fs_info, - "reclaiming chunk %llu with %llu%% used %llu%% unusable", + "reclaiming chunk %llu with %llu%% used %llu%% reserved %llu%% unusable", bg->start, - div64_u64(bg->used * 100, bg->length), + div64_u64(used * 100, bg->length), + div64_u64(reserved * 100, bg->length), div64_u64(zone_unusable * 100, bg->length)); trace_btrfs_reclaim_block_group(bg); - reclaimed = bg->used; ret = btrfs_relocate_chunk(fs_info, bg->start); if (ret) { btrfs_dec_block_group_ro(bg); btrfs_err(fs_info, "error relocating chunk %llu", bg->start); - reclaimed = 0; + used = 0; + reserved = 0; spin_lock(&space_info->lock); space_info->reclaim_errors++; if (READ_ONCE(space_info->periodic_reclaim)) @@ -1936,24 +1985,13 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) } spin_lock(&space_info->lock); space_info->reclaim_count++; - space_info->reclaim_bytes += reclaimed; + space_info->reclaim_bytes += used; + space_info->reclaim_bytes += reserved; spin_unlock(&space_info->lock); next: - if (ret && !READ_ONCE(space_info->periodic_reclaim)) { - /* Refcount held by the reclaim_bgs list after splice. */ - spin_lock(&fs_info->unused_bgs_lock); - /* - * This block group might be added to the unused list - * during the above process. Move it back to the - * reclaim list otherwise. - */ - if (list_empty(&bg->bg_list)) { - btrfs_get_block_group(bg); - list_add_tail(&bg->bg_list, &retry_list); - } - spin_unlock(&fs_info->unused_bgs_lock); - } + if (ret && !READ_ONCE(space_info->periodic_reclaim)) + btrfs_link_bg_list(bg, &retry_list); btrfs_put_block_group(bg); mutex_unlock(&fs_info->reclaim_bgs_lock); @@ -1993,13 +2031,8 @@ void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg) { struct btrfs_fs_info *fs_info = bg->fs_info; - spin_lock(&fs_info->unused_bgs_lock); - if (list_empty(&bg->bg_list)) { - btrfs_get_block_group(bg); + if (btrfs_link_bg_list(bg, &fs_info->reclaim_bgs)) trace_btrfs_add_reclaim_block_group(bg); - list_add_tail(&bg->bg_list, &fs_info->reclaim_bgs); - } - spin_unlock(&fs_info->unused_bgs_lock); } static int read_bg_from_eb(struct btrfs_fs_info *fs_info, const struct btrfs_key *key, @@ -2182,9 +2215,9 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) if (cache->start < BTRFS_SUPER_INFO_OFFSET) { stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start; cache->bytes_super += stripe_len; - ret = set_extent_bit(&fs_info->excluded_extents, cache->start, - cache->start + stripe_len - 1, - EXTENT_UPTODATE, NULL); + ret = btrfs_set_extent_bit(&fs_info->excluded_extents, cache->start, + cache->start + stripe_len - 1, + EXTENT_DIRTY, NULL); if (ret) return ret; } @@ -2210,9 +2243,9 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) cache->start + cache->length - logical[nr]); cache->bytes_super += len; - ret = set_extent_bit(&fs_info->excluded_extents, logical[nr], - logical[nr] + len - 1, - EXTENT_UPTODATE, NULL); + ret = btrfs_set_extent_bit(&fs_info->excluded_extents, + logical[nr], logical[nr] + len - 1, + EXTENT_DIRTY, NULL); if (ret) { kfree(logical); return ret; @@ -2337,6 +2370,7 @@ static int read_one_block_group(struct btrfs_fs_info *info, cache->commit_used = cache->used; cache->flags = btrfs_stack_block_group_flags(bgi); cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi); + cache->space_info = btrfs_find_space_info(info, cache->flags); set_free_space_tree_thresholds(cache); @@ -2410,11 +2444,12 @@ static int read_one_block_group(struct btrfs_fs_info *info, goto error; } - ret = btrfs_add_block_group_cache(info, cache); + ret = btrfs_add_block_group_cache(cache); if (ret) { btrfs_remove_free_space_cache(cache); goto error; } + trace_btrfs_add_block_group(info, cache, 0); btrfs_add_bg_to_space_info(info, cache); @@ -2459,7 +2494,8 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info) bg->cached = BTRFS_CACHE_FINISHED; bg->used = map->chunk_len; bg->flags = map->type; - ret = btrfs_add_block_group_cache(fs_info, bg); + bg->space_info = btrfs_find_space_info(fs_info, bg->flags); + ret = btrfs_add_block_group_cache(bg); /* * We may have some valid block group cache added already, in * that case we skip to the next one. @@ -2509,8 +2545,8 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) return fill_dummy_bgs(info); key.objectid = 0; - key.offset = 0; key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + key.offset = 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -2641,7 +2677,7 @@ static int insert_dev_extent(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_root *root = fs_info->dev_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_dev_extent *extent; struct extent_buffer *leaf; struct btrfs_key key; @@ -2658,7 +2694,7 @@ static int insert_dev_extent(struct btrfs_trans_handle *trans, key.offset = start; ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent)); if (ret) - goto out; + return ret; leaf = path->nodes[0]; extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); @@ -2666,10 +2702,8 @@ static int insert_dev_extent(struct btrfs_trans_handle *trans, btrfs_set_dev_extent_chunk_objectid(leaf, extent, BTRFS_FIRST_CHUNK_TREE_OBJECTID); btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); - btrfs_set_dev_extent_length(leaf, extent, num_bytes); -out: - btrfs_free_path(path); + return ret; } @@ -2771,8 +2805,12 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) /* Already aborted the transaction if it failed. */ next: btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info); + + spin_lock(&fs_info->unused_bgs_lock); list_del_init(&block_group->bg_list); clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags); + btrfs_put_block_group(block_group); + spin_unlock(&fs_info->unused_bgs_lock); /* * If the block group is still unused, add it to the list of @@ -2830,8 +2868,8 @@ static u64 calculate_global_root_id(const struct btrfs_fs_info *fs_info, u64 off } struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans, - u64 type, - u64 chunk_offset, u64 size) + struct btrfs_space_info *space_info, + u64 type, u64 chunk_offset, u64 size) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_group *cache; @@ -2885,10 +2923,10 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran * assigned to our block group. We want our bg to be added to the rbtree * with its ->space_info set. */ - cache->space_info = btrfs_find_space_info(fs_info, cache->flags); + cache->space_info = space_info; ASSERT(cache->space_info); - ret = btrfs_add_block_group_cache(fs_info, cache); + ret = btrfs_add_block_group_cache(cache); if (ret) { btrfs_remove_free_space_cache(cache); btrfs_put_block_group(cache); @@ -2910,7 +2948,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran } #endif - list_add_tail(&cache->bg_list, &trans->new_bgs); + btrfs_link_bg_list(cache, &trans->new_bgs); btrfs_inc_delayed_refs_rsv_bg_inserts(fs_info); set_avail_alloc_bits(fs_info, type); @@ -2930,6 +2968,7 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, bool do_chunk_alloc) { struct btrfs_fs_info *fs_info = cache->fs_info; + struct btrfs_space_info *space_info = cache->space_info; struct btrfs_trans_handle *trans; struct btrfs_root *root = btrfs_block_group_root(fs_info); u64 alloc_flags; @@ -2982,7 +3021,7 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, */ alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags); if (alloc_flags != cache->flags) { - ret = btrfs_chunk_alloc(trans, alloc_flags, + ret = btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE); /* * ENOSPC is allowed here, we may have enough space @@ -3010,15 +3049,15 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM)) goto unlock_out; - alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags); - ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); + alloc_flags = btrfs_get_alloc_profile(fs_info, space_info->flags); + ret = btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE); if (ret < 0) goto out; /* * We have allocated a new chunk. We also need to activate that chunk to * grant metadata tickets for zoned filesystem. */ - ret = btrfs_zoned_activate_one_bg(fs_info, cache->space_info, true); + ret = btrfs_zoned_activate_one_bg(fs_info, space_info, true); if (ret < 0) goto out; @@ -3306,7 +3345,7 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans) struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_group *cache, *tmp; struct btrfs_transaction *cur_trans = trans->transaction; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); if (list_empty(&cur_trans->dirty_bgs) || !btrfs_test_opt(fs_info, SPACE_CACHE)) @@ -3323,7 +3362,6 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans) cache_save_setup(cache, trans, path); } - btrfs_free_path(path); return 0; } @@ -3346,7 +3384,7 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans) struct btrfs_transaction *cur_trans = trans->transaction; int ret = 0; int should_put; - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); LIST_HEAD(dirty); struct list_head *io = &cur_trans->io_bgs; int loops = 0; @@ -3501,7 +3539,6 @@ out: btrfs_cleanup_dirty_bgs(cur_trans, fs_info); } - btrfs_free_path(path); return ret; } @@ -3512,7 +3549,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) struct btrfs_transaction *cur_trans = trans->transaction; int ret = 0; int should_put; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct list_head *io = &cur_trans->io_bgs; path = btrfs_alloc_path(); @@ -3624,7 +3661,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) btrfs_put_block_group(cache); } - btrfs_free_path(path); return ret; } @@ -3703,8 +3739,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, spin_unlock(&cache->lock); spin_unlock(&space_info->lock); - set_extent_bit(&trans->transaction->pinned_extents, bytenr, - bytenr + num_bytes - 1, EXTENT_DIRTY, NULL); + btrfs_set_extent_bit(&trans->transaction->pinned_extents, bytenr, + bytenr + num_bytes - 1, EXTENT_DIRTY, NULL); } spin_lock(&trans->transaction->dirty_bgs_lock); @@ -3793,17 +3829,17 @@ out: /* * Update the block_group and space info counters. * - * @cache: The cache we are manipulating - * @num_bytes: The number of bytes in question - * @delalloc: The blocks are allocated for the delalloc write + * @cache: The cache we are manipulating. + * @num_bytes: The number of bytes in question. + * @is_delalloc: Whether the blocks are allocated for a delalloc write. * * This is called by somebody who is freeing space that was never actually used * on disk. For example if you reserve some space for a new leaf in transaction * A and before transaction A commits you free that leaf, you call this with * reserve set to 0 in order to clear the reservation. */ -void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, - u64 num_bytes, int delalloc) +void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, u64 num_bytes, + bool is_delalloc) { struct btrfs_space_info *space_info = cache->space_info; @@ -3817,7 +3853,7 @@ void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, space_info->bytes_reserved -= num_bytes; space_info->max_extent_size = 0; - if (delalloc) + if (is_delalloc) cache->delalloc_bytes -= num_bytes; spin_unlock(&cache->lock); @@ -3836,14 +3872,14 @@ static void force_metadata_allocation(struct btrfs_fs_info *info) } } -static int should_alloc_chunk(const struct btrfs_fs_info *fs_info, - const struct btrfs_space_info *sinfo, int force) +static bool should_alloc_chunk(const struct btrfs_fs_info *fs_info, + const struct btrfs_space_info *sinfo, int force) { u64 bytes_used = btrfs_space_info_used(sinfo, false); u64 thresh; if (force == CHUNK_ALLOC_FORCE) - return 1; + return true; /* * in limited mode, we want to have some free space up to @@ -3854,22 +3890,31 @@ static int should_alloc_chunk(const struct btrfs_fs_info *fs_info, thresh = max_t(u64, SZ_64M, mult_perc(thresh, 1)); if (sinfo->total_bytes - bytes_used < thresh) - return 1; + return true; } if (bytes_used + SZ_2M < mult_perc(sinfo->total_bytes, 80)) - return 0; - return 1; + return false; + return true; } int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type) { u64 alloc_flags = btrfs_get_alloc_profile(trans->fs_info, type); + struct btrfs_space_info *space_info; - return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); + space_info = btrfs_find_space_info(trans->fs_info, type); + if (!space_info) { + DEBUG_WARN(); + return -EINVAL; + } + + return btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE); } -static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags) +static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_space_info *space_info, + u64 flags) { struct btrfs_block_group *bg; int ret; @@ -3882,7 +3927,7 @@ static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans */ check_system_chunk(trans, flags); - bg = btrfs_create_chunk(trans, flags); + bg = btrfs_create_chunk(trans, space_info, flags); if (IS_ERR(bg)) { ret = PTR_ERR(bg); goto out; @@ -3930,8 +3975,16 @@ static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans if (ret == -ENOSPC) { const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info); struct btrfs_block_group *sys_bg; + struct btrfs_space_info *sys_space_info; + + sys_space_info = btrfs_find_space_info(trans->fs_info, sys_flags); + if (!sys_space_info) { + ret = -EINVAL; + btrfs_abort_transaction(trans, ret); + goto out; + } - sys_bg = btrfs_create_chunk(trans, sys_flags); + sys_bg = btrfs_create_chunk(trans, sys_space_info, sys_flags); if (IS_ERR(sys_bg)) { ret = PTR_ERR(sys_bg); btrfs_abort_transaction(trans, ret); @@ -4062,6 +4115,8 @@ out: * * This function, btrfs_chunk_alloc(), belongs to phase 1. * + * @space_info: specify which space_info the new chunk should belong to. + * * If @force is CHUNK_ALLOC_FORCE: * - return 1 if it successfully allocates a chunk, * - return errors including -ENOSPC otherwise. @@ -4070,11 +4125,11 @@ out: * - return 1 if it successfully allocates a chunk, * - return errors including -ENOSPC otherwise. */ -int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, +int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_space_info *space_info, u64 flags, enum btrfs_chunk_alloc_enum force) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_space_info *space_info; struct btrfs_block_group *ret_bg; bool wait_for_alloc = false; bool should_alloc = false; @@ -4113,9 +4168,6 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, if (flags & BTRFS_BLOCK_GROUP_SYSTEM) return -ENOSPC; - space_info = btrfs_find_space_info(fs_info, flags); - ASSERT(space_info); - do { spin_lock(&space_info->lock); if (force < space_info->force_alloc) @@ -4176,7 +4228,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, force_metadata_allocation(fs_info); } - ret_bg = do_chunk_alloc(trans, flags); + ret_bg = do_chunk_alloc(trans, space_info, flags); trans->allocating_chunk = false; if (IS_ERR(ret_bg)) { @@ -4252,6 +4304,10 @@ static void reserve_chunk_space(struct btrfs_trans_handle *trans, if (left < bytes) { u64 flags = btrfs_system_alloc_profile(fs_info); struct btrfs_block_group *bg; + struct btrfs_space_info *space_info; + + space_info = btrfs_find_space_info(fs_info, flags); + ASSERT(space_info); /* * Ignore failure to create system chunk. We might end up not @@ -4259,7 +4315,7 @@ static void reserve_chunk_space(struct btrfs_trans_handle *trans, * the paths we visit in the chunk tree (they were already COWed * or created in the current transaction for example). */ - bg = btrfs_create_chunk(trans, flags); + bg = btrfs_create_chunk(trans, space_info, flags); if (IS_ERR(bg)) { ret = PTR_ERR(bg); } else { @@ -4367,6 +4423,43 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info) } } +static void check_removing_space_info(struct btrfs_space_info *space_info) +{ + struct btrfs_fs_info *info = space_info->fs_info; + + if (space_info->subgroup_id == BTRFS_SUB_GROUP_PRIMARY) { + /* This is a top space_info, proceed with its children first. */ + for (int i = 0; i < BTRFS_SPACE_INFO_SUB_GROUP_MAX; i++) { + if (space_info->sub_group[i]) { + check_removing_space_info(space_info->sub_group[i]); + kfree(space_info->sub_group[i]); + space_info->sub_group[i] = NULL; + } + } + } + + /* + * Do not hide this behind enospc_debug, this is actually important and + * indicates a real bug if this happens. + */ + if (WARN_ON(space_info->bytes_pinned > 0 || space_info->bytes_may_use > 0)) + btrfs_dump_space_info(info, space_info, 0, 0); + + /* + * If there was a failure to cleanup a log tree, very likely due to an + * IO failure on a writeback attempt of one or more of its extent + * buffers, we could not do proper (and cheap) unaccounting of their + * reserved space, so don't warn on bytes_reserved > 0 in that case. + */ + if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) || + !BTRFS_FS_LOG_CLEANUP_ERROR(info)) { + if (WARN_ON(space_info->bytes_reserved > 0)) + btrfs_dump_space_info(info, space_info, 0, 0); + } + + WARN_ON(space_info->reclaim_size > 0); +} + /* * Must be called only after stopping all workers, since we could have block * group caching kthreads running, and therefore they could race with us if we @@ -4392,8 +4485,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) write_lock(&info->block_group_cache_lock); while (!list_empty(&info->caching_block_groups)) { - caching_ctl = list_entry(info->caching_block_groups.next, - struct btrfs_caching_control, list); + caching_ctl = list_first_entry(&info->caching_block_groups, + struct btrfs_caching_control, list); list_del(&caching_ctl->list); btrfs_put_caching_control(caching_ctl); } @@ -4464,32 +4557,10 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) btrfs_release_global_block_rsv(info); while (!list_empty(&info->space_info)) { - space_info = list_entry(info->space_info.next, - struct btrfs_space_info, - list); - - /* - * Do not hide this behind enospc_debug, this is actually - * important and indicates a real bug if this happens. - */ - if (WARN_ON(space_info->bytes_pinned > 0 || - space_info->bytes_may_use > 0)) - btrfs_dump_space_info(info, space_info, 0, 0); - - /* - * If there was a failure to cleanup a log tree, very likely due - * to an IO failure on a writeback attempt of one or more of its - * extent buffers, we could not do proper (and cheap) unaccounting - * of their reserved space, so don't warn on bytes_reserved > 0 in - * that case. - */ - if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) || - !BTRFS_FS_LOG_CLEANUP_ERROR(info)) { - if (WARN_ON(space_info->bytes_reserved > 0)) - btrfs_dump_space_info(info, space_info, 0, 0); - } + space_info = list_first_entry(&info->space_info, + struct btrfs_space_info, list); - WARN_ON(space_info->reclaim_size > 0); + check_removing_space_info(space_info); list_del(&space_info->list); btrfs_sysfs_remove_space_info(space_info); } diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 36937eeab9b8..9de356bcb411 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -326,8 +326,8 @@ void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info); void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg); int btrfs_read_block_groups(struct btrfs_fs_info *info); struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans, - u64 type, - u64 chunk_offset, u64 size); + struct btrfs_space_info *space_info, + u64 type, u64 chunk_offset, u64 size); void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans); int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, bool do_chunk_alloc); @@ -340,9 +340,10 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, u64 ram_bytes, u64 num_bytes, int delalloc, bool force_wrong_size_class); -void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, - u64 num_bytes, int delalloc); -int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, +void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, u64 num_bytes, + bool is_delalloc); +int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_space_info *space_info, u64 flags, enum btrfs_chunk_alloc_enum force); int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type); void check_system_chunk(struct btrfs_trans_handle *trans, const u64 type); diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index 3f3608299c0b..5ad6de738aee 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -418,6 +418,9 @@ void btrfs_init_root_block_rsv(struct btrfs_root *root) case BTRFS_CHUNK_TREE_OBJECTID: root->block_rsv = &fs_info->chunk_block_rsv; break; + case BTRFS_TREE_LOG_OBJECTID: + root->block_rsv = &fs_info->treelog_rsv; + break; default: root->block_rsv = NULL; break; @@ -438,6 +441,14 @@ void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info) fs_info->delayed_block_rsv.space_info = space_info; fs_info->delayed_refs_rsv.space_info = space_info; + /* The treelog_rsv uses a dedicated space_info on the zoned mode. */ + if (!btrfs_is_zoned(fs_info)) { + fs_info->treelog_rsv.space_info = space_info; + } else { + ASSERT(space_info->sub_group[0]->subgroup_id == BTRFS_SUB_GROUP_TREELOG); + fs_info->treelog_rsv.space_info = space_info->sub_group[0]; + } + btrfs_update_global_block_rsv(fs_info); } diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index d12b1fac5c74..79ae9d05cd91 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -24,6 +24,7 @@ enum btrfs_rsv_type { BTRFS_BLOCK_RSV_CHUNK, BTRFS_BLOCK_RSV_DELOPS, BTRFS_BLOCK_RSV_DELREFS, + BTRFS_BLOCK_RSV_TREELOG, BTRFS_BLOCK_RSV_EMPTY, BTRFS_BLOCK_RSV_TEMP, }; diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index b2fa33911c28..a79fa0726f1d 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -145,6 +145,7 @@ struct btrfs_inode { * different from prop_compress and takes precedence if set. */ u8 defrag_compress; + s8 defrag_compress_level; /* * Lock for counters and all fields used to determine if the inode is in @@ -516,15 +517,23 @@ static inline void btrfs_assert_inode_locked(struct btrfs_inode *inode) lockdep_assert_held(&inode->vfs_inode.i_rwsem); } +static inline void btrfs_update_inode_mapping_flags(struct btrfs_inode *inode) +{ + if (inode->flags & BTRFS_INODE_NODATASUM) + mapping_clear_stable_writes(inode->vfs_inode.i_mapping); + else + mapping_set_stable_writes(inode->vfs_inode.i_mapping); +} + /* Array of bytes with variable length, hexadecimal format 0x1234 */ #define CSUM_FMT "0x%*phN" #define CSUM_FMT_VALUE(size, bytes) size, bytes -int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, - u32 pgoff, u8 *csum, const u8 * const csum_expected); +int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, void *kaddr, u8 *csum, + const u8 * const csum_expected); bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, u32 bio_offset, struct bio_vec *bv); -noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, +noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len, struct btrfs_file_extent *file_extent, bool nowait); @@ -538,8 +547,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, struct btrfs_inode *parent_inode, struct btrfs_inode *inode, const struct fscrypt_str *name, int add_backref, u64 index); int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry); -int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, - int front); +int btrfs_truncate_block(struct btrfs_inode *inode, u64 offset, u64 start, u64 end); int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context); int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, @@ -584,9 +592,9 @@ void btrfs_free_inode(struct inode *inode); int btrfs_drop_inode(struct inode *inode); int __init btrfs_init_cachep(void); void __cold btrfs_destroy_cachep(void); -struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, - struct btrfs_path *path); -struct inode *btrfs_iget(u64 ino, struct btrfs_root *root); +struct btrfs_inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, + struct btrfs_path *path); +struct btrfs_inode *btrfs_iget(u64 ino, struct btrfs_root *root); struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, struct folio *folio, u64 start, u64 len); int btrfs_update_inode(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 0c4d486c3048..48d07939fee4 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -285,12 +285,12 @@ static noinline void end_compressed_writeback(const struct compressed_bio *cb) unsigned long index = cb->start >> PAGE_SHIFT; unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT; struct folio_batch fbatch; - const int error = blk_status_to_errno(cb->bbio.bio.bi_status); int i; int ret; - if (error) - mapping_set_error(inode->i_mapping, error); + ret = blk_status_to_errno(cb->bbio.bio.bi_status); + if (ret) + mapping_set_error(inode->i_mapping, ret); folio_batch_init(&fbatch); while (index <= end_index) { @@ -499,9 +499,9 @@ static noinline int add_ra_bio_pages(struct inode *inode, } page_end = (pg_index << PAGE_SHIFT) + folio_size(folio) - 1; - lock_extent(tree, cur, page_end, NULL); + btrfs_lock_extent(tree, cur, page_end, NULL); read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, cur, page_end + 1 - cur); + em = btrfs_lookup_extent_mapping(em_tree, cur, page_end + 1 - cur); read_unlock(&em_tree->lock); /* @@ -510,20 +510,20 @@ static noinline int add_ra_bio_pages(struct inode *inode, * to this compressed extent on disk. */ if (!em || cur < em->start || - (cur + fs_info->sectorsize > extent_map_end(em)) || - (extent_map_block_start(em) >> SECTOR_SHIFT) != + (cur + fs_info->sectorsize > btrfs_extent_map_end(em)) || + (btrfs_extent_map_block_start(em) >> SECTOR_SHIFT) != orig_bio->bi_iter.bi_sector) { - free_extent_map(em); - unlock_extent(tree, cur, page_end, NULL); + btrfs_free_extent_map(em); + btrfs_unlock_extent(tree, cur, page_end, NULL); folio_unlock(folio); folio_put(folio); break; } add_size = min(em->start + em->len, page_end + 1) - cur; - free_extent_map(em); - unlock_extent(tree, cur, page_end, NULL); + btrfs_free_extent_map(em); + btrfs_unlock_extent(tree, cur, page_end, NULL); - if (folio->index == end_index) { + if (folio_contains(folio, end_index)) { size_t zero_offset = offset_in_folio(folio, isize); if (zero_offset) { @@ -576,19 +576,19 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) struct extent_map *em; unsigned long pflags; int memstall = 0; - blk_status_t ret; - int ret2; + blk_status_t status; + int ret; /* we need the actual starting offset of this extent in the file */ read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize); + em = btrfs_lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize); read_unlock(&em_tree->lock); if (!em) { - ret = BLK_STS_IOERR; + status = BLK_STS_IOERR; goto out; } - ASSERT(extent_map_is_compressed(em)); + ASSERT(btrfs_extent_map_is_compressed(em)); compressed_len = em->disk_num_bytes; cb = alloc_compressed_bio(inode, file_offset, REQ_OP_READ, @@ -600,21 +600,21 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) cb->len = bbio->bio.bi_iter.bi_size; cb->compressed_len = compressed_len; - cb->compress_type = extent_map_compression(em); + cb->compress_type = btrfs_extent_map_compression(em); cb->orig_bbio = bbio; - free_extent_map(em); + btrfs_free_extent_map(em); cb->nr_folios = DIV_ROUND_UP(compressed_len, PAGE_SIZE); - cb->compressed_folios = kcalloc(cb->nr_folios, sizeof(struct page *), GFP_NOFS); + cb->compressed_folios = kcalloc(cb->nr_folios, sizeof(struct folio *), GFP_NOFS); if (!cb->compressed_folios) { - ret = BLK_STS_RESOURCE; + status = BLK_STS_RESOURCE; goto out_free_bio; } - ret2 = btrfs_alloc_folio_array(cb->nr_folios, cb->compressed_folios); - if (ret2) { - ret = BLK_STS_RESOURCE; + ret = btrfs_alloc_folio_array(cb->nr_folios, cb->compressed_folios); + if (ret) { + status = BLK_STS_RESOURCE; goto out_free_compressed_pages; } @@ -637,7 +637,7 @@ out_free_compressed_pages: out_free_bio: bio_put(&cb->bbio.bio); out: - btrfs_bio_end_io(bbio, ret); + btrfs_bio_end_io(bbio, status); } /* @@ -740,7 +740,7 @@ static const struct btrfs_compress_op * const btrfs_compress_op[] = { &btrfs_zstd_compress, }; -static struct list_head *alloc_workspace(int type, unsigned int level) +static struct list_head *alloc_workspace(int type, int level) { switch (type) { case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(); @@ -818,7 +818,7 @@ static void btrfs_cleanup_workspace_manager(int type) * Preallocation makes a forward progress guarantees and we do not return * errors. */ -struct list_head *btrfs_get_workspace(int type, unsigned int level) +struct list_head *btrfs_get_workspace(int type, int level) { struct workspace_manager *wsm; struct list_head *workspace; @@ -968,18 +968,28 @@ static void put_workspace(int type, struct list_head *ws) * Adjust @level according to the limits of the compression algorithm or * fallback to default */ -static unsigned int btrfs_compress_set_level(int type, unsigned level) +static int btrfs_compress_set_level(unsigned int type, int level) { const struct btrfs_compress_op *ops = btrfs_compress_op[type]; if (level == 0) level = ops->default_level; else - level = min(level, ops->max_level); + level = min(max(level, ops->min_level), ops->max_level); return level; } +/* + * Check whether the @level is within the valid range for the given type. + */ +bool btrfs_compress_level_valid(unsigned int type, int level) +{ + const struct btrfs_compress_op *ops = btrfs_compress_op[type]; + + return ops->min_level <= level && level <= ops->max_level; +} + /* Wrapper around find_get_page(), with extra error message. */ int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start, struct folio **in_folio_ret) @@ -1023,12 +1033,10 @@ int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start, * @total_out is an in/out parameter, must be set to the input length and will * be also used to return the total number of compressed bytes */ -int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping, +int btrfs_compress_folios(unsigned int type, int level, struct address_space *mapping, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out) { - int type = btrfs_compress_type(type_level); - int level = btrfs_compress_level(type_level); const unsigned long orig_len = *total_out; struct list_head *workspace; int ret; @@ -1130,6 +1138,22 @@ void __cold btrfs_exit_compress(void) } /* + * The bvec is a single page bvec from a bio that contains folios from a filemap. + * + * Since the folio may be a large one, and if the bv_page is not a head page of + * a large folio, then page->index is unreliable. + * + * Thus we need this helper to grab the proper file offset. + */ +static u64 file_offset_from_bvec(const struct bio_vec *bvec) +{ + const struct page *page = bvec->bv_page; + const struct folio *folio = page_folio(page); + + return (page_pgoff(folio, page) << PAGE_SHIFT) + bvec->bv_offset; +} + +/* * Copy decompressed data from working buffer to pages. * * @buf: The decompressed data buffer @@ -1174,13 +1198,14 @@ int btrfs_decompress_buf2page(const char *buf, u32 buf_len, u32 copy_start; /* Offset inside the full decompressed extent */ u32 bvec_offset; + void *kaddr; bvec = bio_iter_iovec(orig_bio, orig_bio->bi_iter); /* * cb->start may underflow, but subtracting that value can still * give us correct offset inside the full decompressed extent. */ - bvec_offset = page_offset(bvec.bv_page) + bvec.bv_offset - cb->start; + bvec_offset = file_offset_from_bvec(&bvec) - cb->start; /* Haven't reached the bvec range, exit */ if (decompressed + buf_len <= bvec_offset) @@ -1196,10 +1221,12 @@ int btrfs_decompress_buf2page(const char *buf, u32 buf_len, * @buf + @buf_len. */ ASSERT(copy_start - decompressed < buf_len); - memcpy_to_page(bvec.bv_page, bvec.bv_offset, - buf + copy_start - decompressed, copy_len); - cur_offset += copy_len; + kaddr = bvec_kmap_local(&bvec); + memcpy(kaddr, buf + copy_start - decompressed, copy_len); + kunmap_local(kaddr); + + cur_offset += copy_len; bio_advance(orig_bio, copy_len); /* Finished the bio */ if (!orig_bio->bi_iter.bi_size) @@ -1590,18 +1617,19 @@ out: /* * Convert the compression suffix (eg. after "zlib" starting with ":") to - * level, unrecognized string will set the default level + * level, unrecognized string will set the default level. Negative level + * numbers are allowed. */ -unsigned int btrfs_compress_str2level(unsigned int type, const char *str) +int btrfs_compress_str2level(unsigned int type, const char *str) { - unsigned int level = 0; + int level = 0; int ret; if (!type) return 0; if (str[0] == ':') { - ret = kstrtouint(str + 1, 10, &level); + ret = kstrtoint(str + 1, 10, &level); if (ret) level = 0; } diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 954034086d0d..d34c4341eaf4 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -11,7 +11,9 @@ #include <linux/list.h> #include <linux/workqueue.h> #include <linux/wait.h> +#include <linux/pagemap.h> #include "bio.h" +#include "messages.h" struct address_space; struct page; @@ -72,28 +74,22 @@ struct compressed_bio { struct btrfs_bio bbio; }; -static inline unsigned int btrfs_compress_type(unsigned int type_level) -{ - return (type_level & 0xF); -} - -static inline unsigned int btrfs_compress_level(unsigned int type_level) -{ - return ((type_level & 0xF0) >> 4); -} - /* @range_end must be exclusive. */ -static inline u32 btrfs_calc_input_length(u64 range_end, u64 cur) +static inline u32 btrfs_calc_input_length(struct folio *folio, u64 range_end, u64 cur) { - u64 page_end = round_down(cur, PAGE_SIZE) + PAGE_SIZE; + const u64 folio_end = folio_pos(folio) + folio_size(folio); - return min(range_end, page_end) - cur; + /* @cur must be inside the folio. */ + ASSERT(folio_pos(folio) <= cur); + ASSERT(cur < folio_end); + return min(range_end, folio_end) - cur; } int __init btrfs_init_compress(void); void __cold btrfs_exit_compress(void); -int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping, +bool btrfs_compress_level_valid(unsigned int type, int level); +int btrfs_compress_folios(unsigned int type, int level, struct address_space *mapping, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out); int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio, @@ -107,7 +103,7 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, bool writeback); void btrfs_submit_compressed_read(struct btrfs_bio *bbio); -unsigned int btrfs_compress_str2level(unsigned int type, const char *str); +int btrfs_compress_str2level(unsigned int type, const char *str); struct folio *btrfs_alloc_compr_folio(void); void btrfs_free_compr_folio(struct folio *folio); @@ -131,14 +127,15 @@ struct workspace_manager { wait_queue_head_t ws_wait; }; -struct list_head *btrfs_get_workspace(int type, unsigned int level); +struct list_head *btrfs_get_workspace(int type, int level); void btrfs_put_workspace(int type, struct list_head *ws); struct btrfs_compress_op { struct workspace_manager *workspace_manager; /* Maximum level supported by the compression algorithm */ - unsigned int max_level; - unsigned int default_level; + int min_level; + int max_level; + int default_level; }; /* The heuristic workspaces are managed via the 0th workspace manager */ @@ -187,9 +184,9 @@ int zstd_decompress(struct list_head *ws, const u8 *data_in, size_t destlen); void zstd_init_workspace_manager(void); void zstd_cleanup_workspace_manager(void); -struct list_head *zstd_alloc_workspace(unsigned int level); +struct list_head *zstd_alloc_workspace(int level); void zstd_free_workspace(struct list_head *ws); -struct list_head *zstd_get_workspace(unsigned int level); +struct list_head *zstd_get_workspace(int level); void zstd_put_workspace(struct list_head *ws); #endif diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 3dc5a35dd19b..a2e7979372cc 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -4306,7 +4306,7 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 data_size) { int ret = 0; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; unsigned long ptr; @@ -4320,7 +4320,6 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, write_extent_buffer(leaf, data, ptr, data_size); btrfs_mark_buffer_dirty(trans, leaf); } - btrfs_free_path(path); return ret; } @@ -4608,7 +4607,6 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, u64 min_trans) { struct extent_buffer *cur; - struct btrfs_key found_key; int slot; int sret; u32 nritems; @@ -4644,7 +4642,8 @@ again: goto find_next_key; ret = 0; path->slots[level] = slot; - btrfs_item_key_to_cpu(cur, &found_key, slot); + /* Save our key for returning back. */ + btrfs_item_key_to_cpu(cur, min_key, slot); goto out; } if (sret && slot > 0) @@ -4668,8 +4667,8 @@ find_next_key: * we didn't find a candidate key in this node, walk forward * and find another one */ + path->slots[level] = slot; if (slot >= nritems) { - path->slots[level] = slot; sret = btrfs_find_next_key(root, path, min_key, level, min_trans); if (sret == 0) { @@ -4679,11 +4678,10 @@ find_next_key: goto out; } } - /* save our key for returning back */ - btrfs_node_key_to_cpu(cur, &found_key, slot); - path->slots[level] = slot; if (level == path->lowest_level) { ret = 0; + /* Save our key for returning back. */ + btrfs_node_key_to_cpu(cur, min_key, slot); goto out; } cur = btrfs_read_node_slot(cur, slot); @@ -4700,10 +4698,8 @@ find_next_key: } out: path->keep_locks = keep_locks; - if (ret == 0) { + if (ret == 0) btrfs_unlock_up_safe(path, path->lowest_level + 1); - memcpy(min_key, &found_key, sizeof(found_key)); - } return ret; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1096a80a64e7..71fa42ca04fe 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -6,7 +6,7 @@ #ifndef BTRFS_CTREE_H #define BTRFS_CTREE_H -#include "linux/cleanup.h" +#include <linux/cleanup.h> #include <linux/spinlock.h> #include <linux/rbtree.h> #include <linux/mutex.h> @@ -61,7 +61,6 @@ struct btrfs_path { /* if there is real range locking, this locks field will change */ u8 locks[BTRFS_MAX_LEVEL]; u8 reada; - /* keep some upper locks as we walk down */ u8 lowest_level; /* @@ -69,6 +68,7 @@ struct btrfs_path { * and to force calls to keep space in the nodes */ unsigned int search_for_split:1; + /* Keep some upper locks as we walk down. */ unsigned int keep_locks:1; unsigned int skip_locking:1; unsigned int search_commit_root:1; diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 968dae953948..1831618579cb 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -105,15 +105,15 @@ static int btrfs_insert_inode_defrag(struct btrfs_inode *inode, return 0; } -static inline int need_auto_defrag(struct btrfs_fs_info *fs_info) +static inline bool need_auto_defrag(struct btrfs_fs_info *fs_info) { if (!btrfs_test_opt(fs_info, AUTO_DEFRAG)) - return 0; + return false; if (btrfs_fs_closing(fs_info)) - return 0; + return false; - return 1; + return true; } /* @@ -191,10 +191,7 @@ static struct inode_defrag *btrfs_pick_defrag_inode( if (parent && compare_inode_defrag(&tmp, entry) > 0) { parent = rb_next(parent); - if (parent) - entry = rb_entry(parent, struct inode_defrag, rb_node); - else - entry = NULL; + entry = rb_entry_safe(parent, struct inode_defrag, rb_node); } out: if (entry) @@ -225,7 +222,7 @@ static int btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, struct file_ra_state *ra) { struct btrfs_root *inode_root; - struct inode *inode; + struct btrfs_inode *inode; struct btrfs_ioctl_defrag_range_args range; int ret = 0; u64 cur = 0; @@ -250,24 +247,24 @@ again: goto cleanup; } - if (cur >= i_size_read(inode)) { - iput(inode); + if (cur >= i_size_read(&inode->vfs_inode)) { + iput(&inode->vfs_inode); goto cleanup; } /* Do a chunk of defrag */ - clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); + clear_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags); memset(&range, 0, sizeof(range)); range.len = (u64)-1; range.start = cur; range.extent_thresh = defrag->extent_thresh; - file_ra_state_init(ra, inode->i_mapping); + file_ra_state_init(ra, inode->vfs_inode.i_mapping); sb_start_write(fs_info->sb); ret = btrfs_defrag_file(inode, ra, &range, defrag->transid, - BTRFS_DEFRAG_BATCH); + BTRFS_DEFRAG_BATCH); sb_end_write(fs_info->sb); - iput(inode); + iput(&inode->vfs_inode); if (ret < 0) goto cleanup; @@ -624,7 +621,7 @@ static struct extent_map *defrag_get_extent(struct btrfs_inode *inode, u64 ino = btrfs_ino(inode); int ret; - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { ret = -ENOMEM; goto err; @@ -734,12 +731,12 @@ next: not_found: btrfs_release_path(&path); - free_extent_map(em); + btrfs_free_extent_map(em); return NULL; err: btrfs_release_path(&path); - free_extent_map(em); + btrfs_free_extent_map(em); return ERR_PTR(ret); } @@ -756,7 +753,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, * full extent lock. */ read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, sectorsize); + em = btrfs_lookup_extent_mapping(em_tree, start, sectorsize); read_unlock(&em_tree->lock); /* @@ -769,7 +766,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, * file extent items in the inode's subvolume tree). */ if (em && (em->flags & EXTENT_FLAG_MERGED)) { - free_extent_map(em); + btrfs_free_extent_map(em); em = NULL; } @@ -779,10 +776,10 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, /* Get the big lock and read metadata off disk. */ if (!locked) - lock_extent(io_tree, start, end, &cached); + btrfs_lock_extent(io_tree, start, end, &cached); em = defrag_get_extent(BTRFS_I(inode), start, newer_than); if (!locked) - unlock_extent(io_tree, start, end, &cached); + btrfs_unlock_extent(io_tree, start, end, &cached); if (IS_ERR(em)) return NULL; @@ -794,7 +791,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info, const struct extent_map *em) { - if (extent_map_is_compressed(em)) + if (btrfs_extent_map_is_compressed(em)) return BTRFS_MAX_COMPRESSED; return fs_info->max_extent_size; } @@ -837,7 +834,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, ret = true; out: - free_extent_map(next); + btrfs_free_extent_map(next); return ret; } @@ -857,13 +854,14 @@ static struct folio *defrag_prepare_one_folio(struct btrfs_inode *inode, pgoff_t { struct address_space *mapping = inode->vfs_inode.i_mapping; gfp_t mask = btrfs_alloc_write_mask(mapping); - u64 page_start = (u64)index << PAGE_SHIFT; - u64 page_end = page_start + PAGE_SIZE - 1; + u64 folio_start; + u64 folio_end; struct extent_state *cached_state = NULL; struct folio *folio; int ret; again: + /* TODO: Add order fgp order flags when large folios are fully enabled. */ folio = __filemap_get_folio(mapping, index, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask); if (IS_ERR(folio)) @@ -871,13 +869,16 @@ again: /* * Since we can defragment files opened read-only, we can encounter - * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We - * can't do I/O using huge pages yet, so return an error for now. + * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). + * + * The IO for such large folios is not fully tested, thus return + * an error to reject such folios unless it's an experimental build. + * * Filesystem transparent huge pages are typically only used for * executables that explicitly enable them, so this isn't very * restrictive. */ - if (folio_test_large(folio)) { + if (!IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL) && folio_test_large(folio)) { folio_unlock(folio); folio_put(folio); return ERR_PTR(-ETXTBSY); @@ -890,14 +891,15 @@ again: return ERR_PTR(ret); } + folio_start = folio_pos(folio); + folio_end = folio_pos(folio) + folio_size(folio) - 1; /* Wait for any existing ordered extent in the range */ while (1) { struct btrfs_ordered_extent *ordered; - lock_extent(&inode->io_tree, page_start, page_end, &cached_state); - ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); - unlock_extent(&inode->io_tree, page_start, page_end, - &cached_state); + btrfs_lock_extent(&inode->io_tree, folio_start, folio_end, &cached_state); + ordered = btrfs_lookup_ordered_range(inode, folio_start, folio_size(folio)); + btrfs_unlock_extent(&inode->io_tree, folio_start, folio_end, &cached_state); if (!ordered) break; @@ -1027,8 +1029,8 @@ static int defrag_collect_targets(struct btrfs_inode *inode, * very likely resulting in a larger extent after writeback is * triggered (except in a case of free space fragmentation). */ - if (test_range_bit_exists(&inode->io_tree, cur, cur + range_len - 1, - EXTENT_DELALLOC)) + if (btrfs_test_range_bit_exists(&inode->io_tree, cur, cur + range_len - 1, + EXTENT_DELALLOC)) goto next; /* @@ -1066,8 +1068,8 @@ static int defrag_collect_targets(struct btrfs_inode *inode, /* Empty target list, no way to merge with last entry */ if (list_empty(target_list)) goto next; - last = list_entry(target_list->prev, - struct defrag_target_range, list); + last = list_last_entry(target_list, + struct defrag_target_range, list); /* Not mergeable with last entry */ if (last->start + last->len != cur) goto next; @@ -1077,7 +1079,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode, add: last_is_target = true; - range_len = min(extent_map_end(em), start + len) - cur; + range_len = min(btrfs_extent_map_end(em), start + len) - cur; /* * This one is a good target, check if it can be merged into * last range of the target list. @@ -1085,8 +1087,8 @@ add: if (!list_empty(target_list)) { struct defrag_target_range *last; - last = list_entry(target_list->prev, - struct defrag_target_range, list); + last = list_last_entry(target_list, + struct defrag_target_range, list); ASSERT(last->start + last->len <= cur); if (last->start + last->len == cur) { /* Mergeable, enlarge the last entry */ @@ -1099,7 +1101,7 @@ add: /* Allocate new defrag_target_range */ new = kmalloc(sizeof(*new), GFP_NOFS); if (!new) { - free_extent_map(em); + btrfs_free_extent_map(em); ret = -ENOMEM; break; } @@ -1108,8 +1110,8 @@ add: list_add_tail(&new->list, target_list); next: - cur = extent_map_end(em); - free_extent_map(em); + cur = btrfs_extent_map_end(em); + btrfs_free_extent_map(em); } if (ret < 0) { struct defrag_target_range *entry; @@ -1162,27 +1164,31 @@ static int defrag_one_locked_target(struct btrfs_inode *inode, struct extent_changeset *data_reserved = NULL; const u64 start = target->start; const u64 len = target->len; - unsigned long last_index = (start + len - 1) >> PAGE_SHIFT; - unsigned long start_index = start >> PAGE_SHIFT; - unsigned long first_index = folios[0]->index; int ret = 0; - int i; - - ASSERT(last_index - first_index + 1 <= nr_pages); ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len); if (ret < 0) return ret; - clear_extent_bit(&inode->io_tree, start, start + len - 1, - EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, cached_state); - set_extent_bit(&inode->io_tree, start, start + len - 1, - EXTENT_DELALLOC | EXTENT_DEFRAG, cached_state); - - /* Update the page status */ - for (i = start_index - first_index; i <= last_index - first_index; i++) { - folio_clear_checked(folios[i]); - btrfs_folio_clamp_set_dirty(fs_info, folios[i], start, len); + btrfs_clear_extent_bit(&inode->io_tree, start, start + len - 1, + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, cached_state); + btrfs_set_extent_bit(&inode->io_tree, start, start + len - 1, + EXTENT_DELALLOC | EXTENT_DEFRAG, cached_state); + + /* + * Update the page status. + * Due to possible large folios, we have to check all folios one by one. + */ + for (int i = 0; i < nr_pages && folios[i]; i++) { + struct folio *folio = folios[i]; + + if (!folio) + break; + if (start >= folio_pos(folio) + folio_size(folio) || + start + len <= folio_pos(folio)) + continue; + btrfs_folio_clamp_clear_checked(fs_info, folio, start, len); + btrfs_folio_clamp_set_dirty(fs_info, folio, start, len); } btrfs_delalloc_release_extents(inode, len); extent_changeset_free(data_reserved); @@ -1200,11 +1206,10 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, LIST_HEAD(target_list); struct folio **folios; const u32 sectorsize = inode->root->fs_info->sectorsize; - u64 last_index = (start + len - 1) >> PAGE_SHIFT; - u64 start_index = start >> PAGE_SHIFT; - unsigned int nr_pages = last_index - start_index + 1; + u64 cur = start; + const unsigned int nr_pages = ((start + len - 1) >> PAGE_SHIFT) - + (start >> PAGE_SHIFT) + 1; int ret = 0; - int i; ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE); ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize)); @@ -1214,21 +1219,25 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, return -ENOMEM; /* Prepare all pages */ - for (i = 0; i < nr_pages; i++) { - folios[i] = defrag_prepare_one_folio(inode, start_index + i); + for (int i = 0; cur < start + len && i < nr_pages; i++) { + folios[i] = defrag_prepare_one_folio(inode, cur >> PAGE_SHIFT); if (IS_ERR(folios[i])) { ret = PTR_ERR(folios[i]); - nr_pages = i; + folios[i] = NULL; goto free_folios; } + cur = folio_pos(folios[i]) + folio_size(folios[i]); } - for (i = 0; i < nr_pages; i++) + for (int i = 0; i < nr_pages; i++) { + if (!folios[i]) + break; folio_wait_writeback(folios[i]); + } + /* We should get at least one folio. */ + ASSERT(folios[0]); /* Lock the pages range */ - lock_extent(&inode->io_tree, start_index << PAGE_SHIFT, - (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, - &cached_state); + btrfs_lock_extent(&inode->io_tree, folio_pos(folios[0]), cur - 1, &cached_state); /* * Now we have a consistent view about the extent map, re-check * which range really needs to be defragged. @@ -1254,11 +1263,11 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, kfree(entry); } unlock_extent: - unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT, - (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, - &cached_state); + btrfs_unlock_extent(&inode->io_tree, folio_pos(folios[0]), cur - 1, &cached_state); free_folios: - for (i = 0; i < nr_pages; i++) { + for (int i = 0; i < nr_pages; i++) { + if (!folios[i]) + break; folio_unlock(folios[i]); folio_put(folios[i]); } @@ -1352,17 +1361,18 @@ out: * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without * defragging all the range). */ -int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, +int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra, struct btrfs_ioctl_defrag_range_args *range, u64 newer_than, unsigned long max_to_defrag) { - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; unsigned long sectors_defragged = 0; - u64 isize = i_size_read(inode); + u64 isize = i_size_read(&inode->vfs_inode); u64 cur; u64 last_byte; bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS); int compress_type = BTRFS_COMPRESS_ZLIB; + int compress_level = 0; int ret = 0; u32 extent_thresh = range->extent_thresh; pgoff_t start_index; @@ -1376,10 +1386,21 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, return -EINVAL; if (do_compress) { - if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES) - return -EINVAL; - if (range->compress_type) - compress_type = range->compress_type; + if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS_LEVEL) { + if (range->compress.type >= BTRFS_NR_COMPRESS_TYPES) + return -EINVAL; + if (range->compress.type) { + compress_type = range->compress.type; + compress_level = range->compress.level; + if (!btrfs_compress_level_valid(compress_type, compress_level)) + return -EINVAL; + } + } else { + if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES) + return -EINVAL; + if (range->compress_type) + compress_type = range->compress_type; + } } if (extent_thresh == 0) @@ -1402,8 +1423,8 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, * defrag range can be written sequentially. */ start_index = cur >> PAGE_SHIFT; - if (start_index < inode->i_mapping->writeback_index) - inode->i_mapping->writeback_index = start_index; + if (start_index < inode->vfs_inode.i_mapping->writeback_index) + inode->vfs_inode.i_mapping->writeback_index = start_index; while (cur < last_byte) { const unsigned long prev_sectors_defragged = sectors_defragged; @@ -1420,27 +1441,29 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1; cluster_end = min(cluster_end, last_byte); - btrfs_inode_lock(BTRFS_I(inode), 0); - if (IS_SWAPFILE(inode)) { + btrfs_inode_lock(inode, 0); + if (IS_SWAPFILE(&inode->vfs_inode)) { ret = -ETXTBSY; - btrfs_inode_unlock(BTRFS_I(inode), 0); + btrfs_inode_unlock(inode, 0); break; } - if (!(inode->i_sb->s_flags & SB_ACTIVE)) { - btrfs_inode_unlock(BTRFS_I(inode), 0); + if (!(inode->vfs_inode.i_sb->s_flags & SB_ACTIVE)) { + btrfs_inode_unlock(inode, 0); break; } - if (do_compress) - BTRFS_I(inode)->defrag_compress = compress_type; - ret = defrag_one_cluster(BTRFS_I(inode), ra, cur, + if (do_compress) { + inode->defrag_compress = compress_type; + inode->defrag_compress_level = compress_level; + } + ret = defrag_one_cluster(inode, ra, cur, cluster_end + 1 - cur, extent_thresh, newer_than, do_compress, §ors_defragged, max_to_defrag, &last_scanned); if (sectors_defragged > prev_sectors_defragged) - balance_dirty_pages_ratelimited(inode->i_mapping); + balance_dirty_pages_ratelimited(inode->vfs_inode.i_mapping); - btrfs_inode_unlock(BTRFS_I(inode), 0); + btrfs_inode_unlock(inode, 0); if (ret < 0) break; cur = max(cluster_end + 1, last_scanned); @@ -1462,10 +1485,10 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, * need to be written back immediately. */ if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) { - filemap_flush(inode->i_mapping); + filemap_flush(inode->vfs_inode.i_mapping); if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) - filemap_flush(inode->i_mapping); + &inode->runtime_flags)) + filemap_flush(inode->vfs_inode.i_mapping); } if (range->compress_type == BTRFS_COMPRESS_LZO) btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); @@ -1474,9 +1497,9 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, ret = sectors_defragged; } if (do_compress) { - btrfs_inode_lock(BTRFS_I(inode), 0); - BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE; - btrfs_inode_unlock(BTRFS_I(inode), 0); + btrfs_inode_lock(inode, 0); + inode->defrag_compress = BTRFS_COMPRESS_NONE; + btrfs_inode_unlock(inode, 0); } return ret; } diff --git a/fs/btrfs/defrag.h b/fs/btrfs/defrag.h index 6b7596c4f0dc..a7f917a38dbf 100644 --- a/fs/btrfs/defrag.h +++ b/fs/btrfs/defrag.h @@ -6,14 +6,14 @@ #include <linux/types.h> #include <linux/compiler_types.h> -struct inode; struct file_ra_state; +struct btrfs_inode; struct btrfs_fs_info; struct btrfs_root; struct btrfs_trans_handle; struct btrfs_ioctl_defrag_range_args; -int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, +int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra, struct btrfs_ioctl_defrag_range_args *range, u64 newer_than, unsigned long max_to_defrag); int __init btrfs_auto_defrag_init(void); diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 88e900e5a43d..288e1776c02d 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -111,6 +111,18 @@ * making error handling and cleanup easier. */ +static inline struct btrfs_space_info *data_sinfo_for_inode(const struct btrfs_inode *inode) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + + if (btrfs_is_zoned(fs_info) && btrfs_is_data_reloc_root(inode->root)) { + ASSERT(fs_info->data_sinfo->sub_group[0]->subgroup_id == + BTRFS_SUB_GROUP_DATA_RELOC); + return fs_info->data_sinfo->sub_group[0]; + } + return fs_info->data_sinfo; +} + int btrfs_alloc_data_chunk_ondemand(const struct btrfs_inode *inode, u64 bytes) { struct btrfs_root *root = inode->root; @@ -123,7 +135,7 @@ int btrfs_alloc_data_chunk_ondemand(const struct btrfs_inode *inode, u64 bytes) if (btrfs_is_free_space_inode(inode)) flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE; - return btrfs_reserve_data_bytes(fs_info, bytes, flush); + return btrfs_reserve_data_bytes(data_sinfo_for_inode(inode), bytes, flush); } int btrfs_check_data_free_space(struct btrfs_inode *inode, @@ -144,14 +156,14 @@ int btrfs_check_data_free_space(struct btrfs_inode *inode, else if (btrfs_is_free_space_inode(inode)) flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE; - ret = btrfs_reserve_data_bytes(fs_info, len, flush); + ret = btrfs_reserve_data_bytes(data_sinfo_for_inode(inode), len, flush); if (ret < 0) return ret; /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ ret = btrfs_qgroup_reserve_data(inode, reserved, start, len); if (ret < 0) { - btrfs_free_reserved_data_space_noquota(fs_info, len); + btrfs_free_reserved_data_space_noquota(inode, len); extent_changeset_free(*reserved); *reserved = NULL; } else { @@ -168,15 +180,13 @@ int btrfs_check_data_free_space(struct btrfs_inode *inode, * which we can't sleep and is sure it won't affect qgroup reserved space. * Like clear_bit_hook(). */ -void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info, - u64 len) +void btrfs_free_reserved_data_space_noquota(struct btrfs_inode *inode, u64 len) { - struct btrfs_space_info *data_sinfo; + struct btrfs_fs_info *fs_info = inode->root->fs_info; ASSERT(IS_ALIGNED(len, fs_info->sectorsize)); - data_sinfo = fs_info->data_sinfo; - btrfs_space_info_free_bytes_may_use(data_sinfo, len); + btrfs_space_info_free_bytes_may_use(data_sinfo_for_inode(inode), len); } /* @@ -196,7 +206,7 @@ void btrfs_free_reserved_data_space(struct btrfs_inode *inode, round_down(start, fs_info->sectorsize); start = round_down(start, fs_info->sectorsize); - btrfs_free_reserved_data_space_noquota(fs_info, len); + btrfs_free_reserved_data_space_noquota(inode, len); btrfs_qgroup_free_data(inode, reserved, start, len, NULL); } @@ -439,6 +449,29 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes) btrfs_inode_rsv_release(inode, true); } +/* Shrink a previously reserved extent to a new length. */ +void btrfs_delalloc_shrink_extents(struct btrfs_inode *inode, u64 reserved_len, u64 new_len) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + const u32 reserved_num_extents = count_max_extents(fs_info, reserved_len); + const u32 new_num_extents = count_max_extents(fs_info, new_len); + const int diff_num_extents = new_num_extents - reserved_num_extents; + + ASSERT(new_len <= reserved_len); + if (new_num_extents == reserved_num_extents) + return; + + spin_lock(&inode->lock); + btrfs_mod_outstanding_extents(inode, diff_num_extents); + btrfs_calculate_inode_block_rsv_size(fs_info, inode); + spin_unlock(&inode->lock); + + if (btrfs_is_testing(fs_info)) + return; + + btrfs_inode_rsv_release(inode, true); +} + /* * Reserve data and metadata space for delalloc * diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h index 3f32953c0a80..6119c0d3f883 100644 --- a/fs/btrfs/delalloc-space.h +++ b/fs/btrfs/delalloc-space.h @@ -18,8 +18,7 @@ void btrfs_free_reserved_data_space(struct btrfs_inode *inode, void btrfs_delalloc_release_space(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len, bool qgroup_free); -void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info, - u64 len); +void btrfs_free_reserved_data_space_noquota(struct btrfs_inode *inode, u64 len); void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, bool qgroup_free); int btrfs_delalloc_reserve_space(struct btrfs_inode *inode, @@ -27,5 +26,6 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode, int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, u64 disk_num_bytes, bool noflush); void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes); +void btrfs_delalloc_shrink_extents(struct btrfs_inode *inode, u64 reserved_len, u64 new_len); #endif /* BTRFS_DELALLOC_SPACE_H */ diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 0b4933c6a889..c7cc24a5dd5e 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -119,7 +119,12 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( return NULL; } -/* Will return either the node or PTR_ERR(-ENOMEM) */ +/* + * Look up an existing delayed node associated with @btrfs_inode or create a new + * one and insert it to the delayed nodes of the root. + * + * Return the delayed node, or error pointer on failure. + */ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( struct btrfs_inode *btrfs_inode) { @@ -211,17 +216,13 @@ static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root, static struct btrfs_delayed_node *btrfs_first_delayed_node( struct btrfs_delayed_root *delayed_root) { - struct list_head *p; - struct btrfs_delayed_node *node = NULL; + struct btrfs_delayed_node *node; spin_lock(&delayed_root->lock); - if (list_empty(&delayed_root->node_list)) - goto out; - - p = delayed_root->node_list.next; - node = list_entry(p, struct btrfs_delayed_node, n_list); - refcount_inc(&node->refs); -out: + node = list_first_entry_or_null(&delayed_root->node_list, + struct btrfs_delayed_node, n_list); + if (node) + refcount_inc(&node->refs); spin_unlock(&delayed_root->lock); return node; @@ -293,18 +294,15 @@ static inline void btrfs_release_delayed_node(struct btrfs_delayed_node *node) static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node( struct btrfs_delayed_root *delayed_root) { - struct list_head *p; - struct btrfs_delayed_node *node = NULL; + struct btrfs_delayed_node *node; spin_lock(&delayed_root->lock); - if (list_empty(&delayed_root->prepare_list)) - goto out; - - p = delayed_root->prepare_list.next; - list_del_init(p); - node = list_entry(p, struct btrfs_delayed_node, p_list); - refcount_inc(&node->refs); -out: + node = list_first_entry_or_null(&delayed_root->prepare_list, + struct btrfs_delayed_node, p_list); + if (node) { + list_del_init(&node->p_list); + refcount_inc(&node->refs); + } spin_unlock(&delayed_root->lock); return node; @@ -454,40 +452,25 @@ static void btrfs_release_delayed_item(struct btrfs_delayed_item *item) static struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item( struct btrfs_delayed_node *delayed_node) { - struct rb_node *p; - struct btrfs_delayed_item *item = NULL; + struct rb_node *p = rb_first_cached(&delayed_node->ins_root); - p = rb_first_cached(&delayed_node->ins_root); - if (p) - item = rb_entry(p, struct btrfs_delayed_item, rb_node); - - return item; + return rb_entry_safe(p, struct btrfs_delayed_item, rb_node); } static struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item( struct btrfs_delayed_node *delayed_node) { - struct rb_node *p; - struct btrfs_delayed_item *item = NULL; + struct rb_node *p = rb_first_cached(&delayed_node->del_root); - p = rb_first_cached(&delayed_node->del_root); - if (p) - item = rb_entry(p, struct btrfs_delayed_item, rb_node); - - return item; + return rb_entry_safe(p, struct btrfs_delayed_item, rb_node); } static struct btrfs_delayed_item *__btrfs_next_delayed_item( struct btrfs_delayed_item *item) { - struct rb_node *p; - struct btrfs_delayed_item *next = NULL; + struct rb_node *p = rb_next(&item->rb_node); - p = rb_next(&item->rb_node); - if (p) - next = rb_entry(p, struct btrfs_delayed_item, rb_node); - - return next; + return rb_entry_safe(p, struct btrfs_delayed_item, rb_node); } static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, @@ -1211,7 +1194,7 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, struct btrfs_inode *inode) { struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_block_rsv *block_rsv; int ret; @@ -1238,7 +1221,6 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, ret = __btrfs_commit_inode_delayed_items(trans, path, delayed_node); btrfs_release_delayed_node(delayed_node); - btrfs_free_path(path); trans->block_rsv = block_rsv; return ret; @@ -1398,17 +1380,17 @@ void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info) WARN_ON(btrfs_first_delayed_node(fs_info->delayed_root)); } -static int could_end_wait(struct btrfs_delayed_root *delayed_root, int seq) +static bool could_end_wait(struct btrfs_delayed_root *delayed_root, int seq) { int val = atomic_read(&delayed_root->items_seq); if (val < seq || val >= seq + BTRFS_DELAYED_BATCH) - return 1; + return true; if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) - return 1; + return true; - return 0; + return false; } void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info) @@ -1817,53 +1799,53 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, static void fill_stack_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode_item *inode_item, - struct inode *inode) + struct btrfs_inode *inode) { + struct inode *vfs_inode = &inode->vfs_inode; u64 flags; - btrfs_set_stack_inode_uid(inode_item, i_uid_read(inode)); - btrfs_set_stack_inode_gid(inode_item, i_gid_read(inode)); - btrfs_set_stack_inode_size(inode_item, BTRFS_I(inode)->disk_i_size); - btrfs_set_stack_inode_mode(inode_item, inode->i_mode); - btrfs_set_stack_inode_nlink(inode_item, inode->i_nlink); - btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode)); - btrfs_set_stack_inode_generation(inode_item, - BTRFS_I(inode)->generation); + btrfs_set_stack_inode_uid(inode_item, i_uid_read(vfs_inode)); + btrfs_set_stack_inode_gid(inode_item, i_gid_read(vfs_inode)); + btrfs_set_stack_inode_size(inode_item, inode->disk_i_size); + btrfs_set_stack_inode_mode(inode_item, vfs_inode->i_mode); + btrfs_set_stack_inode_nlink(inode_item, vfs_inode->i_nlink); + btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(vfs_inode)); + btrfs_set_stack_inode_generation(inode_item, inode->generation); btrfs_set_stack_inode_sequence(inode_item, - inode_peek_iversion(inode)); + inode_peek_iversion(vfs_inode)); btrfs_set_stack_inode_transid(inode_item, trans->transid); - btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev); - flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, - BTRFS_I(inode)->ro_flags); + btrfs_set_stack_inode_rdev(inode_item, vfs_inode->i_rdev); + flags = btrfs_inode_combine_flags(inode->flags, inode->ro_flags); btrfs_set_stack_inode_flags(inode_item, flags); btrfs_set_stack_inode_block_group(inode_item, 0); btrfs_set_stack_timespec_sec(&inode_item->atime, - inode_get_atime_sec(inode)); + inode_get_atime_sec(vfs_inode)); btrfs_set_stack_timespec_nsec(&inode_item->atime, - inode_get_atime_nsec(inode)); + inode_get_atime_nsec(vfs_inode)); btrfs_set_stack_timespec_sec(&inode_item->mtime, - inode_get_mtime_sec(inode)); + inode_get_mtime_sec(vfs_inode)); btrfs_set_stack_timespec_nsec(&inode_item->mtime, - inode_get_mtime_nsec(inode)); + inode_get_mtime_nsec(vfs_inode)); btrfs_set_stack_timespec_sec(&inode_item->ctime, - inode_get_ctime_sec(inode)); + inode_get_ctime_sec(vfs_inode)); btrfs_set_stack_timespec_nsec(&inode_item->ctime, - inode_get_ctime_nsec(inode)); + inode_get_ctime_nsec(vfs_inode)); - btrfs_set_stack_timespec_sec(&inode_item->otime, BTRFS_I(inode)->i_otime_sec); - btrfs_set_stack_timespec_nsec(&inode_item->otime, BTRFS_I(inode)->i_otime_nsec); + btrfs_set_stack_timespec_sec(&inode_item->otime, inode->i_otime_sec); + btrfs_set_stack_timespec_nsec(&inode_item->otime, inode->i_otime_nsec); } -int btrfs_fill_inode(struct inode *inode, u32 *rdev) +int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev) { - struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_delayed_node *delayed_node; struct btrfs_inode_item *inode_item; + struct inode *vfs_inode = &inode->vfs_inode; - delayed_node = btrfs_get_delayed_node(BTRFS_I(inode)); + delayed_node = btrfs_get_delayed_node(inode); if (!delayed_node) return -ENOENT; @@ -1876,39 +1858,38 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) inode_item = &delayed_node->inode_item; - i_uid_write(inode, btrfs_stack_inode_uid(inode_item)); - i_gid_write(inode, btrfs_stack_inode_gid(inode_item)); - btrfs_i_size_write(BTRFS_I(inode), btrfs_stack_inode_size(inode_item)); - btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0, - round_up(i_size_read(inode), fs_info->sectorsize)); - inode->i_mode = btrfs_stack_inode_mode(inode_item); - set_nlink(inode, btrfs_stack_inode_nlink(inode_item)); - inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item)); - BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item); - BTRFS_I(inode)->last_trans = btrfs_stack_inode_transid(inode_item); - - inode_set_iversion_queried(inode, - btrfs_stack_inode_sequence(inode_item)); - inode->i_rdev = 0; + i_uid_write(vfs_inode, btrfs_stack_inode_uid(inode_item)); + i_gid_write(vfs_inode, btrfs_stack_inode_gid(inode_item)); + btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item)); + btrfs_inode_set_file_extent_range(inode, 0, + round_up(i_size_read(vfs_inode), fs_info->sectorsize)); + vfs_inode->i_mode = btrfs_stack_inode_mode(inode_item); + set_nlink(vfs_inode, btrfs_stack_inode_nlink(inode_item)); + inode_set_bytes(vfs_inode, btrfs_stack_inode_nbytes(inode_item)); + inode->generation = btrfs_stack_inode_generation(inode_item); + inode->last_trans = btrfs_stack_inode_transid(inode_item); + + inode_set_iversion_queried(vfs_inode, btrfs_stack_inode_sequence(inode_item)); + vfs_inode->i_rdev = 0; *rdev = btrfs_stack_inode_rdev(inode_item); btrfs_inode_split_flags(btrfs_stack_inode_flags(inode_item), - &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags); + &inode->flags, &inode->ro_flags); - inode_set_atime(inode, btrfs_stack_timespec_sec(&inode_item->atime), + inode_set_atime(vfs_inode, btrfs_stack_timespec_sec(&inode_item->atime), btrfs_stack_timespec_nsec(&inode_item->atime)); - inode_set_mtime(inode, btrfs_stack_timespec_sec(&inode_item->mtime), + inode_set_mtime(vfs_inode, btrfs_stack_timespec_sec(&inode_item->mtime), btrfs_stack_timespec_nsec(&inode_item->mtime)); - inode_set_ctime(inode, btrfs_stack_timespec_sec(&inode_item->ctime), + inode_set_ctime(vfs_inode, btrfs_stack_timespec_sec(&inode_item->ctime), btrfs_stack_timespec_nsec(&inode_item->ctime)); - BTRFS_I(inode)->i_otime_sec = btrfs_stack_timespec_sec(&inode_item->otime); - BTRFS_I(inode)->i_otime_nsec = btrfs_stack_timespec_nsec(&inode_item->otime); + inode->i_otime_sec = btrfs_stack_timespec_sec(&inode_item->otime); + inode->i_otime_nsec = btrfs_stack_timespec_nsec(&inode_item->otime); - inode->i_generation = BTRFS_I(inode)->generation; - if (S_ISDIR(inode->i_mode)) - BTRFS_I(inode)->index_cnt = (u64)-1; + vfs_inode->i_generation = inode->generation; + if (S_ISDIR(vfs_inode->i_mode)) + inode->index_cnt = (u64)-1; mutex_unlock(&delayed_node->mutex); btrfs_release_delayed_node(delayed_node); @@ -1928,8 +1909,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, mutex_lock(&delayed_node->mutex); if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { - fill_stack_inode_item(trans, &delayed_node->inode_item, - &inode->vfs_inode); + fill_stack_inode_item(trans, &delayed_node->inode_item, inode); goto release_node; } @@ -1937,7 +1917,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, if (ret) goto release_node; - fill_stack_inode_item(trans, &delayed_node->inode_item, &inode->vfs_inode); + fill_stack_inode_item(trans, &delayed_node->inode_item, inode); set_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags); delayed_node->count++; atomic_inc(&root->fs_info->delayed_root->items); diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index f4d9feac0d0e..c4b4ba122beb 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -133,7 +133,7 @@ int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode); int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *inode); -int btrfs_fill_inode(struct inode *inode, u32 *rdev); +int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev); int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode); /* Used for drop dead root */ diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 98c5b61dabe8..739c9e29aaa3 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -331,12 +331,9 @@ static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root, struct btrfs_delayed_ref_node *ins) { struct rb_node *node = &ins->ref_node; - struct rb_node *exist; + struct rb_node *exist = rb_find_add_cached(node, root, cmp_refs_node); - exist = rb_find_add_cached(node, root, cmp_refs_node); - if (exist) - return rb_entry(exist, struct btrfs_delayed_ref_node, ref_node); - return NULL; + return rb_entry_safe(exist, struct btrfs_delayed_ref_node, ref_node); } static struct btrfs_delayed_ref_head *find_first_ref_head( @@ -1339,7 +1336,7 @@ int __init btrfs_delayed_ref_init(void) { btrfs_delayed_ref_head_cachep = KMEM_CACHE(btrfs_delayed_ref_head, 0); if (!btrfs_delayed_ref_head_cachep) - goto fail; + return -ENOMEM; btrfs_delayed_ref_node_cachep = KMEM_CACHE(btrfs_delayed_ref_node, 0); if (!btrfs_delayed_ref_node_cachep) diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index a35067cebb97..78cc23837610 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -14,6 +14,8 @@ #include <linux/spinlock.h> #include <linux/slab.h> #include <uapi/linux/btrfs_tree.h> +#include "fs.h" +#include "messages.h" struct btrfs_trans_handle; struct btrfs_fs_info; @@ -260,7 +262,6 @@ enum btrfs_ref_type { BTRFS_REF_NOT_SET, BTRFS_REF_DATA, BTRFS_REF_METADATA, - BTRFS_REF_LAST, } __packed; struct btrfs_ref { diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index f86fbea0b3de..2decb9fff445 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -76,7 +76,7 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) struct extent_buffer *eb; int slot; int ret = 0; - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); int item_size; struct btrfs_dev_replace_item *ptr; u64 src_devid; @@ -85,10 +85,8 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) return 0; path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } + if (!path) + return -ENOMEM; key.objectid = 0; key.type = BTRFS_DEV_REPLACE_KEY; @@ -103,10 +101,8 @@ no_valid_dev_replace_entry_found: if (btrfs_find_device(fs_info->fs_devices, &args)) { btrfs_err(fs_info, "found replace target device without a valid replace item"); - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } - ret = 0; dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED; dev_replace->cont_reading_from_srcdev_mode = @@ -123,7 +119,7 @@ no_valid_dev_replace_entry_found: dev_replace->tgtdev = NULL; dev_replace->is_valid = 0; dev_replace->item_needs_writeback = 0; - goto out; + return 0; } slot = path->slots[0]; eb = path->nodes[0]; @@ -226,8 +222,6 @@ no_valid_dev_replace_entry_found: break; } -out: - btrfs_free_path(path); return ret; } @@ -346,7 +340,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans) struct btrfs_fs_info *fs_info = trans->fs_info; int ret; struct btrfs_root *dev_root = fs_info->dev_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct extent_buffer *eb; struct btrfs_dev_replace_item *ptr; @@ -365,16 +359,15 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans) key.offset = 0; path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } + if (!path) + return -ENOMEM; + ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); if (ret < 0) { btrfs_warn(fs_info, "error %d while searching for dev_replace item!", ret); - goto out; + return ret; } if (ret == 0 && @@ -395,7 +388,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans) btrfs_warn(fs_info, "delete too small dev_replace item failed %d!", ret); - goto out; + return ret; } ret = 1; } @@ -408,7 +401,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans) if (ret < 0) { btrfs_warn(fs_info, "insert dev_replace item failed %d!", ret); - goto out; + return ret; } } @@ -440,8 +433,6 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans) dev_replace->cursor_right); dev_replace->item_needs_writeback = 0; up_write(&dev_replace->rwsem); -out: - btrfs_free_path(path); return ret; } @@ -646,7 +637,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, break; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: - ASSERT(0); + DEBUG_WARN("unexpected STARTED ot SUSPENDED dev-replace state"); ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED; up_write(&dev_replace->rwsem); goto leave; @@ -803,17 +794,17 @@ static int btrfs_set_target_alloc_state(struct btrfs_device *srcdev, lockdep_assert_held(&srcdev->fs_info->chunk_mutex); - while (find_first_extent_bit(&srcdev->alloc_state, start, - &found_start, &found_end, - CHUNK_ALLOCATED, &cached_state)) { - ret = set_extent_bit(&tgtdev->alloc_state, found_start, - found_end, CHUNK_ALLOCATED, NULL); + while (btrfs_find_first_extent_bit(&srcdev->alloc_state, start, + &found_start, &found_end, + CHUNK_ALLOCATED, &cached_state)) { + ret = btrfs_set_extent_bit(&tgtdev->alloc_state, found_start, + found_end, CHUNK_ALLOCATED, NULL); if (ret) break; start = found_end + 1; } - free_extent_state(cached_state); + btrfs_free_extent_state(cached_state); return ret; } @@ -1274,16 +1265,16 @@ static int btrfs_dev_replace_kthread(void *data) return 0; } -int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace) +bool __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace) { if (!dev_replace->is_valid) - return 0; + return false; switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: - return 0; + return false; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: /* @@ -1298,7 +1289,7 @@ int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace) */ break; } - return 1; + return true; } void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount) diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h index 23e480efe5e6..b35cecf388f2 100644 --- a/fs/btrfs/dev-replace.h +++ b/fs/btrfs/dev-replace.h @@ -25,7 +25,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info); void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info); int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info); -int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace); +bool __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace); bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev, struct btrfs_block_group *cache, u64 physical); diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index ccf91de29f80..b29cc31a7c4a 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -236,7 +236,7 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, int data_size; struct extent_buffer *leaf; int slot; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); path = btrfs_alloc_path(); if (!path) @@ -251,20 +251,17 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, if (IS_ERR(di)) { ret = PTR_ERR(di); /* Nothing found, we're safe */ - if (ret == -ENOENT) { - ret = 0; - goto out; - } + if (ret == -ENOENT) + return 0; if (ret < 0) - goto out; + return ret; } /* we found an item, look for our name in the item */ if (di) { /* our exact name was found */ - ret = -EEXIST; - goto out; + return -EEXIST; } /* See if there is room in the item to insert this name. */ @@ -273,14 +270,11 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, slot = path->slots[0]; if (data_size + btrfs_item_size(leaf, slot) + sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root->fs_info)) { - ret = -EOVERFLOW; - } else { - /* plenty of insertion room */ - ret = 0; + return -EOVERFLOW; } -out: - btrfs_free_path(path); - return ret; + + /* Plenty of insertion room. */ + return 0; } /* diff --git a/fs/btrfs/dir-item.h b/fs/btrfs/dir-item.h index 28d69970bc70..8462579a95f4 100644 --- a/fs/btrfs/dir-item.h +++ b/fs/btrfs/dir-item.h @@ -10,6 +10,7 @@ struct fscrypt_str; struct btrfs_fs_info; struct btrfs_key; struct btrfs_path; +struct btrfs_inode; struct btrfs_root; struct btrfs_trans_handle; diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c index 8567af46e16f..fe9a4bd7e6e6 100644 --- a/fs/btrfs/direct-io.c +++ b/fs/btrfs/direct-io.c @@ -42,21 +42,21 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, /* Direct lock must be taken before the extent lock. */ if (nowait) { - if (!try_lock_dio_extent(io_tree, lockstart, lockend, cached_state)) + if (!btrfs_try_lock_dio_extent(io_tree, lockstart, lockend, cached_state)) return -EAGAIN; } else { - lock_dio_extent(io_tree, lockstart, lockend, cached_state); + btrfs_lock_dio_extent(io_tree, lockstart, lockend, cached_state); } while (1) { if (nowait) { - if (!try_lock_extent(io_tree, lockstart, lockend, - cached_state)) { + if (!btrfs_try_lock_extent(io_tree, lockstart, lockend, + cached_state)) { ret = -EAGAIN; break; } } else { - lock_extent(io_tree, lockstart, lockend, cached_state); + btrfs_lock_extent(io_tree, lockstart, lockend, cached_state); } /* * We're concerned with the entire range that we're going to be @@ -78,7 +78,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, lockstart, lockend))) break; - unlock_extent(io_tree, lockstart, lockend, cached_state); + btrfs_unlock_extent(io_tree, lockstart, lockend, cached_state); if (ordered) { if (nowait) { @@ -131,7 +131,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, } if (ret) - unlock_dio_extent(io_tree, lockstart, lockend, cached_state); + btrfs_unlock_dio_extent(io_tree, lockstart, lockend, cached_state); return ret; } @@ -151,11 +151,11 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, } ordered = btrfs_alloc_ordered_extent(inode, start, file_extent, - (1 << type) | - (1 << BTRFS_ORDERED_DIRECT)); + (1U << type) | + (1U << BTRFS_ORDERED_DIRECT)); if (IS_ERR(ordered)) { if (em) { - free_extent_map(em); + btrfs_free_extent_map(em); btrfs_drop_extent_map_range(inode, start, start + file_extent->num_bytes - 1, false); } @@ -204,8 +204,7 @@ again: BTRFS_ORDERED_REGULAR); btrfs_dec_block_group_reservations(fs_info, ins.objectid); if (IS_ERR(em)) - btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, - 1); + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true); return em; } @@ -246,9 +245,10 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, else type = BTRFS_ORDERED_NOCOW; len = min(len, em->len - (start - em->start)); - block_start = extent_map_block_start(em) + (start - em->start); + block_start = btrfs_extent_map_block_start(em) + (start - em->start); - if (can_nocow_extent(inode, start, &len, &file_extent, false) == 1) { + if (can_nocow_extent(BTRFS_I(inode), start, &len, &file_extent, + false) == 1) { bg = btrfs_inc_nocow_writers(fs_info, block_start); if (bg) can_nocow = true; @@ -264,7 +264,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, nowait); if (ret < 0) { /* Our caller expects us to free the input extent map. */ - free_extent_map(em); + btrfs_free_extent_map(em); *map = NULL; btrfs_dec_nocow_writers(bg); if (nowait && (ret == -ENOSPC || ret == -EDQUOT)) @@ -277,7 +277,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, &file_extent, type); btrfs_dec_nocow_writers(bg); if (type == BTRFS_ORDERED_PREALLOC) { - free_extent_map(em); + btrfs_free_extent_map(em); *map = em2; em = em2; } @@ -290,7 +290,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, dio_data->nocow_done = true; } else { /* Our caller expects us to free the input extent map. */ - free_extent_map(em); + btrfs_free_extent_map(em); *map = NULL; if (nowait) { @@ -439,8 +439,8 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, start, data_alloc_len, false); if (!ret) dio_data->data_space_reserved = true; - else if (ret && !(BTRFS_I(inode)->flags & - (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) + else if (!(BTRFS_I(inode)->flags & + (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) goto err; } @@ -473,8 +473,8 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, * to buffered IO. Don't blame me, this is the price we pay for using * the generic code. */ - if (extent_map_is_compressed(em) || em->disk_bytenr == EXTENT_MAP_INLINE) { - free_extent_map(em); + if (btrfs_extent_map_is_compressed(em) || em->disk_bytenr == EXTENT_MAP_INLINE) { + btrfs_free_extent_map(em); /* * If we are in a NOWAIT context, return -EAGAIN in order to * fallback to buffered IO. This is not only because we can @@ -515,7 +515,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, * after we have submitted bios for all the extents in the range. */ if ((flags & IOMAP_NOWAIT) && len < length) { - free_extent_map(em); + btrfs_free_extent_map(em); ret = -EAGAIN; goto unlock_err; } @@ -557,13 +557,13 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, iomap->addr = IOMAP_NULL_ADDR; iomap->type = IOMAP_HOLE; } else { - iomap->addr = extent_map_block_start(em) + (start - em->start); + iomap->addr = btrfs_extent_map_block_start(em) + (start - em->start); iomap->type = IOMAP_MAPPED; } iomap->offset = start; iomap->bdev = fs_info->fs_devices->latest_dev->bdev; iomap->length = len; - free_extent_map(em); + btrfs_free_extent_map(em); /* * Reads will hold the EXTENT_DIO_LOCKED bit until the io is completed, @@ -574,13 +574,13 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, if (write) unlock_bits |= EXTENT_DIO_LOCKED; - clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, - unlock_bits, &cached_state); + btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, + unlock_bits, &cached_state); /* We didn't use everything, unlock the dio extent for the remainder. */ if (!write && (start + len) < lockend) - unlock_dio_extent(&BTRFS_I(inode)->io_tree, start + len, - lockend, NULL); + btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, start + len, + lockend, NULL); return 0; @@ -590,8 +590,8 @@ unlock_err: * to update this, be explicit that we expect EXTENT_LOCKED and * EXTENT_DIO_LOCKED to be set here, and so that's what we're clearing. */ - clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, - EXTENT_LOCKED | EXTENT_DIO_LOCKED, &cached_state); + btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, + EXTENT_LOCKED | EXTENT_DIO_LOCKED, &cached_state); err: if (dio_data->data_space_reserved) { btrfs_free_reserved_data_space(BTRFS_I(inode), @@ -614,8 +614,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, if (!write && (iomap->type == IOMAP_HOLE)) { /* If reading from a hole, unlock and return */ - unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos, - pos + length - 1, NULL); + btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos, + pos + length - 1, NULL); return 0; } @@ -626,8 +626,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, btrfs_finish_ordered_extent(dio_data->ordered, NULL, pos, length, false); else - unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos, - pos + length - 1, NULL); + btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos, + pos + length - 1, NULL); ret = -ENOTBLK; } if (write) { @@ -659,8 +659,8 @@ static void btrfs_dio_end_io(struct btrfs_bio *bbio) dip->file_offset, dip->bytes, !bio->bi_status); } else { - unlock_dio_extent(&inode->io_tree, dip->file_offset, - dip->file_offset + dip->bytes - 1, NULL); + btrfs_unlock_dio_extent(&inode->io_tree, dip->file_offset, + dip->file_offset + dip->bytes - 1, NULL); } bbio->bio.bi_private = bbio->private; @@ -691,9 +691,9 @@ static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio, * a pre-existing one. */ if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { - ret = split_extent_map(bbio->inode, bbio->file_offset, - ordered->num_bytes, len, - ordered->disk_bytenr); + ret = btrfs_split_extent_map(bbio->inode, bbio->file_offset, + ordered->num_bytes, len, + ordered->disk_bytenr); if (ret) return ret; } @@ -855,6 +855,22 @@ relock: btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); goto buffered; } + /* + * We can't control the folios being passed in, applications can write + * to them while a direct IO write is in progress. This means the + * content might change after we calculated the data checksum. + * Therefore we can end up storing a checksum that doesn't match the + * persisted data. + * + * To be extra safe and avoid false data checksum mismatch, if the + * inode requires data checksum, just fallback to buffered IO. + * For buffered IO we have full control of page cache and can ensure + * no one is modifying the content during writeback. + */ + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); + goto buffered; + } /* * The iov_iter can be mapped to the same file range we are writing to. diff --git a/fs/btrfs/direct-io.h b/fs/btrfs/direct-io.h index 3dc3ea926afe..df5d45ee6de7 100644 --- a/fs/btrfs/direct-io.h +++ b/fs/btrfs/direct-io.h @@ -5,6 +5,8 @@ #include <linux/types.h> +struct kiocb; + int __init btrfs_init_dio(void); void __cold btrfs_destroy_dio(void); diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index e815d165cccc..89fe85778115 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -94,8 +94,6 @@ static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { lockdep_assert_held(&discard_ctl->lock); - if (!btrfs_run_discard_work(discard_ctl)) - return; if (list_empty(&block_group->discard_list) || block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) { @@ -118,6 +116,9 @@ static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, if (!btrfs_is_block_group_data_only(block_group)) return; + if (!btrfs_run_discard_work(discard_ctl)) + return; + spin_lock(&discard_ctl->lock); __add_to_discard_list(discard_ctl, block_group); spin_unlock(&discard_ctl->lock); @@ -167,13 +168,7 @@ static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl, block_group->discard_eligible_time = 0; queued = !list_empty(&block_group->discard_list); list_del_init(&block_group->discard_list); - /* - * If the block group is currently running in the discard workfn, we - * don't want to deref it, since it's still being used by the workfn. - * The workfn will notice this case and deref the block group when it is - * finished. - */ - if (queued && !running) + if (queued) btrfs_put_block_group(block_group); spin_unlock(&discard_ctl->lock); @@ -250,6 +245,20 @@ again: block_group->used != 0) { if (btrfs_is_block_group_data_only(block_group)) { __add_to_discard_list(discard_ctl, block_group); + /* + * The block group must have been moved to other + * discard list even if discard was disabled in + * the meantime or a transaction abort happened, + * otherwise we can end up in an infinite loop, + * always jumping into the 'again' label and + * keep getting this block group over and over + * in case there are no other block groups in + * the discard lists. + */ + ASSERT(block_group->discard_index != + BTRFS_DISCARD_INDEX_UNUSED, + "discard_index=%d", + block_group->discard_index); } else { list_del_init(&block_group->discard_list); btrfs_put_block_group(block_group); @@ -260,9 +269,10 @@ again: block_group->discard_cursor = block_group->start; block_group->discard_state = BTRFS_DISCARD_EXTENTS; } - discard_ctl->block_group = block_group; } if (block_group) { + btrfs_get_block_group(block_group); + discard_ctl->block_group = block_group; *discard_state = block_group->discard_state; *discard_index = block_group->discard_index; } @@ -493,9 +503,20 @@ static void btrfs_discard_workfn(struct work_struct *work) block_group = peek_discard_list(discard_ctl, &discard_state, &discard_index, now); - if (!block_group || !btrfs_run_discard_work(discard_ctl)) + if (!block_group) + return; + if (!btrfs_run_discard_work(discard_ctl)) { + spin_lock(&discard_ctl->lock); + btrfs_put_block_group(block_group); + discard_ctl->block_group = NULL; + spin_unlock(&discard_ctl->lock); return; + } if (now < block_group->discard_eligible_time) { + spin_lock(&discard_ctl->lock); + btrfs_put_block_group(block_group); + discard_ctl->block_group = NULL; + spin_unlock(&discard_ctl->lock); btrfs_discard_schedule_work(discard_ctl, false); return; } @@ -547,15 +568,7 @@ static void btrfs_discard_workfn(struct work_struct *work) spin_lock(&discard_ctl->lock); discard_ctl->prev_discard = trimmed; discard_ctl->prev_discard_time = now; - /* - * If the block group was removed from the discard list while it was - * running in this workfn, then we didn't deref it, since this function - * still owned that reference. But we set the discard_ctl->block_group - * back to NULL, so we can use that condition to know that now we need - * to deref the block_group. - */ - if (discard_ctl->block_group == NULL) - btrfs_put_block_group(block_group); + btrfs_put_block_group(block_group); discard_ctl->block_group = NULL; __btrfs_discard_schedule_work(discard_ctl, now, false); spin_unlock(&discard_ctl->lock); diff --git a/fs/btrfs/discard.h b/fs/btrfs/discard.h index dddb0f9101ba..2c5e85394092 100644 --- a/fs/btrfs/discard.h +++ b/fs/btrfs/discard.h @@ -3,6 +3,7 @@ #ifndef BTRFS_DISCARD_H #define BTRFS_DISCARD_H +#include <linux/types.h> #include <linux/sizes.h> struct btrfs_fs_info; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f09db62e61a1..1beb9458f622 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -182,22 +182,22 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num) { struct btrfs_fs_info *fs_info = eb->fs_info; - int num_folios = num_extent_folios(eb); int ret = 0; if (sb_rdonly(fs_info->sb)) return -EROFS; - for (int i = 0; i < num_folios; i++) { + for (int i = 0; i < num_extent_folios(eb); i++) { struct folio *folio = eb->folios[i]; u64 start = max_t(u64, eb->start, folio_pos(folio)); u64 end = min_t(u64, eb->start + eb->len, folio_pos(folio) + eb->folio_size); u32 len = end - start; + phys_addr_t paddr = PFN_PHYS(folio_pfn(folio)) + + offset_in_folio(folio, start); - ret = btrfs_repair_io_failure(fs_info, 0, start, len, - start, folio, offset_in_folio(folio, start), - mirror_num); + ret = btrfs_repair_io_failure(fs_info, 0, start, len, start, + paddr, mirror_num); if (ret) break; } @@ -225,7 +225,6 @@ int btrfs_read_extent_buffer(struct extent_buffer *eb, ASSERT(check); while (1) { - clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); ret = read_extent_buffer_pages(eb, mirror_num, check); if (!ret) break; @@ -257,7 +256,7 @@ int btrfs_read_extent_buffer(struct extent_buffer *eb, /* * Checksum a dirty tree block before IO. */ -blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) +int btree_csum_one_bio(struct btrfs_bio *bbio) { struct extent_buffer *eb = bbio->private; struct btrfs_fs_info *fs_info = eb->fs_info; @@ -268,9 +267,9 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) /* Btree blocks are always contiguous on disk. */ if (WARN_ON_ONCE(bbio->file_offset != eb->start)) - return BLK_STS_IOERR; + return -EIO; if (WARN_ON_ONCE(bbio->bio.bi_iter.bi_size != eb->len)) - return BLK_STS_IOERR; + return -EIO; /* * If an extent_buffer is marked as EXTENT_BUFFER_ZONED_ZEROOUT, don't @@ -279,14 +278,13 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) */ if (test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)) { memzero_extent_buffer(eb, 0, eb->len); - return BLK_STS_OK; + return 0; } if (WARN_ON_ONCE(found_start != eb->start)) - return BLK_STS_IOERR; - if (WARN_ON(!btrfs_folio_test_uptodate(fs_info, eb->folios[0], - eb->start, eb->len))) - return BLK_STS_IOERR; + return -EIO; + if (WARN_ON(!btrfs_meta_folio_test_uptodate(eb->folios[0], eb))) + return -EIO; ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid, offsetof(struct btrfs_header, fsid), @@ -314,7 +312,7 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) goto error; } write_extent_buffer(eb, result, 0, fs_info->csum_size); - return BLK_STS_OK; + return 0; error: btrfs_print_tree(eb, 0); @@ -328,7 +326,7 @@ error: */ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG) || btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID); - return errno_to_blk_status(ret); + return ret; } static bool check_tree_block_fsid(struct extent_buffer *eb) @@ -454,15 +452,9 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb, goto out; } - /* - * If this is a leaf block and it is corrupt, set the corrupt bit so - * that we don't try and read the other copies of this block, just - * return -EIO. - */ - if (found_level == 0 && btrfs_check_leaf(eb)) { - set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); + /* If this is a leaf block and it is corrupt, just return -EIO. */ + if (found_level == 0 && btrfs_check_leaf(eb)) ret = -EIO; - } if (found_level > 0 && btrfs_check_node(eb)) ret = -EIO; @@ -643,11 +635,16 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, } -static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, - u64 objectid) +static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, + u64 objectid, gfp_t flags) { + struct btrfs_root *root; bool dummy = btrfs_is_testing(fs_info); + root = kzalloc(sizeof(*root), flags); + if (!root) + return NULL; + memset(&root->root_key, 0, sizeof(root->root_key)); memset(&root->root_item, 0, sizeof(root->root_item)); memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); @@ -700,10 +697,10 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, btrfs_set_root_last_log_commit(root, 0); root->anon_dev = 0; if (!dummy) { - extent_io_tree_init(fs_info, &root->dirty_log_pages, - IO_TREE_ROOT_DIRTY_LOG_PAGES); - extent_io_tree_init(fs_info, &root->log_csum_range, - IO_TREE_LOG_CSUM_RANGE); + btrfs_extent_io_tree_init(fs_info, &root->dirty_log_pages, + IO_TREE_ROOT_DIRTY_LOG_PAGES); + btrfs_extent_io_tree_init(fs_info, &root->log_csum_range, + IO_TREE_LOG_CSUM_RANGE); } spin_lock_init(&root->root_item_lock); @@ -714,14 +711,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, list_add_tail(&root->leak_list, &fs_info->allocated_roots); spin_unlock(&fs_info->fs_roots_radix_lock); #endif -} -static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, - u64 objectid, gfp_t flags) -{ - struct btrfs_root *root = kzalloc(sizeof(*root), flags); - if (root) - __setup_root(root, fs_info, objectid); return root; } @@ -1089,21 +1079,22 @@ struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, const struct btrfs_key *key) { struct btrfs_root *root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); path = btrfs_alloc_path(); if (!path) return ERR_PTR(-ENOMEM); root = read_tree_root_path(tree_root, path, key); - btrfs_free_path(path); return root; } /* - * Initialize subvolume root in-memory structure + * Initialize subvolume root in-memory structure. * * @anon_dev: anonymous device to attach to the root, if zero, allocate new + * + * In case of failure the caller is responsible to call btrfs_free_fs_root() */ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) { @@ -1127,7 +1118,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) if (!anon_dev) { ret = get_anon_bdev(&root->anon_dev); if (ret) - goto fail; + return ret; } else { root->anon_dev = anon_dev; } @@ -1137,7 +1128,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) ret = btrfs_init_root_free_objectid(root); if (ret) { mutex_unlock(&root->objectid_mutex); - goto fail; + return ret; } ASSERT(root->free_objectid <= BTRFS_LAST_FREE_OBJECTID); @@ -1145,9 +1136,6 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) mutex_unlock(&root->objectid_mutex); return 0; -fail: - /* The caller is responsible to call btrfs_free_fs_root */ - return ret; } static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, @@ -1568,7 +1556,7 @@ static int transaction_kthread(void *arg) do { cannot_commit = false; - delay = msecs_to_jiffies(fs_info->commit_interval * 1000); + delay = secs_to_jiffies(fs_info->commit_interval); mutex_lock(&fs_info->transaction_kthread_mutex); spin_lock(&fs_info->trans_lock); @@ -1583,9 +1571,9 @@ static int transaction_kthread(void *arg) cur->state < TRANS_STATE_COMMIT_PREP && delta < fs_info->commit_interval) { spin_unlock(&fs_info->trans_lock); - delay -= msecs_to_jiffies((delta - 1) * 1000); + delay -= secs_to_jiffies(delta - 1); delay = min(delay, - msecs_to_jiffies(fs_info->commit_interval * 1000)); + secs_to_jiffies(fs_info->commit_interval)); goto sleep; } transid = cur->transid; @@ -1867,8 +1855,8 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) int i; while (!list_empty(&fs_info->dead_roots)) { - gang[0] = list_entry(fs_info->dead_roots.next, - struct btrfs_root, root_list); + gang[0] = list_first_entry(&fs_info->dead_roots, + struct btrfs_root, root_list); list_del(&gang[0]->root_list); if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) @@ -1931,9 +1919,9 @@ static int btrfs_init_btree_inode(struct super_block *sb) inode->i_mapping->a_ops = &btree_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); - extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree, - IO_TREE_BTREE_INODE_IO); - extent_map_tree_init(&BTRFS_I(inode)->extent_tree); + btrfs_extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree, + IO_TREE_BTREE_INODE_IO); + btrfs_extent_map_tree_init(&BTRFS_I(inode)->extent_tree); BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root); set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); @@ -2006,7 +1994,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) btrfs_alloc_ordered_workqueue(fs_info, "qgroup-rescan", ordered_flags); fs_info->discard_ctl.discard_workers = - alloc_ordered_workqueue("btrfs_discard", WQ_FREEZABLE); + alloc_ordered_workqueue("btrfs-discard", WQ_FREEZABLE); if (!(fs_info->workers && fs_info->delalloc_workers && fs_info->flush_workers && @@ -2200,8 +2188,8 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root, static int load_global_roots(struct btrfs_root *tree_root) { - struct btrfs_path *path; - int ret = 0; + BTRFS_PATH_AUTO_FREE(path); + int ret; path = btrfs_alloc_path(); if (!path) @@ -2210,18 +2198,17 @@ static int load_global_roots(struct btrfs_root *tree_root) ret = load_global_roots_objectid(tree_root, path, BTRFS_EXTENT_TREE_OBJECTID, "extent"); if (ret) - goto out; + return ret; ret = load_global_roots_objectid(tree_root, path, BTRFS_CSUM_TREE_OBJECTID, "csum"); if (ret) - goto out; + return ret; if (!btrfs_fs_compat_ro(tree_root->fs_info, FREE_SPACE_TREE)) - goto out; + return ret; ret = load_global_roots_objectid(tree_root, path, BTRFS_FREE_SPACE_TREE_OBJECTID, "free space"); -out: - btrfs_free_path(path); + return ret; } @@ -2447,21 +2434,27 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info, * Check sectorsize and nodesize first, other check will need it. * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here. */ - if (!is_power_of_2(sectorsize) || sectorsize < 4096 || + if (!is_power_of_2(sectorsize) || sectorsize < BTRFS_MIN_BLOCKSIZE || sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) { btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize); ret = -EINVAL; } /* - * We only support at most two sectorsizes: 4K and PAGE_SIZE. + * We only support at most 3 sectorsizes: 4K, PAGE_SIZE, MIN_BLOCKSIZE. + * + * For 4K page sized systems with non-debug builds, all 3 matches (4K). + * For 4K page sized systems with debug builds, there are two block sizes + * supported. (4K and 2K) * * We can support 16K sectorsize with 64K page size without problem, * but such sectorsize/pagesize combination doesn't make much sense. * 4K will be our future standard, PAGE_SIZE is supported from the very * beginning. */ - if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K && sectorsize != PAGE_SIZE)) { + if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K && + sectorsize != PAGE_SIZE && + sectorsize != BTRFS_MIN_BLOCKSIZE)) { btrfs_err(fs_info, "sectorsize %llu not yet supported for page size %lu", sectorsize, PAGE_SIZE); @@ -2561,6 +2554,9 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info, ret = -EINVAL; } + if (ret) + return ret; + ret = validate_sys_chunk_array(fs_info, sb); /* @@ -2765,10 +2761,21 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) return ret; } +/* + * Lockdep gets confused between our buffer_tree which requires IRQ locking because + * we modify marks in the IRQ context, and our delayed inode xarray which doesn't + * have these requirements. Use a class key so lockdep doesn't get them mixed up. + */ +static struct lock_class_key buffer_xa_class; + void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) { INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); - INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); + + /* Use the same flags as mapping->i_pages. */ + xa_init_flags(&fs_info->buffer_tree, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT); + lockdep_set_class(&fs_info->buffer_tree.xa_lock, &buffer_xa_class); + INIT_LIST_HEAD(&fs_info->trans_list); INIT_LIST_HEAD(&fs_info->dead_roots); INIT_LIST_HEAD(&fs_info->delayed_iputs); @@ -2780,7 +2787,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->defrag_inodes_lock); spin_lock_init(&fs_info->super_lock); - spin_lock_init(&fs_info->buffer_lock); spin_lock_init(&fs_info->unused_bgs_lock); spin_lock_init(&fs_info->treelog_bg_lock); spin_lock_init(&fs_info->zone_active_bgs_lock); @@ -2825,6 +2831,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) BTRFS_BLOCK_RSV_GLOBAL); btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS); btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK); + btrfs_init_block_rsv(&fs_info->treelog_rsv, BTRFS_BLOCK_RSV_TREELOG); btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY); btrfs_init_block_rsv(&fs_info->delayed_block_rsv, BTRFS_BLOCK_RSV_DELOPS); @@ -2858,8 +2865,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) rwlock_init(&fs_info->block_group_cache_lock); fs_info->block_group_cache_tree = RB_ROOT_CACHED; - extent_io_tree_init(fs_info, &fs_info->excluded_extents, - IO_TREE_FS_EXCLUDED_EXTENTS); + btrfs_extent_io_tree_init(fs_info, &fs_info->excluded_extents, + IO_TREE_FS_EXCLUDED_EXTENTS); mutex_init(&fs_info->ordered_operations_mutex); mutex_init(&fs_info->tree_log_mutex); @@ -3311,7 +3318,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device /* * Read super block and check the signature bytes only */ - disk_super = btrfs_read_dev_super(fs_devices->latest_dev->bdev); + disk_super = btrfs_read_disk_super(fs_devices->latest_dev->bdev, 0, false); if (IS_ERR(disk_super)) { ret = PTR_ERR(disk_super); goto fail_alloc; @@ -3390,7 +3397,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device fs_info->nodesize = nodesize; fs_info->sectorsize = sectorsize; fs_info->sectorsize_bits = ilog2(sectorsize); - fs_info->sectors_per_page = (PAGE_SIZE >> fs_info->sectorsize_bits); fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size; fs_info->stripesize = stripesize; fs_info->fs_devices->fs_info = fs_info; @@ -3416,11 +3422,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device */ fs_info->max_inline = min_t(u64, fs_info->max_inline, fs_info->sectorsize); - if (sectorsize < PAGE_SIZE) - btrfs_warn(fs_info, - "read-write for sector size %u with page size %lu is experimental", - sectorsize, PAGE_SIZE); - ret = btrfs_init_workqueues(fs_info); if (ret) goto fail_sb_buffer; @@ -3712,85 +3713,6 @@ static void btrfs_end_super_write(struct bio *bio) bio_put(bio); } -struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, - int copy_num, bool drop_cache) -{ - struct btrfs_super_block *super; - struct page *page; - u64 bytenr, bytenr_orig; - struct address_space *mapping = bdev->bd_mapping; - int ret; - - bytenr_orig = btrfs_sb_offset(copy_num); - ret = btrfs_sb_log_location_bdev(bdev, copy_num, READ, &bytenr); - if (ret == -ENOENT) - return ERR_PTR(-EINVAL); - else if (ret) - return ERR_PTR(ret); - - if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev)) - return ERR_PTR(-EINVAL); - - if (drop_cache) { - /* This should only be called with the primary sb. */ - ASSERT(copy_num == 0); - - /* - * Drop the page of the primary superblock, so later read will - * always read from the device. - */ - invalidate_inode_pages2_range(mapping, - bytenr >> PAGE_SHIFT, - (bytenr + BTRFS_SUPER_INFO_SIZE) >> PAGE_SHIFT); - } - - page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS); - if (IS_ERR(page)) - return ERR_CAST(page); - - super = page_address(page); - if (btrfs_super_magic(super) != BTRFS_MAGIC) { - btrfs_release_disk_super(super); - return ERR_PTR(-ENODATA); - } - - if (btrfs_super_bytenr(super) != bytenr_orig) { - btrfs_release_disk_super(super); - return ERR_PTR(-EINVAL); - } - - return super; -} - - -struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev) -{ - struct btrfs_super_block *super, *latest = NULL; - int i; - u64 transid = 0; - - /* we would like to check all the supers, but that would make - * a btrfs mount succeed after a mkfs from a different FS. - * So, we need to add a special mount option to scan for - * later supers, using BTRFS_SUPER_MIRROR_MAX instead - */ - for (i = 0; i < 1; i++) { - super = btrfs_read_dev_one_super(bdev, i, false); - if (IS_ERR(super)) - continue; - - if (!latest || btrfs_super_generation(super) > transid) { - if (latest) - btrfs_release_disk_super(super); - - latest = super; - transid = btrfs_super_generation(super); - } - } - - return super; -} - /* * Write superblock @sb to the @device. Do not wait for completion, all the * folios we use for writing are locked. @@ -3830,8 +3752,8 @@ static int write_dev_supers(struct btrfs_device *device, continue; } else if (ret < 0) { btrfs_err(device->fs_info, - "couldn't get super block location for mirror %d", - i); + "couldn't get super block location for mirror %d error %d", + i, ret); atomic_inc(&device->sb_write_errors); continue; } @@ -3850,12 +3772,11 @@ static int write_dev_supers(struct btrfs_device *device, GFP_NOFS); if (IS_ERR(folio)) { btrfs_err(device->fs_info, - "couldn't get super block page for bytenr %llu", - bytenr); + "couldn't get super block page for bytenr %llu error %ld", + bytenr, PTR_ERR(folio)); atomic_inc(&device->sb_write_errors); continue; } - ASSERT(folio_order(folio) == 0); offset = offset_in_folio(folio, bytenr); disk_super = folio_address(folio) + offset; @@ -3928,7 +3849,6 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) /* If the folio has been removed, then we know it completed. */ if (IS_ERR(folio)) continue; - ASSERT(folio_order(folio) == 0); /* Folio will be unlocked once the write completes. */ folio_wait_locked(folio); @@ -4248,8 +4168,9 @@ static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info) u64 found_end; found = true; - while (find_first_extent_bit(&trans->dirty_pages, cur, - &found_start, &found_end, EXTENT_DIRTY, &cached)) { + while (btrfs_find_first_extent_bit(&trans->dirty_pages, cur, + &found_start, &found_end, + EXTENT_DIRTY, &cached)) { dirty_bytes += found_end + 1 - found_start; cur = found_end + 1; } @@ -4326,6 +4247,14 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_cleanup_defrag_inodes(fs_info); /* + * Handle the error fs first, as it will flush and wait for all ordered + * extents. This will generate delayed iputs, thus we want to handle + * it first. + */ + if (unlikely(BTRFS_FS_ERROR(fs_info))) + btrfs_error_commit_super(fs_info); + + /* * Wait for any fixup workers to complete. * If we don't wait for them here and they are still running by the time * we call kthread_stop() against the cleaner kthread further below, we @@ -4346,6 +4275,31 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_flush_workqueue(fs_info->delalloc_workers); /* + * We can have ordered extents getting their last reference dropped from + * the fs_info->workers queue because for async writes for data bios we + * queue a work for that queue, at btrfs_wq_submit_bio(), that runs + * run_one_async_done() which calls btrfs_bio_end_io() in case the bio + * has an error, and that later function can do the final + * btrfs_put_ordered_extent() on the ordered extent attached to the bio, + * which adds a delayed iput for the inode. So we must flush the queue + * so that we don't have delayed iputs after committing the current + * transaction below and stopping the cleaner and transaction kthreads. + */ + btrfs_flush_workqueue(fs_info->workers); + + /* + * When finishing a compressed write bio we schedule a work queue item + * to finish an ordered extent - btrfs_finish_compressed_write_work() + * calls btrfs_finish_ordered_extent() which in turns does a call to + * btrfs_queue_ordered_fn(), and that queues the ordered extent + * completion either in the endio_write_workers work queue or in the + * fs_info->endio_freespace_worker work queue. We flush those queues + * below, so before we flush them we must flush this queue for the + * workers of compressed writes. + */ + flush_workqueue(fs_info->compressed_write_workers); + + /* * After we parked the cleaner kthread, ordered extents may have * completed and created new delayed iputs. If one of the async reclaim * tasks is running and in the RUN_DELAYED_IPUTS flush state, then we @@ -4369,6 +4323,8 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) /* Ordered extents for free space inodes. */ btrfs_flush_workqueue(fs_info->endio_freespace_worker); btrfs_run_delayed_iputs(fs_info); + /* There should be no more workload to generate new delayed iputs. */ + set_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state); cancel_work_sync(&fs_info->async_reclaim_work); cancel_work_sync(&fs_info->async_data_reclaim_work); @@ -4403,9 +4359,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_err(fs_info, "commit super ret %d", ret); } - if (BTRFS_FS_ERROR(fs_info)) - btrfs_error_commit_super(fs_info); - kthread_stop(fs_info->transaction_kthread); kthread_stop(fs_info->cleaner_kthread); @@ -4413,7 +4366,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags); if (btrfs_check_quota_leak(fs_info)) { - WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + DEBUG_WARN("qgroup reserved space leaked"); btrfs_err(fs_info, "qgroup reserved space leaked"); } @@ -4528,10 +4481,6 @@ static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info) /* cleanup FS via transaction */ btrfs_cleanup_transaction(fs_info); - mutex_lock(&fs_info->cleaner_mutex); - btrfs_run_delayed_iputs(fs_info); - mutex_unlock(&fs_info->cleaner_mutex); - down_write(&fs_info->cleanup_work_sem); up_write(&fs_info->cleanup_work_sem); } @@ -4674,9 +4623,9 @@ static void btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info, u64 start = 0; u64 end; - while (find_first_extent_bit(dirty_pages, start, &start, &end, - mark, NULL)) { - clear_extent_bits(dirty_pages, start, end, mark); + while (btrfs_find_first_extent_bit(dirty_pages, start, &start, &end, + mark, NULL)) { + btrfs_clear_extent_bits(dirty_pages, start, end, mark); while (start <= end) { eb = find_extent_buffer(fs_info, start); start += fs_info->nodesize; @@ -4709,14 +4658,14 @@ static void btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, * the same extent range. */ mutex_lock(&fs_info->unused_bg_unpin_mutex); - if (!find_first_extent_bit(unpin, 0, &start, &end, - EXTENT_DIRTY, &cached_state)) { + if (!btrfs_find_first_extent_bit(unpin, 0, &start, &end, + EXTENT_DIRTY, &cached_state)) { mutex_unlock(&fs_info->unused_bg_unpin_mutex); break; } - clear_extent_dirty(unpin, start, end, &cached_state); - free_extent_state(cached_state); + btrfs_clear_extent_dirty(unpin, start, end, &cached_state); + btrfs_free_extent_state(cached_state); btrfs_error_unpin_extent_range(fs_info, start, end); mutex_unlock(&fs_info->unused_bg_unpin_mutex); cond_resched(); @@ -4902,7 +4851,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) int btrfs_init_root_free_objectid(struct btrfs_root *root) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int ret; struct extent_buffer *l; struct btrfs_key search_key; @@ -4918,14 +4867,13 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root) search_key.offset = (u64)-1; ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); if (ret < 0) - goto error; + return ret; if (ret == 0) { /* * Key with offset -1 found, there would have to exist a root * with such id, but this is out of valid range. */ - ret = -EUCLEAN; - goto error; + return -EUCLEAN; } if (path->slots[0] > 0) { slot = path->slots[0] - 1; @@ -4936,10 +4884,8 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root) } else { root->free_objectid = BTRFS_FIRST_FREE_OBJECTID; } - ret = 0; -error: - btrfs_free_path(path); - return ret; + + return 0; } int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid) diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 587842991b24..864a55a96226 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -58,9 +58,6 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info, const struct btrfs_super_block *sb, int mirror_num); int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount); int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors); -struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev); -struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, - int copy_num, bool drop_cache); int btrfs_commit_super(struct btrfs_fs_info *fs_info); struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, const struct btrfs_key *key); @@ -114,7 +111,7 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int btrfs_read_extent_buffer(struct extent_buffer *buf, const struct btrfs_tree_parent_check *check); -blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio); +int btree_csum_one_bio(struct btrfs_bio *bbio); int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index e2b22bea348a..7fc8a3200b40 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -75,7 +75,7 @@ struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, { struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *root; - struct inode *inode; + struct btrfs_inode *inode; if (objectid < BTRFS_FIRST_FREE_OBJECTID) return ERR_PTR(-ESTALE); @@ -89,12 +89,12 @@ struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, if (IS_ERR(inode)) return ERR_CAST(inode); - if (generation != 0 && generation != inode->i_generation) { - iput(inode); + if (generation != 0 && generation != inode->vfs_inode.i_generation) { + iput(&inode->vfs_inode); return ERR_PTR(-ESTALE); } - return d_obtain_alias(inode); + return d_obtain_alias(&inode->vfs_inode); } static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh, @@ -145,9 +145,10 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, struct dentry *btrfs_get_parent(struct dentry *child) { - struct inode *dir = d_inode(child); - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); - struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_inode *dir = BTRFS_I(d_inode(child)); + struct btrfs_inode *inode; + struct btrfs_root *root = dir->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_root_ref *ref; @@ -159,13 +160,13 @@ struct dentry *btrfs_get_parent(struct dentry *child) if (!path) return ERR_PTR(-ENOMEM); - if (btrfs_ino(BTRFS_I(dir)) == BTRFS_FIRST_FREE_OBJECTID) { + if (btrfs_ino(dir) == BTRFS_FIRST_FREE_OBJECTID) { key.objectid = btrfs_root_id(root); key.type = BTRFS_ROOT_BACKREF_KEY; key.offset = (u64)-1; root = fs_info->tree_root; } else { - key.objectid = btrfs_ino(BTRFS_I(dir)); + key.objectid = btrfs_ino(dir); key.type = BTRFS_INODE_REF_KEY; key.offset = (u64)-1; } @@ -210,7 +211,11 @@ struct dentry *btrfs_get_parent(struct dentry *child) found_key.offset, 0); } - return d_obtain_alias(btrfs_iget(key.objectid, root)); + inode = btrfs_iget(key.objectid, root); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + return d_obtain_alias(&inode->vfs_inode); fail: btrfs_free_path(path); return ERR_PTR(ret); @@ -219,11 +224,11 @@ fail: static int btrfs_get_name(struct dentry *parent, char *name, struct dentry *child) { - struct inode *inode = d_inode(child); - struct inode *dir = d_inode(parent); - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct btrfs_path *path; - struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_inode *inode = BTRFS_I(d_inode(child)); + struct btrfs_inode *dir = BTRFS_I(d_inode(parent)); + struct btrfs_root *root = dir->root; + struct btrfs_fs_info *fs_info = root->fs_info; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_inode_ref *iref; struct btrfs_root_ref *rref; struct extent_buffer *leaf; @@ -233,37 +238,34 @@ static int btrfs_get_name(struct dentry *parent, char *name, int ret; u64 ino; - if (!S_ISDIR(dir->i_mode)) + if (!S_ISDIR(dir->vfs_inode.i_mode)) return -EINVAL; - ino = btrfs_ino(BTRFS_I(inode)); + ino = btrfs_ino(inode); path = btrfs_alloc_path(); if (!path) return -ENOMEM; if (ino == BTRFS_FIRST_FREE_OBJECTID) { - key.objectid = btrfs_root_id(BTRFS_I(inode)->root); + key.objectid = btrfs_root_id(inode->root); key.type = BTRFS_ROOT_BACKREF_KEY; key.offset = (u64)-1; root = fs_info->tree_root; } else { key.objectid = ino; - key.offset = btrfs_ino(BTRFS_I(dir)); key.type = BTRFS_INODE_REF_KEY; + key.offset = btrfs_ino(dir); } ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) { - btrfs_free_path(path); return ret; } else if (ret > 0) { - if (ino == BTRFS_FIRST_FREE_OBJECTID) { + if (ino == BTRFS_FIRST_FREE_OBJECTID) path->slots[0]--; - } else { - btrfs_free_path(path); + else return -ENOENT; - } } leaf = path->nodes[0]; @@ -280,7 +282,6 @@ static int btrfs_get_name(struct dentry *parent, char *name, } read_extent_buffer(leaf, name, name_ptr, name_len); - btrfs_free_path(path); /* * have to add the null termination to make sure that reconnect_path diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 6d08c100b01d..b1b96eb5f64e 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -42,7 +42,7 @@ static inline void btrfs_extent_state_leak_debug_check(void) struct extent_state *state; while (!list_empty(&states)) { - state = list_entry(states.next, struct extent_state, leak_list); + state = list_first_entry(&states, struct extent_state, leak_list); pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n", state->start, state->end, state->state, extent_state_in_tree(state), @@ -59,13 +59,12 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller, struct extent_io_tree *tree, u64 start, u64 end) { - const struct btrfs_inode *inode; + const struct btrfs_inode *inode = tree->inode; u64 isize; if (tree->owner != IO_TREE_INODE_IO) return; - inode = extent_io_tree_to_inode_const(tree); isize = i_size_read(&inode->vfs_inode); if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { btrfs_debug_rl(inode->root->fs_info, @@ -80,25 +79,8 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller, #define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0) #endif - -/* - * The only tree allowed to set the inode is IO_TREE_INODE_IO. - */ -static bool is_inode_io_tree(const struct extent_io_tree *tree) -{ - return tree->owner == IO_TREE_INODE_IO; -} - -/* Return the inode if it's valid for the given tree, otherwise NULL. */ -struct btrfs_inode *extent_io_tree_to_inode(struct extent_io_tree *tree) -{ - if (tree->owner == IO_TREE_INODE_IO) - return tree->inode; - return NULL; -} - /* Read-only access to the inode. */ -const struct btrfs_inode *extent_io_tree_to_inode_const(const struct extent_io_tree *tree) +const struct btrfs_inode *btrfs_extent_io_tree_to_inode(const struct extent_io_tree *tree) { if (tree->owner == IO_TREE_INODE_IO) return tree->inode; @@ -106,15 +88,15 @@ const struct btrfs_inode *extent_io_tree_to_inode_const(const struct extent_io_t } /* For read-only access to fs_info. */ -const struct btrfs_fs_info *extent_io_tree_to_fs_info(const struct extent_io_tree *tree) +const struct btrfs_fs_info *btrfs_extent_io_tree_to_fs_info(const struct extent_io_tree *tree) { if (tree->owner == IO_TREE_INODE_IO) return tree->inode->root->fs_info; return tree->fs_info; } -void extent_io_tree_init(struct btrfs_fs_info *fs_info, - struct extent_io_tree *tree, unsigned int owner) +void btrfs_extent_io_tree_init(struct btrfs_fs_info *fs_info, + struct extent_io_tree *tree, unsigned int owner) { tree->state = RB_ROOT; spin_lock_init(&tree->lock); @@ -129,7 +111,7 @@ void extent_io_tree_init(struct btrfs_fs_info *fs_info, * aren't any waiters on any extent state record (EXTENT_LOCK_BITS are never * set on any extent state when calling this function). */ -void extent_io_tree_release(struct extent_io_tree *tree) +void btrfs_extent_io_tree_release(struct extent_io_tree *tree) { struct rb_root root; struct extent_state *state; @@ -148,7 +130,7 @@ void extent_io_tree_release(struct extent_io_tree *tree) * (see wait_extent_bit()). */ ASSERT(!waitqueue_active(&state->wq)); - free_extent_state(state); + btrfs_free_extent_state(state); cond_resched_lock(&tree->lock); } /* @@ -176,7 +158,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask) btrfs_leak_debug_add_state(state); refcount_set(&state->refs, 1); init_waitqueue_head(&state->wq); - trace_alloc_extent_state(state, mask, _RET_IP_); + trace_btrfs_alloc_extent_state(state, mask, _RET_IP_); return state; } @@ -188,14 +170,14 @@ static struct extent_state *alloc_extent_state_atomic(struct extent_state *preal return prealloc; } -void free_extent_state(struct extent_state *state) +void btrfs_free_extent_state(struct extent_state *state) { if (!state) return; if (refcount_dec_and_test(&state->refs)) { WARN_ON(extent_state_in_tree(state)); btrfs_leak_debug_del_state(state); - trace_free_extent_state(state, _RET_IP_); + trace_btrfs_free_extent_state(state, _RET_IP_); kmem_cache_free(extent_state_cache, state); } } @@ -222,38 +204,34 @@ static inline struct extent_state *next_state(struct extent_state *state) { struct rb_node *next = rb_next(&state->rb_node); - if (next) - return rb_entry(next, struct extent_state, rb_node); - else - return NULL; + return rb_entry_safe(next, struct extent_state, rb_node); } static inline struct extent_state *prev_state(struct extent_state *state) { struct rb_node *next = rb_prev(&state->rb_node); - if (next) - return rb_entry(next, struct extent_state, rb_node); - else - return NULL; + return rb_entry_safe(next, struct extent_state, rb_node); } /* - * Search @tree for an entry that contains @offset. Such entry would have - * entry->start <= offset && entry->end >= offset. + * Search @tree for an entry that contains @offset or if none exists for the + * first entry that starts and ends after that offset. * * @tree: the tree to search - * @offset: offset that should fall within an entry in @tree + * @offset: search offset * @node_ret: pointer where new node should be anchored (used when inserting an * entry in the tree) * @parent_ret: points to entry which would have been the parent of the entry, * containing @offset * - * Return a pointer to the entry that contains @offset byte address and don't change - * @node_ret and @parent_ret. + * Return a pointer to the entry that contains @offset byte address. + * + * If no such entry exists, return the first entry that starts and ends after + * @offset if one exists, otherwise NULL. * - * If no such entry exists, return pointer to entry that ends before @offset - * and fill parameters @node_ret and @parent_ret, ie. does not return NULL. + * If the returned entry starts at @offset, then @node_ret and @parent_ret + * aren't changed. */ static inline struct extent_state *tree_search_for_insert(struct extent_io_tree *tree, u64 offset, @@ -282,7 +260,11 @@ static inline struct extent_state *tree_search_for_insert(struct extent_io_tree if (parent_ret) *parent_ret = prev; - /* Search neighbors until we find the first one past the end */ + /* + * Return either the current entry if it contains offset (it ends after + * or at offset) or the first entry that starts and ends after offset if + * one exists, or NULL. + */ while (entry && offset > entry->end) entry = next_state(entry); @@ -346,12 +328,12 @@ static inline struct extent_state *tree_search(struct extent_io_tree *tree, u64 return tree_search_for_insert(tree, offset, NULL, NULL); } -static void extent_io_tree_panic(const struct extent_io_tree *tree, - const struct extent_state *state, - const char *opname, - int err) +static void __cold extent_io_tree_panic(const struct extent_io_tree *tree, + const struct extent_state *state, + const char *opname, + int err) { - btrfs_panic(extent_io_tree_to_fs_info(tree), err, + btrfs_panic(btrfs_extent_io_tree_to_fs_info(tree), err, "extent io tree error on %s state start %llu end %llu", opname, state->start, state->end); } @@ -362,13 +344,12 @@ static void merge_prev_state(struct extent_io_tree *tree, struct extent_state *s prev = prev_state(state); if (prev && prev->end == state->start - 1 && prev->state == state->state) { - if (is_inode_io_tree(tree)) - btrfs_merge_delalloc_extent(extent_io_tree_to_inode(tree), - state, prev); + if (tree->owner == IO_TREE_INODE_IO) + btrfs_merge_delalloc_extent(tree->inode, state, prev); state->start = prev->start; rb_erase(&prev->rb_node, &tree->state); RB_CLEAR_NODE(&prev->rb_node); - free_extent_state(prev); + btrfs_free_extent_state(prev); } } @@ -378,13 +359,12 @@ static void merge_next_state(struct extent_io_tree *tree, struct extent_state *s next = next_state(state); if (next && next->start == state->end + 1 && next->state == state->state) { - if (is_inode_io_tree(tree)) - btrfs_merge_delalloc_extent(extent_io_tree_to_inode(tree), - state, next); + if (tree->owner == IO_TREE_INODE_IO) + btrfs_merge_delalloc_extent(tree->inode, state, next); state->end = next->end; rb_erase(&next->rb_node, &tree->state); RB_CLEAR_NODE(&next->rb_node); - free_extent_state(next); + btrfs_free_extent_state(next); } } @@ -413,8 +393,8 @@ static void set_state_bits(struct extent_io_tree *tree, u32 bits_to_set = bits & ~EXTENT_CTLBITS; int ret; - if (is_inode_io_tree(tree)) - btrfs_set_delalloc_extent(extent_io_tree_to_inode(tree), state, bits); + if (tree->owner == IO_TREE_INODE_IO) + btrfs_set_delalloc_extent(tree->inode, state, bits); ret = add_extent_changeset(state, bits_to_set, changeset, 1); BUG_ON(ret < 0); @@ -459,10 +439,9 @@ static struct extent_state *insert_state(struct extent_io_tree *tree, if (state->end < entry->start) { if (try_merge && end == entry->start && state->state == entry->state) { - if (is_inode_io_tree(tree)) - btrfs_merge_delalloc_extent( - extent_io_tree_to_inode(tree), - state, entry); + if (tree->owner == IO_TREE_INODE_IO) + btrfs_merge_delalloc_extent(tree->inode, + state, entry); entry->start = state->start; merge_prev_state(tree, entry); state->state = 0; @@ -472,10 +451,9 @@ static struct extent_state *insert_state(struct extent_io_tree *tree, } else if (state->end > entry->end) { if (try_merge && entry->end == start && state->state == entry->state) { - if (is_inode_io_tree(tree)) - btrfs_merge_delalloc_extent( - extent_io_tree_to_inode(tree), - state, entry); + if (tree->owner == IO_TREE_INODE_IO) + btrfs_merge_delalloc_extent(tree->inode, + state, entry); entry->end = state->end; merge_next_state(tree, entry); state->state = 0; @@ -527,9 +505,8 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, struct rb_node *parent = NULL; struct rb_node **node; - if (is_inode_io_tree(tree)) - btrfs_split_delalloc_extent(extent_io_tree_to_inode(tree), orig, - split); + if (tree->owner == IO_TREE_INODE_IO) + btrfs_split_delalloc_extent(tree->inode, orig, split); prealloc->start = orig->start; prealloc->end = split - 1; @@ -549,7 +526,7 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, } else if (prealloc->end > entry->end) { node = &(*node)->rb_right; } else { - free_extent_state(prealloc); + btrfs_free_extent_state(prealloc); return -EEXIST; } } @@ -561,6 +538,18 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, } /* + * Use this during tree iteration to avoid doing next node searches when it's + * not needed (the current record ends at or after the target range's end). + */ +static inline struct extent_state *next_search_state(struct extent_state *state, u64 end) +{ + if (state->end < end) + return next_state(state); + + return NULL; +} + +/* * Utility function to clear some bits in an extent state struct. It will * optionally wake up anyone waiting on this state (wake == 1). * @@ -569,16 +558,15 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, */ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, struct extent_state *state, - u32 bits, int wake, + u32 bits, int wake, u64 end, struct extent_changeset *changeset) { struct extent_state *next; u32 bits_to_clear = bits & ~EXTENT_CTLBITS; int ret; - if (is_inode_io_tree(tree)) - btrfs_clear_delalloc_extent(extent_io_tree_to_inode(tree), state, - bits); + if (tree->owner == IO_TREE_INODE_IO) + btrfs_clear_delalloc_extent(tree->inode, state, bits); ret = add_extent_changeset(state, bits_to_clear, changeset, 0); BUG_ON(ret < 0); @@ -586,17 +574,17 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, if (wake) wake_up(&state->wq); if (state->state == 0) { - next = next_state(state); + next = next_search_state(state, end); if (extent_state_in_tree(state)) { rb_erase(&state->rb_node, &tree->state); RB_CLEAR_NODE(&state->rb_node); - free_extent_state(state); + btrfs_free_extent_state(state); } else { WARN_ON(1); } } else { merge_state(tree, state); - next = next_state(state); + next = next_search_state(state, end); } return next; } @@ -620,18 +608,18 @@ static void set_gfp_mask_from_bits(u32 *bits, gfp_t *mask) * * This takes the tree lock, and returns 0 on success and < 0 on error. */ -int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_state **cached_state, - struct extent_changeset *changeset) +int btrfs_clear_extent_bit_changeset(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_state **cached_state, + struct extent_changeset *changeset) { struct extent_state *state; struct extent_state *cached; struct extent_state *prealloc = NULL; u64 last_end; - int err; - int clear = 0; - int wake; - int delete = (bits & EXTENT_CLEAR_ALL_BITS); + int ret = 0; + bool clear; + bool wake; + const bool delete = (bits & EXTENT_CLEAR_ALL_BITS); gfp_t mask; set_gfp_mask_from_bits(&bits, &mask); @@ -644,9 +632,8 @@ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, if (bits & EXTENT_DELALLOC) bits |= EXTENT_NORESERVE; - wake = ((bits & EXTENT_LOCK_BITS) ? 1 : 0); - if (bits & (EXTENT_LOCK_BITS | EXTENT_BOUNDARY)) - clear = 1; + wake = (bits & EXTENT_LOCK_BITS); + clear = (bits & (EXTENT_LOCK_BITS | EXTENT_BOUNDARY)); again: if (!prealloc) { /* @@ -676,7 +663,7 @@ again: goto hit_next; } if (clear) - free_extent_state(cached); + btrfs_free_extent_state(cached); } /* This search will find the extents that end after our range starts. */ @@ -691,7 +678,7 @@ hit_next: /* The state doesn't have the wanted bits, go ahead. */ if (!(state->state & bits)) { - state = next_state(state); + state = next_search_state(state, end); goto next; } @@ -714,18 +701,24 @@ hit_next: prealloc = alloc_extent_state_atomic(prealloc); if (!prealloc) goto search_again; - err = split_state(tree, state, prealloc, start); - if (err) - extent_io_tree_panic(tree, state, "split", err); - + ret = split_state(tree, state, prealloc, start); prealloc = NULL; - if (err) + if (ret) { + extent_io_tree_panic(tree, state, "split", ret); goto out; + } if (state->end <= end) { - state = clear_state_bit(tree, state, bits, wake, changeset); + state = clear_state_bit(tree, state, bits, wake, end, + changeset); goto next; } - goto search_again; + if (need_resched()) + goto search_again; + /* + * Fallthrough and try atomic extent state allocation if needed. + * If it fails we'll jump to 'search_again' retry the allocation + * in non-atomic mode and start the search again. + */ } /* * | ---- desired range ---- | @@ -736,30 +729,31 @@ hit_next: prealloc = alloc_extent_state_atomic(prealloc); if (!prealloc) goto search_again; - err = split_state(tree, state, prealloc, end + 1); - if (err) - extent_io_tree_panic(tree, state, "split", err); + ret = split_state(tree, state, prealloc, end + 1); + if (ret) { + extent_io_tree_panic(tree, state, "split", ret); + prealloc = NULL; + goto out; + } if (wake) wake_up(&state->wq); - clear_state_bit(tree, prealloc, bits, wake, changeset); + clear_state_bit(tree, prealloc, bits, wake, end, changeset); prealloc = NULL; goto out; } - state = clear_state_bit(tree, state, bits, wake, changeset); + state = clear_state_bit(tree, state, bits, wake, end, changeset); next: - if (last_end == (u64)-1) + if (last_end >= end) goto out; start = last_end + 1; - if (start <= end && state && !need_resched()) + if (state && !need_resched()) goto hit_next; search_again: - if (start > end) - goto out; spin_unlock(&tree->lock); if (gfpflags_allow_blocking(mask)) cond_resched(); @@ -767,10 +761,9 @@ search_again: out: spin_unlock(&tree->lock); - if (prealloc) - free_extent_state(prealloc); + btrfs_free_extent_state(prealloc); - return 0; + return ret; } @@ -820,7 +813,7 @@ process_node: schedule(); spin_lock(&tree->lock); finish_wait(&state->wq, &wait); - free_extent_state(state); + btrfs_free_extent_state(state); goto again; } start = state->end + 1; @@ -838,7 +831,7 @@ out: if (cached_state && *cached_state) { state = *cached_state; *cached_state = NULL; - free_extent_state(state); + btrfs_free_extent_state(state); } spin_unlock(&tree->lock); } @@ -877,7 +870,7 @@ static struct extent_state *find_first_extent_bit_state(struct extent_io_tree *t */ state = tree_search(tree, start); while (state) { - if (state->end >= start && (state->state & bits)) + if (state->state & bits) return state; state = next_state(state); } @@ -892,9 +885,9 @@ static struct extent_state *find_first_extent_bit_state(struct extent_io_tree *t * Return true if we find something, and update @start_ret and @end_ret. * Return false if we found nothing. */ -bool find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, u32 bits, - struct extent_state **cached_state) +bool btrfs_find_first_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, u32 bits, + struct extent_state **cached_state) { struct extent_state *state; bool ret = false; @@ -914,13 +907,13 @@ bool find_first_extent_bit(struct extent_io_tree *tree, u64 start, * again. If we haven't found any, clear as well since * it's now useless. */ - free_extent_state(*cached_state); + btrfs_free_extent_state(*cached_state); *cached_state = NULL; if (state) goto got_it; goto out; } - free_extent_state(*cached_state); + btrfs_free_extent_state(*cached_state); *cached_state = NULL; } @@ -952,14 +945,17 @@ out: * contiguous area for given bits. We will search to the first bit we find, and * then walk down the tree until we find a non-contiguous area. The area * returned will be the full contiguous area with the bits set. + * + * Returns true if we found a range with the given bits set, in which case + * @start_ret and @end_ret are updated, or false if no range was found. */ -int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, u32 bits) +bool btrfs_find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, u32 bits) { struct extent_state *state; - int ret = 1; + bool ret = false; - ASSERT(!btrfs_fs_incompat(extent_io_tree_to_fs_info(tree), NO_HOLES)); + ASSERT(!btrfs_fs_incompat(btrfs_extent_io_tree_to_fs_info(tree), NO_HOLES)); spin_lock(&tree->lock); state = find_first_extent_bit_state(tree, start, bits); @@ -971,7 +967,7 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, break; *end_ret = state->end; } - ret = 0; + ret = true; } spin_unlock(&tree->lock); return ret; @@ -1046,11 +1042,11 @@ out: * * [start, end] is inclusive This takes the tree lock. */ -static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, u64 *failed_start, - struct extent_state **failed_state, - struct extent_state **cached_state, - struct extent_changeset *changeset) +static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, u64 *failed_start, + struct extent_state **failed_state, + struct extent_state **cached_state, + struct extent_changeset *changeset) { struct extent_state *state; struct extent_state *prealloc = NULL; @@ -1129,12 +1125,11 @@ hit_next: set_state_bits(tree, state, bits, changeset); cache_state(state, cached_state); merge_state(tree, state); - if (last_end == (u64)-1) + if (last_end >= end) goto out; start = last_end + 1; state = next_state(state); - if (start < end && state && state->start == start && - !need_resched()) + if (state && state->start == start && !need_resched()) goto hit_next; goto search_again; } @@ -1186,12 +1181,11 @@ hit_next: set_state_bits(tree, state, bits, changeset); cache_state(state, cached_state); merge_state(tree, state); - if (last_end == (u64)-1) + if (last_end >= end) goto out; start = last_end + 1; state = next_state(state); - if (start < end && state && state->start == start && - !need_resched()) + if (state && state->start == start && !need_resched()) goto hit_next; } goto search_again; @@ -1204,14 +1198,8 @@ hit_next: * extent we found. */ if (state->start > start) { - u64 this_end; struct extent_state *inserted_state; - if (end < last_start) - this_end = end; - else - this_end = last_start - 1; - prealloc = alloc_extent_state_atomic(prealloc); if (!prealloc) goto search_again; @@ -1221,17 +1209,38 @@ hit_next: * extent. */ prealloc->start = start; - prealloc->end = this_end; + if (end < last_start) + prealloc->end = end; + else + prealloc->end = last_start - 1; + inserted_state = insert_state(tree, prealloc, bits, changeset); if (IS_ERR(inserted_state)) { ret = PTR_ERR(inserted_state); extent_io_tree_panic(tree, prealloc, "insert", ret); + goto out; } cache_state(inserted_state, cached_state); if (inserted_state == prealloc) prealloc = NULL; - start = this_end + 1; + start = inserted_state->end + 1; + + /* Beyond target range, stop. */ + if (start > end) + goto out; + + if (need_resched()) + goto search_again; + + state = next_search_state(inserted_state, end); + /* + * If there's a next state, whether contiguous or not, we don't + * need to unlock and start search agian. If it's not contiguous + * we will end up here and try to allocate a prealloc state and insert. + */ + if (state) + goto hit_next; goto search_again; } /* @@ -1252,8 +1261,11 @@ hit_next: if (!prealloc) goto search_again; ret = split_state(tree, state, prealloc, end + 1); - if (ret) + if (ret) { extent_io_tree_panic(tree, state, "split", ret); + prealloc = NULL; + goto out; + } set_state_bits(tree, prealloc, bits, changeset); cache_state(prealloc, cached_state); @@ -1272,18 +1284,16 @@ search_again: out: spin_unlock(&tree->lock); - if (prealloc) - free_extent_state(prealloc); + btrfs_free_extent_state(prealloc); return ret; } -int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_state **cached_state) +int btrfs_set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_state **cached_state) { - return __set_extent_bit(tree, start, end, bits, NULL, NULL, - cached_state, NULL); + return set_extent_bit(tree, start, end, bits, NULL, NULL, cached_state, NULL); } /* @@ -1304,9 +1314,9 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, * * All allocations are done with GFP_NOFS. */ -int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, u32 clear_bits, - struct extent_state **cached_state) +int btrfs_convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, u32 clear_bits, + struct extent_state **cached_state) { struct extent_state *state; struct extent_state *prealloc = NULL; @@ -1374,12 +1384,11 @@ hit_next: if (state->start == start && state->end <= end) { set_state_bits(tree, state, bits, NULL); cache_state(state, cached_state); - state = clear_state_bit(tree, state, clear_bits, 0, NULL); - if (last_end == (u64)-1) + state = clear_state_bit(tree, state, clear_bits, 0, end, NULL); + if (last_end >= end) goto out; start = last_end + 1; - if (start < end && state && state->start == start && - !need_resched()) + if (state && state->start == start && !need_resched()) goto hit_next; goto search_again; } @@ -1406,20 +1415,19 @@ hit_next: goto out; } ret = split_state(tree, state, prealloc, start); - if (ret) - extent_io_tree_panic(tree, state, "split", ret); prealloc = NULL; - if (ret) + if (ret) { + extent_io_tree_panic(tree, state, "split", ret); goto out; + } if (state->end <= end) { set_state_bits(tree, state, bits, NULL); cache_state(state, cached_state); - state = clear_state_bit(tree, state, clear_bits, 0, NULL); - if (last_end == (u64)-1) + state = clear_state_bit(tree, state, clear_bits, 0, end, NULL); + if (last_end >= end) goto out; start = last_end + 1; - if (start < end && state && state->start == start && - !need_resched()) + if (state && state->start == start && !need_resched()) goto hit_next; } goto search_again; @@ -1432,14 +1440,8 @@ hit_next: * extent we found. */ if (state->start > start) { - u64 this_end; struct extent_state *inserted_state; - if (end < last_start) - this_end = end; - else - this_end = last_start - 1; - prealloc = alloc_extent_state_atomic(prealloc); if (!prealloc) { ret = -ENOMEM; @@ -1451,16 +1453,37 @@ hit_next: * extent. */ prealloc->start = start; - prealloc->end = this_end; + if (end < last_start) + prealloc->end = end; + else + prealloc->end = last_start - 1; + inserted_state = insert_state(tree, prealloc, bits, NULL); if (IS_ERR(inserted_state)) { ret = PTR_ERR(inserted_state); extent_io_tree_panic(tree, prealloc, "insert", ret); + goto out; } cache_state(inserted_state, cached_state); if (inserted_state == prealloc) prealloc = NULL; - start = this_end + 1; + start = inserted_state->end + 1; + + /* Beyond target range, stop. */ + if (start > end) + goto out; + + if (need_resched()) + goto search_again; + + state = next_search_state(inserted_state, end); + /* + * If there's a next state, whether contiguous or not, we don't + * need to unlock and start search again. If it's not contiguous + * we will end up here and try to allocate a prealloc state and insert. + */ + if (state) + goto hit_next; goto search_again; } /* @@ -1477,12 +1500,15 @@ hit_next: } ret = split_state(tree, state, prealloc, end + 1); - if (ret) + if (ret) { extent_io_tree_panic(tree, state, "split", ret); + prealloc = NULL; + goto out; + } set_state_bits(tree, prealloc, bits, NULL); cache_state(prealloc, cached_state); - clear_state_bit(tree, prealloc, clear_bits, 0, NULL); + clear_state_bit(tree, prealloc, clear_bits, 0, end, NULL); prealloc = NULL; goto out; } @@ -1497,8 +1523,7 @@ search_again: out: spin_unlock(&tree->lock); - if (prealloc) - free_extent_state(prealloc); + btrfs_free_extent_state(prealloc); return ret; } @@ -1518,8 +1543,8 @@ out: * spans (last_range_end, end of device]. In this case it's up to the caller to * trim @end_ret to the appropriate size. */ -void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, u32 bits) +void btrfs_find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, u32 bits) { struct extent_state *state; struct extent_state *prev = NULL, *next = NULL; @@ -1636,10 +1661,10 @@ out: * all given bits set. If the returned number of bytes is greater than zero * then @start is updated with the offset of the first byte with the bits set. */ -u64 count_range_bits(struct extent_io_tree *tree, - u64 *start, u64 search_end, u64 max_bytes, - u32 bits, int contig, - struct extent_state **cached_state) +u64 btrfs_count_range_bits(struct extent_io_tree *tree, + u64 *start, u64 search_end, u64 max_bytes, + u32 bits, int contig, + struct extent_state **cached_state) { struct extent_state *state = NULL; struct extent_state *cached; @@ -1710,7 +1735,7 @@ search: } if (cached_state) { - free_extent_state(*cached_state); + btrfs_free_extent_state(*cached_state); *cached_state = state; if (state) refcount_inc(&state->refs); @@ -1724,16 +1749,16 @@ search: /* * Check if the single @bit exists in the given range. */ -bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit) +bool btrfs_test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit) { - struct extent_state *state = NULL; + struct extent_state *state; bool bitset = false; ASSERT(is_power_of_2(bit)); spin_lock(&tree->lock); state = tree_search(tree, start); - while (state && start <= end) { + while (state) { if (state->start > end) break; @@ -1742,9 +1767,7 @@ bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 break; } - /* If state->end is (u64)-1, start will overflow to 0 */ - start = state->end + 1; - if (start > end || start == 0) + if (state->end >= end) break; state = next_state(state); } @@ -1752,16 +1775,51 @@ bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 return bitset; } +void btrfs_get_range_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 *bits, + struct extent_state **cached_state) +{ + struct extent_state *state; + + /* + * The cached state is currently mandatory and not used to start the + * search, only to cache the first state record found in the range. + */ + ASSERT(cached_state != NULL); + ASSERT(*cached_state == NULL); + + *bits = 0; + + spin_lock(&tree->lock); + state = tree_search(tree, start); + if (state && state->start < end) { + *cached_state = state; + refcount_inc(&state->refs); + } + while (state) { + if (state->start > end) + break; + + *bits |= state->state; + + if (state->end >= end) + break; + + state = next_state(state); + } + spin_unlock(&tree->lock); +} + /* * Check if the whole range [@start,@end) contains the single @bit set. */ -bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit, - struct extent_state *cached) +bool btrfs_test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit, + struct extent_state *cached) { - struct extent_state *state = NULL; + struct extent_state *state; bool bitset = true; ASSERT(is_power_of_2(bit)); + ASSERT(start < end); spin_lock(&tree->lock); if (cached && extent_state_in_tree(cached) && cached->start <= start && @@ -1769,30 +1827,22 @@ bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit, state = cached; else state = tree_search(tree, start); - while (state && start <= end) { + while (state) { if (state->start > start) { bitset = false; break; } - if (state->start > end) - break; - if ((state->state & bit) == 0) { bitset = false; break; } - if (state->end == (u64)-1) + if (state->end >= end) break; - /* - * Last entry (if state->end is (u64)-1 and overflow happens), - * or next entry starts after the range. - */ + /* Next state must start where this one ends. */ start = state->end + 1; - if (start > end || start == 0) - break; state = next_state(state); } @@ -1804,8 +1854,8 @@ bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit, } /* Wrappers around set/clear extent bit */ -int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_changeset *changeset) +int btrfs_set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_changeset *changeset) { /* * We don't support EXTENT_LOCK_BITS yet, as current changeset will @@ -1814,11 +1864,11 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, */ ASSERT(!(bits & EXTENT_LOCK_BITS)); - return __set_extent_bit(tree, start, end, bits, NULL, NULL, NULL, changeset); + return set_extent_bit(tree, start, end, bits, NULL, NULL, NULL, changeset); } -int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_changeset *changeset) +int btrfs_clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_changeset *changeset) { /* * Don't support EXTENT_LOCK_BITS case, same reason as @@ -1826,20 +1876,21 @@ int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, */ ASSERT(!(bits & EXTENT_LOCK_BITS)); - return __clear_extent_bit(tree, start, end, bits, NULL, changeset); + return btrfs_clear_extent_bit_changeset(tree, start, end, bits, NULL, changeset); } -bool __try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, - struct extent_state **cached) +bool btrfs_try_lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_state **cached) { int err; u64 failed_start; - err = __set_extent_bit(tree, start, end, bits, &failed_start, - NULL, cached, NULL); + err = set_extent_bit(tree, start, end, bits, &failed_start, NULL, + cached, NULL); if (err == -EEXIST) { if (failed_start > start) - clear_extent_bit(tree, start, failed_start - 1, bits, cached); + btrfs_clear_extent_bit(tree, start, failed_start - 1, + bits, cached); return 0; } return 1; @@ -1849,35 +1900,54 @@ bool __try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits * Either insert or lock state struct between start and end use mask to tell * us if waiting is desired. */ -int __lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, - struct extent_state **cached_state) +int btrfs_lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, + struct extent_state **cached_state) { struct extent_state *failed_state = NULL; int err; u64 failed_start; - err = __set_extent_bit(tree, start, end, bits, &failed_start, - &failed_state, cached_state, NULL); + err = set_extent_bit(tree, start, end, bits, &failed_start, + &failed_state, cached_state, NULL); while (err == -EEXIST) { if (failed_start != start) - clear_extent_bit(tree, start, failed_start - 1, - bits, cached_state); + btrfs_clear_extent_bit(tree, start, failed_start - 1, + bits, cached_state); wait_extent_bit(tree, failed_start, end, bits, &failed_state); - err = __set_extent_bit(tree, start, end, bits, - &failed_start, &failed_state, - cached_state, NULL); + err = set_extent_bit(tree, start, end, bits, &failed_start, + &failed_state, cached_state, NULL); } return err; } -void __cold extent_state_free_cachep(void) +/* + * Get the extent state that follows the given extent state. + * This is meant to be used in a context where we know no other tasks can + * concurrently modify the tree. + */ +struct extent_state *btrfs_next_extent_state(struct extent_io_tree *tree, + struct extent_state *state) +{ + struct extent_state *next; + + spin_lock(&tree->lock); + ASSERT(extent_state_in_tree(state)); + next = next_state(state); + if (next) + refcount_inc(&next->refs); + spin_unlock(&tree->lock); + + return next; +} + +void __cold btrfs_extent_state_free_cachep(void) { btrfs_extent_state_leak_debug_check(); kmem_cache_destroy(extent_state_cache); } -int __init extent_state_init_cachep(void) +int __init btrfs_extent_state_init_cachep(void) { extent_state_cache = kmem_cache_create("btrfs_extent_state", sizeof(struct extent_state), 0, 0, diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 6ffef1cd37c1..0a18ca9c59c3 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -17,7 +17,6 @@ struct btrfs_inode; /* Bits for the extent state */ enum { ENUM_BIT(EXTENT_DIRTY), - ENUM_BIT(EXTENT_UPTODATE), ENUM_BIT(EXTENT_LOCKED), ENUM_BIT(EXTENT_DIO_LOCKED), ENUM_BIT(EXTENT_NEW), @@ -39,6 +38,11 @@ enum { */ ENUM_BIT(EXTENT_DELALLOC_NEW), /* + * Mark that a range is being locked for finishing an ordered extent. + * Used together with EXTENT_LOCKED. + */ + ENUM_BIT(EXTENT_FINISHING_ORDERED), + /* * When an ordered extent successfully completes for a region marked as * a new delalloc range, use this flag when clearing a new delalloc * range to indicate that the VFS' inode number of bytes should be @@ -130,117 +134,116 @@ struct extent_state { #endif }; -struct btrfs_inode *extent_io_tree_to_inode(struct extent_io_tree *tree); -const struct btrfs_inode *extent_io_tree_to_inode_const(const struct extent_io_tree *tree); -const struct btrfs_fs_info *extent_io_tree_to_fs_info(const struct extent_io_tree *tree); +const struct btrfs_inode *btrfs_extent_io_tree_to_inode(const struct extent_io_tree *tree); +const struct btrfs_fs_info *btrfs_extent_io_tree_to_fs_info(const struct extent_io_tree *tree); -void extent_io_tree_init(struct btrfs_fs_info *fs_info, - struct extent_io_tree *tree, unsigned int owner); -void extent_io_tree_release(struct extent_io_tree *tree); -int __lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, - struct extent_state **cached); -bool __try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, - struct extent_state **cached); +void btrfs_extent_io_tree_init(struct btrfs_fs_info *fs_info, + struct extent_io_tree *tree, unsigned int owner); +void btrfs_extent_io_tree_release(struct extent_io_tree *tree); +int btrfs_lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, + struct extent_state **cached); +bool btrfs_try_lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_state **cached); -static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached) +static inline int btrfs_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached) { - return __lock_extent(tree, start, end, EXTENT_LOCKED, cached); + return btrfs_lock_extent_bits(tree, start, end, EXTENT_LOCKED, cached); } -static inline bool try_lock_extent(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached) +static inline bool btrfs_try_lock_extent(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) { - return __try_lock_extent(tree, start, end, EXTENT_LOCKED, cached); + return btrfs_try_lock_extent_bits(tree, start, end, EXTENT_LOCKED, cached); } -int __init extent_state_init_cachep(void); -void __cold extent_state_free_cachep(void); - -u64 count_range_bits(struct extent_io_tree *tree, - u64 *start, u64 search_end, - u64 max_bytes, u32 bits, int contig, - struct extent_state **cached_state); - -void free_extent_state(struct extent_state *state); -bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit, - struct extent_state *cached_state); -bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit); -int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_changeset *changeset); -int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_state **cached, - struct extent_changeset *changeset); - -static inline int clear_extent_bit(struct extent_io_tree *tree, u64 start, - u64 end, u32 bits, - struct extent_state **cached) -{ - return __clear_extent_bit(tree, start, end, bits, cached, NULL); -} +int __init btrfs_extent_state_init_cachep(void); +void __cold btrfs_extent_state_free_cachep(void); + +u64 btrfs_count_range_bits(struct extent_io_tree *tree, + u64 *start, u64 search_end, + u64 max_bytes, u32 bits, int contig, + struct extent_state **cached_state); -static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached) +void btrfs_free_extent_state(struct extent_state *state); +bool btrfs_test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit, + struct extent_state *cached_state); +bool btrfs_test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit); +void btrfs_get_range_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 *bits, + struct extent_state **cached_state); +int btrfs_clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_changeset *changeset); +int btrfs_clear_extent_bit_changeset(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_state **cached, + struct extent_changeset *changeset); + +static inline int btrfs_clear_extent_bit(struct extent_io_tree *tree, u64 start, + u64 end, u32 bits, + struct extent_state **cached) { - return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, cached, NULL); + return btrfs_clear_extent_bit_changeset(tree, start, end, bits, cached, NULL); } -static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start, - u64 end, u32 bits) +static inline int btrfs_unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached) { - return clear_extent_bit(tree, start, end, bits, NULL); + return btrfs_clear_extent_bit_changeset(tree, start, end, EXTENT_LOCKED, + cached, NULL); } -int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_changeset *changeset); -int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_state **cached_state); - -static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached_state) +static inline int btrfs_clear_extent_bits(struct extent_io_tree *tree, u64 start, + u64 end, u32 bits) { - return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE, - cached_state, NULL); + return btrfs_clear_extent_bit(tree, start, end, bits, NULL); } -static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached) +int btrfs_set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_changeset *changeset); +int btrfs_set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_state **cached_state); + +static inline int btrfs_clear_extent_dirty(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) { - return clear_extent_bit(tree, start, end, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, cached); + return btrfs_clear_extent_bit(tree, start, end, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, cached); } -int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, u32 clear_bits, - struct extent_state **cached_state); - -bool find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, u32 bits, - struct extent_state **cached_state); -void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, u32 bits); -int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, u32 bits); +int btrfs_convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, u32 clear_bits, + struct extent_state **cached_state); + +bool btrfs_find_first_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, u32 bits, + struct extent_state **cached_state); +void btrfs_find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, u32 bits); +bool btrfs_find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, u32 bits); bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, u64 *end, u64 max_bytes, struct extent_state **cached_state); -static inline int lock_dio_extent(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached) +static inline int btrfs_lock_dio_extent(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) { - return __lock_extent(tree, start, end, EXTENT_DIO_LOCKED, cached); + return btrfs_lock_extent_bits(tree, start, end, EXTENT_DIO_LOCKED, cached); } -static inline bool try_lock_dio_extent(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached) +static inline bool btrfs_try_lock_dio_extent(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) { - return __try_lock_extent(tree, start, end, EXTENT_DIO_LOCKED, cached); + return btrfs_try_lock_extent_bits(tree, start, end, EXTENT_DIO_LOCKED, cached); } -static inline int unlock_dio_extent(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached) +static inline int btrfs_unlock_dio_extent(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) { - return __clear_extent_bit(tree, start, end, EXTENT_DIO_LOCKED, cached, NULL); + return btrfs_clear_extent_bit_changeset(tree, start, end, EXTENT_DIO_LOCKED, + cached, NULL); } +struct extent_state *btrfs_next_extent_state(struct extent_io_tree *tree, + struct extent_state *state); + #endif /* BTRFS_EXTENT_IO_TREE_H */ diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3014a1a23efd..cb6128778a83 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -70,20 +70,17 @@ static int block_group_bits(struct btrfs_block_group *cache, u64 bits) int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) { struct btrfs_root *root = btrfs_extent_root(fs_info, start); - int ret; struct btrfs_key key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); path = btrfs_alloc_path(); if (!path) return -ENOMEM; key.objectid = start; - key.offset = len; key.type = BTRFS_EXTENT_ITEM_KEY; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - btrfs_free_path(path); - return ret; + key.offset = len; + return btrfs_search_slot(NULL, root, &key, path, 0, 0); } /* @@ -103,7 +100,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root; struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; u64 num_refs; u64 extent_flags; @@ -125,16 +122,16 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, search_again: key.objectid = bytenr; - key.offset = offset; if (metadata) key.type = BTRFS_METADATA_ITEM_KEY; else key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = offset; extent_root = btrfs_extent_root(fs_info, bytenr); ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) - goto out_free; + return ret; if (ret > 0 && key.type == BTRFS_METADATA_ITEM_KEY) { if (path->slots[0]) { @@ -159,7 +156,7 @@ search_again: "unexpected extent item size, has %u expect >= %zu", item_size, sizeof(*ei)); btrfs_abort_transaction(trans, ret); - goto out_free; + return ret; } ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); @@ -170,7 +167,7 @@ search_again: "unexpected zero reference count for extent item (%llu %u %llu)", key.objectid, key.type, key.offset); btrfs_abort_transaction(trans, ret); - goto out_free; + return ret; } extent_flags = btrfs_extent_flags(leaf, ei); owner = btrfs_get_extent_owner_root(fs_info, leaf, path->slots[0]); @@ -216,8 +213,7 @@ search_again: *flags = extent_flags; if (owning_root) *owning_root = owner; -out_free: - btrfs_free_path(path); + return ret; } @@ -413,15 +409,15 @@ static u64 hash_extent_data_ref_item(struct extent_buffer *leaf, btrfs_extent_data_ref_offset(leaf, ref)); } -static int match_extent_data_ref(struct extent_buffer *leaf, - struct btrfs_extent_data_ref *ref, - u64 root_objectid, u64 owner, u64 offset) +static bool match_extent_data_ref(struct extent_buffer *leaf, + struct btrfs_extent_data_ref *ref, + u64 root_objectid, u64 owner, u64 offset) { if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid || btrfs_extent_data_ref_objectid(leaf, ref) != owner || btrfs_extent_data_ref_offset(leaf, ref) != offset) - return 0; - return 1; + return false; + return true; } static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, @@ -1487,7 +1483,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_extent_item *item; struct btrfs_key key; @@ -1508,7 +1504,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, node->parent, node->ref_root, owner, offset, refs_to_add, extent_op); if ((ret < 0 && ret != -EAGAIN) || !ret) - goto out; + return ret; /* * Ok we had -EAGAIN which means we didn't have space to insert and @@ -1533,8 +1529,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, if (ret) btrfs_abort_transaction(trans, ret); -out: - btrfs_free_path(path); + return ret; } @@ -1631,7 +1626,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root; struct btrfs_key key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_extent_item *ei; struct extent_buffer *leaf; u32 item_size; @@ -1662,7 +1657,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, again: ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) { - goto out; + return ret; } else if (ret > 0) { if (metadata) { if (path->slots[0] > 0) { @@ -1679,8 +1674,8 @@ again: metadata = 0; key.objectid = head->bytenr; - key.offset = head->num_bytes; key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = head->num_bytes; goto again; } } else { @@ -1688,7 +1683,7 @@ again: btrfs_err(fs_info, "missing extent item for extent %llu num_bytes %llu level %d", head->bytenr, head->num_bytes, head->level); - goto out; + return ret; } } @@ -1701,13 +1696,12 @@ again: "unexpected extent item size, has %u expect >= %zu", item_size, sizeof(*ei)); btrfs_abort_transaction(trans, ret); - goto out; + return ret; } ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); __run_delayed_extent_op(extent_op, leaf, ei); -out: - btrfs_free_path(path); + return ret; } @@ -2012,7 +2006,12 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, delayed_refs = &trans->transaction->delayed_refs; if (min_bytes == 0) { - max_count = delayed_refs->num_heads_ready; + /* + * We may be subject to a harmless race if some task is + * concurrently adding or removing a delayed ref, so silence + * KCSAN and similar tools. + */ + max_count = data_race(delayed_refs->num_heads_ready); min_bytes = U64_MAX; } @@ -2348,8 +2347,8 @@ static noinline int check_committed_ref(struct btrfs_inode *inode, int ret; key.objectid = bytenr; - key.offset = (u64)-1; key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = (u64)-1; ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) @@ -2604,8 +2603,8 @@ static int pin_down_extent(struct btrfs_trans_handle *trans, spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - set_extent_bit(&trans->transaction->pinned_extents, bytenr, - bytenr + num_bytes - 1, EXTENT_DIRTY, NULL); + btrfs_set_extent_bit(&trans->transaction->pinned_extents, bytenr, + bytenr + num_bytes - 1, EXTENT_DIRTY, NULL); return 0; } @@ -2824,34 +2823,63 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_group *block_group, *tmp; struct list_head *deleted_bgs; - struct extent_io_tree *unpin; + struct extent_io_tree *unpin = &trans->transaction->pinned_extents; + struct extent_state *cached_state = NULL; u64 start; u64 end; + int unpin_error = 0; int ret; - unpin = &trans->transaction->pinned_extents; - - while (!TRANS_ABORTED(trans)) { - struct extent_state *cached_state = NULL; + mutex_lock(&fs_info->unused_bg_unpin_mutex); + btrfs_find_first_extent_bit(unpin, 0, &start, &end, EXTENT_DIRTY, &cached_state); - mutex_lock(&fs_info->unused_bg_unpin_mutex); - if (!find_first_extent_bit(unpin, 0, &start, &end, - EXTENT_DIRTY, &cached_state)) { - mutex_unlock(&fs_info->unused_bg_unpin_mutex); - break; - } + while (!TRANS_ABORTED(trans) && cached_state) { + struct extent_state *next_state; if (btrfs_test_opt(fs_info, DISCARD_SYNC)) ret = btrfs_discard_extent(fs_info, start, end + 1 - start, NULL); - clear_extent_dirty(unpin, start, end, &cached_state); + next_state = btrfs_next_extent_state(unpin, cached_state); + btrfs_clear_extent_dirty(unpin, start, end, &cached_state); ret = unpin_extent_range(fs_info, start, end, true); - BUG_ON(ret); - mutex_unlock(&fs_info->unused_bg_unpin_mutex); - free_extent_state(cached_state); - cond_resched(); + /* + * If we get an error unpinning an extent range, store the first + * error to return later after trying to unpin all ranges and do + * the sync discards. Our caller will abort the transaction + * (which already wrote new superblocks) and on the next mount + * the space will be available as it was pinned by in-memory + * only structures in this phase. + */ + if (ret) { + btrfs_err_rl(fs_info, +"failed to unpin extent range [%llu, %llu] when committing transaction %llu: %s (%d)", + start, end, trans->transid, + btrfs_decode_error(ret), ret); + if (!unpin_error) + unpin_error = ret; + } + + btrfs_free_extent_state(cached_state); + + if (need_resched()) { + btrfs_free_extent_state(next_state); + mutex_unlock(&fs_info->unused_bg_unpin_mutex); + cond_resched(); + cached_state = NULL; + mutex_lock(&fs_info->unused_bg_unpin_mutex); + btrfs_find_first_extent_bit(unpin, 0, &start, &end, + EXTENT_DIRTY, &cached_state); + } else { + cached_state = next_state; + if (cached_state) { + start = cached_state->start; + end = cached_state->end; + } + } } + mutex_unlock(&fs_info->unused_bg_unpin_mutex); + btrfs_free_extent_state(cached_state); if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) { btrfs_discard_calc_delay(&fs_info->discard_ctl); @@ -2865,16 +2893,20 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) */ deleted_bgs = &trans->transaction->deleted_bgs; list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) { - u64 trimmed = 0; - ret = -EROFS; if (!TRANS_ABORTED(trans)) - ret = btrfs_discard_extent(fs_info, - block_group->start, - block_group->length, - &trimmed); + ret = btrfs_discard_extent(fs_info, block_group->start, + block_group->length, NULL); + /* + * Not strictly necessary to lock, as the block_group should be + * read-only from btrfs_delete_unused_bgs(). + */ + ASSERT(block_group->ro); + spin_lock(&fs_info->unused_bgs_lock); list_del_init(&block_group->bg_list); + spin_unlock(&fs_info->unused_bgs_lock); + btrfs_unfreeze_block_group(block_group); btrfs_put_block_group(block_group); @@ -2886,7 +2918,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) } } - return 0; + return unpin_error; } /* @@ -3481,17 +3513,11 @@ int btrfs_free_tree_block(struct btrfs_trans_handle *trans, WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); btrfs_add_free_space(bg, buf->start, buf->len); - btrfs_free_reserved_bytes(bg, buf->len, 0); + btrfs_free_reserved_bytes(bg, buf->len, false); btrfs_put_block_group(bg); trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len); out: - - /* - * Deleting the buffer, clear the corrupt flag since it doesn't - * matter anymore. - */ - clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); return 0; } @@ -4109,6 +4135,7 @@ static int can_allocate_chunk(struct btrfs_fs_info *fs_info, static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, struct btrfs_key *ins, struct find_free_extent_ctl *ffe_ctl, + struct btrfs_space_info *space_info, bool full_search) { struct btrfs_root *root = fs_info->chunk_root; @@ -4163,7 +4190,7 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, return ret; } - ret = btrfs_chunk_alloc(trans, ffe_ctl->flags, + ret = btrfs_chunk_alloc(trans, space_info, ffe_ctl->flags, CHUNK_ALLOC_FORCE_FOR_EXTENT); /* Do not bail out on ENOSPC since we can do more. */ @@ -4380,11 +4407,22 @@ static noinline int find_free_extent(struct btrfs_root *root, ins->objectid = 0; ins->offset = 0; - trace_find_free_extent(root, ffe_ctl); + trace_btrfs_find_free_extent(root, ffe_ctl); space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags); + if (btrfs_is_zoned(fs_info) && space_info) { + /* Use dedicated sub-space_info for dedicated block group users. */ + if (ffe_ctl->for_data_reloc) { + space_info = space_info->sub_group[0]; + ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC); + } else if (ffe_ctl->for_treelog) { + space_info = space_info->sub_group[0]; + ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_TREELOG); + } + } if (!space_info) { - btrfs_err(fs_info, "No space info for %llu", ffe_ctl->flags); + btrfs_err(fs_info, "no space info for %llu, tree-log %d, relocation %d", + ffe_ctl->flags, ffe_ctl->for_treelog, ffe_ctl->for_data_reloc); return -ENOSPC; } @@ -4406,6 +4444,7 @@ static noinline int find_free_extent(struct btrfs_root *root, * picked out then we don't care that the block group is cached. */ if (block_group && block_group_bits(block_group, ffe_ctl->flags) && + block_group->space_info == space_info && block_group->cached != BTRFS_CACHE_NO) { down_read(&space_info->groups_sem); if (list_empty(&block_group->list) || @@ -4431,7 +4470,7 @@ static noinline int find_free_extent(struct btrfs_root *root, } } search: - trace_find_free_extent_search_loop(root, ffe_ctl); + trace_btrfs_find_free_extent_search_loop(root, ffe_ctl); ffe_ctl->have_caching_bg = false; if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) || ffe_ctl->index == 0) @@ -4483,7 +4522,7 @@ search: } have_block_group: - trace_find_free_extent_have_block_group(root, ffe_ctl, block_group); + trace_btrfs_find_free_extent_have_block_group(root, ffe_ctl, block_group); ffe_ctl->cached = btrfs_block_group_done(block_group); if (unlikely(!ffe_ctl->cached)) { ffe_ctl->have_caching_bg = true; @@ -4576,7 +4615,8 @@ loop: } up_read(&space_info->groups_sem); - ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, full_search); + ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, space_info, + full_search); if (ret > 0) goto search; @@ -4698,8 +4738,8 @@ again: return ret; } -int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, - u64 start, u64 len, int delalloc) +int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len, + bool is_delalloc) { struct btrfs_block_group *cache; @@ -4711,7 +4751,7 @@ int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, } btrfs_add_free_space(cache, start, len); - btrfs_free_reserved_bytes(cache, len, delalloc); + btrfs_free_reserved_bytes(cache, len, is_delalloc); trace_btrfs_reserved_extent_free(fs_info, start, len); btrfs_put_block_group(cache); @@ -5069,17 +5109,17 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, * EXTENT bit to differentiate dirty pages. */ if (buf->log_index == 0) - set_extent_bit(&root->dirty_log_pages, buf->start, - buf->start + buf->len - 1, - EXTENT_DIRTY, NULL); + btrfs_set_extent_bit(&root->dirty_log_pages, buf->start, + buf->start + buf->len - 1, + EXTENT_DIRTY, NULL); else - set_extent_bit(&root->dirty_log_pages, buf->start, - buf->start + buf->len - 1, - EXTENT_NEW, NULL); + btrfs_set_extent_bit(&root->dirty_log_pages, buf->start, + buf->start + buf->len - 1, + EXTENT_NEW, NULL); } else { buf->log_index = -1; - set_extent_bit(&trans->transaction->dirty_pages, buf->start, - buf->start + buf->len - 1, EXTENT_DIRTY, NULL); + btrfs_set_extent_bit(&trans->transaction->dirty_pages, buf->start, + buf->start + buf->len - 1, EXTENT_DIRTY, NULL); } /* this returns a buffer locked for blocking */ return buf; @@ -5185,7 +5225,7 @@ out_free_buf: btrfs_tree_unlock(buf); free_extent_buffer(buf); out_free_reserved: - btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, false); out_unuse: btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize); return ERR_PTR(ret); @@ -5465,7 +5505,7 @@ static int check_ref_exists(struct btrfs_trans_handle *trans, { struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_head *head; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_extent_inline_ref *iref; int ret; bool exists = false; @@ -5482,7 +5522,6 @@ again: * If we get 0 then we found our reference, return 1, else * return the error if it's not -ENOENT; */ - btrfs_free_path(path); return (ret < 0 ) ? ret : 1; } @@ -5517,7 +5556,6 @@ again: mutex_unlock(&head->mutex); out: spin_unlock(&delayed_refs->lock); - btrfs_free_path(path); return exists ? 1 : 0; } @@ -6285,7 +6323,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct extent_buffer *parent) { struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct walk_control *wc; int level; int parent_level; @@ -6298,10 +6336,8 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, return -ENOMEM; wc = kzalloc(sizeof(*wc), GFP_NOFS); - if (!wc) { - btrfs_free_path(path); + if (!wc) return -ENOMEM; - } btrfs_assert_tree_write_locked(parent); parent_level = btrfs_header_level(parent); @@ -6338,7 +6374,6 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, } kfree(wc); - btrfs_free_path(path); return ret; } @@ -6400,13 +6435,13 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) if (ret) break; - find_first_clear_extent_bit(&device->alloc_state, start, - &start, &end, - CHUNK_TRIMMED | CHUNK_ALLOCATED); + btrfs_find_first_clear_extent_bit(&device->alloc_state, start, + &start, &end, + CHUNK_TRIMMED | CHUNK_ALLOCATED); /* Check if there are any CHUNK_* bits left */ if (start > device->total_bytes) { - WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + DEBUG_WARN(); btrfs_warn_in_rcu(fs_info, "ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu", start, end - start + 1, @@ -6439,8 +6474,8 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) ret = btrfs_issue_discard(device->bdev, start, len, &bytes); if (!ret) - set_extent_bit(&device->alloc_state, start, - start + bytes - 1, CHUNK_TRIMMED, NULL); + btrfs_set_extent_bit(&device->alloc_state, start, + start + bytes - 1, CHUNK_TRIMMED, NULL); mutex_unlock(&fs_info->chunk_mutex); if (ret) diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index cfa52264f678..72914074c304 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -4,7 +4,6 @@ #define BTRFS_EXTENT_TREE_H #include <linux/types.h> -#include "misc.h" #include "block-group.h" #include "locking.h" @@ -150,8 +149,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref); u64 btrfs_get_extent_owner_root(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, int slot); -int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, - u64 start, u64 len, int delalloc); +int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len, + bool is_delalloc); int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, const struct extent_buffer *eb); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b2fae67f8fa3..849199768664 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -96,6 +96,8 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info) */ struct btrfs_bio_ctrl { struct btrfs_bio *bbio; + /* Last byte contained in bbio + 1 . */ + loff_t next_file_offset; enum btrfs_compression_type compress_type; u32 len_to_oe_boundary; blk_opf_t opf; @@ -221,22 +223,17 @@ static void __process_folios_contig(struct address_space *mapping, } static noinline void unlock_delalloc_folio(const struct inode *inode, - const struct folio *locked_folio, + struct folio *locked_folio, u64 start, u64 end) { - unsigned long index = start >> PAGE_SHIFT; - unsigned long end_index = end >> PAGE_SHIFT; - ASSERT(locked_folio); - if (index == locked_folio->index && end_index == index) - return; __process_folios_contig(inode->i_mapping, locked_folio, start, end, PAGE_UNLOCK); } static noinline int lock_delalloc_folios(struct inode *inode, - const struct folio *locked_folio, + struct folio *locked_folio, u64 start, u64 end) { struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); @@ -246,9 +243,6 @@ static noinline int lock_delalloc_folios(struct inode *inode, u64 processed_end = start; struct folio_batch fbatch; - if (index == locked_folio->index && index == end_index) - return 0; - folio_batch_init(&fbatch); while (index <= end_index) { unsigned int found_folios, i; @@ -340,7 +334,7 @@ again: /* @delalloc_end can be -1, never go beyond @orig_end */ *end = min(delalloc_end, orig_end); - free_extent_state(cached_state); + btrfs_free_extent_state(cached_state); return false; } @@ -366,7 +360,7 @@ again: /* some of the folios are gone, lets avoid looping by * shortening the size of the delalloc range we're searching */ - free_extent_state(cached_state); + btrfs_free_extent_state(cached_state); cached_state = NULL; if (!loops) { max_bytes = PAGE_SIZE; @@ -379,13 +373,13 @@ again: } /* step three, lock the state bits for the whole range */ - lock_extent(tree, delalloc_start, delalloc_end, &cached_state); + btrfs_lock_extent(tree, delalloc_start, delalloc_end, &cached_state); /* then test to make sure it is all still delalloc */ - ret = test_range_bit(tree, delalloc_start, delalloc_end, - EXTENT_DELALLOC, cached_state); + ret = btrfs_test_range_bit(tree, delalloc_start, delalloc_end, + EXTENT_DELALLOC, cached_state); - unlock_extent(tree, delalloc_start, delalloc_end, &cached_state); + btrfs_unlock_extent(tree, delalloc_start, delalloc_end, &cached_state); if (!ret) { unlock_delalloc_folio(inode, locked_folio, delalloc_start, delalloc_end); @@ -403,7 +397,7 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, struct extent_state **cached, u32 clear_bits, unsigned long page_ops) { - clear_extent_bit(&inode->io_tree, start, end, clear_bits, cached); + btrfs_clear_extent_bit(&inode->io_tree, start, end, clear_bits, cached); __process_folios_contig(inode->vfs_inode.i_mapping, locked_folio, start, end, page_ops); @@ -425,14 +419,14 @@ static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 le struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); ASSERT(folio_pos(folio) <= start && - start + len <= folio_pos(folio) + PAGE_SIZE); + start + len <= folio_pos(folio) + folio_size(folio)); if (uptodate && btrfs_verify_folio(folio, start, len)) btrfs_folio_set_uptodate(fs_info, folio, start, len); else btrfs_folio_clear_uptodate(fs_info, folio, start, len); - if (!btrfs_is_subpage(fs_info, folio->mapping)) + if (!btrfs_is_subpage(fs_info, folio)) folio_unlock(folio); else btrfs_folio_end_lock(fs_info, folio, start, len); @@ -462,9 +456,6 @@ static void end_bbio_data_write(struct btrfs_bio *bbio) u64 start = folio_pos(folio) + fi.offset; u32 len = fi.length; - /* Only order 0 (single page) folios are allowed for data. */ - ASSERT(folio_order(folio) == 0); - /* Our read/write should always be sector aligned. */ if (!IS_ALIGNED(fi.offset, sectorsize)) btrfs_err(fs_info, @@ -488,11 +479,11 @@ static void end_bbio_data_write(struct btrfs_bio *bbio) static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio) { ASSERT(folio_test_locked(folio)); - if (!btrfs_is_subpage(fs_info, folio->mapping)) + if (!btrfs_is_subpage(fs_info, folio)) return; ASSERT(folio_test_private(folio)); - btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), PAGE_SIZE); + btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), folio_size(folio)); } /* @@ -512,43 +503,22 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) struct btrfs_fs_info *fs_info = bbio->fs_info; struct bio *bio = &bbio->bio; struct folio_iter fi; - const u32 sectorsize = fs_info->sectorsize; ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_folio_all(fi, &bbio->bio) { bool uptodate = !bio->bi_status; struct folio *folio = fi.folio; struct inode *inode = folio->mapping->host; - u64 start; - u64 end; - u32 len; + u64 start = folio_pos(folio) + fi.offset; btrfs_debug(fs_info, "%s: bi_sector=%llu, err=%d, mirror=%u", __func__, bio->bi_iter.bi_sector, bio->bi_status, bbio->mirror_num); - /* - * We always issue full-sector reads, but if some block in a - * folio fails to read, blk_update_request() will advance - * bv_offset and adjust bv_len to compensate. Print a warning - * for unaligned offsets, and an error if they don't add up to - * a full sector. - */ - if (!IS_ALIGNED(fi.offset, sectorsize)) - btrfs_err(fs_info, - "partial page read in btrfs with offset %zu and length %zu", - fi.offset, fi.length); - else if (!IS_ALIGNED(fi.offset + fi.length, sectorsize)) - btrfs_info(fs_info, - "incomplete page read with offset %zu and length %zu", - fi.offset, fi.length); - - start = folio_pos(folio) + fi.offset; - end = start + fi.length - 1; - len = fi.length; if (likely(uptodate)) { + u64 end = start + fi.length - 1; loff_t i_size = i_size_read(inode); /* @@ -573,7 +543,7 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) } /* Update page status and unlock. */ - end_folio_read(folio, uptodate, start, len); + end_folio_read(folio, uptodate, start, fi.length); } bio_put(bio); } @@ -664,13 +634,10 @@ static int alloc_eb_folio_array(struct extent_buffer *eb, bool nofail) } static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl, - struct folio *folio, u64 disk_bytenr, - unsigned int pg_offset) + u64 disk_bytenr, loff_t file_offset) { struct bio *bio = &bio_ctrl->bbio->bio; - struct bio_vec *bvec = bio_last_bvec_all(bio); const sector_t sector = disk_bytenr >> SECTOR_SHIFT; - struct folio *bv_folio = page_folio(bvec->bv_page); if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) { /* @@ -681,19 +648,11 @@ static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl, } /* - * The contig check requires the following conditions to be met: - * - * 1) The folios are belonging to the same inode - * This is implied by the call chain. - * - * 2) The range has adjacent logical bytenr - * - * 3) The range has adjacent file offset - * This is required for the usage of btrfs_bio->file_offset. + * To merge into a bio both the disk sector and the logical offset in + * the file need to be contiguous. */ - return bio_end_sector(bio) == sector && - folio_pos(bv_folio) + bvec->bv_offset + bvec->bv_len == - folio_pos(folio) + pg_offset; + return bio_ctrl->next_file_offset == file_offset && + bio_end_sector(bio) == sector; } static void alloc_new_bio(struct btrfs_inode *inode, @@ -711,6 +670,7 @@ static void alloc_new_bio(struct btrfs_inode *inode, bbio->file_offset = file_offset; bio_ctrl->bbio = bbio; bio_ctrl->len_to_oe_boundary = U32_MAX; + bio_ctrl->next_file_offset = file_offset; /* Limit data write bios to the ordered boundary. */ if (bio_ctrl->wbc) { @@ -752,22 +712,21 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl, size_t size, unsigned long pg_offset) { struct btrfs_inode *inode = folio_to_inode(folio); + loff_t file_offset = folio_pos(folio) + pg_offset; - ASSERT(pg_offset + size <= PAGE_SIZE); + ASSERT(pg_offset + size <= folio_size(folio)); ASSERT(bio_ctrl->end_io_func); if (bio_ctrl->bbio && - !btrfs_bio_is_contig(bio_ctrl, folio, disk_bytenr, pg_offset)) + !btrfs_bio_is_contig(bio_ctrl, disk_bytenr, file_offset)) submit_one_bio(bio_ctrl); do { u32 len = size; /* Allocate new bio if needed */ - if (!bio_ctrl->bbio) { - alloc_new_bio(inode, bio_ctrl, disk_bytenr, - folio_pos(folio) + pg_offset); - } + if (!bio_ctrl->bbio) + alloc_new_bio(inode, bio_ctrl, disk_bytenr, file_offset); /* Cap to the current ordered extent boundary if there is one. */ if (len > bio_ctrl->len_to_oe_boundary) { @@ -781,14 +740,15 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl, submit_one_bio(bio_ctrl); continue; } + bio_ctrl->next_file_offset += len; if (bio_ctrl->wbc) - wbc_account_cgroup_owner(bio_ctrl->wbc, folio, - len); + wbc_account_cgroup_owner(bio_ctrl->wbc, folio, len); size -= len; pg_offset += len; disk_bytenr += len; + file_offset += len; /* * len_to_oe_boundary defaults to U32_MAX, which isn't folio or @@ -836,7 +796,7 @@ static int attach_extent_buffer_folio(struct extent_buffer *eb, if (folio->mapping) lockdep_assert_held(&folio->mapping->i_private_lock); - if (fs_info->nodesize >= PAGE_SIZE) { + if (!btrfs_meta_is_subpage(fs_info)) { if (!folio_test_private(folio)) folio_attach_private(folio, eb); else @@ -870,7 +830,7 @@ int set_folio_extent_mapped(struct folio *folio) fs_info = folio_to_fs_info(folio); - if (btrfs_is_subpage(fs_info, folio->mapping)) + if (btrfs_is_subpage(fs_info, folio)) return btrfs_attach_subpage(fs_info, folio, BTRFS_SUBPAGE_DATA); folio_attach_private(folio, (void *)EXTENT_FOLIO_PRIVATE); @@ -887,8 +847,8 @@ void clear_folio_extent_mapped(struct folio *folio) return; fs_info = folio_to_fs_info(folio); - if (btrfs_is_subpage(fs_info, folio->mapping)) - return btrfs_detach_subpage(fs_info, folio); + if (btrfs_is_subpage(fs_info, folio)) + return btrfs_detach_subpage(fs_info, folio, BTRFS_SUBPAGE_DATA); folio_detach_private(folio); } @@ -903,13 +863,13 @@ static struct extent_map *get_extent_map(struct btrfs_inode *inode, if (*em_cached) { em = *em_cached; - if (extent_map_in_tree(em) && start >= em->start && - start < extent_map_end(em)) { + if (btrfs_extent_map_in_tree(em) && start >= em->start && + start < btrfs_extent_map_end(em)) { refcount_inc(&em->refs); return em; } - free_extent_map(em); + btrfs_free_extent_map(em); *em_cached = NULL; } @@ -935,16 +895,12 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, struct inode *inode = folio->mapping->host; struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); u64 start = folio_pos(folio); - const u64 end = start + PAGE_SIZE - 1; - u64 cur = start; + const u64 end = start + folio_size(folio) - 1; u64 extent_offset; u64 last_byte = i_size_read(inode); - u64 block_start; struct extent_map *em; int ret = 0; - size_t pg_offset = 0; - size_t iosize; - size_t blocksize = fs_info->sectorsize; + const size_t blocksize = fs_info->sectorsize; ret = set_folio_extent_mapped(folio); if (ret < 0) { @@ -955,45 +911,49 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, if (folio_contains(folio, last_byte >> PAGE_SHIFT)) { size_t zero_offset = offset_in_folio(folio, last_byte); - if (zero_offset) { - iosize = folio_size(folio) - zero_offset; - folio_zero_range(folio, zero_offset, iosize); - } + if (zero_offset) + folio_zero_range(folio, zero_offset, + folio_size(folio) - zero_offset); } bio_ctrl->end_io_func = end_bbio_data_read; begin_folio_read(fs_info, folio); - while (cur <= end) { + for (u64 cur = start; cur <= end; cur += blocksize) { enum btrfs_compression_type compress_type = BTRFS_COMPRESS_NONE; + unsigned long pg_offset = offset_in_folio(folio, cur); bool force_bio_submit = false; u64 disk_bytenr; + u64 block_start; ASSERT(IS_ALIGNED(cur, fs_info->sectorsize)); if (cur >= last_byte) { - iosize = folio_size(folio) - pg_offset; - folio_zero_range(folio, pg_offset, iosize); - end_folio_read(folio, true, cur, iosize); + folio_zero_range(folio, pg_offset, end - cur + 1); + end_folio_read(folio, true, cur, end - cur + 1); break; } + if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) { + end_folio_read(folio, true, cur, blocksize); + continue; + } em = get_extent_map(BTRFS_I(inode), folio, cur, end - cur + 1, em_cached); if (IS_ERR(em)) { end_folio_read(folio, false, cur, end + 1 - cur); return PTR_ERR(em); } extent_offset = cur - em->start; - BUG_ON(extent_map_end(em) <= cur); + BUG_ON(btrfs_extent_map_end(em) <= cur); BUG_ON(end < cur); - compress_type = extent_map_compression(em); + compress_type = btrfs_extent_map_compression(em); - iosize = min(extent_map_end(em) - cur, end - cur + 1); - iosize = ALIGN(iosize, blocksize); if (compress_type != BTRFS_COMPRESS_NONE) disk_bytenr = em->disk_bytenr; else - disk_bytenr = extent_map_block_start(em) + extent_offset; - block_start = extent_map_block_start(em); + disk_bytenr = btrfs_extent_map_block_start(em) + extent_offset; + if (em->flags & EXTENT_FLAG_PREALLOC) block_start = EXTENT_MAP_HOLE; + else + block_start = btrfs_extent_map_block_start(em); /* * If we have a file range that points to a compressed extent @@ -1037,23 +997,18 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, if (prev_em_start) *prev_em_start = em->start; - free_extent_map(em); + btrfs_free_extent_map(em); em = NULL; /* we've found a hole, just zero and go on */ if (block_start == EXTENT_MAP_HOLE) { - folio_zero_range(folio, pg_offset, iosize); - - end_folio_read(folio, true, cur, iosize); - cur = cur + iosize; - pg_offset += iosize; + folio_zero_range(folio, pg_offset, blocksize); + end_folio_read(folio, true, cur, blocksize); continue; } /* the get_extent function already copied into the folio */ if (block_start == EXTENT_MAP_INLINE) { - end_folio_read(folio, true, cur, iosize); - cur = cur + iosize; - pg_offset += iosize; + end_folio_read(folio, true, cur, blocksize); continue; } @@ -1064,15 +1019,190 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, if (force_bio_submit) submit_one_bio(bio_ctrl); - submit_extent_folio(bio_ctrl, disk_bytenr, folio, iosize, + submit_extent_folio(bio_ctrl, disk_bytenr, folio, blocksize, pg_offset); - cur = cur + iosize; - pg_offset += iosize; } - return 0; } +/* + * Check if we can skip waiting the @ordered extent covering the block at @fileoff. + * + * @fileoff: Both input and output. + * Input as the file offset where the check should start at. + * Output as where the next check should start at, + * if the function returns true. + * + * Return true if we can skip to @fileoff. The caller needs to check the new + * @fileoff value to make sure it covers the full range, before skipping the + * full OE. + * + * Return false if we must wait for the ordered extent. + */ +static bool can_skip_one_ordered_range(struct btrfs_inode *inode, + struct btrfs_ordered_extent *ordered, + u64 *fileoff) +{ + const struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct folio *folio; + const u32 blocksize = fs_info->sectorsize; + u64 cur = *fileoff; + bool ret; + + folio = filemap_get_folio(inode->vfs_inode.i_mapping, cur >> PAGE_SHIFT); + + /* + * We should have locked the folio(s) for range [start, end], thus + * there must be a folio and it must be locked. + */ + ASSERT(!IS_ERR(folio)); + ASSERT(folio_test_locked(folio)); + + /* + * There are several cases for the folio and OE combination: + * + * 1) Folio has no private flag + * The OE has all its IO done but not yet finished, and folio got + * invalidated. + * + * Have we have to wait for the OE to finish, as it may contain the + * to-be-inserted data checksum. + * Without the data checksum inserted into the csum tree, read will + * just fail with missing csum. + */ + if (!folio_test_private(folio)) { + ret = false; + goto out; + } + + /* + * 2) The first block is DIRTY. + * + * This means the OE is created by some other folios whose file pos is + * before this one. And since we are holding the folio lock, the writeback + * of this folio cannot start. + * + * We must skip the whole OE, because it will never start until we + * finished our folio read and unlocked the folio. + */ + if (btrfs_folio_test_dirty(fs_info, folio, cur, blocksize)) { + u64 range_len = min(folio_pos(folio) + folio_size(folio), + ordered->file_offset + ordered->num_bytes) - cur; + + ret = true; + /* + * At least inside the folio, all the remaining blocks should + * also be dirty. + */ + ASSERT(btrfs_folio_test_dirty(fs_info, folio, cur, range_len)); + *fileoff = ordered->file_offset + ordered->num_bytes; + goto out; + } + + /* + * 3) The first block is uptodate. + * + * At least the first block can be skipped, but we are still not fully + * sure. E.g. if the OE has some other folios in the range that cannot + * be skipped. + * So we return true and update @next_ret to the OE/folio boundary. + */ + if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) { + u64 range_len = min(folio_pos(folio) + folio_size(folio), + ordered->file_offset + ordered->num_bytes) - cur; + + /* + * The whole range to the OE end or folio boundary should also + * be uptodate. + */ + ASSERT(btrfs_folio_test_uptodate(fs_info, folio, cur, range_len)); + ret = true; + *fileoff = cur + range_len; + goto out; + } + + /* + * 4) The first block is not uptodate. + * + * This means the folio is invalidated after the writeback was finished, + * but by some other operations (e.g. block aligned buffered write) the + * folio is inserted into filemap. + * Very much the same as case 1). + */ + ret = false; +out: + folio_put(folio); + return ret; +} + +static bool can_skip_ordered_extent(struct btrfs_inode *inode, + struct btrfs_ordered_extent *ordered, + u64 start, u64 end) +{ + const u64 range_end = min(end, ordered->file_offset + ordered->num_bytes - 1); + u64 cur = max(start, ordered->file_offset); + + while (cur < range_end) { + bool can_skip; + + can_skip = can_skip_one_ordered_range(inode, ordered, &cur); + if (!can_skip) + return false; + } + return true; +} + +/* + * Locking helper to make sure we get a stable view of extent maps for the + * involved range. + * + * This is for folio read paths (read and readahead), thus the involved range + * should have all the folios locked. + */ +static void lock_extents_for_read(struct btrfs_inode *inode, u64 start, u64 end, + struct extent_state **cached_state) +{ + u64 cur_pos; + + /* Caller must provide a valid @cached_state. */ + ASSERT(cached_state); + + /* The range must at least be page aligned, as all read paths are folio based. */ + ASSERT(IS_ALIGNED(start, PAGE_SIZE)); + ASSERT(IS_ALIGNED(end + 1, PAGE_SIZE)); + +again: + btrfs_lock_extent(&inode->io_tree, start, end, cached_state); + cur_pos = start; + while (cur_pos < end) { + struct btrfs_ordered_extent *ordered; + + ordered = btrfs_lookup_ordered_range(inode, cur_pos, + end - cur_pos + 1); + /* + * No ordered extents in the range, and we hold the extent lock, + * no one can modify the extent maps in the range, we're safe to return. + */ + if (!ordered) + break; + + /* Check if we can skip waiting for the whole OE. */ + if (can_skip_ordered_extent(inode, ordered, start, end)) { + cur_pos = min(ordered->file_offset + ordered->num_bytes, + end + 1); + btrfs_put_ordered_extent(ordered); + continue; + } + + /* Now wait for the OE to finish. */ + btrfs_unlock_extent(&inode->io_tree, start, end, cached_state); + btrfs_start_ordered_extent_nowriteback(ordered, start, end + 1 - start); + btrfs_put_ordered_extent(ordered); + /* We have unlocked the whole range, restart from the beginning. */ + goto again; + } +} + int btrfs_read_folio(struct file *file, struct folio *folio) { struct btrfs_inode *inode = folio_to_inode(folio); @@ -1083,11 +1213,11 @@ int btrfs_read_folio(struct file *file, struct folio *folio) struct extent_map *em_cached = NULL; int ret; - btrfs_lock_and_flush_ordered_range(inode, start, end, &cached_state); + lock_extents_for_read(inode, start, end, &cached_state); ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, NULL); - unlock_extent(&inode->io_tree, start, end, &cached_state); + btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); - free_extent_map(em_cached); + btrfs_free_extent_map(em_cached); /* * If btrfs_do_readpage() failed we will want to submit the assembled @@ -1105,7 +1235,7 @@ static void set_delalloc_bitmap(struct folio *folio, unsigned long *delalloc_bit unsigned int start_bit; unsigned int nbits; - ASSERT(start >= folio_start && start + len <= folio_start + PAGE_SIZE); + ASSERT(start >= folio_start && start + len <= folio_start + folio_size(folio)); start_bit = (start - folio_start) >> fs_info->sectorsize_bits; nbits = len >> fs_info->sectorsize_bits; ASSERT(bitmap_test_range_all_zero(delalloc_bitmap, start_bit, nbits)); @@ -1118,12 +1248,12 @@ static bool find_next_delalloc_bitmap(struct folio *folio, { struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); const u64 folio_start = folio_pos(folio); - const unsigned int bitmap_size = fs_info->sectors_per_page; + const unsigned int bitmap_size = btrfs_blocks_per_folio(fs_info, folio); unsigned int start_bit; unsigned int first_zero; unsigned int first_set; - ASSERT(start >= folio_start && start < folio_start + PAGE_SIZE); + ASSERT(start >= folio_start && start < folio_start + folio_size(folio)); start_bit = (start - folio_start) >> fs_info->sectorsize_bits; first_set = find_next_bit(delalloc_bitmap, bitmap_size, start_bit); @@ -1157,9 +1287,10 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, { struct btrfs_fs_info *fs_info = inode_to_fs_info(&inode->vfs_inode); struct writeback_control *wbc = bio_ctrl->wbc; - const bool is_subpage = btrfs_is_subpage(fs_info, folio->mapping); + const bool is_subpage = btrfs_is_subpage(fs_info, folio); const u64 page_start = folio_pos(folio); const u64 page_end = page_start + folio_size(folio) - 1; + const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); unsigned long delalloc_bitmap = 0; /* * Save the last found delalloc end. As the delalloc end can go beyond @@ -1184,14 +1315,14 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, int bit; /* Save the dirty bitmap as our submission bitmap will be a subset of it. */ - if (btrfs_is_subpage(fs_info, inode->vfs_inode.i_mapping)) { - ASSERT(fs_info->sectors_per_page > 1); + if (btrfs_is_subpage(fs_info, folio)) { + ASSERT(blocks_per_folio > 1); btrfs_get_subpage_dirty_bitmap(fs_info, folio, &bio_ctrl->submit_bitmap); } else { bio_ctrl->submit_bitmap = 1; } - for_each_set_bit(bit, &bio_ctrl->submit_bitmap, fs_info->sectors_per_page) { + for_each_set_bit(bit, &bio_ctrl->submit_bitmap, blocks_per_folio) { u64 start = page_start + (bit << fs_info->sectorsize_bits); btrfs_folio_set_lock(fs_info, folio, start, fs_info->sectorsize); @@ -1264,7 +1395,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, btrfs_root_id(inode->root), btrfs_ino(inode), folio_pos(folio), - fs_info->sectors_per_page, + blocks_per_folio, &bio_ctrl->submit_bitmap, found_start, found_len, ret); } else { @@ -1272,8 +1403,8 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, * We've hit an error during previous delalloc range, * have to cleanup the remaining locked ranges. */ - unlock_extent(&inode->io_tree, found_start, - found_start + found_len - 1, NULL); + btrfs_unlock_extent(&inode->io_tree, found_start, + found_start + found_len - 1, NULL); unlock_delalloc_folio(&inode->vfs_inode, folio, found_start, found_start + found_len - 1); @@ -1309,7 +1440,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, unsigned int bitmap_size = min( (last_finished_delalloc_end - page_start) >> fs_info->sectorsize_bits, - fs_info->sectors_per_page); + blocks_per_folio); for_each_set_bit(bit, &bio_ctrl->submit_bitmap, bitmap_size) btrfs_mark_ordered_io_finished(inode, folio, @@ -1324,7 +1455,7 @@ out: delalloc_end = page_end; /* * delalloc_end is already one less than the total length, so - * we don't subtract one from PAGE_SIZE + * we don't subtract one from PAGE_SIZE. */ delalloc_to_write += DIV_ROUND_UP(delalloc_end + 1 - page_start, PAGE_SIZE); @@ -1333,7 +1464,7 @@ out: * If all ranges are submitted asynchronously, we just need to account * for them here. */ - if (bitmap_empty(&bio_ctrl->submit_bitmap, fs_info->sectors_per_page)) { + if (bitmap_empty(&bio_ctrl->submit_bitmap, blocks_per_folio)) { wbc->nr_to_write -= delalloc_to_write; return 1; } @@ -1379,19 +1510,19 @@ static int submit_one_sector(struct btrfs_inode *inode, return PTR_ERR(em); extent_offset = filepos - em->start; - em_end = extent_map_end(em); + em_end = btrfs_extent_map_end(em); ASSERT(filepos <= em_end); ASSERT(IS_ALIGNED(em->start, sectorsize)); ASSERT(IS_ALIGNED(em->len, sectorsize)); - block_start = extent_map_block_start(em); - disk_bytenr = extent_map_block_start(em) + extent_offset; + block_start = btrfs_extent_map_block_start(em); + disk_bytenr = btrfs_extent_map_block_start(em) + extent_offset; - ASSERT(!extent_map_is_compressed(em)); + ASSERT(!btrfs_extent_map_is_compressed(em)); ASSERT(block_start != EXTENT_MAP_HOLE); ASSERT(block_start != EXTENT_MAP_INLINE); - free_extent_map(em); + btrfs_free_extent_map(em); em = NULL; /* @@ -1434,6 +1565,7 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, bool submitted_io = false; bool error = false; const u64 folio_start = folio_pos(folio); + const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); u64 cur; int bit; int ret = 0; @@ -1442,21 +1574,23 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, start + len <= folio_start + folio_size(folio)); ret = btrfs_writepage_cow_fixup(folio); - if (ret) { + if (ret == -EAGAIN) { /* Fixup worker will requeue */ folio_redirty_for_writepage(bio_ctrl->wbc, folio); folio_unlock(folio); return 1; } + if (ret < 0) + return ret; for (cur = start; cur < start + len; cur += fs_info->sectorsize) set_bit((cur - folio_start) >> fs_info->sectorsize_bits, &range_bitmap); bitmap_and(&bio_ctrl->submit_bitmap, &bio_ctrl->submit_bitmap, &range_bitmap, - fs_info->sectors_per_page); + blocks_per_folio); bio_ctrl->end_io_func = end_bbio_data_write; - for_each_set_bit(bit, &bio_ctrl->submit_bitmap, fs_info->sectors_per_page) { + for_each_set_bit(bit, &bio_ctrl->submit_bitmap, blocks_per_folio) { cur = folio_pos(folio) + (bit << fs_info->sectorsize_bits); if (cur >= i_size) { @@ -1530,6 +1664,7 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl size_t pg_offset; loff_t i_size = i_size_read(&inode->vfs_inode); unsigned long end_index = i_size >> PAGE_SHIFT; + const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); trace_extent_writepage(folio, &inode->vfs_inode, bio_ctrl->wbc); @@ -1543,7 +1678,7 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl return 0; } - if (folio->index == end_index) + if (folio_contains(folio, end_index)) folio_zero_range(folio, pg_offset, folio_size(folio) - pg_offset); /* @@ -1551,6 +1686,30 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl * The proper bitmap can only be initialized until writepage_delalloc(). */ bio_ctrl->submit_bitmap = (unsigned long)-1; + + /* + * If the page is dirty but without private set, it's marked dirty + * without informing the fs. + * Nowadays that is a bug, since the introduction of + * pin_user_pages*(). + * + * So here we check if the page has private set to rule out such + * case. + * But we also have a long history of relying on the COW fixup, + * so here we only enable this check for experimental builds until + * we're sure it's safe. + */ + if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL) && + unlikely(!folio_test_private(folio))) { + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + btrfs_err_rl(fs_info, + "root %lld ino %llu folio %llu is marked dirty without notifying the fs", + inode->root->root_key.objectid, + btrfs_ino(inode), folio_pos(folio)); + ret = -EUCLEAN; + goto done; + } + ret = set_folio_extent_mapped(folio); if (ret < 0) goto done; @@ -1562,14 +1721,14 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl goto done; ret = extent_writepage_io(inode, folio, folio_pos(folio), - PAGE_SIZE, bio_ctrl, i_size); + folio_size(folio), bio_ctrl, i_size); if (ret == 1) return 0; if (ret < 0) btrfs_err_rl(fs_info, "failed to submit blocks, root=%lld inode=%llu folio=%llu submit_bitmap=%*pbl: %d", btrfs_root_id(inode->root), btrfs_ino(inode), - folio_pos(folio), fs_info->sectors_per_page, + folio_pos(folio), blocks_per_folio, &bio_ctrl->submit_bitmap, ret); bio_ctrl->wbc->nr_to_write--; @@ -1615,8 +1774,18 @@ static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *e */ spin_lock(&eb->refs_lock); if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { + XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits); + unsigned long flags; + set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); spin_unlock(&eb->refs_lock); + + xas_lock_irqsave(&xas, flags); + xas_load(&xas); + xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK); + xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY); + xas_unlock_irqrestore(&xas, flags); + btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, -eb->len, @@ -1702,45 +1871,166 @@ static void set_btree_ioerr(struct extent_buffer *eb) } } +static void buffer_tree_set_mark(const struct extent_buffer *eb, xa_mark_t mark) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits); + unsigned long flags; + + xas_lock_irqsave(&xas, flags); + xas_load(&xas); + xas_set_mark(&xas, mark); + xas_unlock_irqrestore(&xas, flags); +} + +static void buffer_tree_clear_mark(const struct extent_buffer *eb, xa_mark_t mark) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits); + unsigned long flags; + + xas_lock_irqsave(&xas, flags); + xas_load(&xas); + xas_clear_mark(&xas, mark); + xas_unlock_irqrestore(&xas, flags); +} + +static void buffer_tree_tag_for_writeback(struct btrfs_fs_info *fs_info, + unsigned long start, unsigned long end) +{ + XA_STATE(xas, &fs_info->buffer_tree, start); + unsigned int tagged = 0; + void *eb; + + xas_lock_irq(&xas); + xas_for_each_marked(&xas, eb, end, PAGECACHE_TAG_DIRTY) { + xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE); + if (++tagged % XA_CHECK_SCHED) + continue; + xas_pause(&xas); + xas_unlock_irq(&xas); + cond_resched(); + xas_lock_irq(&xas); + } + xas_unlock_irq(&xas); +} + +struct eb_batch { + unsigned int nr; + unsigned int cur; + struct extent_buffer *ebs[PAGEVEC_SIZE]; +}; + +static inline bool eb_batch_add(struct eb_batch *batch, struct extent_buffer *eb) +{ + batch->ebs[batch->nr++] = eb; + return (batch->nr < PAGEVEC_SIZE); +} + +static inline void eb_batch_init(struct eb_batch *batch) +{ + batch->nr = 0; + batch->cur = 0; +} + +static inline struct extent_buffer *eb_batch_next(struct eb_batch *batch) +{ + if (batch->cur >= batch->nr) + return NULL; + return batch->ebs[batch->cur++]; +} + +static inline void eb_batch_release(struct eb_batch *batch) +{ + for (unsigned int i = 0; i < batch->nr; i++) + free_extent_buffer(batch->ebs[i]); + eb_batch_init(batch); +} + +static inline struct extent_buffer *find_get_eb(struct xa_state *xas, unsigned long max, + xa_mark_t mark) +{ + struct extent_buffer *eb; + +retry: + eb = xas_find_marked(xas, max, mark); + + if (xas_retry(xas, eb)) + goto retry; + + if (!eb) + return NULL; + + if (!atomic_inc_not_zero(&eb->refs)) { + xas_reset(xas); + goto retry; + } + + if (unlikely(eb != xas_reload(xas))) { + free_extent_buffer(eb); + xas_reset(xas); + goto retry; + } + + return eb; +} + +static unsigned int buffer_tree_get_ebs_tag(struct btrfs_fs_info *fs_info, + unsigned long *start, + unsigned long end, xa_mark_t tag, + struct eb_batch *batch) +{ + XA_STATE(xas, &fs_info->buffer_tree, *start); + struct extent_buffer *eb; + + rcu_read_lock(); + while ((eb = find_get_eb(&xas, end, tag)) != NULL) { + if (!eb_batch_add(batch, eb)) { + *start = ((eb->start + eb->len) >> fs_info->sectorsize_bits); + goto out; + } + } + if (end == ULONG_MAX) + *start = ULONG_MAX; + else + *start = end + 1; +out: + rcu_read_unlock(); + + return batch->nr; +} + /* * The endio specific version which won't touch any unsafe spinlock in endio * context. */ static struct extent_buffer *find_extent_buffer_nolock( - const struct btrfs_fs_info *fs_info, u64 start) + struct btrfs_fs_info *fs_info, u64 start) { struct extent_buffer *eb; + unsigned long index = (start >> fs_info->sectorsize_bits); rcu_read_lock(); - eb = radix_tree_lookup(&fs_info->buffer_radix, - start >> fs_info->sectorsize_bits); - if (eb && atomic_inc_not_zero(&eb->refs)) { - rcu_read_unlock(); - return eb; - } + eb = xa_load(&fs_info->buffer_tree, index); + if (eb && !atomic_inc_not_zero(&eb->refs)) + eb = NULL; rcu_read_unlock(); - return NULL; + return eb; } static void end_bbio_meta_write(struct btrfs_bio *bbio) { struct extent_buffer *eb = bbio->private; - struct btrfs_fs_info *fs_info = eb->fs_info; struct folio_iter fi; - u32 bio_offset = 0; if (bbio->bio.bi_status != BLK_STS_OK) set_btree_ioerr(eb); bio_for_each_folio_all(fi, &bbio->bio) { - u64 start = eb->start + bio_offset; - struct folio *folio = fi.folio; - u32 len = fi.length; - - btrfs_folio_clear_writeback(fs_info, folio, start, len); - bio_offset += len; + btrfs_meta_folio_clear_writeback(fi.folio, eb); } + buffer_tree_clear_mark(eb, PAGECACHE_TAG_WRITEBACK); clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); smp_mb__after_atomic(); wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); @@ -1792,199 +2082,56 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, wbc_init_bio(wbc, &bbio->bio); bbio->inode = BTRFS_I(eb->fs_info->btree_inode); bbio->file_offset = eb->start; - if (fs_info->nodesize < PAGE_SIZE) { - struct folio *folio = eb->folios[0]; - bool ret; + for (int i = 0; i < num_extent_folios(eb); i++) { + struct folio *folio = eb->folios[i]; + u64 range_start = max_t(u64, eb->start, folio_pos(folio)); + u32 range_len = min_t(u64, folio_pos(folio) + folio_size(folio), + eb->start + eb->len) - range_start; folio_lock(folio); - btrfs_subpage_set_writeback(fs_info, folio, eb->start, eb->len); - if (btrfs_subpage_clear_and_test_dirty(fs_info, folio, eb->start, - eb->len)) { - folio_clear_dirty_for_io(folio); - wbc->nr_to_write--; - } - ret = bio_add_folio(&bbio->bio, folio, eb->len, - eb->start - folio_pos(folio)); - ASSERT(ret); - wbc_account_cgroup_owner(wbc, folio, eb->len); - folio_unlock(folio); - } else { - int num_folios = num_extent_folios(eb); - - for (int i = 0; i < num_folios; i++) { - struct folio *folio = eb->folios[i]; - bool ret; - - folio_lock(folio); - folio_clear_dirty_for_io(folio); - folio_start_writeback(folio); - ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0); - ASSERT(ret); - wbc_account_cgroup_owner(wbc, folio, eb->folio_size); + btrfs_meta_folio_clear_dirty(folio, eb); + btrfs_meta_folio_set_writeback(folio, eb); + if (!folio_test_dirty(folio)) wbc->nr_to_write -= folio_nr_pages(folio); - folio_unlock(folio); - } + bio_add_folio_nofail(&bbio->bio, folio, range_len, + offset_in_folio(folio, range_start)); + wbc_account_cgroup_owner(wbc, folio, range_len); + folio_unlock(folio); } btrfs_submit_bbio(bbio, 0); } /* - * Submit one subpage btree page. - * - * The main difference to submit_eb_page() is: - * - Page locking - * For subpage, we don't rely on page locking at all. + * Wait for all eb writeback in the given range to finish. * - * - Flush write bio - * We only flush bio if we may be unable to fit current extent buffers into - * current bio. - * - * Return >=0 for the number of submitted extent buffers. - * Return <0 for fatal error. + * @fs_info: The fs_info for this file system. + * @start: The offset of the range to start waiting on writeback. + * @end: The end of the range, inclusive. This is meant to be used in + * conjuction with wait_marked_extents, so this will usually be + * the_next_eb->start - 1. */ -static int submit_eb_subpage(struct folio *folio, struct writeback_control *wbc) +void btrfs_btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start, + u64 end) { - struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); - int submitted = 0; - u64 folio_start = folio_pos(folio); - int bit_start = 0; - int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits; - - /* Lock and write each dirty extent buffers in the range */ - while (bit_start < fs_info->sectors_per_page) { - struct btrfs_subpage *subpage = folio_get_private(folio); + struct eb_batch batch; + unsigned long start_index = (start >> fs_info->sectorsize_bits); + unsigned long end_index = (end >> fs_info->sectorsize_bits); + + eb_batch_init(&batch); + while (start_index <= end_index) { struct extent_buffer *eb; - unsigned long flags; - u64 start; + unsigned int nr_ebs; - /* - * Take private lock to ensure the subpage won't be detached - * in the meantime. - */ - spin_lock(&folio->mapping->i_private_lock); - if (!folio_test_private(folio)) { - spin_unlock(&folio->mapping->i_private_lock); + nr_ebs = buffer_tree_get_ebs_tag(fs_info, &start_index, end_index, + PAGECACHE_TAG_WRITEBACK, &batch); + if (!nr_ebs) break; - } - spin_lock_irqsave(&subpage->lock, flags); - if (!test_bit(bit_start + btrfs_bitmap_nr_dirty * fs_info->sectors_per_page, - subpage->bitmaps)) { - spin_unlock_irqrestore(&subpage->lock, flags); - spin_unlock(&folio->mapping->i_private_lock); - bit_start++; - continue; - } - - start = folio_start + bit_start * fs_info->sectorsize; - bit_start += sectors_per_node; - - /* - * Here we just want to grab the eb without touching extra - * spin locks, so call find_extent_buffer_nolock(). - */ - eb = find_extent_buffer_nolock(fs_info, start); - spin_unlock_irqrestore(&subpage->lock, flags); - spin_unlock(&folio->mapping->i_private_lock); - - /* - * The eb has already reached 0 refs thus find_extent_buffer() - * doesn't return it. We don't need to write back such eb - * anyway. - */ - if (!eb) - continue; - - if (lock_extent_buffer_for_io(eb, wbc)) { - write_one_eb(eb, wbc); - submitted++; - } - free_extent_buffer(eb); - } - return submitted; -} - -/* - * Submit all page(s) of one extent buffer. - * - * @page: the page of one extent buffer - * @eb_context: to determine if we need to submit this page, if current page - * belongs to this eb, we don't need to submit - * - * The caller should pass each page in their bytenr order, and here we use - * @eb_context to determine if we have submitted pages of one extent buffer. - * - * If we have, we just skip until we hit a new page that doesn't belong to - * current @eb_context. - * - * If not, we submit all the page(s) of the extent buffer. - * - * Return >0 if we have submitted the extent buffer successfully. - * Return 0 if we don't need to submit the page, as it's already submitted by - * previous call. - * Return <0 for fatal error. - */ -static int submit_eb_page(struct folio *folio, struct btrfs_eb_write_context *ctx) -{ - struct writeback_control *wbc = ctx->wbc; - struct address_space *mapping = folio->mapping; - struct extent_buffer *eb; - int ret; - - if (!folio_test_private(folio)) - return 0; - - if (folio_to_fs_info(folio)->nodesize < PAGE_SIZE) - return submit_eb_subpage(folio, wbc); - - spin_lock(&mapping->i_private_lock); - if (!folio_test_private(folio)) { - spin_unlock(&mapping->i_private_lock); - return 0; - } - - eb = folio_get_private(folio); - - /* - * Shouldn't happen and normally this would be a BUG_ON but no point - * crashing the machine for something we can survive anyway. - */ - if (WARN_ON(!eb)) { - spin_unlock(&mapping->i_private_lock); - return 0; - } - - if (eb == ctx->eb) { - spin_unlock(&mapping->i_private_lock); - return 0; - } - ret = atomic_inc_not_zero(&eb->refs); - spin_unlock(&mapping->i_private_lock); - if (!ret) - return 0; - - ctx->eb = eb; - ret = btrfs_check_meta_write_pointer(eb->fs_info, ctx); - if (ret) { - if (ret == -EBUSY) - ret = 0; - free_extent_buffer(eb); - return ret; - } - - if (!lock_extent_buffer_for_io(eb, wbc)) { - free_extent_buffer(eb); - return 0; - } - /* Implies write in zoned mode. */ - if (ctx->zoned_bg) { - /* Mark the last eb in the block group. */ - btrfs_schedule_zone_finish_bg(ctx->zoned_bg, eb); - ctx->zoned_bg->meta_write_pointer += eb->len; + while ((eb = eb_batch_next(&batch)) != NULL) + wait_on_extent_buffer_writeback(eb); + eb_batch_release(&batch); + cond_resched(); } - write_one_eb(eb, wbc); - free_extent_buffer(eb); - return 1; } int btree_write_cache_pages(struct address_space *mapping, @@ -1995,25 +2142,27 @@ int btree_write_cache_pages(struct address_space *mapping, int ret = 0; int done = 0; int nr_to_write_done = 0; - struct folio_batch fbatch; - unsigned int nr_folios; - pgoff_t index; - pgoff_t end; /* Inclusive */ + struct eb_batch batch; + unsigned int nr_ebs; + unsigned long index; + unsigned long end; int scanned = 0; xa_mark_t tag; - folio_batch_init(&fbatch); + eb_batch_init(&batch); if (wbc->range_cyclic) { - index = mapping->writeback_index; /* Start from prev offset */ + index = ((mapping->writeback_index << PAGE_SHIFT) >> fs_info->sectorsize_bits); end = -1; + /* * Start from the beginning does not need to cycle over the * range, mark it as scanned. */ scanned = (index == 0); } else { - index = wbc->range_start >> PAGE_SHIFT; - end = wbc->range_end >> PAGE_SHIFT; + index = (wbc->range_start >> fs_info->sectorsize_bits); + end = (wbc->range_end >> fs_info->sectorsize_bits); + scanned = 1; } if (wbc->sync_mode == WB_SYNC_ALL) @@ -2023,31 +2172,39 @@ int btree_write_cache_pages(struct address_space *mapping, btrfs_zoned_meta_io_lock(fs_info); retry: if (wbc->sync_mode == WB_SYNC_ALL) - tag_pages_for_writeback(mapping, index, end); + buffer_tree_tag_for_writeback(fs_info, index, end); while (!done && !nr_to_write_done && (index <= end) && - (nr_folios = filemap_get_folios_tag(mapping, &index, end, - tag, &fbatch))) { - unsigned i; + (nr_ebs = buffer_tree_get_ebs_tag(fs_info, &index, end, tag, &batch))) { + struct extent_buffer *eb; - for (i = 0; i < nr_folios; i++) { - struct folio *folio = fbatch.folios[i]; + while ((eb = eb_batch_next(&batch)) != NULL) { + ctx.eb = eb; - ret = submit_eb_page(folio, &ctx); - if (ret == 0) + ret = btrfs_check_meta_write_pointer(eb->fs_info, &ctx); + if (ret) { + if (ret == -EBUSY) + ret = 0; + + if (ret) { + done = 1; + break; + } continue; - if (ret < 0) { - done = 1; - break; } - /* - * the filesystem may choose to bump up nr_to_write. - * We have to make sure to honor the new nr_to_write - * at any time - */ - nr_to_write_done = wbc->nr_to_write <= 0; + if (!lock_extent_buffer_for_io(eb, wbc)) + continue; + + /* Implies write in zoned mode. */ + if (ctx.zoned_bg) { + /* Mark the last eb in the block group. */ + btrfs_schedule_zone_finish_bg(ctx.zoned_bg, eb); + ctx.zoned_bg->meta_write_pointer += eb->len; + } + write_one_eb(eb, wbc); } - folio_batch_release(&fbatch); + nr_to_write_done = (wbc->nr_to_write <= 0); + eb_batch_release(&batch); cond_resched(); } if (!scanned && !done) { @@ -2192,10 +2349,8 @@ retry: done_index = folio_next_index(folio); /* * At this point we hold neither the i_pages lock nor - * the page lock: the page may be truncated or - * invalidated (changing page->mapping to NULL), - * or even swizzled back from swapper_space to - * tmpfs file mapping + * the folio lock: the folio may be truncated or + * invalidated (changing folio->mapping to NULL). */ if (!folio_trylock(folio)) { submit_write_bio(bio_ctrl, 0); @@ -2233,7 +2388,7 @@ retry: * regular submission. */ if (wbc->sync_mode != WB_SYNC_NONE || - btrfs_is_subpage(inode_to_fs_info(inode), mapping)) { + btrfs_is_subpage(inode_to_fs_info(inode), folio)) { if (folio_test_writeback(folio)) submit_write_bio(bio_ctrl, 0); folio_wait_writeback(folio); @@ -2314,8 +2469,8 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize)); while (cur <= end) { - u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end); - u32 cur_len = cur_end + 1 - cur; + u64 cur_end; + u32 cur_len; struct folio *folio; folio = filemap_get_folio(mapping, cur >> PAGE_SHIFT); @@ -2325,13 +2480,18 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f * code is just in case, but shouldn't actually be run. */ if (IS_ERR(folio)) { + cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end); + cur_len = cur_end + 1 - cur; btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL, cur, cur_len, false); mapping_set_error(mapping, PTR_ERR(folio)); - cur = cur_end + 1; + cur = cur_end; continue; } + cur_end = min_t(u64, folio_pos(folio) + folio_size(folio) - 1, end); + cur_len = cur_end + 1 - cur; + ASSERT(folio_test_locked(folio)); if (pages_dirty && folio != locked_folio) ASSERT(folio_test_dirty(folio)); @@ -2390,15 +2550,15 @@ void btrfs_readahead(struct readahead_control *rac) struct extent_map *em_cached = NULL; u64 prev_em_start = (u64)-1; - btrfs_lock_and_flush_ordered_range(inode, start, end, &cached_state); + lock_extents_for_read(inode, start, end, &cached_state); while ((folio = readahead_folio(rac)) != NULL) btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start); - unlock_extent(&inode->io_tree, start, end, &cached_state); + btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); if (em_cached) - free_extent_map(em_cached); + btrfs_free_extent_map(em_cached); submit_one_bio(&bio_ctrl); } @@ -2422,7 +2582,7 @@ int extent_invalidate_folio(struct extent_io_tree *tree, if (start > end) return 0; - lock_extent(tree, start, end, &cached_state); + btrfs_lock_extent(tree, start, end, &cached_state); folio_wait_writeback(folio); /* @@ -2430,46 +2590,54 @@ int extent_invalidate_folio(struct extent_io_tree *tree, * so here we only need to unlock the extent range to free any * existing extent state. */ - unlock_extent(tree, start, end, &cached_state); + btrfs_unlock_extent(tree, start, end, &cached_state); return 0; } /* - * a helper for release_folio, this tests for areas of the page that - * are locked or under IO and drops the related state bits if it is safe - * to drop the page. + * A helper for struct address_space_operations::release_folio, this tests for + * areas of the folio that are locked or under IO and drops the related state + * bits if it is safe to drop the folio. */ static bool try_release_extent_state(struct extent_io_tree *tree, struct folio *folio) { + struct extent_state *cached_state = NULL; u64 start = folio_pos(folio); - u64 end = start + PAGE_SIZE - 1; - bool ret; + u64 end = start + folio_size(folio) - 1; + u32 range_bits; + u32 clear_bits; + bool ret = false; + int ret2; - if (test_range_bit_exists(tree, start, end, EXTENT_LOCKED)) { - ret = false; - } else { - u32 clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM | - EXTENT_DELALLOC_NEW | EXTENT_CTLBITS | - EXTENT_QGROUP_RESERVED); - int ret2; + btrfs_get_range_bits(tree, start, end, &range_bits, &cached_state); - /* - * At this point we can safely clear everything except the - * locked bit, the nodatasum bit and the delalloc new bit. - * The delalloc new bit will be cleared by ordered extent - * completion. - */ - ret2 = __clear_extent_bit(tree, start, end, clear_bits, NULL, NULL); + /* + * We can release the folio if it's locked only for ordered extent + * completion, since that doesn't require using the folio. + */ + if ((range_bits & EXTENT_LOCKED) && + !(range_bits & EXTENT_FINISHING_ORDERED)) + goto out; + + clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW | + EXTENT_CTLBITS | EXTENT_QGROUP_RESERVED | + EXTENT_FINISHING_ORDERED); + /* + * At this point we can safely clear everything except the locked, + * nodatasum, delalloc new and finishing ordered bits. The delalloc new + * bit will be cleared by ordered extent completion. + */ + ret2 = btrfs_clear_extent_bit(tree, start, end, clear_bits, &cached_state); + /* + * If clear_extent_bit failed for enomem reasons, we can't allow the + * release to continue. + */ + if (ret2 == 0) + ret = true; +out: + btrfs_free_extent_state(cached_state); - /* if clear_extent_bit failed for enomem reasons, - * we can't allow the release to continue. - */ - if (ret2 < 0) - ret = false; - else - ret = true; - } return ret; } @@ -2481,7 +2649,7 @@ static bool try_release_extent_state(struct extent_io_tree *tree, bool try_release_extent_mapping(struct folio *folio, gfp_t mask) { u64 start = folio_pos(folio); - u64 end = start + PAGE_SIZE - 1; + u64 end = start + folio_size(folio) - 1; struct btrfs_inode *inode = folio_to_inode(folio); struct extent_io_tree *io_tree = &inode->io_tree; @@ -2492,18 +2660,19 @@ bool try_release_extent_mapping(struct folio *folio, gfp_t mask) struct extent_map *em; write_lock(&extent_tree->lock); - em = lookup_extent_mapping(extent_tree, start, len); + em = btrfs_lookup_extent_mapping(extent_tree, start, len); if (!em) { write_unlock(&extent_tree->lock); break; } if ((em->flags & EXTENT_FLAG_PINNED) || em->start != start) { write_unlock(&extent_tree->lock); - free_extent_map(em); + btrfs_free_extent_map(em); break; } - if (test_range_bit_exists(io_tree, em->start, - extent_map_end(em) - 1, EXTENT_LOCKED)) + if (btrfs_test_range_bit_exists(io_tree, em->start, + btrfs_extent_map_end(em) - 1, + EXTENT_LOCKED)) goto next; /* * If it's not in the list of modified extents, used by a fast @@ -2530,15 +2699,15 @@ remove_em: * fsync performance for workloads with a data size that exceeds * or is close to the system's memory). */ - remove_extent_mapping(inode, em); + btrfs_remove_extent_mapping(inode, em); /* Once for the inode's extent map tree. */ - free_extent_map(em); + btrfs_free_extent_map(em); next: - start = extent_map_end(em); + start = btrfs_extent_map_end(em); write_unlock(&extent_tree->lock); /* Once for us, for the lookup_extent_mapping() reference. */ - free_extent_map(em); + btrfs_free_extent_map(em); if (need_resched()) { /* @@ -2577,6 +2746,7 @@ static bool folio_range_has_eb(struct folio *folio) static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct folio *folio) { struct btrfs_fs_info *fs_info = eb->fs_info; + struct address_space *mapping = folio->mapping; const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); /* @@ -2584,21 +2754,20 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo * be done under the i_private_lock. */ if (mapped) - spin_lock(&folio->mapping->i_private_lock); + spin_lock(&mapping->i_private_lock); if (!folio_test_private(folio)) { if (mapped) - spin_unlock(&folio->mapping->i_private_lock); + spin_unlock(&mapping->i_private_lock); return; } - if (fs_info->nodesize >= PAGE_SIZE) { + if (!btrfs_meta_is_subpage(fs_info)) { /* - * We do this since we'll remove the pages after we've - * removed the eb from the radix tree, so we could race - * and have this page now attached to the new eb. So - * only clear folio if it's still connected to - * this eb. + * We do this since we'll remove the pages after we've removed + * the eb from the xarray, so we could race and have this page + * now attached to the new eb. So only clear folio if it's + * still connected to this eb. */ if (folio_test_private(folio) && folio_get_private(folio) == eb) { BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); @@ -2608,7 +2777,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo folio_detach_private(folio); } if (mapped) - spin_unlock(&folio->mapping->i_private_lock); + spin_unlock(&mapping->i_private_lock); return; } @@ -2618,7 +2787,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo * attached to one dummy eb, no sharing. */ if (!mapped) { - btrfs_detach_subpage(fs_info, folio); + btrfs_detach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA); return; } @@ -2629,9 +2798,9 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo * page range and no unfinished IO. */ if (!folio_range_has_eb(folio)) - btrfs_detach_subpage(fs_info, folio); + btrfs_detach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA); - spin_unlock(&folio->mapping->i_private_lock); + spin_unlock(&mapping->i_private_lock); } /* Release all folios attached to the extent buffer */ @@ -2646,9 +2815,6 @@ static void btrfs_release_extent_buffer_folios(const struct extent_buffer *eb) continue; detach_extent_buffer_folio(eb, folio); - - /* One for when we allocated the folio. */ - folio_put(folio); } } @@ -2662,15 +2828,14 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) kmem_cache_free(extent_buffer_cache, eb); } -static struct extent_buffer * -__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, - unsigned long len) +static struct extent_buffer *__alloc_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start) { struct extent_buffer *eb = NULL; eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL); eb->start = start; - eb->len = len; + eb->len = fs_info->nodesize; eb->fs_info = fs_info; init_rwsem(&eb->lock); @@ -2679,18 +2844,36 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, spin_lock_init(&eb->refs_lock); atomic_set(&eb->refs, 1); - ASSERT(len <= BTRFS_MAX_METADATA_BLOCKSIZE); + ASSERT(eb->len <= BTRFS_MAX_METADATA_BLOCKSIZE); return eb; } +/* + * For use in eb allocation error cleanup paths, as btrfs_release_extent_buffer() + * does not call folio_put(), and we need to set the folios to NULL so that + * btrfs_release_extent_buffer() will not detach them a second time. + */ +static void cleanup_extent_buffer_folios(struct extent_buffer *eb) +{ + const int num_folios = num_extent_folios(eb); + + /* We canont use num_extent_folios() as loop bound as eb->folios changes. */ + for (int i = 0; i < num_folios; i++) { + ASSERT(eb->folios[i]); + detach_extent_buffer_folio(eb, eb->folios[i]); + folio_put(eb->folios[i]); + eb->folios[i] = NULL; + } +} + struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) { struct extent_buffer *new; - int num_folios = num_extent_folios(src); + int num_folios; int ret; - new = __alloc_extent_buffer(src->fs_info, src->start, src->len); + new = __alloc_extent_buffer(src->fs_info, src->start); if (new == NULL) return NULL; @@ -2702,78 +2885,78 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags); ret = alloc_eb_folio_array(new, false); - if (ret) { - btrfs_release_extent_buffer(new); - return NULL; - } + if (ret) + goto release_eb; + ASSERT(num_extent_folios(src) == num_extent_folios(new), + "%d != %d", num_extent_folios(src), num_extent_folios(new)); + /* Explicitly use the cached num_extent value from now on. */ + num_folios = num_extent_folios(src); for (int i = 0; i < num_folios; i++) { struct folio *folio = new->folios[i]; ret = attach_extent_buffer_folio(new, folio, NULL); - if (ret < 0) { - btrfs_release_extent_buffer(new); - return NULL; - } + if (ret < 0) + goto cleanup_folios; WARN_ON(folio_test_dirty(folio)); } + for (int i = 0; i < num_folios; i++) + folio_put(new->folios[i]); + copy_extent_buffer_full(new, src); set_extent_buffer_uptodate(new); return new; + +cleanup_folios: + cleanup_extent_buffer_folios(new); +release_eb: + btrfs_release_extent_buffer(new); + return NULL; } -struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start, unsigned long len) +struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start) { struct extent_buffer *eb; - int num_folios = 0; int ret; - eb = __alloc_extent_buffer(fs_info, start, len); + eb = __alloc_extent_buffer(fs_info, start); if (!eb) return NULL; ret = alloc_eb_folio_array(eb, false); if (ret) - goto err; + goto release_eb; - num_folios = num_extent_folios(eb); - for (int i = 0; i < num_folios; i++) { + for (int i = 0; i < num_extent_folios(eb); i++) { ret = attach_extent_buffer_folio(eb, eb->folios[i], NULL); if (ret < 0) - goto err; + goto cleanup_folios; } + for (int i = 0; i < num_extent_folios(eb); i++) + folio_put(eb->folios[i]); set_extent_buffer_uptodate(eb); btrfs_set_header_nritems(eb, 0); set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); return eb; -err: - for (int i = 0; i < num_folios; i++) { - if (eb->folios[i]) { - detach_extent_buffer_folio(eb, eb->folios[i]); - folio_put(eb->folios[i]); - } - } - kmem_cache_free(extent_buffer_cache, eb); - return NULL; -} -struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start) -{ - return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize); +cleanup_folios: + cleanup_extent_buffer_folios(eb); +release_eb: + btrfs_release_extent_buffer(eb); + return NULL; } static void check_buffer_tree_ref(struct extent_buffer *eb) { int refs; /* - * The TREE_REF bit is first set when the extent_buffer is added - * to the radix tree. It is also reset, if unset, when a new reference - * is created by find_extent_buffer. + * The TREE_REF bit is first set when the extent_buffer is added to the + * xarray. It is also reset, if unset, when a new reference is created + * by find_extent_buffer. * * It is only cleared in two cases: freeing the last non-tree * reference to the extent_buffer when its STALE bit is set or @@ -2785,13 +2968,12 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) * conditions between the calls to check_buffer_tree_ref in those * codepaths and clearing TREE_REF in try_release_extent_buffer. * - * The actual lifetime of the extent_buffer in the radix tree is - * adequately protected by the refcount, but the TREE_REF bit and - * its corresponding reference are not. To protect against this - * class of races, we call check_buffer_tree_ref from the codepaths - * which trigger io. Note that once io is initiated, TREE_REF can no - * longer be cleared, so that is the moment at which any such race is - * best fixed. + * The actual lifetime of the extent_buffer in the xarray is adequately + * protected by the refcount, but the TREE_REF bit and its corresponding + * reference are not. To protect against this class of races, we call + * check_buffer_tree_ref() from the code paths which trigger io. Note that + * once io is initiated, TREE_REF can no longer be cleared, so that is + * the moment at which any such race is best fixed. */ refs = atomic_read(&eb->refs); if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) @@ -2805,11 +2987,9 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) static void mark_extent_buffer_accessed(struct extent_buffer *eb) { - int num_folios= num_extent_folios(eb); - check_buffer_tree_ref(eb); - for (int i = 0; i < num_folios; i++) + for (int i = 0; i < num_extent_folios(eb); i++) folio_mark_accessed(eb->folios[i]); } @@ -2842,10 +3022,10 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, return eb; } -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, u64 start) { +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct extent_buffer *eb, *exists = NULL; int ret; @@ -2857,32 +3037,34 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, return ERR_PTR(-ENOMEM); eb->fs_info = fs_info; again: - ret = radix_tree_preload(GFP_NOFS); - if (ret) { - exists = ERR_PTR(ret); - goto free_eb; + xa_lock_irq(&fs_info->buffer_tree); + exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->sectorsize_bits, + NULL, eb, GFP_NOFS); + if (xa_is_err(exists)) { + ret = xa_err(exists); + xa_unlock_irq(&fs_info->buffer_tree); + btrfs_release_extent_buffer(eb); + return ERR_PTR(ret); } - spin_lock(&fs_info->buffer_lock); - ret = radix_tree_insert(&fs_info->buffer_radix, - start >> fs_info->sectorsize_bits, eb); - spin_unlock(&fs_info->buffer_lock); - radix_tree_preload_end(); - if (ret == -EEXIST) { - exists = find_extent_buffer(fs_info, start); - if (exists) - goto free_eb; - else + if (exists) { + if (!atomic_inc_not_zero(&exists->refs)) { + /* The extent buffer is being freed, retry. */ + xa_unlock_irq(&fs_info->buffer_tree); goto again; + } + xa_unlock_irq(&fs_info->buffer_tree); + btrfs_release_extent_buffer(eb); + return exists; } + xa_unlock_irq(&fs_info->buffer_tree); check_buffer_tree_ref(eb); - set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); return eb; -free_eb: - btrfs_release_extent_buffer(eb); - return exists; -} +#else + /* Stub to avoid linker error when compiled with optimizations turned off. */ + return NULL; #endif +} static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info, struct folio *folio) @@ -2892,11 +3074,11 @@ static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info, lockdep_assert_held(&folio->mapping->i_private_lock); /* - * For subpage case, we completely rely on radix tree to ensure we - * don't try to insert two ebs for the same bytenr. So here we always - * return NULL and just continue. + * For subpage case, we completely rely on xarray to ensure we don't try + * to insert two ebs for the same bytenr. So here we always return NULL + * and just continue. */ - if (fs_info->nodesize < PAGE_SIZE) + if (btrfs_meta_is_subpage(fs_info)) return NULL; /* Page not yet attached to an extent buffer */ @@ -2928,10 +3110,9 @@ static bool check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) return true; } - if (fs_info->nodesize < PAGE_SIZE && - offset_in_page(start) + fs_info->nodesize > PAGE_SIZE) { + if (fs_info->nodesize < PAGE_SIZE && !IS_ALIGNED(start, fs_info->nodesize)) { btrfs_err(fs_info, - "tree block crosses page boundary, start %llu nodesize %u", + "tree block is not nodesize aligned, start %llu nodesize %u", start, fs_info->nodesize); return true; } @@ -2967,7 +3148,7 @@ static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i, struct btrfs_fs_info *fs_info = eb->fs_info; struct address_space *mapping = fs_info->btree_inode->i_mapping; const unsigned long index = eb->start >> PAGE_SHIFT; - struct folio *existing_folio = NULL; + struct folio *existing_folio; int ret; ASSERT(found_eb_ret); @@ -2976,6 +3157,7 @@ static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i, ASSERT(eb->folios[i]); retry: + existing_folio = NULL; ret = filemap_add_folio(mapping, eb->folios[i], index + i, GFP_NOFS | __GFP_NOFAIL); if (!ret) @@ -2983,10 +3165,8 @@ retry: existing_folio = filemap_lock_folio(mapping, index + i); /* The page cache only exists for a very short time, just retry. */ - if (IS_ERR(existing_folio)) { - existing_folio = NULL; + if (IS_ERR(existing_folio)) goto retry; - } /* For now, we should only have single-page folios for btree inode. */ ASSERT(folio_nr_pages(existing_folio) == 1); @@ -2999,7 +3179,7 @@ retry: finish: spin_lock(&mapping->i_private_lock); - if (existing_folio && fs_info->nodesize < PAGE_SIZE) { + if (existing_folio && btrfs_meta_is_subpage(fs_info)) { /* We're going to reuse the existing page, can drop our folio now. */ __free_page(folio_page(eb->folios[i], 0)); eb->folios[i] = existing_folio; @@ -3027,7 +3207,7 @@ finish: /* * To inform we have an extra eb under allocation, so that * detach_extent_buffer_page() won't release the folio private when the - * eb hasn't been inserted into radix tree yet. + * eb hasn't been inserted into the xarray yet. * * The ref will be decreased when the eb releases the page, in * detach_extent_buffer_page(). Thus needs no special handling in the @@ -3041,8 +3221,6 @@ finish: struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, u64 owner_root, int level) { - unsigned long len = fs_info->nodesize; - int num_folios; int attached = 0; struct extent_buffer *eb; struct extent_buffer *existing_eb = NULL; @@ -3070,7 +3248,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, if (eb) return eb; - eb = __alloc_extent_buffer(fs_info, start, len); + eb = __alloc_extent_buffer(fs_info, start); if (!eb) return ERR_PTR(-ENOMEM); @@ -3090,8 +3268,8 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, * The memory will be freed by attach_extent_buffer_page() or freed * manually if we exit earlier. */ - if (fs_info->nodesize < PAGE_SIZE) { - prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA); + if (btrfs_meta_is_subpage(fs_info)) { + prealloc = btrfs_alloc_subpage(fs_info, PAGE_SIZE, BTRFS_SUBPAGE_METADATA); if (IS_ERR(prealloc)) { ret = PTR_ERR(prealloc); goto out; @@ -3106,9 +3284,8 @@ reallocate: goto out; } - num_folios = num_extent_folios(eb); /* Attach all pages to the filemap. */ - for (int i = 0; i < num_folios; i++) { + for (int i = 0; i < num_extent_folios(eb); i++) { struct folio *folio; ret = attach_eb_folio_to_filemap(eb, i, prealloc, &existing_eb); @@ -3137,7 +3314,7 @@ reallocate: * using 0-order folios. */ if (unlikely(ret == -EAGAIN)) { - ASSERT(0); + DEBUG_WARN("folio order mismatch between new eb and filemap"); goto reallocate; } attached++; @@ -3148,7 +3325,7 @@ reallocate: * and free the allocated page. */ folio = eb->folios[i]; - WARN_ON(btrfs_folio_test_dirty(fs_info, folio, eb->start, eb->len)); + WARN_ON(btrfs_meta_folio_test_dirty(folio, eb)); /* * Check if the current page is physically contiguous with previous eb @@ -3159,15 +3336,14 @@ reallocate: if (i && folio_page(eb->folios[i - 1], 0) + 1 != folio_page(folio, 0)) page_contig = false; - if (!btrfs_folio_test_uptodate(fs_info, folio, eb->start, eb->len)) + if (!btrfs_meta_folio_test_uptodate(folio, eb)) uptodate = 0; /* * We can't unlock the pages just yet since the extent buffer - * hasn't been properly inserted in the radix tree, this - * opens a race with btree_release_folio which can free a page - * while we are still filling in all pages for the buffer and - * we could crash. + * hasn't been properly inserted into the xarray, this opens a + * race with btree_release_folio() which can free a page while we + * are still filling in all pages for the buffer and we could crash. */ } if (uptodate) @@ -3176,34 +3352,42 @@ reallocate: if (page_contig) eb->addr = folio_address(eb->folios[0]) + offset_in_page(eb->start); again: - ret = radix_tree_preload(GFP_NOFS); - if (ret) + xa_lock_irq(&fs_info->buffer_tree); + existing_eb = __xa_cmpxchg(&fs_info->buffer_tree, + start >> fs_info->sectorsize_bits, NULL, eb, + GFP_NOFS); + if (xa_is_err(existing_eb)) { + ret = xa_err(existing_eb); + xa_unlock_irq(&fs_info->buffer_tree); goto out; - - spin_lock(&fs_info->buffer_lock); - ret = radix_tree_insert(&fs_info->buffer_radix, - start >> fs_info->sectorsize_bits, eb); - spin_unlock(&fs_info->buffer_lock); - radix_tree_preload_end(); - if (ret == -EEXIST) { - ret = 0; - existing_eb = find_extent_buffer(fs_info, start); - if (existing_eb) - goto out; - else + } + if (existing_eb) { + if (!atomic_inc_not_zero(&existing_eb->refs)) { + xa_unlock_irq(&fs_info->buffer_tree); goto again; + } + xa_unlock_irq(&fs_info->buffer_tree); + goto out; } + xa_unlock_irq(&fs_info->buffer_tree); + /* add one reference for the tree */ check_buffer_tree_ref(eb); - set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); /* * Now it's safe to unlock the pages because any calls to * btree_release_folio will correctly detect that a page belongs to a * live buffer and won't free them prematurely. */ - for (int i = 0; i < num_folios; i++) + for (int i = 0; i < num_extent_folios(eb); i++) { folio_unlock(eb->folios[i]); + /* + * A folio that has been added to an address_space mapping + * should not continue holding the refcount from its original + * allocation indefinitely. + */ + folio_put(eb->folios[i]); + } return eb; out: @@ -3217,26 +3401,22 @@ out: * want that to grab this eb, as we're getting ready to free it. So we * have to detach it first and then unlock it. * - * We have to drop our reference and NULL it out here because in the - * subpage case detaching does a btrfs_folio_dec_eb_refs() for our eb. - * Below when we call btrfs_release_extent_buffer() we will call - * detach_extent_buffer_folio() on our remaining pages in the !subpage - * case. If we left eb->folios[i] populated in the subpage case we'd - * double put our reference and be super sad. + * Note: the bounds is num_extent_pages() as we need to go through all slots. */ - for (int i = 0; i < attached; i++) { - ASSERT(eb->folios[i]); - detach_extent_buffer_folio(eb, eb->folios[i]); - folio_unlock(eb->folios[i]); - folio_put(eb->folios[i]); + for (int i = 0; i < num_extent_pages(eb); i++) { + struct folio *folio = eb->folios[i]; + + if (i < attached) { + ASSERT(folio); + detach_extent_buffer_folio(eb, folio); + folio_unlock(folio); + } else if (!folio) { + continue; + } + + folio_put(folio); eb->folios[i] = NULL; } - /* - * Now all pages of that extent buffer is unmapped, set UNMAPPED flag, - * so it can be cleaned up without utilizing page->mapping. - */ - set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); - btrfs_release_extent_buffer(eb); if (ret < 0) return ERR_PTR(ret); @@ -3259,18 +3439,27 @@ static int release_extent_buffer(struct extent_buffer *eb) WARN_ON(atomic_read(&eb->refs) == 0); if (atomic_dec_and_test(&eb->refs)) { - if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) { - struct btrfs_fs_info *fs_info = eb->fs_info; + struct btrfs_fs_info *fs_info = eb->fs_info; - spin_unlock(&eb->refs_lock); + spin_unlock(&eb->refs_lock); - spin_lock(&fs_info->buffer_lock); - radix_tree_delete(&fs_info->buffer_radix, - eb->start >> fs_info->sectorsize_bits); - spin_unlock(&fs_info->buffer_lock); - } else { - spin_unlock(&eb->refs_lock); - } + /* + * We're erasing, theoretically there will be no allocations, so + * just use GFP_ATOMIC. + * + * We use cmpxchg instead of erase because we do not know if + * this eb is actually in the tree or not, we could be cleaning + * up an eb that we allocated but never inserted into the tree. + * Thus use cmpxchg to remove it from the tree if it is there, + * or leave the other entry if this isn't in the tree. + * + * The documentation says that putting a NULL value is the same + * as erase as long as XA_FLAGS_ALLOC is not set, which it isn't + * in this case. + */ + xa_cmpxchg_irq(&fs_info->buffer_tree, + eb->start >> fs_info->sectorsize_bits, eb, NULL, + GFP_ATOMIC); btrfs_leak_debug_del_eb(eb); /* Should be safe to release folios at this point. */ @@ -3333,38 +3522,21 @@ void free_extent_buffer_stale(struct extent_buffer *eb) release_extent_buffer(eb); } -static void btree_clear_folio_dirty(struct folio *folio) +static void btree_clear_folio_dirty_tag(struct folio *folio) { - ASSERT(folio_test_dirty(folio)); + ASSERT(!folio_test_dirty(folio)); ASSERT(folio_test_locked(folio)); - folio_clear_dirty_for_io(folio); xa_lock_irq(&folio->mapping->i_pages); if (!folio_test_dirty(folio)) - __xa_clear_mark(&folio->mapping->i_pages, - folio_index(folio), PAGECACHE_TAG_DIRTY); + __xa_clear_mark(&folio->mapping->i_pages, folio->index, + PAGECACHE_TAG_DIRTY); xa_unlock_irq(&folio->mapping->i_pages); } -static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb) -{ - struct btrfs_fs_info *fs_info = eb->fs_info; - struct folio *folio = eb->folios[0]; - bool last; - - /* btree_clear_folio_dirty() needs page locked. */ - folio_lock(folio); - last = btrfs_subpage_clear_and_test_dirty(fs_info, folio, eb->start, eb->len); - if (last) - btree_clear_folio_dirty(folio); - folio_unlock(folio); - WARN_ON(atomic_read(&eb->refs) == 0); -} - void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, struct extent_buffer *eb) { struct btrfs_fs_info *fs_info = eb->fs_info; - int num_folios; btrfs_assert_tree_write_locked(eb); @@ -3388,20 +3560,20 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, if (!test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) return; + buffer_tree_clear_mark(eb, PAGECACHE_TAG_DIRTY); percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, -eb->len, fs_info->dirty_metadata_batch); - if (eb->fs_info->nodesize < PAGE_SIZE) - return clear_subpage_extent_buffer_dirty(eb); - - num_folios = num_extent_folios(eb); - for (int i = 0; i < num_folios; i++) { + for (int i = 0; i < num_extent_folios(eb); i++) { struct folio *folio = eb->folios[i]; + bool last; if (!folio_test_dirty(folio)) continue; folio_lock(folio); - btree_clear_folio_dirty(folio); + last = btrfs_meta_folio_clear_and_test_dirty(folio, eb); + if (last) + btree_clear_folio_dirty_tag(folio); folio_unlock(folio); } WARN_ON(atomic_read(&eb->refs) == 0); @@ -3409,37 +3581,35 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, void set_extent_buffer_dirty(struct extent_buffer *eb) { - int num_folios; bool was_dirty; check_buffer_tree_ref(eb); was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); - num_folios = num_extent_folios(eb); WARN_ON(atomic_read(&eb->refs) == 0); WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); WARN_ON(test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)); if (!was_dirty) { - bool subpage = eb->fs_info->nodesize < PAGE_SIZE; + bool subpage = btrfs_meta_is_subpage(eb->fs_info); /* * For subpage case, we can have other extent buffers in the - * same page, and in clear_subpage_extent_buffer_dirty() we + * same page, and in clear_extent_buffer_dirty() we * have to clear page dirty without subpage lock held. * This can cause race where our page gets dirty cleared after * we just set it. * - * Thankfully, clear_subpage_extent_buffer_dirty() has locked + * Thankfully, clear_extent_buffer_dirty() has locked * its page for other reasons, we can use page lock to prevent * the above race. */ if (subpage) folio_lock(eb->folios[0]); - for (int i = 0; i < num_folios; i++) - btrfs_folio_set_dirty(eb->fs_info, eb->folios[i], - eb->start, eb->len); + for (int i = 0; i < num_extent_folios(eb); i++) + btrfs_meta_folio_set_dirty(eb->folios[i], eb); + buffer_tree_set_mark(eb, PAGECACHE_TAG_DIRTY); if (subpage) folio_unlock(eb->folios[0]); percpu_counter_add_batch(&eb->fs_info->dirty_metadata_bytes, @@ -3447,54 +3617,31 @@ void set_extent_buffer_dirty(struct extent_buffer *eb) eb->fs_info->dirty_metadata_batch); } #ifdef CONFIG_BTRFS_DEBUG - for (int i = 0; i < num_folios; i++) + for (int i = 0; i < num_extent_folios(eb); i++) ASSERT(folio_test_dirty(eb->folios[i])); #endif } void clear_extent_buffer_uptodate(struct extent_buffer *eb) { - struct btrfs_fs_info *fs_info = eb->fs_info; - int num_folios = num_extent_folios(eb); clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - for (int i = 0; i < num_folios; i++) { + for (int i = 0; i < num_extent_folios(eb); i++) { struct folio *folio = eb->folios[i]; if (!folio) continue; - /* - * This is special handling for metadata subpage, as regular - * btrfs_is_subpage() can not handle cloned/dummy metadata. - */ - if (fs_info->nodesize >= PAGE_SIZE) - folio_clear_uptodate(folio); - else - btrfs_subpage_clear_uptodate(fs_info, folio, - eb->start, eb->len); + btrfs_meta_folio_clear_uptodate(folio, eb); } } void set_extent_buffer_uptodate(struct extent_buffer *eb) { - struct btrfs_fs_info *fs_info = eb->fs_info; - int num_folios = num_extent_folios(eb); set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - for (int i = 0; i < num_folios; i++) { - struct folio *folio = eb->folios[i]; - - /* - * This is special handling for metadata subpage, as regular - * btrfs_is_subpage() can not handle cloned/dummy metadata. - */ - if (fs_info->nodesize >= PAGE_SIZE) - folio_mark_uptodate(folio); - else - btrfs_subpage_set_uptodate(fs_info, folio, - eb->start, eb->len); - } + for (int i = 0; i < num_extent_folios(eb); i++) + btrfs_meta_folio_set_uptodate(eb->folios[i], eb); } static void clear_extent_buffer_reading(struct extent_buffer *eb) @@ -3507,10 +3654,7 @@ static void clear_extent_buffer_reading(struct extent_buffer *eb) static void end_bbio_meta_read(struct btrfs_bio *bbio) { struct extent_buffer *eb = bbio->private; - struct btrfs_fs_info *fs_info = eb->fs_info; bool uptodate = !bbio->bio.bi_status; - struct folio_iter fi; - u32 bio_offset = 0; /* * If the extent buffer is marked UPTODATE before the read operation @@ -3525,25 +3669,10 @@ static void end_bbio_meta_read(struct btrfs_bio *bbio) btrfs_validate_extent_buffer(eb, &bbio->parent_check) < 0) uptodate = false; - if (uptodate) { + if (uptodate) set_extent_buffer_uptodate(eb); - } else { + else clear_extent_buffer_uptodate(eb); - set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); - } - - bio_for_each_folio_all(fi, &bbio->bio) { - struct folio *folio = fi.folio; - u64 start = eb->start + bio_offset; - u32 len = fi.length; - - if (uptodate) - btrfs_folio_set_uptodate(fs_info, folio, start, len); - else - btrfs_folio_clear_uptodate(fs_info, folio, start, len); - - bio_offset += len; - } clear_extent_buffer_reading(eb); free_extent_buffer(eb); @@ -3555,7 +3684,6 @@ int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num, const struct btrfs_tree_parent_check *check) { struct btrfs_bio *bbio; - bool ret; if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; @@ -3583,7 +3711,6 @@ int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num, return 0; } - clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); eb->read_mirror = 0; check_buffer_tree_ref(eb); atomic_inc(&eb->refs); @@ -3595,19 +3722,14 @@ int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num, bbio->inode = BTRFS_I(eb->fs_info->btree_inode); bbio->file_offset = eb->start; memcpy(&bbio->parent_check, check, sizeof(*check)); - if (eb->fs_info->nodesize < PAGE_SIZE) { - ret = bio_add_folio(&bbio->bio, eb->folios[0], eb->len, - eb->start - folio_pos(eb->folios[0])); - ASSERT(ret); - } else { - int num_folios = num_extent_folios(eb); - - for (int i = 0; i < num_folios; i++) { - struct folio *folio = eb->folios[i]; + for (int i = 0; i < num_extent_folios(eb); i++) { + struct folio *folio = eb->folios[i]; + u64 range_start = max_t(u64, eb->start, folio_pos(folio)); + u32 range_len = min_t(u64, folio_pos(folio) + folio_size(folio), + eb->start + eb->len) - range_start; - ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0); - ASSERT(ret); - } + bio_add_folio_nofail(&bbio->bio, folio, range_len, + offset_in_folio(folio, range_start)); } btrfs_submit_bbio(bbio, mirror_num); return 0; @@ -3634,7 +3756,7 @@ static bool report_eb_range(const struct extent_buffer *eb, unsigned long start, btrfs_warn(eb->fs_info, "access to eb bytenr %llu len %u out of range start %lu len %lu", eb->start, eb->len, start, len); - WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + DEBUG_WARN(); return true; } @@ -3796,7 +3918,7 @@ static void assert_eb_folio_uptodate(const struct extent_buffer *eb, int i) if (test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) return; - if (fs_info->nodesize < PAGE_SIZE) { + if (btrfs_meta_is_subpage(fs_info)) { folio = eb->folios[0]; ASSERT(i == 0); if (WARN_ON(!btrfs_subpage_test_uptodate(fs_info, folio, @@ -4170,71 +4292,17 @@ void memmove_extent_buffer(const struct extent_buffer *dst, } } -#define GANG_LOOKUP_SIZE 16 -static struct extent_buffer *get_next_extent_buffer( - const struct btrfs_fs_info *fs_info, struct folio *folio, u64 bytenr) -{ - struct extent_buffer *gang[GANG_LOOKUP_SIZE]; - struct extent_buffer *found = NULL; - u64 folio_start = folio_pos(folio); - u64 cur = folio_start; - - ASSERT(in_range(bytenr, folio_start, PAGE_SIZE)); - lockdep_assert_held(&fs_info->buffer_lock); - - while (cur < folio_start + PAGE_SIZE) { - int ret; - int i; - - ret = radix_tree_gang_lookup(&fs_info->buffer_radix, - (void **)gang, cur >> fs_info->sectorsize_bits, - min_t(unsigned int, GANG_LOOKUP_SIZE, - PAGE_SIZE / fs_info->nodesize)); - if (ret == 0) - goto out; - for (i = 0; i < ret; i++) { - /* Already beyond page end */ - if (gang[i]->start >= folio_start + PAGE_SIZE) - goto out; - /* Found one */ - if (gang[i]->start >= bytenr) { - found = gang[i]; - goto out; - } - } - cur = gang[ret - 1]->start + gang[ret - 1]->len; - } -out: - return found; -} - static int try_release_subpage_extent_buffer(struct folio *folio) { struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); - u64 cur = folio_pos(folio); - const u64 end = cur + PAGE_SIZE; + struct extent_buffer *eb; + unsigned long start = (folio_pos(folio) >> fs_info->sectorsize_bits); + unsigned long index = start; + unsigned long end = index + (PAGE_SIZE >> fs_info->sectorsize_bits) - 1; int ret; - while (cur < end) { - struct extent_buffer *eb = NULL; - - /* - * Unlike try_release_extent_buffer() which uses folio private - * to grab buffer, for subpage case we rely on radix tree, thus - * we need to ensure radix tree consistency. - * - * We also want an atomic snapshot of the radix tree, thus go - * with spinlock rather than RCU. - */ - spin_lock(&fs_info->buffer_lock); - eb = get_next_extent_buffer(fs_info, folio, cur); - if (!eb) { - /* No more eb in the page range after or at cur */ - spin_unlock(&fs_info->buffer_lock); - break; - } - cur = eb->start + eb->len; - + xa_lock_irq(&fs_info->buffer_tree); + xa_for_each_range(&fs_info->buffer_tree, index, eb, start, end) { /* * The same as try_release_extent_buffer(), to ensure the eb * won't disappear out from under us. @@ -4242,10 +4310,9 @@ static int try_release_subpage_extent_buffer(struct folio *folio) spin_lock(&eb->refs_lock); if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { spin_unlock(&eb->refs_lock); - spin_unlock(&fs_info->buffer_lock); - break; + continue; } - spin_unlock(&fs_info->buffer_lock); + xa_unlock_irq(&fs_info->buffer_tree); /* * If tree ref isn't set then we know the ref on this eb is a @@ -4263,7 +4330,10 @@ static int try_release_subpage_extent_buffer(struct folio *folio) * release_extent_buffer() will release the refs_lock. */ release_extent_buffer(eb); + xa_lock_irq(&fs_info->buffer_tree); } + xa_unlock_irq(&fs_info->buffer_tree); + /* * Finally to check if we have cleared folio private, as if we have * released all ebs in the page, the folio private should be cleared now. @@ -4282,7 +4352,7 @@ int try_release_extent_buffer(struct folio *folio) { struct extent_buffer *eb; - if (folio_to_fs_info(folio)->nodesize < PAGE_SIZE) + if (btrfs_meta_is_subpage(folio_to_fs_info(folio))) return try_release_subpage_extent_buffer(folio); /* diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 6c5328bfabc2..e36e8d6a00bc 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -38,16 +38,10 @@ struct btrfs_tree_parent_check; enum { EXTENT_BUFFER_UPTODATE, EXTENT_BUFFER_DIRTY, - EXTENT_BUFFER_CORRUPT, - /* this got triggered by readahead */ - EXTENT_BUFFER_READAHEAD, EXTENT_BUFFER_TREE_REF, EXTENT_BUFFER_STALE, EXTENT_BUFFER_WRITEBACK, - /* read IO error */ - EXTENT_BUFFER_READ_ERR, EXTENT_BUFFER_UNMAPPED, - EXTENT_BUFFER_IN_TREE, /* write IO error */ EXTENT_BUFFER_WRITE_ERR, /* Indicate the extent buffer is written zeroed out (for zoned) */ @@ -79,7 +73,7 @@ enum { * single word in a bitmap may straddle two pages in the extent buffer. */ #define BIT_BYTE(nr) ((nr) / BITS_PER_BYTE) -#define BYTE_MASK ((1 << BITS_PER_BYTE) - 1) +#define BYTE_MASK ((1U << BITS_PER_BYTE) - 1) #define BITMAP_FIRST_BYTE_MASK(start) \ ((BYTE_MASK << ((start) & (BITS_PER_BYTE - 1))) & BYTE_MASK) #define BITMAP_LAST_BYTE_MASK(nbits) \ @@ -246,14 +240,13 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc); int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc); +void btrfs_btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start, u64 end); void btrfs_readahead(struct readahead_control *rac); int set_folio_extent_mapped(struct folio *folio); void clear_folio_extent_mapped(struct folio *folio); struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, u64 owner_root, int level); -struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start, unsigned long len); struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, u64 start); struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src); @@ -276,7 +269,8 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, u64 owner_root, u64 gen, int level); void btrfs_readahead_node_child(struct extent_buffer *node, int slot); -static inline int num_extent_pages(const struct extent_buffer *eb) +/* Note: this can be used in for loops without caching the value in a variable. */ +static inline int __pure num_extent_pages(const struct extent_buffer *eb) { /* * For sectorsize == PAGE_SIZE case, since nodesize is always aligned to @@ -294,9 +288,13 @@ static inline int num_extent_pages(const struct extent_buffer *eb) * As we can have either one large folio covering the whole eb * (either nodesize <= PAGE_SIZE, or high order folio), or multiple * single-paged folios. + * + * Note: this can be used in for loops without caching the value in a variable. */ -static inline int num_extent_folios(const struct extent_buffer *eb) +static inline int __pure num_extent_folios(const struct extent_buffer *eb) { + if (!eb->folios[0]) + return 0; if (folio_order(eb->folios[0])) return 1; return num_extent_pages(eb); diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 67ce85ff0ae2..02bfdb976e40 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -13,7 +13,7 @@ static struct kmem_cache *extent_map_cache; -int __init extent_map_init(void) +int __init btrfs_extent_map_init(void) { extent_map_cache = kmem_cache_create("btrfs_extent_map", sizeof(struct extent_map), 0, 0, NULL); @@ -22,7 +22,7 @@ int __init extent_map_init(void) return 0; } -void __cold extent_map_exit(void) +void __cold btrfs_extent_map_exit(void) { kmem_cache_destroy(extent_map_cache); } @@ -31,7 +31,7 @@ void __cold extent_map_exit(void) * Initialize the extent tree @tree. Should be called for each new inode or * other user of the extent_map interface. */ -void extent_map_tree_init(struct extent_map_tree *tree) +void btrfs_extent_map_tree_init(struct extent_map_tree *tree) { tree->root = RB_ROOT; INIT_LIST_HEAD(&tree->modified_extents); @@ -42,7 +42,7 @@ void extent_map_tree_init(struct extent_map_tree *tree) * Allocate a new extent_map structure. The new structure is returned with a * reference count of one and needs to be freed using free_extent_map() */ -struct extent_map *alloc_extent_map(void) +struct extent_map *btrfs_alloc_extent_map(void) { struct extent_map *em; em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS); @@ -58,12 +58,12 @@ struct extent_map *alloc_extent_map(void) * Drop the reference out on @em by one and free the structure if the reference * count hits zero. */ -void free_extent_map(struct extent_map *em) +void btrfs_free_extent_map(struct extent_map *em) { if (!em) return; if (refcount_dec_and_test(&em->refs)) { - WARN_ON(extent_map_in_tree(em)); + WARN_ON(btrfs_extent_map_in_tree(em)); WARN_ON(!list_empty(&em->list)); kmem_cache_free(extent_map_cache, em); } @@ -102,19 +102,19 @@ static int tree_insert(struct rb_root *root, struct extent_map *em) if (em->start < entry->start) p = &(*p)->rb_left; - else if (em->start >= extent_map_end(entry)) + else if (em->start >= btrfs_extent_map_end(entry)) p = &(*p)->rb_right; else return -EEXIST; } orig_parent = parent; - while (parent && em->start >= extent_map_end(entry)) { + while (parent && em->start >= btrfs_extent_map_end(entry)) { parent = rb_next(parent); entry = rb_entry(parent, struct extent_map, rb_node); } if (parent) - if (end > entry->start && em->start < extent_map_end(entry)) + if (end > entry->start && em->start < btrfs_extent_map_end(entry)) return -EEXIST; parent = orig_parent; @@ -124,7 +124,7 @@ static int tree_insert(struct rb_root *root, struct extent_map *em) entry = rb_entry(parent, struct extent_map, rb_node); } if (parent) - if (end > entry->start && em->start < extent_map_end(entry)) + if (end > entry->start && em->start < btrfs_extent_map_end(entry)) return -EEXIST; rb_link_node(&em->rb_node, orig_parent, p); @@ -136,8 +136,8 @@ static int tree_insert(struct rb_root *root, struct extent_map *em) * Search through the tree for an extent_map with a given offset. If it can't * be found, try to find some neighboring extents */ -static struct rb_node *__tree_search(struct rb_root *root, u64 offset, - struct rb_node **prev_or_next_ret) +static struct rb_node *tree_search(struct rb_root *root, u64 offset, + struct rb_node **prev_or_next_ret) { struct rb_node *n = root->rb_node; struct rb_node *prev = NULL; @@ -154,14 +154,14 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset, if (offset < entry->start) n = n->rb_left; - else if (offset >= extent_map_end(entry)) + else if (offset >= btrfs_extent_map_end(entry)) n = n->rb_right; else return n; } orig_prev = prev; - while (prev && offset >= extent_map_end(prev_entry)) { + while (prev && offset >= btrfs_extent_map_end(prev_entry)) { prev = rb_next(prev); prev_entry = rb_entry(prev, struct extent_map, rb_node); } @@ -188,14 +188,14 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset, static inline u64 extent_map_block_len(const struct extent_map *em) { - if (extent_map_is_compressed(em)) + if (btrfs_extent_map_is_compressed(em)) return em->disk_num_bytes; return em->len; } static inline u64 extent_map_block_end(const struct extent_map *em) { - const u64 block_start = extent_map_block_start(em); + const u64 block_start = btrfs_extent_map_block_start(em); const u64 block_end = block_start + extent_map_block_len(em); if (block_end < block_start) @@ -210,7 +210,7 @@ static bool can_merge_extent_map(const struct extent_map *em) return false; /* Don't merge compressed extents, we need to know their actual size. */ - if (extent_map_is_compressed(em)) + if (btrfs_extent_map_is_compressed(em)) return false; if (em->flags & EXTENT_FLAG_LOGGING) @@ -230,7 +230,7 @@ static bool can_merge_extent_map(const struct extent_map *em) /* Check to see if two extent_map structs are adjacent and safe to merge. */ static bool mergeable_maps(const struct extent_map *prev, const struct extent_map *next) { - if (extent_map_end(prev) != next->start) + if (btrfs_extent_map_end(prev) != next->start) return false; /* @@ -242,7 +242,7 @@ static bool mergeable_maps(const struct extent_map *prev, const struct extent_ma return false; if (next->disk_bytenr < EXTENT_MAP_LAST_BYTE - 1) - return extent_map_block_start(next) == extent_map_block_end(prev); + return btrfs_extent_map_block_start(next) == extent_map_block_end(prev); /* HOLES and INLINE extents. */ return next->disk_bytenr == prev->disk_bytenr; @@ -270,8 +270,8 @@ static void merge_ondisk_extents(const struct extent_map *prev, const struct ext u64 new_offset; /* @prev and @next should not be compressed. */ - ASSERT(!extent_map_is_compressed(prev)); - ASSERT(!extent_map_is_compressed(next)); + ASSERT(!btrfs_extent_map_is_compressed(prev)); + ASSERT(!btrfs_extent_map_is_compressed(next)); /* * There are two different cases where @prev and @next can be merged. @@ -327,9 +327,9 @@ static void validate_extent_map(struct btrfs_fs_info *fs_info, struct extent_map if (em->offset + em->len > em->ram_bytes) dump_extent_map(fs_info, "ram_bytes too small", em); if (em->offset + em->len > em->disk_num_bytes && - !extent_map_is_compressed(em)) + !btrfs_extent_map_is_compressed(em)) dump_extent_map(fs_info, "disk_num_bytes too small", em); - if (!extent_map_is_compressed(em) && + if (!btrfs_extent_map_is_compressed(em) && em->ram_bytes != em->disk_num_bytes) dump_extent_map(fs_info, "ram_bytes mismatch with disk_num_bytes for non-compressed em", @@ -361,8 +361,8 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em) if (em->start != 0) { rb = rb_prev(&em->rb_node); - if (rb) - merge = rb_entry(rb, struct extent_map, rb_node); + merge = rb_entry_safe(rb, struct extent_map, rb_node); + if (rb && can_merge_extent_map(merge) && mergeable_maps(merge, em)) { em->start = merge->start; em->len += merge->len; @@ -374,13 +374,13 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em) validate_extent_map(fs_info, em); remove_em(inode, merge); - free_extent_map(merge); + btrfs_free_extent_map(merge); } } rb = rb_next(&em->rb_node); - if (rb) - merge = rb_entry(rb, struct extent_map, rb_node); + merge = rb_entry_safe(rb, struct extent_map, rb_node); + if (rb && can_merge_extent_map(merge) && mergeable_maps(em, merge)) { em->len += merge->len; if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) @@ -389,7 +389,7 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em) em->generation = max(em->generation, merge->generation); em->flags |= EXTENT_FLAG_MERGED; remove_em(inode, merge); - free_extent_map(merge); + btrfs_free_extent_map(merge); } } @@ -409,7 +409,7 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em) * -ENOENT when the extent is not found in the tree * -EUCLEAN if the found extent does not match the expected start */ -int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen) +int btrfs_unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_map_tree *tree = &inode->extent_tree; @@ -417,7 +417,7 @@ int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen) struct extent_map *em; write_lock(&tree->lock); - em = lookup_extent_mapping(tree, start, len); + em = btrfs_lookup_extent_mapping(tree, start, len); if (WARN_ON(!em)) { btrfs_warn(fs_info, @@ -444,17 +444,17 @@ int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen) out: write_unlock(&tree->lock); - free_extent_map(em); + btrfs_free_extent_map(em); return ret; } -void clear_em_logging(struct btrfs_inode *inode, struct extent_map *em) +void btrfs_clear_em_logging(struct btrfs_inode *inode, struct extent_map *em) { lockdep_assert_held_write(&inode->extent_tree.lock); em->flags &= ~EXTENT_FLAG_LOGGING; - if (extent_map_in_tree(em)) + if (btrfs_extent_map_in_tree(em)) try_merge_map(inode, em); } @@ -508,16 +508,15 @@ static int add_extent_mapping(struct btrfs_inode *inode, return 0; } -static struct extent_map * -__lookup_extent_mapping(struct extent_map_tree *tree, - u64 start, u64 len, int strict) +static struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len, int strict) { struct extent_map *em; struct rb_node *rb_node; struct rb_node *prev_or_next = NULL; u64 end = range_end(start, len); - rb_node = __tree_search(&tree->root, start, &prev_or_next); + rb_node = tree_search(&tree->root, start, &prev_or_next); if (!rb_node) { if (prev_or_next) rb_node = prev_or_next; @@ -527,7 +526,7 @@ __lookup_extent_mapping(struct extent_map_tree *tree, em = rb_entry(rb_node, struct extent_map, rb_node); - if (strict && !(end > em->start && start < extent_map_end(em))) + if (strict && !(end > em->start && start < btrfs_extent_map_end(em))) return NULL; refcount_inc(&em->refs); @@ -546,10 +545,10 @@ __lookup_extent_mapping(struct extent_map_tree *tree, * intersect, so check the object returned carefully to make sure that no * additional lookups are needed. */ -struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, - u64 start, u64 len) +struct extent_map *btrfs_lookup_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len) { - return __lookup_extent_mapping(tree, start, len, 1); + return lookup_extent_mapping(tree, start, len, 1); } /* @@ -564,10 +563,10 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, * * If one can't be found, any nearby extent may be returned */ -struct extent_map *search_extent_mapping(struct extent_map_tree *tree, - u64 start, u64 len) +struct extent_map *btrfs_search_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len) { - return __lookup_extent_mapping(tree, start, len, 0); + return lookup_extent_mapping(tree, start, len, 0); } /* @@ -579,7 +578,7 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree, * Remove @em from the extent tree of @inode. No reference counts are dropped, * and no checks are done to see if the range is in use. */ -void remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em) +void btrfs_remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em) { struct extent_map_tree *tree = &inode->extent_tree; @@ -605,7 +604,7 @@ static void replace_extent_mapping(struct btrfs_inode *inode, validate_extent_map(fs_info, new); WARN_ON(cur->flags & EXTENT_FLAG_PINNED); - ASSERT(extent_map_in_tree(cur)); + ASSERT(btrfs_extent_map_in_tree(cur)); if (!(cur->flags & EXTENT_FLAG_LOGGING)) list_del_init(&cur->list); rb_replace_node(&cur->rb_node, &new->rb_node, &tree->root); @@ -651,7 +650,7 @@ static noinline int merge_extent_mapping(struct btrfs_inode *inode, u64 end; u64 start_diff; - if (map_start < em->start || map_start >= extent_map_end(em)) + if (map_start < em->start || map_start >= btrfs_extent_map_end(em)) return -EINVAL; if (existing->start > map_start) { @@ -662,10 +661,10 @@ static noinline int merge_extent_mapping(struct btrfs_inode *inode, next = next_extent_map(prev); } - start = prev ? extent_map_end(prev) : em->start; + start = prev ? btrfs_extent_map_end(prev) : em->start; start = max_t(u64, start, em->start); - end = next ? next->start : extent_map_end(em); - end = min_t(u64, end, extent_map_end(em)); + end = next ? next->start : btrfs_extent_map_end(em); + end = min_t(u64, end, btrfs_extent_map_end(em)); start_diff = start - em->start; em->start = start; em->len = end - start; @@ -716,7 +715,7 @@ int btrfs_add_extent_mapping(struct btrfs_inode *inode, if (ret == -EEXIST) { struct extent_map *existing; - existing = search_extent_mapping(&inode->extent_tree, start, len); + existing = btrfs_search_extent_mapping(&inode->extent_tree, start, len); trace_btrfs_handle_em_exist(fs_info, existing, em, start, len); @@ -725,8 +724,8 @@ int btrfs_add_extent_mapping(struct btrfs_inode *inode, * extent causing the -EEXIST. */ if (start >= existing->start && - start < extent_map_end(existing)) { - free_extent_map(em); + start < btrfs_extent_map_end(existing)) { + btrfs_free_extent_map(em); *em_in = existing; ret = 0; } else { @@ -739,14 +738,14 @@ int btrfs_add_extent_mapping(struct btrfs_inode *inode, */ ret = merge_extent_mapping(inode, existing, em, start); if (WARN_ON(ret)) { - free_extent_map(em); + btrfs_free_extent_map(em); *em_in = NULL; btrfs_warn(fs_info, "extent map merge error existing [%llu, %llu) with em [%llu, %llu) start %llu", - existing->start, extent_map_end(existing), + existing->start, btrfs_extent_map_end(existing), orig_start, orig_start + orig_len, start); } - free_extent_map(existing); + btrfs_free_extent_map(existing); } } @@ -772,8 +771,8 @@ static void drop_all_extent_maps_fast(struct btrfs_inode *inode) em = rb_entry(node, struct extent_map, rb_node); em->flags &= ~(EXTENT_FLAG_PINNED | EXTENT_FLAG_LOGGING); - remove_extent_mapping(inode, em); - free_extent_map(em); + btrfs_remove_extent_mapping(inode, em); + btrfs_free_extent_map(em); if (cond_resched_rwlock_write(&tree->lock)) node = rb_first(&tree->root); @@ -826,15 +825,15 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, * range ends after our range (and they might be the same extent map), * because we need to split those two extent maps at the boundaries. */ - split = alloc_extent_map(); - split2 = alloc_extent_map(); + split = btrfs_alloc_extent_map(); + split2 = btrfs_alloc_extent_map(); write_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, len); + em = btrfs_lookup_extent_mapping(em_tree, start, len); while (em) { /* extent_map_end() returns exclusive value (last byte + 1). */ - const u64 em_end = extent_map_end(em); + const u64 em_end = btrfs_extent_map_end(em); struct extent_map *next_em = NULL; u64 gen; unsigned long flags; @@ -898,7 +897,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, split->generation = gen; split->flags = flags; replace_extent_mapping(inode, em, split, modified); - free_extent_map(split); + btrfs_free_extent_map(split); split = split2; split2 = NULL; } @@ -925,7 +924,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, split->ram_bytes = split->len; } - if (extent_map_in_tree(em)) { + if (btrfs_extent_map_in_tree(em)) { replace_extent_mapping(inode, em, split, modified); } else { int ret; @@ -936,11 +935,11 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, if (WARN_ON(ret != 0) && modified) btrfs_set_inode_full_sync(inode); } - free_extent_map(split); + btrfs_free_extent_map(split); split = NULL; } remove_em: - if (extent_map_in_tree(em)) { + if (btrfs_extent_map_in_tree(em)) { /* * If the extent map is still in the tree it means that * either of the following is true: @@ -965,25 +964,25 @@ remove_em: ASSERT(!split); btrfs_set_inode_full_sync(inode); } - remove_extent_mapping(inode, em); + btrfs_remove_extent_mapping(inode, em); } /* * Once for the tree reference (we replaced or removed the * extent map from the tree). */ - free_extent_map(em); + btrfs_free_extent_map(em); next: /* Once for us (for our lookup reference). */ - free_extent_map(em); + btrfs_free_extent_map(em); em = next_em; } write_unlock(&em_tree->lock); - free_extent_map(split); - free_extent_map(split2); + btrfs_free_extent_map(split); + btrfs_free_extent_map(split2); } /* @@ -1007,7 +1006,7 @@ int btrfs_replace_extent_map_range(struct btrfs_inode *inode, struct extent_map_tree *tree = &inode->extent_tree; int ret; - ASSERT(!extent_map_in_tree(new_em)); + ASSERT(!btrfs_extent_map_in_tree(new_em)); /* * The caller has locked an appropriate file range in the inode's io @@ -1033,8 +1032,8 @@ int btrfs_replace_extent_map_range(struct btrfs_inode *inode, * * This function is used when an ordered_extent needs to be split. */ -int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, - u64 new_logical) +int btrfs_split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, + u64 new_logical) { struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; @@ -1046,25 +1045,25 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, ASSERT(pre != 0); ASSERT(pre < len); - split_pre = alloc_extent_map(); + split_pre = btrfs_alloc_extent_map(); if (!split_pre) return -ENOMEM; - split_mid = alloc_extent_map(); + split_mid = btrfs_alloc_extent_map(); if (!split_mid) { ret = -ENOMEM; goto out_free_pre; } - lock_extent(&inode->io_tree, start, start + len - 1, NULL); + btrfs_lock_extent(&inode->io_tree, start, start + len - 1, NULL); write_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, len); + em = btrfs_lookup_extent_mapping(em_tree, start, len); if (!em) { ret = -EIO; goto out_unlock; } ASSERT(em->len == len); - ASSERT(!extent_map_is_compressed(em)); + ASSERT(!btrfs_extent_map_is_compressed(em)); ASSERT(em->disk_bytenr < EXTENT_MAP_LAST_BYTE); ASSERT(em->flags & EXTENT_FLAG_PINNED); ASSERT(!(em->flags & EXTENT_FLAG_LOGGING)); @@ -1093,7 +1092,7 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, /* Insert the middle extent_map. */ split_mid->start = em->start + pre; split_mid->len = em->len - pre; - split_mid->disk_bytenr = extent_map_block_start(em) + pre; + split_mid->disk_bytenr = btrfs_extent_map_block_start(em) + pre; split_mid->disk_num_bytes = split_mid->len; split_mid->offset = 0; split_mid->ram_bytes = split_mid->len; @@ -1102,16 +1101,16 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, add_extent_mapping(inode, split_mid, 1); /* Once for us */ - free_extent_map(em); + btrfs_free_extent_map(em); /* Once for the tree */ - free_extent_map(em); + btrfs_free_extent_map(em); out_unlock: write_unlock(&em_tree->lock); - unlock_extent(&inode->io_tree, start, start + len - 1, NULL); - free_extent_map(split_mid); + btrfs_unlock_extent(&inode->io_tree, start, start + len - 1, NULL); + btrfs_free_extent_map(split_mid); out_free_pre: - free_extent_map(split_pre); + btrfs_free_extent_map(split_pre); return ret; } @@ -1128,6 +1127,8 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c long nr_dropped = 0; struct rb_node *node; + lockdep_assert_held_write(&tree->lock); + /* * Take the mmap lock so that we serialize with the inode logging phase * of fsync because we may need to set the full sync flag on the inode, @@ -1139,28 +1140,12 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c * to find new extents, which may not be there yet because ordered * extents haven't completed yet. * - * We also do a try lock because otherwise we could deadlock. This is - * because the shrinker for this filesystem may be invoked while we are - * in a path that is holding the mmap lock in write mode. For example in - * a reflink operation while COWing an extent buffer, when allocating - * pages for a new extent buffer and under memory pressure, the shrinker - * may be invoked, and therefore we would deadlock by attempting to read - * lock the mmap lock while we are holding already a write lock on it. + * We also do a try lock because we don't want to block for too long and + * we are holding the extent map tree's lock in write mode. */ if (!down_read_trylock(&inode->i_mmap_lock)) return 0; - /* - * We want to be fast so if the lock is busy we don't want to spend time - * waiting for it - either some task is about to do IO for the inode or - * we may have another task shrinking extent maps, here in this code, so - * skip this inode. - */ - if (!write_trylock(&tree->lock)) { - up_read(&inode->i_mmap_lock); - return 0; - } - node = rb_first(&tree->root); while (node) { struct rb_node *next = rb_next(node); @@ -1182,10 +1167,10 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c if (!list_empty(&em->list) && em->generation >= cur_fs_gen) btrfs_set_inode_full_sync(inode); - remove_extent_mapping(inode, em); + btrfs_remove_extent_mapping(inode, em); trace_btrfs_extent_map_shrinker_remove_em(inode, em); /* Drop the reference for the tree. */ - free_extent_map(em); + btrfs_free_extent_map(em); nr_dropped++; next: if (ctx->scanned >= ctx->nr_to_scan) @@ -1201,12 +1186,61 @@ next: break; node = next; } - write_unlock(&tree->lock); up_read(&inode->i_mmap_lock); return nr_dropped; } +static struct btrfs_inode *find_first_inode_to_shrink(struct btrfs_root *root, + u64 min_ino) +{ + struct btrfs_inode *inode; + unsigned long from = min_ino; + + xa_lock(&root->inodes); + while (true) { + struct extent_map_tree *tree; + + inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT); + if (!inode) + break; + + tree = &inode->extent_tree; + + /* + * We want to be fast so if the lock is busy we don't want to + * spend time waiting for it (some task is about to do IO for + * the inode). + */ + if (!write_trylock(&tree->lock)) + goto next; + + /* + * Skip inode if it doesn't have loaded extent maps, so we avoid + * getting a reference and doing an iput later. This includes + * cases like files that were opened for things like stat(2), or + * files with all extent maps previously released through the + * release folio callback (btrfs_release_folio()) or released in + * a previous run, or directories which never have extent maps. + */ + if (RB_EMPTY_ROOT(&tree->root)) { + write_unlock(&tree->lock); + goto next; + } + + if (igrab(&inode->vfs_inode)) + break; + + write_unlock(&tree->lock); +next: + from = btrfs_ino(inode) + 1; + cond_resched_lock(&root->inodes.xa_lock); + } + xa_unlock(&root->inodes); + + return inode; +} + static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -1214,21 +1248,21 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx long nr_dropped = 0; u64 min_ino = fs_info->em_shrinker_last_ino + 1; - inode = btrfs_find_first_inode(root, min_ino); + inode = find_first_inode_to_shrink(root, min_ino); while (inode) { nr_dropped += btrfs_scan_inode(inode, ctx); + write_unlock(&inode->extent_tree.lock); min_ino = btrfs_ino(inode) + 1; fs_info->em_shrinker_last_ino = btrfs_ino(inode); - btrfs_add_delayed_iput(inode); + iput(&inode->vfs_inode); - if (ctx->scanned >= ctx->nr_to_scan || - btrfs_fs_closing(inode->root->fs_info)) + if (ctx->scanned >= ctx->nr_to_scan || btrfs_fs_closing(fs_info)) break; cond_resched(); - inode = btrfs_find_first_inode(root, min_ino); + inode = find_first_inode_to_shrink(root, min_ino); } if (inode) { diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index cd123b266b64..d4b81ee4d97b 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -108,8 +108,8 @@ struct extent_map_tree { struct btrfs_inode; -static inline void extent_map_set_compression(struct extent_map *em, - enum btrfs_compression_type type) +static inline void btrfs_extent_map_set_compression(struct extent_map *em, + enum btrfs_compression_type type) { if (type == BTRFS_COMPRESS_ZLIB) em->flags |= EXTENT_FLAG_COMPRESS_ZLIB; @@ -119,7 +119,8 @@ static inline void extent_map_set_compression(struct extent_map *em, em->flags |= EXTENT_FLAG_COMPRESS_ZSTD; } -static inline enum btrfs_compression_type extent_map_compression(const struct extent_map *em) +static inline enum btrfs_compression_type btrfs_extent_map_compression( + const struct extent_map *em) { if (em->flags & EXTENT_FLAG_COMPRESS_ZLIB) return BTRFS_COMPRESS_ZLIB; @@ -137,50 +138,50 @@ static inline enum btrfs_compression_type extent_map_compression(const struct ex * More efficient way to determine if extent is compressed, instead of using * 'extent_map_compression() != BTRFS_COMPRESS_NONE'. */ -static inline bool extent_map_is_compressed(const struct extent_map *em) +static inline bool btrfs_extent_map_is_compressed(const struct extent_map *em) { return (em->flags & (EXTENT_FLAG_COMPRESS_ZLIB | EXTENT_FLAG_COMPRESS_LZO | EXTENT_FLAG_COMPRESS_ZSTD)) != 0; } -static inline int extent_map_in_tree(const struct extent_map *em) +static inline int btrfs_extent_map_in_tree(const struct extent_map *em) { return !RB_EMPTY_NODE(&em->rb_node); } -static inline u64 extent_map_block_start(const struct extent_map *em) +static inline u64 btrfs_extent_map_block_start(const struct extent_map *em) { if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) { - if (extent_map_is_compressed(em)) + if (btrfs_extent_map_is_compressed(em)) return em->disk_bytenr; return em->disk_bytenr + em->offset; } return em->disk_bytenr; } -static inline u64 extent_map_end(const struct extent_map *em) +static inline u64 btrfs_extent_map_end(const struct extent_map *em) { if (em->start + em->len < em->start) return (u64)-1; return em->start + em->len; } -void extent_map_tree_init(struct extent_map_tree *tree); -struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, - u64 start, u64 len); -void remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em); -int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, - u64 new_logical); - -struct extent_map *alloc_extent_map(void); -void free_extent_map(struct extent_map *em); -int __init extent_map_init(void); -void __cold extent_map_exit(void); -int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen); -void clear_em_logging(struct btrfs_inode *inode, struct extent_map *em); -struct extent_map *search_extent_mapping(struct extent_map_tree *tree, - u64 start, u64 len); +void btrfs_extent_map_tree_init(struct extent_map_tree *tree); +struct extent_map *btrfs_lookup_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len); +void btrfs_remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em); +int btrfs_split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, + u64 new_logical); + +struct extent_map *btrfs_alloc_extent_map(void); +void btrfs_free_extent_map(struct extent_map *em); +int __init btrfs_extent_map_init(void); +void __cold btrfs_extent_map_exit(void); +int btrfs_unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen); +void btrfs_clear_em_logging(struct btrfs_inode *inode, struct extent_map *em); +struct extent_map *btrfs_search_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len); int btrfs_add_extent_mapping(struct btrfs_inode *inode, struct extent_map **em_in, u64 start, u64 len); void btrfs_drop_extent_map_range(struct btrfs_inode *inode, diff --git a/fs/btrfs/fiemap.c b/fs/btrfs/fiemap.c index b80c07ad8c5e..43bf0979fd53 100644 --- a/fs/btrfs/fiemap.c +++ b/fs/btrfs/fiemap.c @@ -634,7 +634,7 @@ static int extent_fiemap(struct btrfs_inode *inode, const u64 ino = btrfs_ino(inode); struct extent_state *cached_state = NULL; struct extent_state *delalloc_cached_state = NULL; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct fiemap_cache cache = { 0 }; struct btrfs_backref_share_check_ctx *backref_ctx; u64 last_extent_end = 0; @@ -661,7 +661,7 @@ restart: range_end = round_up(start + len, sectorsize); prev_extent_end = range_start; - lock_extent(&inode->io_tree, range_start, range_end, &cached_state); + btrfs_lock_extent(&inode->io_tree, range_start, range_end, &cached_state); ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end); if (ret < 0) @@ -841,7 +841,7 @@ check_eof_delalloc: } out_unlock: - unlock_extent(&inode->io_tree, range_start, range_end, &cached_state); + btrfs_unlock_extent(&inode->io_tree, range_start, range_end, &cached_state); if (ret == BTRFS_FIEMAP_FLUSH_CACHE) { btrfs_release_path(path); @@ -871,10 +871,9 @@ out_unlock: ret = emit_last_fiemap_cache(fieinfo, &cache); out: - free_extent_state(delalloc_cached_state); + btrfs_free_extent_state(delalloc_cached_state); kfree(cache.entries); btrfs_free_backref_share_ctx(backref_ctx); - btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index d04a3b47b1fb..54d523d4f421 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -46,7 +46,7 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_size) { u64 start, end, i_size; - int ret; + bool found; spin_lock(&inode->lock); i_size = new_i_size ?: i_size_read(&inode->vfs_inode); @@ -55,9 +55,9 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz goto out_unlock; } - ret = find_contiguous_extent_bit(inode->file_extent_tree, 0, &start, - &end, EXTENT_DIRTY); - if (!ret && start == 0) + found = btrfs_find_contiguous_extent_bit(inode->file_extent_tree, 0, &start, + &end, EXTENT_DIRTY); + if (found && start == 0) i_size = min(i_size, end + 1); else i_size = 0; @@ -91,8 +91,8 @@ int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, ASSERT(IS_ALIGNED(start + len, inode->root->fs_info->sectorsize)); - return set_extent_bit(inode->file_extent_tree, start, start + len - 1, - EXTENT_DIRTY, NULL); + return btrfs_set_extent_bit(inode->file_extent_tree, start, start + len - 1, + EXTENT_DIRTY, NULL); } /* @@ -121,8 +121,8 @@ int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start, ASSERT(IS_ALIGNED(start + len, inode->root->fs_info->sectorsize) || len == (u64)-1); - return clear_extent_bit(inode->file_extent_tree, start, - start + len - 1, EXTENT_DIRTY, NULL); + return btrfs_clear_extent_bit(inode->file_extent_tree, start, + start + len - 1, EXTENT_DIRTY, NULL); } static size_t bytes_to_csum_size(const struct btrfs_fs_info *fs_info, u32 bytes) @@ -163,20 +163,21 @@ int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, int ret = 0; struct btrfs_file_extent_item *item; struct btrfs_key file_key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; path = btrfs_alloc_path(); if (!path) return -ENOMEM; + file_key.objectid = objectid; - file_key.offset = pos; file_key.type = BTRFS_EXTENT_DATA_KEY; + file_key.offset = pos; ret = btrfs_insert_empty_item(trans, root, path, &file_key, sizeof(*item)); if (ret < 0) - goto out; + return ret; leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -190,8 +191,7 @@ int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, btrfs_set_file_extent_compression(leaf, item, 0); btrfs_set_file_extent_encryption(leaf, item, 0); btrfs_set_file_extent_other_encoding(leaf, item, 0); -out: - btrfs_free_path(path); + return ret; } @@ -212,8 +212,8 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans, int csums_in_item; file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - file_key.offset = bytenr; file_key.type = BTRFS_EXTENT_CSUM_KEY; + file_key.offset = bytenr; ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow); if (ret < 0) goto fail; @@ -259,8 +259,8 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, int cow = mod != 0; file_key.objectid = objectid; - file_key.offset = offset; file_key.type = BTRFS_EXTENT_DATA_KEY; + file_key.offset = offset; return btrfs_search_slot(trans, root, &file_key, path, ins_len, cow); } @@ -336,23 +336,23 @@ out: * * Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise. */ -blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) +int btrfs_lookup_bio_sums(struct btrfs_bio *bbio) { struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct bio *bio = &bbio->bio; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); const u32 sectorsize = fs_info->sectorsize; const u32 csum_size = fs_info->csum_size; u32 orig_len = bio->bi_iter.bi_size; u64 orig_disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT; const unsigned int nblocks = orig_len >> fs_info->sectorsize_bits; - blk_status_t ret = BLK_STS_OK; + int ret = 0; u32 bio_offset = 0; if ((inode->flags & BTRFS_INODE_NODATASUM) || test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state)) - return BLK_STS_OK; + return 0; /* * This function is only called for read bio. @@ -369,14 +369,12 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) ASSERT(bio_op(bio) == REQ_OP_READ); path = btrfs_alloc_path(); if (!path) - return BLK_STS_RESOURCE; + return -ENOMEM; if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS); - if (!bbio->csum) { - btrfs_free_path(path); - return BLK_STS_RESOURCE; - } + if (!bbio->csum) + return -ENOMEM; } else { bbio->csum = bbio->csum_inline; } @@ -408,7 +406,7 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) count = search_csum_tree(fs_info, path, cur_disk_bytenr, orig_len - bio_offset, csum_dst); if (count < 0) { - ret = errno_to_blk_status(count); + ret = count; if (bbio->csum != bbio->csum_inline) kfree(bbio->csum); bbio->csum = NULL; @@ -432,9 +430,9 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) if (btrfs_root_id(inode->root) == BTRFS_DATA_RELOC_TREE_OBJECTID) { u64 file_offset = bbio->file_offset + bio_offset; - set_extent_bit(&inode->io_tree, file_offset, - file_offset + sectorsize - 1, - EXTENT_NODATASUM, NULL); + btrfs_set_extent_bit(&inode->io_tree, file_offset, + file_offset + sectorsize - 1, + EXTENT_NODATASUM, NULL); } else { btrfs_warn_rl(fs_info, "csum hole found for disk bytenr range [%llu, %llu)", @@ -444,7 +442,6 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) bio_offset += count * sectorsize; } - btrfs_free_path(path); return ret; } @@ -484,8 +481,8 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, path->nowait = nowait; key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - key.offset = start; key.type = BTRFS_EXTENT_CSUM_KEY; + key.offset = start; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) @@ -738,7 +735,7 @@ fail: /* * Calculate checksums of the data contained inside a bio. */ -blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio) +int btrfs_csum_one_bio(struct btrfs_bio *bbio) { struct btrfs_ordered_extent *ordered = bbio->ordered; struct btrfs_inode *inode = bbio->inode; @@ -760,7 +757,7 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio) memalloc_nofs_restore(nofs_flag); if (!sums) - return BLK_STS_RESOURCE; + return -ENOMEM; sums->len = bio->bi_iter.bi_size; INIT_LIST_HEAD(&sums->list); @@ -797,11 +794,11 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio) * record the updated logical address on Zone Append completion. * Allocate just the structure with an empty sums array here for that case. */ -blk_status_t btrfs_alloc_dummy_sum(struct btrfs_bio *bbio) +int btrfs_alloc_dummy_sum(struct btrfs_bio *bbio) { bbio->sums = kmalloc(sizeof(*bbio->sums), GFP_NOFS); if (!bbio->sums) - return BLK_STS_RESOURCE; + return -ENOMEM; bbio->sums->len = bbio->bio.bi_iter.bi_size; bbio->sums->logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; btrfs_add_ordered_sum(bbio->ordered, bbio->sums); @@ -874,7 +871,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 len) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; u64 end_byte = bytenr + len; u64 csum_end; @@ -892,8 +889,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, while (1) { key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - key.offset = end_byte - 1; key.type = BTRFS_EXTENT_CSUM_KEY; + key.offset = end_byte - 1; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) { @@ -1010,7 +1007,6 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, } btrfs_release_path(path); } - btrfs_free_path(path); return ret; } @@ -1052,7 +1048,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key file_key; struct btrfs_key found_key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_csum_item *item; struct btrfs_csum_item *item_end; struct extent_buffer *leaf = NULL; @@ -1074,8 +1070,8 @@ again: found_next = 0; bytenr = sums->logical + total_bytes; file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - file_key.offset = bytenr; file_key.type = BTRFS_EXTENT_CSUM_KEY; + file_key.offset = bytenr; item = btrfs_lookup_csum(trans, root, path, bytenr, 1); if (!IS_ERR(item)) { @@ -1263,7 +1259,6 @@ found: goto again; } out: - btrfs_free_path(path); return ret; } @@ -1301,7 +1296,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, em->disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); em->offset = btrfs_file_extent_offset(leaf, fi); if (compress_type != BTRFS_COMPRESS_NONE) { - extent_map_set_compression(em, compress_type); + btrfs_extent_map_set_compression(em, compress_type); } else { /* * Older kernels can create regular non-hole data @@ -1321,7 +1316,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, em->start = 0; em->len = fs_info->sectorsize; em->offset = 0; - extent_map_set_compression(em, compress_type); + btrfs_extent_map_set_compression(em, compress_type); } else { btrfs_err(fs_info, "unknown file extent item type %d, inode %llu, offset %llu, " diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h index 0e13661a71f3..63216c43676d 100644 --- a/fs/btrfs/file-item.h +++ b/fs/btrfs/file-item.h @@ -3,8 +3,10 @@ #ifndef BTRFS_FILE_ITEM_H #define BTRFS_FILE_ITEM_H +#include <linux/blk_types.h> #include <linux/list.h> #include <uapi/linux/btrfs_tree.h> +#include "ctree.h" #include "accessors.h" struct extent_map; @@ -51,7 +53,7 @@ static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize) int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 len); -blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio); +int btrfs_lookup_bio_sums(struct btrfs_bio *bbio); int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 pos, u64 num_bytes); @@ -62,8 +64,8 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums); -blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio); -blk_status_t btrfs_alloc_dummy_sum(struct btrfs_bio *bbio); +int btrfs_csum_one_bio(struct btrfs_bio *bbio); +int btrfs_alloc_dummy_sum(struct btrfs_bio *bbio); int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit, bool nowait); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index ed3c0d6546c5..8ce6f45f45e0 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -98,9 +98,9 @@ int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos * The pages may have already been dirty, clear out old accounting so * we can set things up properly */ - clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block, - EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - cached); + btrfs_clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block, + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, + cached); ret = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, extra_bits, cached); @@ -508,20 +508,19 @@ out: return ret; } -static int extent_mergeable(struct extent_buffer *leaf, int slot, - u64 objectid, u64 bytenr, u64 orig_offset, - u64 *start, u64 *end) +static bool extent_mergeable(struct extent_buffer *leaf, int slot, u64 objectid, + u64 bytenr, u64 orig_offset, u64 *start, u64 *end) { struct btrfs_file_extent_item *fi; struct btrfs_key key; u64 extent_end; if (slot < 0 || slot >= btrfs_header_nritems(leaf)) - return 0; + return false; btrfs_item_key_to_cpu(leaf, &key, slot); if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY) - return 0; + return false; fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG || @@ -530,15 +529,15 @@ static int extent_mergeable(struct extent_buffer *leaf, int slot, btrfs_file_extent_compression(leaf, fi) || btrfs_file_extent_encryption(leaf, fi) || btrfs_file_extent_other_encoding(leaf, fi)) - return 0; + return false; extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); if ((*start && *start != key.offset) || (*end && *end != extent_end)) - return 0; + return false; *start = key.offset; *end = extent_end; - return 1; + return true; } /* @@ -553,7 +552,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, { struct btrfs_root *root = inode->root; struct extent_buffer *leaf; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_file_extent_item *fi; struct btrfs_ref ref = { 0 }; struct btrfs_key key; @@ -791,7 +790,6 @@ again: } } out: - btrfs_free_path(path); return ret; } @@ -800,18 +798,18 @@ out: * On success return a locked folio and 0 */ static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64 pos, - u64 len, bool force_uptodate) + u64 len) { u64 clamp_start = max_t(u64, pos, folio_pos(folio)); u64 clamp_end = min_t(u64, pos + len, folio_pos(folio) + folio_size(folio)); + const u32 blocksize = inode_to_fs_info(inode)->sectorsize; int ret = 0; if (folio_test_uptodate(folio)) return 0; - if (!force_uptodate && - IS_ALIGNED(clamp_start, PAGE_SIZE) && - IS_ALIGNED(clamp_end, PAGE_SIZE)) + if (IS_ALIGNED(clamp_start, blocksize) && + IS_ALIGNED(clamp_end, blocksize)) return 0; ret = btrfs_read_folio(NULL, folio); @@ -857,33 +855,27 @@ static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait) */ static noinline int prepare_one_folio(struct inode *inode, struct folio **folio_ret, loff_t pos, size_t write_bytes, - bool force_uptodate, bool nowait) + bool nowait) { unsigned long index = pos >> PAGE_SHIFT; gfp_t mask = get_prepare_gfp_flags(inode, nowait); - fgf_t fgp_flags = (nowait ? FGP_WRITEBEGIN | FGP_NOWAIT : FGP_WRITEBEGIN); + fgf_t fgp_flags = (nowait ? FGP_WRITEBEGIN | FGP_NOWAIT : FGP_WRITEBEGIN) | + fgf_set_order(write_bytes); struct folio *folio; int ret = 0; again: folio = __filemap_get_folio(inode->i_mapping, index, fgp_flags, mask); - if (IS_ERR(folio)) { - if (nowait) - ret = -EAGAIN; - else - ret = PTR_ERR(folio); - return ret; - } - folio_wait_writeback(folio); - /* Only support page sized folio yet. */ - ASSERT(folio_order(folio) == 0); + if (IS_ERR(folio)) + return PTR_ERR(folio); + ret = set_folio_extent_mapped(folio); if (ret < 0) { folio_unlock(folio); folio_put(folio); return ret; } - ret = prepare_uptodate_folio(inode, folio, pos, write_bytes, force_uptodate); + ret = prepare_uptodate_folio(inode, folio, pos, write_bytes); if (ret) { /* The folio is already unlocked. */ folio_put(folio); @@ -924,14 +916,15 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio, struct btrfs_ordered_extent *ordered; if (nowait) { - if (!try_lock_extent(&inode->io_tree, start_pos, last_pos, - cached_state)) { + if (!btrfs_try_lock_extent(&inode->io_tree, start_pos, + last_pos, cached_state)) { folio_unlock(folio); folio_put(folio); return -EAGAIN; } } else { - lock_extent(&inode->io_tree, start_pos, last_pos, cached_state); + btrfs_lock_extent(&inode->io_tree, start_pos, last_pos, + cached_state); } ordered = btrfs_lookup_ordered_range(inode, start_pos, @@ -939,8 +932,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio, if (ordered && ordered->file_offset + ordered->num_bytes > start_pos && ordered->file_offset <= last_pos) { - unlock_extent(&inode->io_tree, start_pos, last_pos, - cached_state); + btrfs_unlock_extent(&inode->io_tree, start_pos, last_pos, + cached_state); folio_unlock(folio); folio_put(folio); btrfs_start_ordered_extent(ordered); @@ -1014,14 +1007,13 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, &cached_state); } - ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes, - NULL, nowait); + ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, nowait); if (ret <= 0) btrfs_drew_write_unlock(&root->snapshot_lock); else *write_bytes = min_t(size_t, *write_bytes , num_bytes - pos + lockstart); - unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); + btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); return ret; } @@ -1078,234 +1070,306 @@ int btrfs_write_check(struct kiocb *iocb, size_t count) return 0; } -ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) +static void release_space(struct btrfs_inode *inode, struct extent_changeset *data_reserved, + u64 start, u64 len, bool only_release_metadata) { - struct file *file = iocb->ki_filp; - loff_t pos; - struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct extent_changeset *data_reserved = NULL; - u64 release_bytes = 0; - u64 lockstart; - u64 lockend; - size_t num_written = 0; - ssize_t ret; - loff_t old_isize = i_size_read(inode); - unsigned int ilock_flags = 0; - const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); - unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0); - bool only_release_metadata = false; + if (len == 0) + return; - if (nowait) - ilock_flags |= BTRFS_ILOCK_TRY; + if (only_release_metadata) { + btrfs_check_nocow_unlock(inode); + btrfs_delalloc_release_metadata(inode, len, true); + } else { + const struct btrfs_fs_info *fs_info = inode->root->fs_info; - ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags); - if (ret < 0) - return ret; + btrfs_delalloc_release_space(inode, data_reserved, + round_down(start, fs_info->sectorsize), + len, true); + } +} - ret = generic_write_checks(iocb, i); - if (ret <= 0) - goto out; +/* + * Reserve data and metadata space for this buffered write range. + * + * Return >0 for the number of bytes reserved, which is always block aligned. + * Return <0 for error. + */ +static ssize_t reserve_space(struct btrfs_inode *inode, + struct extent_changeset **data_reserved, + u64 start, size_t *len, bool nowait, + bool *only_release_metadata) +{ + const struct btrfs_fs_info *fs_info = inode->root->fs_info; + const unsigned int block_offset = (start & (fs_info->sectorsize - 1)); + size_t reserve_bytes; + int ret; - ret = btrfs_write_check(iocb, ret); - if (ret < 0) - goto out; + ret = btrfs_check_data_free_space(inode, data_reserved, start, *len, nowait); + if (ret < 0) { + int can_nocow; - pos = iocb->ki_pos; - while (iov_iter_count(i) > 0) { - struct extent_state *cached_state = NULL; - size_t offset = offset_in_page(pos); - size_t sector_offset; - size_t write_bytes = min(iov_iter_count(i), PAGE_SIZE - offset); - size_t reserve_bytes; - size_t copied; - size_t dirty_sectors; - size_t num_sectors; - struct folio *folio = NULL; - int extents_locked; - bool force_page_uptodate = false; + if (nowait && (ret == -ENOSPC || ret == -EAGAIN)) + return -EAGAIN; /* - * Fault pages before locking them in prepare_one_folio() - * to avoid recursive lock + * If we don't have to COW at the offset, reserve metadata only. + * write_bytes may get smaller than requested here. */ - if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) { - ret = -EFAULT; - break; - } + can_nocow = btrfs_check_nocow_lock(inode, start, len, nowait); + if (can_nocow < 0) + ret = can_nocow; + if (can_nocow > 0) + ret = 0; + if (ret) + return ret; + *only_release_metadata = true; + } - only_release_metadata = false; - sector_offset = pos & (fs_info->sectorsize - 1); + reserve_bytes = round_up(*len + block_offset, fs_info->sectorsize); + WARN_ON(reserve_bytes == 0); + ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes, + reserve_bytes, nowait); + if (ret) { + if (!*only_release_metadata) + btrfs_free_reserved_data_space(inode, *data_reserved, + start, *len); + else + btrfs_check_nocow_unlock(inode); - extent_changeset_release(data_reserved); - ret = btrfs_check_data_free_space(BTRFS_I(inode), - &data_reserved, pos, - write_bytes, nowait); - if (ret < 0) { - int can_nocow; + if (nowait && ret == -ENOSPC) + ret = -EAGAIN; + return ret; + } + return reserve_bytes; +} - if (nowait && (ret == -ENOSPC || ret == -EAGAIN)) { - ret = -EAGAIN; - break; - } +/* Shrink the reserved data and metadata space from @reserved_len to @new_len. */ +static void shrink_reserved_space(struct btrfs_inode *inode, + struct extent_changeset *data_reserved, + u64 reserved_start, u64 reserved_len, + u64 new_len, bool only_release_metadata) +{ + const u64 diff = reserved_len - new_len; - /* - * If we don't have to COW at the offset, reserve - * metadata only. write_bytes may get smaller than - * requested here. - */ - can_nocow = btrfs_check_nocow_lock(BTRFS_I(inode), pos, - &write_bytes, nowait); - if (can_nocow < 0) - ret = can_nocow; - if (can_nocow > 0) - ret = 0; - if (ret) - break; - only_release_metadata = true; - } + ASSERT(new_len <= reserved_len); + btrfs_delalloc_shrink_extents(inode, reserved_len, new_len); + if (only_release_metadata) + btrfs_delalloc_release_metadata(inode, diff, true); + else + btrfs_delalloc_release_space(inode, data_reserved, + reserved_start + new_len, diff, true); +} - reserve_bytes = round_up(write_bytes + sector_offset, - fs_info->sectorsize); - WARN_ON(reserve_bytes == 0); - ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), - reserve_bytes, - reserve_bytes, nowait); - if (ret) { - if (!only_release_metadata) - btrfs_free_reserved_data_space(BTRFS_I(inode), - data_reserved, pos, - write_bytes); - else - btrfs_check_nocow_unlock(BTRFS_I(inode)); +/* Calculate the maximum amount of bytes we can write into one folio. */ +static size_t calc_write_bytes(const struct btrfs_inode *inode, + const struct iov_iter *iter, u64 start) +{ + const size_t max_folio_size = mapping_max_folio_size(inode->vfs_inode.i_mapping); - if (nowait && ret == -ENOSPC) - ret = -EAGAIN; - break; - } + return min(max_folio_size - (start & (max_folio_size - 1)), + iov_iter_count(iter)); +} + +/* + * Do the heavy-lifting work to copy one range into one folio of the page cache. + * + * Return > 0 in case we copied all bytes or just some of them. + * Return 0 if no bytes were copied, in which case the caller should retry. + * Return <0 on error. + */ +static int copy_one_range(struct btrfs_inode *inode, struct iov_iter *iter, + struct extent_changeset **data_reserved, u64 start, + bool nowait) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_state *cached_state = NULL; + size_t write_bytes = calc_write_bytes(inode, iter, start); + size_t copied; + const u64 reserved_start = round_down(start, fs_info->sectorsize); + u64 reserved_len; + struct folio *folio = NULL; + int extents_locked; + u64 lockstart; + u64 lockend; + bool only_release_metadata = false; + const unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0); + int ret; + + /* + * Fault all pages before locking them in prepare_one_folio() to avoid + * recursive lock. + */ + if (unlikely(fault_in_iov_iter_readable(iter, write_bytes))) + return -EFAULT; + extent_changeset_release(*data_reserved); + ret = reserve_space(inode, data_reserved, start, &write_bytes, nowait, + &only_release_metadata); + if (ret < 0) + return ret; + reserved_len = ret; + /* Write range must be inside the reserved range. */ + ASSERT(reserved_start <= start); + ASSERT(start + write_bytes <= reserved_start + reserved_len); - release_bytes = reserve_bytes; again: - ret = balance_dirty_pages_ratelimited_flags(inode->i_mapping, bdp_flags); - if (ret) { - btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); - break; - } + ret = balance_dirty_pages_ratelimited_flags(inode->vfs_inode.i_mapping, + bdp_flags); + if (ret) { + btrfs_delalloc_release_extents(inode, reserved_len); + release_space(inode, *data_reserved, reserved_start, reserved_len, + only_release_metadata); + return ret; + } - ret = prepare_one_folio(inode, &folio, pos, write_bytes, - force_page_uptodate, false); - if (ret) { - btrfs_delalloc_release_extents(BTRFS_I(inode), - reserve_bytes); - break; - } + ret = prepare_one_folio(&inode->vfs_inode, &folio, start, write_bytes, false); + if (ret) { + btrfs_delalloc_release_extents(inode, reserved_len); + release_space(inode, *data_reserved, reserved_start, reserved_len, + only_release_metadata); + return ret; + } - extents_locked = lock_and_cleanup_extent_if_need(BTRFS_I(inode), - folio, pos, write_bytes, &lockstart, - &lockend, nowait, &cached_state); - if (extents_locked < 0) { - if (!nowait && extents_locked == -EAGAIN) - goto again; + /* + * The reserved range goes beyond the current folio, shrink the reserved + * space to the folio boundary. + */ + if (reserved_start + reserved_len > folio_pos(folio) + folio_size(folio)) { + const u64 last_block = folio_pos(folio) + folio_size(folio); + + shrink_reserved_space(inode, *data_reserved, reserved_start, + reserved_len, last_block - reserved_start, + only_release_metadata); + write_bytes = last_block - start; + reserved_len = last_block - reserved_start; + } + + extents_locked = lock_and_cleanup_extent_if_need(inode, folio, start, + write_bytes, &lockstart, + &lockend, nowait, + &cached_state); + if (extents_locked < 0) { + if (!nowait && extents_locked == -EAGAIN) + goto again; - btrfs_delalloc_release_extents(BTRFS_I(inode), - reserve_bytes); - ret = extents_locked; - break; - } + btrfs_delalloc_release_extents(inode, reserved_len); + release_space(inode, *data_reserved, reserved_start, reserved_len, + only_release_metadata); + ret = extents_locked; + return ret; + } + + copied = copy_folio_from_iter_atomic(folio, offset_in_folio(folio, start), + write_bytes, iter); + flush_dcache_folio(folio); - copied = copy_folio_from_iter_atomic(folio, - offset_in_folio(folio, pos), write_bytes, i); - flush_dcache_folio(folio); + if (unlikely(copied < write_bytes)) { + u64 last_block; /* - * If we get a partial write, we can end up with partially - * uptodate page. Although if sector size < page size we can - * handle it, but if it's not sector aligned it can cause - * a lot of complexity, so make sure they don't happen by - * forcing retry this copy. + * The original write range doesn't need an uptodate folio as + * the range is block aligned. But now a short copy happened. + * We cannot handle it without an uptodate folio. + * + * So just revert the range and we will retry. */ - if (unlikely(copied < write_bytes)) { - if (!folio_test_uptodate(folio)) { - iov_iter_revert(i, copied); - copied = 0; - } + if (!folio_test_uptodate(folio)) { + iov_iter_revert(iter, copied); + copied = 0; } - num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes); - dirty_sectors = round_up(copied + sector_offset, - fs_info->sectorsize); - dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors); - + /* No copied bytes, unlock, release reserved space and exit. */ if (copied == 0) { - force_page_uptodate = true; - dirty_sectors = 0; - } else { - force_page_uptodate = false; + if (extents_locked) + btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, + &cached_state); + else + btrfs_free_extent_state(cached_state); + btrfs_delalloc_release_extents(inode, reserved_len); + release_space(inode, *data_reserved, reserved_start, reserved_len, + only_release_metadata); + btrfs_drop_folio(fs_info, folio, start, copied); + return 0; } - if (num_sectors > dirty_sectors) { - /* release everything except the sectors we dirtied */ - release_bytes -= dirty_sectors << fs_info->sectorsize_bits; - if (only_release_metadata) { - btrfs_delalloc_release_metadata(BTRFS_I(inode), - release_bytes, true); - } else { - u64 release_start = round_up(pos + copied, - fs_info->sectorsize); - btrfs_delalloc_release_space(BTRFS_I(inode), - data_reserved, release_start, - release_bytes, true); - } - } + /* Release the reserved space beyond the last block. */ + last_block = round_up(start + copied, fs_info->sectorsize); - release_bytes = round_up(copied + sector_offset, - fs_info->sectorsize); + shrink_reserved_space(inode, *data_reserved, reserved_start, + reserved_len, last_block - reserved_start, + only_release_metadata); + reserved_len = last_block - reserved_start; + } - ret = btrfs_dirty_folio(BTRFS_I(inode), folio, pos, copied, - &cached_state, only_release_metadata); + ret = btrfs_dirty_folio(inode, folio, start, copied, &cached_state, + only_release_metadata); + /* + * If we have not locked the extent range, because the range's start + * offset is >= i_size, we might still have a non-NULL cached extent + * state, acquired while marking the extent range as delalloc through + * btrfs_dirty_page(). Therefore free any possible cached extent state + * to avoid a memory leak. + */ + if (extents_locked) + btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); + else + btrfs_free_extent_state(cached_state); - /* - * If we have not locked the extent range, because the range's - * start offset is >= i_size, we might still have a non-NULL - * cached extent state, acquired while marking the extent range - * as delalloc through btrfs_dirty_page(). Therefore free any - * possible cached extent state to avoid a memory leak. - */ - if (extents_locked) - unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, - lockend, &cached_state); - else - free_extent_state(cached_state); + btrfs_delalloc_release_extents(inode, reserved_len); + if (ret) { + btrfs_drop_folio(fs_info, folio, start, copied); + release_space(inode, *data_reserved, reserved_start, reserved_len, + only_release_metadata); + return ret; + } + if (only_release_metadata) + btrfs_check_nocow_unlock(inode); - btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); - if (ret) { - btrfs_drop_folio(fs_info, folio, pos, copied); - break; - } + btrfs_drop_folio(fs_info, folio, start, copied); + return copied; +} - release_bytes = 0; - if (only_release_metadata) - btrfs_check_nocow_unlock(BTRFS_I(inode)); +ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *iter) +{ + struct file *file = iocb->ki_filp; + loff_t pos; + struct inode *inode = file_inode(file); + struct extent_changeset *data_reserved = NULL; + size_t num_written = 0; + ssize_t ret; + loff_t old_isize; + unsigned int ilock_flags = 0; + const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); - btrfs_drop_folio(fs_info, folio, pos, copied); + if (nowait) + ilock_flags |= BTRFS_ILOCK_TRY; - cond_resched(); + ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags); + if (ret < 0) + return ret; - pos += copied; - num_written += copied; - } + /* + * We can only trust the isize with inode lock held, or it can race with + * other buffered writes and cause incorrect call of + * pagecache_isize_extended() to overwrite existing data. + */ + old_isize = i_size_read(inode); - if (release_bytes) { - if (only_release_metadata) { - btrfs_check_nocow_unlock(BTRFS_I(inode)); - btrfs_delalloc_release_metadata(BTRFS_I(inode), - release_bytes, true); - } else { - btrfs_delalloc_release_space(BTRFS_I(inode), - data_reserved, - round_down(pos, fs_info->sectorsize), - release_bytes, true); - } + ret = generic_write_checks(iocb, iter); + if (ret <= 0) + goto out; + + ret = btrfs_write_check(iocb, ret); + if (ret < 0) + goto out; + + pos = iocb->ki_pos; + while (iov_iter_count(iter) > 0) { + ret = copy_one_range(BTRFS_I(inode), iter, &data_reserved, pos, nowait); + if (ret < 0) + break; + pos += ret; + num_written += ret; + cond_resched(); } extent_changeset_free(data_reserved); @@ -1400,7 +1464,7 @@ int btrfs_release_file(struct inode *inode, struct file *filp) if (private) { kfree(private->filldir_buf); - free_extent_state(private->llseek_cached_state); + btrfs_free_extent_state(private->llseek_cached_state); kfree(private); filp->private_data = NULL; } @@ -1776,17 +1840,14 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) struct extent_changeset *data_reserved = NULL; unsigned long zero_start; loff_t size; - vm_fault_t ret; - int ret2; - int reserved = 0; + size_t fsize = folio_size(folio); + int ret; u64 reserved_space; u64 page_start; u64 page_end; u64 end; - ASSERT(folio_order(folio) == 0); - - reserved_space = PAGE_SIZE; + reserved_space = fsize; sb_start_pagefault(inode->i_sb); page_start = folio_pos(folio); @@ -1801,21 +1862,14 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) * end up waiting indefinitely to get a lock on the page currently * being processed by btrfs_page_mkwrite() function. */ - ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, - page_start, reserved_space); - if (!ret2) { - ret2 = file_update_time(vmf->vma->vm_file); - reserved = 1; - } - if (ret2) { - ret = vmf_error(ret2); - if (reserved) - goto out; + ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, + page_start, reserved_space); + if (ret < 0) goto out_noreserve; - } - /* Make the VM retry the fault. */ - ret = VM_FAULT_NOPAGE; + ret = file_update_time(vmf->vma->vm_file); + if (ret < 0) + goto out; again: down_read(&BTRFS_I(inode)->i_mmap_lock); folio_lock(folio); @@ -1828,11 +1882,10 @@ again: } folio_wait_writeback(folio); - lock_extent(io_tree, page_start, page_end, &cached_state); - ret2 = set_folio_extent_mapped(folio); - if (ret2 < 0) { - ret = vmf_error(ret2); - unlock_extent(io_tree, page_start, page_end, &cached_state); + btrfs_lock_extent(io_tree, page_start, page_end, &cached_state); + ret = set_folio_extent_mapped(folio); + if (ret < 0) { + btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state); goto out_unlock; } @@ -1840,9 +1893,9 @@ again: * We can't set the delalloc bits if there are pending ordered * extents. Drop our locks and wait for them to finish. */ - ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, PAGE_SIZE); + ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, fsize); if (ordered) { - unlock_extent(io_tree, page_start, page_end, &cached_state); + btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state); folio_unlock(folio); up_read(&BTRFS_I(inode)->i_mmap_lock); btrfs_start_ordered_extent(ordered); @@ -1850,13 +1903,13 @@ again: goto again; } - if (folio->index == ((size - 1) >> PAGE_SHIFT)) { + if (folio_contains(folio, (size - 1) >> PAGE_SHIFT)) { reserved_space = round_up(size - page_start, fs_info->sectorsize); - if (reserved_space < PAGE_SIZE) { + if (reserved_space < fsize) { end = page_start + reserved_space - 1; btrfs_delalloc_release_space(BTRFS_I(inode), - data_reserved, page_start, - PAGE_SIZE - reserved_space, true); + data_reserved, end + 1, + fsize - reserved_space, true); } } @@ -1867,15 +1920,14 @@ again: * clear any delalloc bits within this page range since we have to * reserve data&meta space before lock_page() (see above comments). */ - clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end, - EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, &cached_state); + btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end, + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, &cached_state); - ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0, + ret = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0, &cached_state); - if (ret2) { - unlock_extent(io_tree, page_start, page_end, &cached_state); - ret = VM_FAULT_SIGBUS; + if (ret < 0) { + btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state); goto out_unlock; } @@ -1883,21 +1935,21 @@ again: if (page_start + folio_size(folio) > size) zero_start = offset_in_folio(folio, size); else - zero_start = PAGE_SIZE; + zero_start = fsize; - if (zero_start != PAGE_SIZE) + if (zero_start != fsize) folio_zero_range(folio, zero_start, folio_size(folio) - zero_start); - btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE); + btrfs_folio_clear_checked(fs_info, folio, page_start, fsize); btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start); btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start); btrfs_set_inode_last_sub_trans(BTRFS_I(inode)); - unlock_extent(io_tree, page_start, page_end, &cached_state); + btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state); up_read(&BTRFS_I(inode)->i_mmap_lock); - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); + btrfs_delalloc_release_extents(BTRFS_I(inode), fsize); sb_end_pagefault(inode->i_sb); extent_changeset_free(data_reserved); return VM_FAULT_LOCKED; @@ -1906,13 +1958,18 @@ out_unlock: folio_unlock(folio); up_read(&BTRFS_I(inode)->i_mmap_lock); out: - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); + btrfs_delalloc_release_extents(BTRFS_I(inode), fsize); btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start, - reserved_space, (ret != 0)); + reserved_space, true); + extent_changeset_free(data_reserved); out_noreserve: sb_end_pagefault(inode->i_sb); - extent_changeset_free(data_reserved); - return ret; + + if (ret < 0) + return vmf_error(ret); + + /* Make the VM retry the fault. */ + return VM_FAULT_NOPAGE; } static const struct vm_operations_struct btrfs_file_vm_ops = { @@ -1934,33 +1991,33 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) return 0; } -static int hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf, - int slot, u64 start, u64 end) +static bool hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf, + int slot, u64 start, u64 end) { struct btrfs_file_extent_item *fi; struct btrfs_key key; if (slot < 0 || slot >= btrfs_header_nritems(leaf)) - return 0; + return false; btrfs_item_key_to_cpu(leaf, &key, slot); if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) - return 0; + return false; fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG) - return 0; + return false; if (btrfs_file_extent_disk_bytenr(leaf, fi)) - return 0; + return false; if (key.offset == end) - return 1; + return true; if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start) - return 1; - return 0; + return true; + return false; } static int fill_holes(struct btrfs_trans_handle *trans, @@ -2034,7 +2091,7 @@ static int fill_holes(struct btrfs_trans_handle *trans, out: btrfs_release_path(path); - hole_em = alloc_extent_map(); + hole_em = btrfs_alloc_extent_map(); if (!hole_em) { btrfs_drop_extent_map_range(inode, offset, end - 1, false); btrfs_set_inode_full_sync(inode); @@ -2048,7 +2105,7 @@ out: hole_em->generation = trans->transid; ret = btrfs_replace_extent_map_range(inode, hole_em, true); - free_extent_map(hole_em); + btrfs_free_extent_map(hole_em); if (ret) btrfs_set_inode_full_sync(inode); } @@ -2081,15 +2138,33 @@ static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len) 0 : *start + *len - em->start - em->len; *start = em->start + em->len; } - free_extent_map(em); + btrfs_free_extent_map(em); return ret; } -static void btrfs_punch_hole_lock_range(struct inode *inode, - const u64 lockstart, - const u64 lockend, - struct extent_state **cached_state) +/* + * Check if there is no folio in the range. + * + * We cannot utilize filemap_range_has_page() in a filemap with large folios + * as we can hit the following false positive: + * + * start end + * | | + * |//|//|//|//| | | | | | | | |//|//| + * \ / \ / + * Folio A Folio B + * + * That large folio A and B cover the start and end indexes. + * In that case filemap_range_has_page() will always return true, but the above + * case is fine for btrfs_punch_hole_lock_range() usage. + * + * So here we only ensure that no other folios is in the range, excluding the + * head/tail large folio. + */ +static bool check_range_has_page(struct inode *inode, u64 start, u64 end) { + struct folio_batch fbatch; + bool ret = false; /* * For subpage case, if the range is not at page boundary, we could * have pages at the leading/tailing part of the range. @@ -2097,15 +2172,48 @@ static void btrfs_punch_hole_lock_range(struct inode *inode, * will always return true. * So here we need to do extra page alignment for * filemap_range_has_page(). + * + * And do not decrease page_lockend right now, as it can be 0. */ - const u64 page_lockstart = round_up(lockstart, PAGE_SIZE); - const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1; + const u64 page_lockstart = round_up(start, PAGE_SIZE); + const u64 page_lockend = round_down(end + 1, PAGE_SIZE); + const pgoff_t start_index = page_lockstart >> PAGE_SHIFT; + const pgoff_t end_index = (page_lockend - 1) >> PAGE_SHIFT; + pgoff_t tmp = start_index; + int found_folios; + + /* The same page or adjacent pages. */ + if (page_lockend <= page_lockstart) + return false; + + folio_batch_init(&fbatch); + found_folios = filemap_get_folios(inode->i_mapping, &tmp, end_index, &fbatch); + for (int i = 0; i < found_folios; i++) { + struct folio *folio = fbatch.folios[i]; + + /* A large folio begins before the start. Not a target. */ + if (folio->index < start_index) + continue; + /* A large folio extends beyond the end. Not a target. */ + if (folio->index + folio_nr_pages(folio) > end_index) + continue; + /* A folio doesn't cover the head/tail index. Found a target. */ + ret = true; + break; + } + folio_batch_release(&fbatch); + return ret; +} +static void btrfs_punch_hole_lock_range(struct inode *inode, + const u64 lockstart, const u64 lockend, + struct extent_state **cached_state) +{ while (1) { truncate_pagecache_range(inode, lockstart, lockend); - lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, - cached_state); + btrfs_lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, + cached_state); /* * We can't have ordered extents in the range, nor dirty/writeback * pages, because we have locked the inode's VFS lock in exclusive @@ -2116,12 +2224,11 @@ static void btrfs_punch_hole_lock_range(struct inode *inode, * locking the range check if we have pages in the range, and if * we do, unlock the range and retry. */ - if (!filemap_range_has_page(inode->i_mapping, page_lockstart, - page_lockend)) + if (!check_range_has_page(inode, lockstart, lockend)) break; - unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, - cached_state); + btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, + cached_state); } btrfs_assert_inode_range_clean(BTRFS_I(inode), lockstart, lockend); @@ -2494,7 +2601,8 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) u64 lockend; u64 tail_start; u64 tail_len; - u64 orig_start = offset; + const u64 orig_start = offset; + const u64 orig_end = offset + len - 1; int ret = 0; bool same_block; u64 ino_size; @@ -2526,18 +2634,14 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset)) == (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)); /* - * We needn't truncate any block which is beyond the end of the file - * because we are sure there is no data there. - */ - /* * Only do this if we are in the same block and we aren't doing the * entire block. */ if (same_block && len < fs_info->sectorsize) { if (offset < ino_size) { truncated_block = true; - ret = btrfs_truncate_block(BTRFS_I(inode), offset, len, - 0); + ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1, + orig_start, orig_end); } else { ret = 0; } @@ -2547,7 +2651,7 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) /* zero back part of the first block */ if (offset < ino_size) { truncated_block = true; - ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0); + ret = btrfs_truncate_block(BTRFS_I(inode), offset, orig_start, orig_end); if (ret) { btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); return ret; @@ -2584,8 +2688,8 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) if (tail_start + tail_len < ino_size) { truncated_block = true; ret = btrfs_truncate_block(BTRFS_I(inode), - tail_start + tail_len, - 0, 1); + tail_start + tail_len - 1, + orig_start, orig_end); if (ret) goto out_only_mutex; } @@ -2619,8 +2723,8 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); out: - unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, - &cached_state); + btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, + &cached_state); out_only_mutex: if (!updated_inode && truncated_block && !ret) { /* @@ -2738,7 +2842,7 @@ static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode, else ret = RANGE_BOUNDARY_WRITTEN_EXTENT; - free_extent_map(em); + btrfs_free_extent_map(em); return ret; } @@ -2753,6 +2857,8 @@ static int btrfs_zero_range(struct inode *inode, int ret; u64 alloc_hint = 0; const u64 sectorsize = fs_info->sectorsize; + const u64 orig_start = offset; + const u64 orig_end = offset + len - 1; u64 alloc_start = round_down(offset, sectorsize); u64 alloc_end = round_up(offset + len, sectorsize); u64 bytes_to_reserve = 0; @@ -2782,7 +2888,7 @@ static int btrfs_zero_range(struct inode *inode, * do nothing except updating the inode's i_size if * needed. */ - free_extent_map(em); + btrfs_free_extent_map(em); ret = btrfs_fallocate_update_isize(inode, offset + len, mode); goto out; @@ -2795,9 +2901,9 @@ static int btrfs_zero_range(struct inode *inode, ASSERT(IS_ALIGNED(alloc_start, sectorsize)); len = offset + len - alloc_start; offset = alloc_start; - alloc_hint = extent_map_block_start(em) + em->len; + alloc_hint = btrfs_extent_map_block_start(em) + em->len; } - free_extent_map(em); + btrfs_free_extent_map(em); if (BTRFS_BYTES_TO_BLKS(fs_info, offset) == BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) { @@ -2808,22 +2914,22 @@ static int btrfs_zero_range(struct inode *inode, } if (em->flags & EXTENT_FLAG_PREALLOC) { - free_extent_map(em); + btrfs_free_extent_map(em); ret = btrfs_fallocate_update_isize(inode, offset + len, mode); goto out; } if (len < sectorsize && em->disk_bytenr != EXTENT_MAP_HOLE) { - free_extent_map(em); - ret = btrfs_truncate_block(BTRFS_I(inode), offset, len, - 0); + btrfs_free_extent_map(em); + ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1, + orig_start, orig_end); if (!ret) ret = btrfs_fallocate_update_isize(inode, offset + len, mode); return ret; } - free_extent_map(em); + btrfs_free_extent_map(em); alloc_start = round_down(offset, sectorsize); alloc_end = alloc_start + sectorsize; goto reserve_space; @@ -2847,7 +2953,8 @@ static int btrfs_zero_range(struct inode *inode, alloc_start = round_down(offset, sectorsize); ret = 0; } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) { - ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0); + ret = btrfs_truncate_block(BTRFS_I(inode), offset, + orig_start, orig_end); if (ret) goto out; } else { @@ -2864,8 +2971,8 @@ static int btrfs_zero_range(struct inode *inode, alloc_end = round_up(offset + len, sectorsize); ret = 0; } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) { - ret = btrfs_truncate_block(BTRFS_I(inode), offset + len, - 0, 1); + ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1, + orig_start, orig_end); if (ret) goto out; } else { @@ -2890,16 +2997,16 @@ reserve_space: ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved, alloc_start, bytes_to_reserve); if (ret) { - unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, - lockend, &cached_state); + btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, + lockend, &cached_state); goto out; } ret = btrfs_prealloc_file_range(inode, mode, alloc_start, alloc_end - alloc_start, fs_info->sectorsize, offset + len, &alloc_hint); - unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, - &cached_state); + btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, + &cached_state); /* btrfs_prealloc_file_range releases reserved space on error */ if (ret) { space_reserved = false; @@ -2985,7 +3092,8 @@ static long btrfs_fallocate(struct file *file, int mode, * need to zero out the end of the block if i_size lands in the * middle of a block. */ - ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0); + ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, + inode->i_size, (u64)-1); if (ret) goto out; } @@ -3010,8 +3118,8 @@ static long btrfs_fallocate(struct file *file, int mode, } locked_end = alloc_end - 1; - lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, - &cached_state); + btrfs_lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, + &cached_state); btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end); @@ -3023,8 +3131,8 @@ static long btrfs_fallocate(struct file *file, int mode, ret = PTR_ERR(em); break; } - last_byte = min(extent_map_end(em), alloc_end); - actual_end = min_t(u64, extent_map_end(em), offset + len); + last_byte = min(btrfs_extent_map_end(em), alloc_end); + actual_end = min_t(u64, btrfs_extent_map_end(em), offset + len); last_byte = ALIGN(last_byte, blocksize); if (em->disk_bytenr == EXTENT_MAP_HOLE || (cur_offset >= inode->i_size && @@ -3033,19 +3141,19 @@ static long btrfs_fallocate(struct file *file, int mode, ret = add_falloc_range(&reserve_list, cur_offset, range_len); if (ret < 0) { - free_extent_map(em); + btrfs_free_extent_map(em); break; } ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved, cur_offset, range_len); if (ret < 0) { - free_extent_map(em); + btrfs_free_extent_map(em); break; } qgroup_reserved += range_len; data_space_needed += range_len; } - free_extent_map(em); + btrfs_free_extent_map(em); cur_offset = last_byte; } @@ -3099,8 +3207,8 @@ static long btrfs_fallocate(struct file *file, int mode, */ ret = btrfs_fallocate_update_isize(inode, actual_end, mode); out_unlock: - unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, - &cached_state); + btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, + &cached_state); out: btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); extent_changeset_free(data_reserved); @@ -3134,10 +3242,10 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end if (inode->delalloc_bytes > 0) { spin_unlock(&inode->lock); *delalloc_start_ret = start; - delalloc_len = count_range_bits(&inode->io_tree, - delalloc_start_ret, end, - len, EXTENT_DELALLOC, 1, - cached_state); + delalloc_len = btrfs_count_range_bits(&inode->io_tree, + delalloc_start_ret, end, + len, EXTENT_DELALLOC, 1, + cached_state); } else { spin_unlock(&inode->lock); } @@ -3446,7 +3554,7 @@ static loff_t find_desired_extent(struct file *file, loff_t offset, int whence) last_extent_end = lockstart; - lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); + btrfs_lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) { @@ -3592,7 +3700,7 @@ static loff_t find_desired_extent(struct file *file, loff_t offset, int whence) } out: - unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); + btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); btrfs_free_path(path); if (ret < 0) diff --git a/fs/btrfs/file.h b/fs/btrfs/file.h index de89e644be29..d7df81388cbe 100644 --- a/fs/btrfs/file.h +++ b/fs/btrfs/file.h @@ -9,6 +9,8 @@ struct file; struct extent_state; struct kiocb; struct iov_iter; +struct inode; +struct folio; struct page; struct btrfs_ioctl_encoded_io_args; struct btrfs_drop_extents_args; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index d42b6f882f57..4b34ea1f01c2 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -88,13 +88,13 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root, struct btrfs_disk_key disk_key; struct btrfs_free_space_header *header; struct extent_buffer *leaf; - struct inode *inode = NULL; + struct btrfs_inode *inode; unsigned nofs_flag; int ret; key.objectid = BTRFS_FREE_SPACE_OBJECTID; - key.offset = offset; key.type = 0; + key.offset = offset; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) @@ -120,13 +120,13 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root, btrfs_release_path(path); memalloc_nofs_restore(nofs_flag); if (IS_ERR(inode)) - return inode; + return ERR_CAST(inode); - mapping_set_gfp_mask(inode->i_mapping, - mapping_gfp_constraint(inode->i_mapping, + mapping_set_gfp_mask(inode->vfs_inode.i_mapping, + mapping_gfp_constraint(inode->vfs_inode.i_mapping, ~(__GFP_FS | __GFP_HIGHMEM))); - return inode; + return &inode->vfs_inode; } struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group, @@ -201,8 +201,8 @@ static int __create_free_space_inode(struct btrfs_root *root, btrfs_release_path(path); key.objectid = BTRFS_FREE_SPACE_OBJECTID; - key.offset = offset; key.type = 0; + key.offset = offset; ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(struct btrfs_free_space_header)); if (ret < 0) { @@ -244,7 +244,7 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_block_group *block_group) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; int ret = 0; @@ -257,12 +257,12 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, if (IS_ERR(inode)) { if (PTR_ERR(inode) != -ENOENT) ret = PTR_ERR(inode); - goto out; + return ret; } ret = btrfs_orphan_add(trans, BTRFS_I(inode)); if (ret) { btrfs_add_delayed_iput(BTRFS_I(inode)); - goto out; + return ret; } clear_nlink(inode); /* One for the block groups ref */ @@ -285,12 +285,9 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, if (ret) { if (ret > 0) ret = 0; - goto out; + return ret; } - ret = btrfs_del_item(trans, trans->fs_info->tree_root, path); -out: - btrfs_free_path(path); - return ret; + return btrfs_del_item(trans, trans->fs_info->tree_root, path); } int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, @@ -311,8 +308,9 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, bool locked = false; if (block_group) { - struct btrfs_path *path = btrfs_alloc_path(); + BTRFS_PATH_AUTO_FREE(path); + path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto fail; @@ -333,13 +331,12 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, spin_lock(&block_group->lock); block_group->disk_cache_state = BTRFS_DC_CLEAR; spin_unlock(&block_group->lock); - btrfs_free_path(path); } btrfs_i_size_write(inode, 0); truncate_pagecache(vfs_inode, 0); - lock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); + btrfs_lock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); btrfs_drop_extent_map_range(inode, 0, (u64)-1, false); /* @@ -351,7 +348,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, inode_sub_bytes(&inode->vfs_inode, control.sub_bytes); btrfs_inode_safe_disk_i_size_write(inode, control.last_size); - unlock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); + btrfs_unlock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); if (ret) goto fail; @@ -447,7 +444,7 @@ static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl) static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) { - struct page *page; + struct folio *folio; struct inode *inode = io_ctl->inode; gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); int i; @@ -455,31 +452,33 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) for (i = 0; i < io_ctl->num_pages; i++) { int ret; - page = find_or_create_page(inode->i_mapping, i, mask); - if (!page) { + folio = __filemap_get_folio(inode->i_mapping, i, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + mask); + if (IS_ERR(folio)) { io_ctl_drop_pages(io_ctl); - return -ENOMEM; + return PTR_ERR(folio); } - ret = set_folio_extent_mapped(page_folio(page)); + ret = set_folio_extent_mapped(folio); if (ret < 0) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); io_ctl_drop_pages(io_ctl); return ret; } - io_ctl->pages[i] = page; - if (uptodate && !PageUptodate(page)) { - btrfs_read_folio(NULL, page_folio(page)); - lock_page(page); - if (page->mapping != inode->i_mapping) { + io_ctl->pages[i] = &folio->page; + if (uptodate && !folio_test_uptodate(folio)) { + btrfs_read_folio(NULL, folio); + folio_lock(folio); + if (folio->mapping != inode->i_mapping) { btrfs_err(BTRFS_I(inode)->root->fs_info, "free space cache page truncated"); io_ctl_drop_pages(io_ctl); return -EIO; } - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { btrfs_err(BTRFS_I(inode)->root->fs_info, "error reading free space cache"); io_ctl_drop_pages(io_ctl); @@ -753,8 +752,8 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, return 0; key.objectid = BTRFS_FREE_SPACE_OBJECTID; - key.offset = offset; key.type = 0; + key.offset = offset; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) @@ -1081,9 +1080,8 @@ int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl, /* Get the cluster for this block_group if it exists */ if (block_group && !list_empty(&block_group->cluster_list)) { - cluster = list_entry(block_group->cluster_list.next, - struct btrfs_free_cluster, - block_group_list); + cluster = list_first_entry(&block_group->cluster_list, + struct btrfs_free_cluster, block_group_list); } if (!node && cluster) { @@ -1156,13 +1154,13 @@ update_cache_item(struct btrfs_trans_handle *trans, int ret; key.objectid = BTRFS_FREE_SPACE_OBJECTID; - key.offset = offset; key.type = 0; + key.offset = offset; ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) { - clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, - EXTENT_DELALLOC, NULL); + btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, + EXTENT_DELALLOC, NULL); goto fail; } leaf = path->nodes[0]; @@ -1173,9 +1171,9 @@ update_cache_item(struct btrfs_trans_handle *trans, btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || found_key.offset != offset) { - clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, - inode->i_size - 1, EXTENT_DELALLOC, - NULL); + btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, + inode->i_size - 1, EXTENT_DELALLOC, + NULL); btrfs_release_path(path); goto fail; } @@ -1220,9 +1218,9 @@ static noinline_for_stack int write_pinned_extent_entries( start = block_group->start; while (start < block_group->start + block_group->length) { - if (!find_first_extent_bit(unpin, start, - &extent_start, &extent_end, - EXTENT_DIRTY, NULL)) + if (!btrfs_find_first_extent_bit(unpin, start, + &extent_start, &extent_end, + EXTENT_DIRTY, NULL)) return 0; /* This pinned extent is out of our range */ @@ -1268,8 +1266,8 @@ static int flush_dirty_cache(struct inode *inode) ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1); if (ret) - clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, - EXTENT_DELALLOC, NULL); + btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, + EXTENT_DELALLOC, NULL); return ret; } @@ -1289,8 +1287,8 @@ cleanup_write_cache_enospc(struct inode *inode, struct extent_state **cached_state) { io_ctl_drop_pages(io_ctl); - unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, - cached_state); + btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, + cached_state); } static int __btrfs_wait_cache_io(struct btrfs_root *root, @@ -1415,8 +1413,8 @@ static int __btrfs_write_out_cache(struct inode *inode, if (ret) goto out_unlock; - lock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, - &cached_state); + btrfs_lock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, + &cached_state); io_ctl_set_generation(io_ctl, trans->transid); @@ -1476,8 +1474,8 @@ static int __btrfs_write_out_cache(struct inode *inode, io_ctl_drop_pages(io_ctl); io_ctl_free(io_ctl); - unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, - &cached_state); + btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, + &cached_state); /* * at this point the pages are under IO and we're happy, @@ -2343,9 +2341,8 @@ again: struct rb_node *node; struct btrfs_free_space *entry; - cluster = list_entry(block_group->cluster_list.next, - struct btrfs_free_cluster, - block_group_list); + cluster = list_first_entry(&block_group->cluster_list, + struct btrfs_free_cluster, block_group_list); spin_lock(&cluster->lock); node = rb_first(&cluster->root); if (!node) { diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index cae540ec15ed..0c573d46639a 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -117,7 +117,7 @@ struct btrfs_free_space_info *search_free_space_info( if (ret != 0) { btrfs_warn(fs_info, "missing free space info for %llu", block_group->start); - ASSERT(0); + DEBUG_WARN(); return ERR_PTR(-ENOENT); } @@ -141,12 +141,12 @@ static int btrfs_search_prev_slot(struct btrfs_trans_handle *trans, return ret; if (ret == 0) { - ASSERT(0); + DEBUG_WARN(); return -EIO; } if (p->slots[0] == 0) { - ASSERT(0); + DEBUG_WARN("no previous slot found"); return -EIO; } p->slots[0]--; @@ -223,6 +223,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, bitmap = alloc_bitmap(bitmap_size); if (!bitmap) { ret = -ENOMEM; + btrfs_abort_transaction(trans, ret); goto out; } @@ -235,8 +236,10 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, while (!done) { ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } leaf = path->nodes[0]; nr = 0; @@ -271,14 +274,17 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, } ret = btrfs_del_items(trans, root, path, path->slots[0], nr); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } btrfs_release_path(path); } info = search_free_space_info(trans, block_group, path, 1); if (IS_ERR(info)) { ret = PTR_ERR(info); + btrfs_abort_transaction(trans, ret); goto out; } leaf = path->nodes[0]; @@ -293,8 +299,8 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, "incorrect extent count for %llu; counted %u, expected %u", block_group->start, extent_count, expected_extent_count); - ASSERT(0); ret = -EIO; + btrfs_abort_transaction(trans, ret); goto out; } @@ -315,8 +321,10 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, root, path, &key, data_size); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } leaf = path->nodes[0]; ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); @@ -331,8 +339,6 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, ret = 0; out: kvfree(bitmap); - if (ret) - btrfs_abort_transaction(trans, ret); return ret; } @@ -358,6 +364,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, bitmap = alloc_bitmap(bitmap_size); if (!bitmap) { ret = -ENOMEM; + btrfs_abort_transaction(trans, ret); goto out; } @@ -370,8 +377,10 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, while (!done) { ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } leaf = path->nodes[0]; nr = 0; @@ -412,14 +421,17 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, } ret = btrfs_del_items(trans, root, path, path->slots[0], nr); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } btrfs_release_path(path); } info = search_free_space_info(trans, block_group, path, 1); if (IS_ERR(info)) { ret = PTR_ERR(info); + btrfs_abort_transaction(trans, ret); goto out; } leaf = path->nodes[0]; @@ -441,8 +453,10 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, key.offset = (end_bit - start_bit) * block_group->fs_info->sectorsize; ret = btrfs_insert_empty_item(trans, root, path, &key, 0); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } btrfs_release_path(path); extent_count++; @@ -455,16 +469,14 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, "incorrect extent count for %llu; counted %u, expected %u", block_group->start, extent_count, expected_extent_count); - ASSERT(0); ret = -EIO; + btrfs_abort_transaction(trans, ret); goto out; } ret = 0; out: kvfree(bitmap); - if (ret) - btrfs_abort_transaction(trans, ret); return ret; } @@ -838,13 +850,15 @@ int remove_from_free_space_tree(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; + btrfs_abort_transaction(trans, ret); goto out; } block_group = btrfs_lookup_block_group(trans->fs_info, start); if (!block_group) { - ASSERT(0); + DEBUG_WARN("no block group found for start=%llu", start); ret = -ENOENT; + btrfs_abort_transaction(trans, ret); goto out; } @@ -852,12 +866,12 @@ int remove_from_free_space_tree(struct btrfs_trans_handle *trans, ret = __remove_from_free_space_tree(trans, block_group, path, start, size); mutex_unlock(&block_group->free_space_lock); + if (ret) + btrfs_abort_transaction(trans, ret); btrfs_put_block_group(block_group); out: btrfs_free_path(path); - if (ret) - btrfs_abort_transaction(trans, ret); return ret; } @@ -1031,25 +1045,27 @@ int add_to_free_space_tree(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; + btrfs_abort_transaction(trans, ret); goto out; } block_group = btrfs_lookup_block_group(trans->fs_info, start); if (!block_group) { - ASSERT(0); + DEBUG_WARN("no block group found for start=%llu", start); ret = -ENOENT; + btrfs_abort_transaction(trans, ret); goto out; } mutex_lock(&block_group->free_space_lock); ret = __add_to_free_space_tree(trans, block_group, path, start, size); mutex_unlock(&block_group->free_space_lock); + if (ret) + btrfs_abort_transaction(trans, ret); btrfs_put_block_group(block_group); out: btrfs_free_path(path); - if (ret) - btrfs_abort_transaction(trans, ret); return ret; } @@ -1062,7 +1078,8 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group) { struct btrfs_root *extent_root; - struct btrfs_path *path, *path2; + BTRFS_PATH_AUTO_FREE(path); + BTRFS_PATH_AUTO_FREE(path2); struct btrfs_key key; u64 start, end; int ret; @@ -1070,17 +1087,16 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = READA_FORWARD; path2 = btrfs_alloc_path(); - if (!path2) { - btrfs_free_path(path); + if (!path2) return -ENOMEM; - } + + path->reada = READA_FORWARD; ret = add_new_free_space_info(trans, block_group, path2); if (ret) - goto out; + return ret; mutex_lock(&block_group->free_space_lock); @@ -1146,9 +1162,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, ret = 0; out_locked: mutex_unlock(&block_group->free_space_lock); -out: - btrfs_free_path(path2); - btrfs_free_path(path); + return ret; } @@ -1217,7 +1231,7 @@ out_clear: static int clear_free_space_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; int nr; int ret; @@ -1233,7 +1247,7 @@ static int clear_free_space_tree(struct btrfs_trans_handle *trans, while (1) { ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) - goto out; + return ret; nr = btrfs_header_nritems(path->nodes[0]); if (!nr) @@ -1242,15 +1256,12 @@ static int clear_free_space_tree(struct btrfs_trans_handle *trans, path->slots[0] = 0; ret = btrfs_del_items(trans, root, path, 0, nr); if (ret) - goto out; + return ret; btrfs_release_path(path); } - ret = 0; -out: - btrfs_free_path(path); - return ret; + return 0; } int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info) @@ -1560,7 +1571,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, "incorrect extent count for %llu; counted %u, expected %u", block_group->start, extent_count, expected_extent_count); - ASSERT(0); + DEBUG_WARN(); ret = -EIO; goto out; } @@ -1624,7 +1635,7 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, "incorrect extent count for %llu; counted %u, expected %u", block_group->start, extent_count, expected_extent_count); - ASSERT(0); + DEBUG_WARN(); ret = -EIO; goto out; } @@ -1638,9 +1649,8 @@ int load_free_space_tree(struct btrfs_caching_control *caching_ctl) { struct btrfs_block_group *block_group; struct btrfs_free_space_info *info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); u32 extent_count, flags; - int ret; block_group = caching_ctl->block_group; @@ -1657,10 +1667,9 @@ int load_free_space_tree(struct btrfs_caching_control *caching_ctl) path->reada = READA_FORWARD; info = search_free_space_info(NULL, block_group, path, 0); - if (IS_ERR(info)) { - ret = PTR_ERR(info); - goto out; - } + if (IS_ERR(info)) + return PTR_ERR(info); + extent_count = btrfs_free_space_extent_count(path->nodes[0], info); flags = btrfs_free_space_flags(path->nodes[0], info); @@ -1670,11 +1679,7 @@ int load_free_space_tree(struct btrfs_caching_control *caching_ctl) * there. */ if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) - ret = load_free_space_bitmaps(caching_ctl, path, extent_count); + return load_free_space_bitmaps(caching_ctl, path, extent_count); else - ret = load_free_space_extents(caching_ctl, path, extent_count); - -out: - btrfs_free_path(path); - return ret; + return load_free_space_extents(caching_ctl, path, extent_count); } diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c index 09cfb43580cb..b2bb86f8d7cf 100644 --- a/fs/btrfs/fs.c +++ b/fs/btrfs/fs.c @@ -1,7 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include "messages.h" -#include "ctree.h" #include "fs.h" #include "accessors.h" #include "volumes.h" diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index b572d6b9730b..4394de12a767 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -47,6 +47,18 @@ struct btrfs_subpage_info; struct btrfs_stripe_hash_table; struct btrfs_space_info; +/* + * Minimum data and metadata block size. + * + * Normally it's 4K, but for testing subpage block size on 4K page systems, we + * allow DEBUG builds to accept 2K page size. + */ +#ifdef CONFIG_BTRFS_DEBUG +#define BTRFS_MIN_BLOCKSIZE (SZ_2K) +#else +#define BTRFS_MIN_BLOCKSIZE (SZ_4K) +#endif + #define BTRFS_MAX_EXTENT_SIZE SZ_128M #define BTRFS_OLDEST_GENERATION 0ULL @@ -105,6 +117,9 @@ enum { /* Indicates there was an error cleaning up a log tree. */ BTRFS_FS_STATE_LOG_CLEANUP_ERROR, + /* No more delayed iput can be queued. */ + BTRFS_FS_STATE_NO_DELAYED_IPUT, + BTRFS_FS_STATE_COUNT }; @@ -285,6 +300,7 @@ enum { #define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR 0ULL #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) +#define BTRFS_WARNING_COMMIT_INTERVAL (300) #define BTRFS_DEFAULT_MAX_INLINE (2048) struct btrfs_dev_replace { @@ -456,6 +472,8 @@ struct btrfs_fs_info { struct btrfs_block_rsv delayed_block_rsv; /* Block reservation for delayed refs */ struct btrfs_block_rsv delayed_refs_rsv; + /* Block reservation for treelog tree */ + struct btrfs_block_rsv treelog_rsv; struct btrfs_block_rsv empty_block_rsv; @@ -485,8 +503,8 @@ struct btrfs_fs_info { u64 last_trans_log_full_commit; unsigned long long mount_opt; - unsigned long compress_type:4; - unsigned int compress_level; + int compress_type; + int compress_level; u32 commit_interval; /* * It is a suggestive number, the read side is safe even it gets a @@ -709,7 +727,6 @@ struct btrfs_fs_info { * running. */ refcount_t scrub_workers_refcnt; - u32 sectors_per_page; struct workqueue_struct *scrub_workers; struct btrfs_discard_ctl discard_ctl; @@ -762,10 +779,8 @@ struct btrfs_fs_info { struct btrfs_delayed_root *delayed_root; - /* Extent buffer radix tree */ - spinlock_t buffer_lock; /* Entries are eb->start / sectorsize */ - struct radix_tree_root buffer_radix; + struct xarray buffer_tree; /* Next backup root to be overwritten */ int backup_root_index; @@ -981,6 +996,12 @@ static inline u32 count_max_extents(const struct btrfs_fs_info *fs_info, u64 siz return div_u64(size + fs_info->max_extent_size - 1, fs_info->max_extent_size); } +static inline unsigned int btrfs_blocks_per_folio(const struct btrfs_fs_info *fs_info, + const struct folio *folio) +{ + return folio_size(folio) >> fs_info->sectorsize_bits; +} + bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, enum btrfs_exclusive_operation type); bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 448aa1a682d6..a61c3540d67b 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -109,7 +109,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, u64 inode_objectid, u64 ref_objectid, u64 *index) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_inode_extref *extref; struct extent_buffer *leaf; @@ -129,9 +129,9 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) - ret = -ENOENT; + return -ENOENT; if (ret < 0) - goto out; + return ret; /* * Sanity check - did we find the right item for this name? @@ -142,8 +142,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, ref_objectid, name); if (!extref) { btrfs_abort_transaction(trans, -ENOENT); - ret = -ENOENT; - goto out; + return -ENOENT; } leaf = path->nodes[0]; @@ -152,12 +151,8 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, *index = btrfs_inode_extref_index(leaf, extref); if (del_len == item_size) { - /* - * Common case only one ref in the item, remove the - * whole item. - */ - ret = btrfs_del_item(trans, root, path); - goto out; + /* Common case only one ref in the item, remove the whole item. */ + return btrfs_del_item(trans, root, path); } ptr = (unsigned long)extref; @@ -168,9 +163,6 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, btrfs_truncate_item(trans, path, item_size - del_len, 1); -out: - btrfs_free_path(path); - return ret; } @@ -191,8 +183,8 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, int del_len = name->len + sizeof(*ref); key.objectid = inode_objectid; - key.offset = ref_objectid; key.type = BTRFS_INODE_REF_KEY; + key.offset = ref_objectid; path = btrfs_alloc_path(); if (!path) @@ -260,7 +252,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, int ret; int ins_len = name->len + sizeof(*extref); unsigned long ptr; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct extent_buffer *leaf; @@ -279,13 +271,13 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, path->slots[0], ref_objectid, name)) - goto out; + return ret; btrfs_extend_item(trans, path, ins_len); ret = 0; } if (ret < 0) - goto out; + return ret; leaf = path->nodes[0]; ptr = (unsigned long)btrfs_item_ptr(leaf, path->slots[0], char); @@ -298,9 +290,8 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, ptr = (unsigned long)&extref->name; write_extent_buffer(path->nodes[0], name->name, ptr, name->len); -out: - btrfs_free_path(path); - return ret; + + return 0; } /* Will return 0, -ENOMEM, -EMLINK, or -EEXIST or anything from the CoW path */ @@ -317,8 +308,8 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, int ins_len = name->len + sizeof(*ref); key.objectid = inode_objectid; - key.offset = ref_objectid; key.type = BTRFS_INODE_REF_KEY; + key.offset = ref_objectid; path = btrfs_alloc_path(); if (!path) @@ -493,8 +484,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, path->reada = READA_BACK; key.objectid = control->ino; - key.offset = (u64)-1; key.type = (u8)-1; + key.offset = (u64)-1; search_again: /* diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a9322601ab5c..c0c778243bf1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -489,8 +489,8 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, size_t datasize; key.objectid = btrfs_ino(inode); - key.offset = 0; key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = 0; datasize = btrfs_file_extent_calc_inline_size(cur_size); ret = btrfs_insert_empty_item(trans, root, path, &key, @@ -566,23 +566,14 @@ static bool can_cow_file_range_inline(struct btrfs_inode *inode, if (offset != 0) return false; - /* - * Due to the page size limit, for subpage we can only trigger the - * writeback for the dirty sectors of page, that means data writeback - * is doing more writeback than what we want. - * - * This is especially unexpected for some call sites like fallocate, - * where we only increase i_size after everything is done. - * This means we can trigger inline extent even if we didn't want to. - * So here we skip inline extent creation completely. - */ - if (fs_info->sectorsize != PAGE_SIZE) - return false; - /* Inline extents are limited to sectorsize. */ if (size > fs_info->sectorsize) return false; + /* We do not allow a non-compressed extent to be as large as block size. */ + if (data_len >= fs_info->sectorsize) + return false; + /* We cannot exceed the maximum inline data size. */ if (data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info)) return false; @@ -672,7 +663,7 @@ out: * And at reserve time, it's always aligned to page size, so * just free one page here. */ - btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE, NULL); + btrfs_qgroup_free_data(inode, NULL, 0, fs_info->sectorsize, NULL); btrfs_free_path(path); btrfs_end_transaction(trans); return ret; @@ -695,12 +686,12 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, if (!can_cow_file_range_inline(inode, offset, size, compressed_size)) return 1; - lock_extent(&inode->io_tree, offset, end, &cached); + btrfs_lock_extent(&inode->io_tree, offset, end, &cached); ret = __cow_file_range_inline(inode, size, compressed_size, compress_type, compressed_folio, update_i_size); if (ret > 0) { - unlock_extent(&inode->io_tree, offset, end, &cached); + btrfs_unlock_extent(&inode->io_tree, offset, end, &cached); return ret; } @@ -786,26 +777,9 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, struct btrfs_fs_info *fs_info = inode->root->fs_info; if (!btrfs_inode_can_compress(inode)) { - WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), - KERN_ERR "BTRFS: unexpected compression for ino %llu\n", - btrfs_ino(inode)); + DEBUG_WARN("BTRFS: unexpected compression for ino %llu", btrfs_ino(inode)); return 0; } - /* - * Only enable sector perfect compression for experimental builds. - * - * This is a big feature change for subpage cases, and can hit - * different corner cases, so only limit this feature for - * experimental build for now. - * - * ETA for moving this out of experimental builds is 6.15. - */ - if (fs_info->sectorsize < PAGE_SIZE && - !IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL)) { - if (!PAGE_ALIGNED(start) || - !PAGE_ALIGNED(end + 1)) - return 0; - } /* force compress */ if (btrfs_test_opt(fs_info, FORCE_COMPRESS)) @@ -832,7 +806,7 @@ static inline void inode_should_defrag(struct btrfs_inode *inode, btrfs_add_inode_defrag(inode, small_write); } -static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) +static int extent_range_clear_dirty_for_io(struct btrfs_inode *inode, u64 start, u64 end) { unsigned long end_index = end >> PAGE_SHIFT; struct folio *folio; @@ -840,13 +814,13 @@ static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 e for (unsigned long index = start >> PAGE_SHIFT; index <= end_index; index++) { - folio = filemap_get_folio(inode->i_mapping, index); + folio = filemap_get_folio(inode->vfs_inode.i_mapping, index); if (IS_ERR(folio)) { if (!ret) ret = PTR_ERR(folio); continue; } - btrfs_folio_clamp_clear_dirty(inode_to_fs_info(inode), folio, start, + btrfs_folio_clamp_clear_dirty(inode->root->fs_info, folio, start, end + 1 - start); folio_put(folio); } @@ -886,6 +860,7 @@ static void compress_file_range(struct btrfs_work *work) unsigned int poff; int i; int compress_type = fs_info->compress_type; + int compress_level = fs_info->compress_level; inode_should_defrag(inode, start, end, end - start + 1, SZ_16K); @@ -894,7 +869,7 @@ static void compress_file_range(struct btrfs_work *work) * Otherwise applications with the file mmap'd can wander in and change * the page contents while we are compressing them. */ - ret = extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end); + ret = extent_range_clear_dirty_for_io(inode, start, end); /* * All the folios should have been locked thus no failure. @@ -968,13 +943,15 @@ again: goto cleanup_and_bail_uncompressed; } - if (inode->defrag_compress) + if (inode->defrag_compress) { compress_type = inode->defrag_compress; - else if (inode->prop_compress) + compress_level = inode->defrag_compress_level; + } else if (inode->prop_compress) { compress_type = inode->prop_compress; + } /* Compression level is applied here. */ - ret = btrfs_compress_folios(compress_type | (fs_info->compress_level << 4), + ret = btrfs_compress_folios(compress_type, compress_level, mapping, start, folios, &nr_folios, &total_in, &total_compressed); if (ret) @@ -1090,7 +1067,6 @@ static void submit_uncompressed_range(struct btrfs_inode *inode, &wbc, false); wbc_detach_inode(&wbc); if (ret < 0) { - btrfs_cleanup_ordered_extents(inode, start, end - start + 1); if (locked_folio) btrfs_folio_end_lock(inode->root->fs_info, locked_folio, start, async_extent->ram_size); @@ -1116,6 +1092,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, struct extent_state *cached = NULL; struct extent_map *em; int ret = 0; + bool free_pages = false; u64 start = async_extent->start; u64 end = async_extent->start + async_extent->ram_size - 1; @@ -1136,7 +1113,10 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, } if (async_extent->compress_type == BTRFS_COMPRESS_NONE) { + ASSERT(!async_extent->folios); + ASSERT(async_extent->nr_folios == 0); submit_uncompressed_range(inode, async_extent, locked_folio); + free_pages = true; goto done; } @@ -1152,10 +1132,11 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, * fall back to uncompressed. */ submit_uncompressed_range(inode, async_extent, locked_folio); + free_pages = true; goto done; } - lock_extent(io_tree, start, end, &cached); + btrfs_lock_extent(io_tree, start, end, &cached); /* Here we're doing allocation and writeback of the compressed pages */ file_extent.disk_bytenr = ins.objectid; @@ -1170,10 +1151,10 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, ret = PTR_ERR(em); goto out_free_reserve; } - free_extent_map(em); + btrfs_free_extent_map(em); ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent, - 1 << BTRFS_ORDERED_COMPRESSED); + 1U << BTRFS_ORDERED_COMPRESSED); if (IS_ERR(ordered)) { btrfs_drop_extent_map_range(inode, start, end, false); ret = PTR_ERR(ordered); @@ -1193,12 +1174,14 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, done: if (async_chunk->blkcg_css) kthread_associate_blkcg(NULL); + if (free_pages) + free_async_extent_pages(async_extent); kfree(async_extent); return; out_free_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); - btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true); mapping_set_error(inode->vfs_inode.i_mapping, -EIO); extent_clear_unlock_delalloc(inode, start, end, NULL, &cached, @@ -1225,7 +1208,7 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, u64 alloc_hint = 0; read_lock(&em_tree->lock); - em = search_extent_mapping(em_tree, start, num_bytes); + em = btrfs_search_extent_mapping(em_tree, start, num_bytes); if (em) { /* * if block start isn't an actual block number then find the @@ -1233,15 +1216,15 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * block is also bogus then just don't worry about it. */ if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) { - free_extent_map(em); - em = search_extent_mapping(em_tree, 0, 0); + btrfs_free_extent_map(em); + em = btrfs_search_extent_mapping(em_tree, 0, 0); if (em && em->disk_bytenr < EXTENT_MAP_LAST_BYTE) - alloc_hint = extent_map_block_start(em); + alloc_hint = btrfs_extent_map_block_start(em); if (em) - free_extent_map(em); + btrfs_free_extent_map(em); } else { - alloc_hint = extent_map_block_start(em); - free_extent_map(em); + alloc_hint = btrfs_extent_map_block_start(em); + btrfs_free_extent_map(em); } } read_unlock(&em_tree->lock); @@ -1272,10 +1255,7 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * - Else all pages except for @locked_folio are unlocked. * * When a failure happens in the second or later iteration of the - * while-loop, the ordered extents created in previous iterations are kept - * intact. So, the caller must clean them up by calling - * btrfs_cleanup_ordered_extents(). See btrfs_run_delalloc_range() for - * example. + * while-loop, the ordered extents created in previous iterations are cleaned up. */ static noinline int cow_file_range(struct btrfs_inode *inode, struct folio *locked_folio, u64 start, @@ -1382,8 +1362,13 @@ static noinline int cow_file_range(struct btrfs_inode *inode, continue; } if (done_offset) { - *done_offset = start - 1; - return 0; + /* + * Move @end to the end of the processed range, + * and exit the loop to unlock the processed extents. + */ + end = start - 1; + ret = 0; + break; } ret = -ENOSPC; } @@ -1402,24 +1387,24 @@ static noinline int cow_file_range(struct btrfs_inode *inode, * Locked range will be released either during error clean up or * after the whole range is finished. */ - lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1, - &cached); + btrfs_lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1, + &cached); em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_REGULAR); if (IS_ERR(em)) { - unlock_extent(&inode->io_tree, start, - start + cur_alloc_size - 1, &cached); + btrfs_unlock_extent(&inode->io_tree, start, + start + cur_alloc_size - 1, &cached); ret = PTR_ERR(em); goto out_reserve; } - free_extent_map(em); + btrfs_free_extent_map(em); ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent, - 1 << BTRFS_ORDERED_REGULAR); + 1U << BTRFS_ORDERED_REGULAR); if (IS_ERR(ordered)) { - unlock_extent(&inode->io_tree, start, - start + cur_alloc_size - 1, &cached); + btrfs_unlock_extent(&inode->io_tree, start, + start + cur_alloc_size - 1, &cached); ret = PTR_ERR(ordered); goto out_drop_extent_cache; } @@ -1474,7 +1459,7 @@ out_drop_extent_cache: btrfs_drop_extent_map_range(inode, start, start + cur_alloc_size - 1, false); out_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); - btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true); out_unlock: /* * Now, we have three regions to clean up: @@ -1487,11 +1472,9 @@ out_unlock: /* * For the range (1). We have already instantiated the ordered extents - * for this region. They are cleaned up by - * btrfs_cleanup_ordered_extents() in e.g, - * btrfs_run_delalloc_range(). + * for this region, thus we need to cleanup those ordered extents. * EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV - * are also handled by the cleanup function. + * are also handled by the ordered extents cleanup. * * So here we only clear EXTENT_LOCKED and EXTENT_DELALLOC flag, and * finish the writeback of the involved folios, which will be never submitted. @@ -1502,6 +1485,8 @@ out_unlock: if (!locked_folio) mapping_set_error(inode->vfs_inode.i_mapping, ret); + + btrfs_cleanup_ordered_extents(inode, orig_start, start - orig_start); extent_clear_unlock_delalloc(inode, orig_start, start - 1, locked_folio, NULL, clear_bits, page_ops); } @@ -1583,8 +1568,8 @@ static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_ PAGE_SHIFT; while (!list_empty(&async_chunk->extents)) { - async_extent = list_entry(async_chunk->extents.next, - struct async_extent, list); + async_extent = list_first_entry(&async_chunk->extents, + struct async_extent, list); list_del(&async_extent->list); submit_one_async_extent(async_chunk, async_extent, &alloc_hint); } @@ -1754,9 +1739,9 @@ static int fallback_to_cow(struct btrfs_inode *inode, * group that contains that extent to RO mode and therefore force COW * when starting writeback. */ - lock_extent(io_tree, start, end, &cached_state); - count = count_range_bits(io_tree, &range_start, end, range_bytes, - EXTENT_NORESERVE, 0, NULL); + btrfs_lock_extent(io_tree, start, end, &cached_state); + count = btrfs_count_range_bits(io_tree, &range_start, end, range_bytes, + EXTENT_NORESERVE, 0, NULL); if (count > 0 || is_space_ino || is_reloc_ino) { u64 bytes = count; struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -1770,10 +1755,9 @@ static int fallback_to_cow(struct btrfs_inode *inode, spin_unlock(&sinfo->lock); if (count > 0) - clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE, - NULL); + btrfs_clear_extent_bits(io_tree, start, end, EXTENT_NORESERVE); } - unlock_extent(io_tree, start, end, &cached_state); + btrfs_unlock_extent(io_tree, start, end, &cached_state); /* * Don't try to create inline extents, as a mix of inline extent that @@ -1971,6 +1955,65 @@ static void cleanup_dirty_folios(struct btrfs_inode *inode, mapping_set_error(mapping, error); } +static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio, + struct extent_state **cached, + struct can_nocow_file_extent_args *nocow_args, + u64 file_pos, bool is_prealloc) +{ + struct btrfs_ordered_extent *ordered; + u64 len = nocow_args->file_extent.num_bytes; + u64 end = file_pos + len - 1; + int ret = 0; + + btrfs_lock_extent(&inode->io_tree, file_pos, end, cached); + + if (is_prealloc) { + struct extent_map *em; + + em = btrfs_create_io_em(inode, file_pos, &nocow_args->file_extent, + BTRFS_ORDERED_PREALLOC); + if (IS_ERR(em)) { + btrfs_unlock_extent(&inode->io_tree, file_pos, end, cached); + return PTR_ERR(em); + } + btrfs_free_extent_map(em); + } + + ordered = btrfs_alloc_ordered_extent(inode, file_pos, &nocow_args->file_extent, + is_prealloc + ? (1U << BTRFS_ORDERED_PREALLOC) + : (1U << BTRFS_ORDERED_NOCOW)); + if (IS_ERR(ordered)) { + if (is_prealloc) + btrfs_drop_extent_map_range(inode, file_pos, end, false); + btrfs_unlock_extent(&inode->io_tree, file_pos, end, cached); + return PTR_ERR(ordered); + } + + if (btrfs_is_data_reloc_root(inode->root)) + /* + * Errors are handled later, as we must prevent + * extent_clear_unlock_delalloc() in error handler from freeing + * metadata of the created ordered extent. + */ + ret = btrfs_reloc_clone_csums(ordered); + btrfs_put_ordered_extent(ordered); + + extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached, + EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_CLEAR_DATA_RESV, + PAGE_UNLOCK | PAGE_SET_ORDERED); + /* + * On error, we need to cleanup the ordered extents we created. + * + * We do not clear the folio Dirty flags because they are set and + * cleaered by the caller. + */ + if (ret < 0) + btrfs_cleanup_ordered_extents(inode, file_pos, end); + return ret; +} + /* * when nowcow writeback call back. This checks for snapshots or COW copies * of the extents that exist in the file, and COWs the file as required. @@ -2015,15 +2058,12 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, while (cur_offset <= end) { struct btrfs_block_group *nocow_bg = NULL; - struct btrfs_ordered_extent *ordered; struct btrfs_key found_key; struct btrfs_file_extent_item *fi; struct extent_buffer *leaf; struct extent_state *cached_state = NULL; u64 extent_end; - u64 nocow_end; int extent_type; - bool is_prealloc; ret = btrfs_lookup_file_extent(NULL, root, path, ino, cur_offset, 0); @@ -2078,12 +2118,13 @@ next_slot: /* * If the found extent starts after requested offset, then - * adjust extent_end to be right before this extent begins + * adjust cur_offset to be right before this extent begins. */ if (found_key.offset > cur_offset) { - extent_end = found_key.offset; - extent_type = 0; - goto must_cow; + if (cow_start == (u64)-1) + cow_start = cur_offset; + cur_offset = found_key.offset; + goto next_slot; } /* @@ -2149,75 +2190,21 @@ must_cow: if (cow_start != (u64)-1) { ret = fallback_to_cow(inode, locked_folio, cow_start, found_key.offset - 1); - cow_start = (u64)-1; if (ret) { cow_end = found_key.offset - 1; btrfs_dec_nocow_writers(nocow_bg); goto error; } + cow_start = (u64)-1; } - nocow_end = cur_offset + nocow_args.file_extent.num_bytes - 1; - lock_extent(&inode->io_tree, cur_offset, nocow_end, &cached_state); - - is_prealloc = extent_type == BTRFS_FILE_EXTENT_PREALLOC; - if (is_prealloc) { - struct extent_map *em; - - em = btrfs_create_io_em(inode, cur_offset, - &nocow_args.file_extent, - BTRFS_ORDERED_PREALLOC); - if (IS_ERR(em)) { - unlock_extent(&inode->io_tree, cur_offset, - nocow_end, &cached_state); - btrfs_dec_nocow_writers(nocow_bg); - ret = PTR_ERR(em); - goto error; - } - free_extent_map(em); - } - - ordered = btrfs_alloc_ordered_extent(inode, cur_offset, - &nocow_args.file_extent, - is_prealloc - ? (1 << BTRFS_ORDERED_PREALLOC) - : (1 << BTRFS_ORDERED_NOCOW)); + ret = nocow_one_range(inode, locked_folio, &cached_state, + &nocow_args, cur_offset, + extent_type == BTRFS_FILE_EXTENT_PREALLOC); btrfs_dec_nocow_writers(nocow_bg); - if (IS_ERR(ordered)) { - if (is_prealloc) { - btrfs_drop_extent_map_range(inode, cur_offset, - nocow_end, false); - } - unlock_extent(&inode->io_tree, cur_offset, - nocow_end, &cached_state); - ret = PTR_ERR(ordered); + if (ret < 0) goto error; - } - - if (btrfs_is_data_reloc_root(root)) - /* - * Error handled later, as we must prevent - * extent_clear_unlock_delalloc() in error handler - * from freeing metadata of created ordered extent. - */ - ret = btrfs_reloc_clone_csums(ordered); - btrfs_put_ordered_extent(ordered); - - extent_clear_unlock_delalloc(inode, cur_offset, nocow_end, - locked_folio, &cached_state, - EXTENT_LOCKED | EXTENT_DELALLOC | - EXTENT_CLEAR_DATA_RESV, - PAGE_UNLOCK | PAGE_SET_ORDERED); - cur_offset = extent_end; - - /* - * btrfs_reloc_clone_csums() error, now we're OK to call error - * handler, as metadata for created ordered extent will only - * be freed by btrfs_finish_ordered_io(). - */ - if (ret) - goto error; } btrfs_release_path(path); @@ -2226,11 +2213,11 @@ must_cow: if (cow_start != (u64)-1) { ret = fallback_to_cow(inode, locked_folio, cow_start, end); - cow_start = (u64)-1; if (ret) { cow_end = end; goto error; } + cow_start = (u64)-1; } btrfs_free_path(path); @@ -2244,27 +2231,44 @@ error: * start cur_offset end * |/////////////| | * + * In this case, cow_start should be (u64)-1. + * * For range [start, cur_offset) the folios are already unlocked (except * @locked_folio), EXTENT_DELALLOC already removed. - * Only need to clear the dirty flag as they will never be submitted. - * Ordered extent and extent maps are handled by - * btrfs_mark_ordered_io_finished() inside run_delalloc_range(). + * Need to clear the dirty flags and finish the ordered extents. + * + * 2) Failed with error before calling fallback_to_cow() + * + * start cow_start end + * |/////////////| | + * + * In this case, only @cow_start is set, @cur_offset is between + * [cow_start, end) + * + * It's mostly the same as case 1), just replace @cur_offset with + * @cow_start. * - * 2) Failed with error from fallback_to_cow() - * start cur_offset cow_end end + * 3) Failed with error from fallback_to_cow() + * + * start cow_start cow_end end * |/////////////|-----------| | * - * For range [start, cur_offset) it's the same as case 1). - * But for range [cur_offset, cow_end), the folios have dirty flag - * cleared and unlocked, EXTENT_DEALLLOC cleared by cow_file_range(). + * In this case, both @cow_start and @cow_end is set. * - * Thus we should not call extent_clear_unlock_delalloc() on range - * [cur_offset, cow_end), as the folios are already unlocked. + * For range [start, cow_start) it's the same as case 1). + * But for range [cow_start, cow_end), all the cleanup is handled by + * cow_file_range(), we should not touch anything in that range. * - * So clear the folio dirty flags for [start, cur_offset) first. + * So for all above cases, if @cow_start is set, cleanup ordered extents + * for range [start, @cow_start), other wise cleanup range [start, @cur_offset). */ - if (cur_offset > start) + if (cow_start != (u64)-1) + cur_offset = cow_start; + + if (cur_offset > start) { + btrfs_cleanup_ordered_extents(inode, start, cur_offset - start); cleanup_dirty_folios(inode, locked_folio, start, cur_offset - 1, ret); + } /* * If an error happened while a COW region is outstanding, cur_offset @@ -2281,7 +2285,7 @@ error: if (cur_offset < end) { struct extent_state *cached = NULL; - lock_extent(&inode->io_tree, cur_offset, end, &cached); + btrfs_lock_extent(&inode->io_tree, cur_offset, end, &cached); extent_clear_unlock_delalloc(inode, cur_offset, end, locked_folio, &cached, EXTENT_LOCKED | EXTENT_DELALLOC | @@ -2303,7 +2307,7 @@ static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end) { if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) { if (inode->defrag_bytes && - test_range_bit_exists(&inode->io_tree, start, end, EXTENT_DEFRAG)) + btrfs_test_range_bit_exists(&inode->io_tree, start, end, EXTENT_DEFRAG)) return false; return true; } @@ -2329,7 +2333,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol if (should_nocow(inode, start, end)) { ret = run_delalloc_nocow(inode, locked_folio, start, end); - goto out; + return ret; } if (btrfs_inode_can_compress(inode) && @@ -2343,10 +2347,6 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol else ret = cow_file_range(inode, locked_folio, start, end, NULL, false, false); - -out: - if (ret < 0) - btrfs_cleanup_ordered_extents(inode, start, end - start + 1); return ret; } @@ -2596,7 +2596,7 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, !btrfs_is_free_space_inode(inode) && !(state->state & EXTENT_NORESERVE) && (bits & EXTENT_CLEAR_DATA_RESV)) - btrfs_free_reserved_data_space_noquota(fs_info, len); + btrfs_free_reserved_data_space_noquota(inode, len); percpu_counter_add_batch(&fs_info->delalloc_bytes, -len, fs_info->delalloc_batch); @@ -2680,12 +2680,12 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, if (em_len > search_len) em_len = search_len; - ret = set_extent_bit(&inode->io_tree, search_start, - search_start + em_len - 1, - EXTENT_DELALLOC_NEW, cached_state); + ret = btrfs_set_extent_bit(&inode->io_tree, search_start, + search_start + em_len - 1, + EXTENT_DELALLOC_NEW, cached_state); next: - search_start = extent_map_end(em); - free_extent_map(em); + search_start = btrfs_extent_map_end(em); + btrfs_free_extent_map(em); if (ret) return ret; } @@ -2715,8 +2715,8 @@ int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, return ret; } - return set_extent_bit(&inode->io_tree, start, end, - EXTENT_DELALLOC | extra_bits, cached_state); + return btrfs_set_extent_bit(&inode->io_tree, start, end, + EXTENT_DELALLOC | extra_bits, cached_state); } /* see btrfs_writepage_start_hook for details on why this is required */ @@ -2791,7 +2791,7 @@ again: if (ret) goto out_page; - lock_extent(&inode->io_tree, page_start, page_end, &cached_state); + btrfs_lock_extent(&inode->io_tree, page_start, page_end, &cached_state); /* already ordered? We're done */ if (folio_test_ordered(folio)) @@ -2799,8 +2799,8 @@ again: ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); if (ordered) { - unlock_extent(&inode->io_tree, page_start, page_end, - &cached_state); + btrfs_unlock_extent(&inode->io_tree, page_start, page_end, + &cached_state); folio_unlock(folio); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); @@ -2826,7 +2826,7 @@ out_reserved: if (free_delalloc_space) btrfs_delalloc_release_space(inode, data_reserved, page_start, PAGE_SIZE, true); - unlock_extent(&inode->io_tree, page_start, page_end, &cached_state); + btrfs_unlock_extent(&inode->io_tree, page_start, page_end, &cached_state); out_page: if (ret) { /* @@ -2873,6 +2873,21 @@ int btrfs_writepage_cow_fixup(struct folio *folio) return 0; /* + * For experimental build, we error out instead of EAGAIN. + * + * We should not hit such out-of-band dirty folios anymore. + */ + if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL)) { + DEBUG_WARN(); + btrfs_err_rl(fs_info, + "root %lld ino %llu folio %llu is marked dirty without notifying the fs", + BTRFS_I(inode)->root->root_key.objectid, + btrfs_ino(BTRFS_I(inode)), + folio_pos(folio)); + return -EUCLEAN; + } + + /* * folio_checked is set below when we create a fixup worker for this * folio, don't try to create another one if we're already * folio_test_checked. @@ -2891,7 +2906,7 @@ int btrfs_writepage_cow_fixup(struct folio *folio) * We are already holding a reference to this inode from * write_cache_pages. We need to hold it because the space reservation * takes place outside of the folio lock, and we can't trust - * page->mapping outside of the folio lock. + * folio->mapping outside of the folio lock. */ ihold(inode); btrfs_folio_set_checked(fs_info, folio, folio_pos(folio), folio_size(folio)); @@ -2912,7 +2927,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, { struct btrfs_root *root = inode->root; const u64 sectorsize = root->fs_info->sectorsize; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_key ins; u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi); @@ -2947,8 +2962,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, if (!drop_args.extent_inserted) { ins.objectid = btrfs_ino(inode); - ins.offset = file_pos; ins.type = BTRFS_EXTENT_DATA_KEY; + ins.offset = file_pos; ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*stack_fi)); @@ -2983,8 +2998,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found); ins.objectid = disk_bytenr; - ins.offset = disk_num_bytes; ins.type = BTRFS_EXTENT_ITEM_KEY; + ins.offset = disk_num_bytes; ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes); if (ret) @@ -2994,8 +3009,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, file_pos - offset, qgroup_reserved, &ins); out: - btrfs_free_path(path); - return ret; } @@ -3111,8 +3124,10 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) * depending on their current state). */ if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { - clear_bits |= EXTENT_LOCKED; - lock_extent(io_tree, start, end, &cached_state); + clear_bits |= EXTENT_LOCKED | EXTENT_FINISHING_ORDERED; + btrfs_lock_extent_bits(io_tree, start, end, + EXTENT_LOCKED | EXTENT_FINISHING_ORDERED, + &cached_state); } if (freespace_inode) @@ -3176,8 +3191,8 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) goto out; } - ret = unpin_extent_cache(inode, ordered_extent->file_offset, - ordered_extent->num_bytes, trans->transid); + ret = btrfs_unpin_extent_cache(inode, ordered_extent->file_offset, + ordered_extent->num_bytes, trans->transid); if (ret < 0) { btrfs_abort_transaction(trans, ret); goto out; @@ -3196,9 +3211,9 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) */ if ((clear_bits & EXTENT_DELALLOC_NEW) && !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) - clear_extent_bit(&inode->io_tree, start, end, - EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES, - &cached_state); + btrfs_clear_extent_bit(&inode->io_tree, start, end, + EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES, + &cached_state); btrfs_inode_safe_disk_i_size_write(inode, 0); ret = btrfs_update_inode_fallback(trans, inode); @@ -3207,15 +3222,13 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) goto out; } out: - clear_extent_bit(&inode->io_tree, start, end, clear_bits, - &cached_state); + btrfs_clear_extent_bit(&inode->io_tree, start, end, clear_bits, + &cached_state); if (trans) btrfs_end_transaction(trans); if (ret || truncated) { - u64 unwritten_start = start; - /* * If we failed to finish this ordered extent for any reason we * need to make sure BTRFS_ORDERED_IOERR is set on the ordered @@ -3227,10 +3240,6 @@ out: if (ret) btrfs_mark_ordered_extent_error(ordered_extent); - if (truncated) - unwritten_start += logical_len; - clear_extent_uptodate(io_tree, unwritten_start, end, NULL); - /* * Drop extent maps for the part of the extent we didn't write. * @@ -3245,9 +3254,15 @@ out: * we don't mess with the extent map tree in the NOCOW case, but * for now simply skip this if we are the free space inode. */ - if (!btrfs_is_free_space_inode(inode)) + if (!btrfs_is_free_space_inode(inode)) { + u64 unwritten_start = start; + + if (truncated) + unwritten_start += logical_len; + btrfs_drop_extent_map_range(inode, unwritten_start, end, false); + } /* * If the ordered extent had an IOERR or something else went @@ -3274,7 +3289,7 @@ out: NULL); btrfs_free_reserved_extent(fs_info, ordered_extent->disk_bytenr, - ordered_extent->disk_num_bytes, 1); + ordered_extent->disk_num_bytes, true); /* * Actually free the qgroup rsv which was released when * the ordered extent was created. @@ -3311,20 +3326,16 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered) /* * Verify the checksum for a single sector without any extra action that depend * on the type of I/O. + * + * @kaddr must be a properly kmapped address. */ -int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, - u32 pgoff, u8 *csum, const u8 * const csum_expected) +int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, void *kaddr, u8 *csum, + const u8 * const csum_expected) { SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); - char *kaddr; - - ASSERT(pgoff + fs_info->sectorsize <= PAGE_SIZE); shash->tfm = fs_info->csum_shash; - - kaddr = kmap_local_page(page) + pgoff; crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum); - kunmap_local(kaddr); if (memcmp(csum, csum_expected, fs_info->csum_size)) return -EIO; @@ -3353,6 +3364,7 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, u64 end = file_offset + bv->bv_len - 1; u8 *csum_expected; u8 csum[BTRFS_CSUM_SIZE]; + void *kaddr; ASSERT(bv->bv_len == fs_info->sectorsize); @@ -3360,19 +3372,22 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, return true; if (btrfs_is_data_reloc_root(inode->root) && - test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM, - NULL)) { + btrfs_test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM, + NULL)) { /* Skip the range without csum for data reloc inode */ - clear_extent_bits(&inode->io_tree, file_offset, end, - EXTENT_NODATASUM); + btrfs_clear_extent_bits(&inode->io_tree, file_offset, end, + EXTENT_NODATASUM); return true; } csum_expected = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) * fs_info->csum_size; - if (btrfs_check_sector_csum(fs_info, bv->bv_page, bv->bv_offset, csum, - csum_expected)) + kaddr = bvec_kmap_local(bv); + if (btrfs_check_sector_csum(fs_info, kaddr, csum, csum_expected)) { + kunmap_local(kaddr); goto zeroit; + } + kunmap_local(kaddr); return true; zeroit: @@ -3402,6 +3417,7 @@ void btrfs_add_delayed_iput(struct btrfs_inode *inode) if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1)) return; + WARN_ON_ONCE(test_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state)); atomic_inc(&fs_info->nr_delayed_iputs); /* * Need to be irq safe here because we can be called from either an irq @@ -3518,11 +3534,10 @@ static int btrfs_orphan_del(struct btrfs_trans_handle *trans, int btrfs_orphan_cleanup(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_key key, found_key; struct btrfs_trans_handle *trans; - struct inode *inode; u64 last_objectid = 0; int ret = 0, nr_unlink = 0; @@ -3541,6 +3556,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) key.offset = (u64)-1; while (1) { + struct btrfs_inode *inode; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; @@ -3664,10 +3681,10 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) * deleted but wasn't. The inode number may have been reused, * but either way, we can delete the orphan item. */ - if (!inode || inode->i_nlink) { + if (!inode || inode->vfs_inode.i_nlink) { if (inode) { - ret = btrfs_drop_verity_items(BTRFS_I(inode)); - iput(inode); + ret = btrfs_drop_verity_items(inode); + iput(&inode->vfs_inode); inode = NULL; if (ret) goto out; @@ -3690,7 +3707,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) nr_unlink++; /* this will do delete_inode and everything for us */ - iput(inode); + iput(&inode->vfs_inode); } /* release the path since we're done with it */ btrfs_release_path(path); @@ -3707,19 +3724,22 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) out: if (ret) btrfs_err(fs_info, "could not do orphan cleanup %d", ret); - btrfs_free_path(path); return ret; } /* - * very simple check to peek ahead in the leaf looking for xattrs. If we - * don't find any xattrs, we know there can't be any acls. + * Look ahead in the leaf for xattrs. If we don't find any then we know there + * can't be any ACLs. + * + * @leaf: the eb leaf where to search + * @slot: the slot the inode is in + * @objectid: the objectid of the inode * - * slot is the slot the inode is in, objectid is the objectid of the inode + * Return true if there is xattr/ACL, false otherwise. */ -static noinline int acls_after_inode_item(struct extent_buffer *leaf, - int slot, u64 objectid, - int *first_xattr_slot) +static noinline bool acls_after_inode_item(struct extent_buffer *leaf, + int slot, u64 objectid, + int *first_xattr_slot) { u32 nritems = btrfs_header_nritems(leaf); struct btrfs_key found_key; @@ -3739,45 +3759,50 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf, while (slot < nritems) { btrfs_item_key_to_cpu(leaf, &found_key, slot); - /* we found a different objectid, there must not be acls */ + /* We found a different objectid, there must be no ACLs. */ if (found_key.objectid != objectid) - return 0; + return false; - /* we found an xattr, assume we've got an acl */ + /* We found an xattr, assume we've got an ACL. */ if (found_key.type == BTRFS_XATTR_ITEM_KEY) { if (*first_xattr_slot == -1) *first_xattr_slot = slot; if (found_key.offset == xattr_access || found_key.offset == xattr_default) - return 1; + return true; } /* - * we found a key greater than an xattr key, there can't - * be any acls later on + * We found a key greater than an xattr key, there can't be any + * ACLs later on. */ if (found_key.type > BTRFS_XATTR_ITEM_KEY) - return 0; + return false; slot++; scanned++; /* - * it goes inode, inode backrefs, xattrs, extents, - * so if there are a ton of hard links to an inode there can - * be a lot of backrefs. Don't waste time searching too hard, - * this is just an optimization + * The item order goes like: + * - inode + * - inode backrefs + * - xattrs + * - extents, + * + * so if there are lots of hard links to an inode there can be + * a lot of backrefs. Don't waste time searching too hard, + * this is just an optimization. */ if (scanned >= 8) break; } - /* we hit the end of the leaf before we found an xattr or - * something larger than an xattr. We have to assume the inode - * has acls + /* + * We hit the end of the leaf before we found an xattr or something + * larger than an xattr. We have to assume the inode has ACLs. */ if (*first_xattr_slot == -1) *first_xattr_slot = slot; - return 1; + return true; } static int btrfs_init_file_extent_tree(struct btrfs_inode *inode) @@ -3797,7 +3822,8 @@ static int btrfs_init_file_extent_tree(struct btrfs_inode *inode) if (!inode->file_extent_tree) return -ENOMEM; - extent_io_tree_init(fs_info, inode->file_extent_tree, IO_TREE_INODE_FILE_EXTENT); + btrfs_extent_io_tree_init(fs_info, inode->file_extent_tree, + IO_TREE_INODE_FILE_EXTENT); /* Lockdep class is set only for the file extent tree. */ lockdep_set_class(&inode->file_extent_tree->lock, &file_extent_tree_class); @@ -3840,12 +3866,13 @@ static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc) * * On failure clean up the inode. */ -static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path) +static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path *path) { - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *leaf; struct btrfs_inode_item *inode_item; - struct btrfs_root *root = BTRFS_I(inode)->root; + struct inode *vfs_inode = &inode->vfs_inode; struct btrfs_key location; unsigned long ptr; int maybe_acls; @@ -3854,7 +3881,7 @@ static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path) bool filled = false; int first_xattr_slot; - ret = btrfs_init_file_extent_tree(BTRFS_I(inode)); + ret = btrfs_init_file_extent_tree(inode); if (ret) goto out; @@ -3864,7 +3891,7 @@ static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path) ASSERT(path); - btrfs_get_inode_key(BTRFS_I(inode), &location); + btrfs_get_inode_key(inode, &location); ret = btrfs_lookup_inode(NULL, root, path, &location, 0); if (ret) { @@ -3884,41 +3911,41 @@ static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path) inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); - inode->i_mode = btrfs_inode_mode(leaf, inode_item); - set_nlink(inode, btrfs_inode_nlink(leaf, inode_item)); - i_uid_write(inode, btrfs_inode_uid(leaf, inode_item)); - i_gid_write(inode, btrfs_inode_gid(leaf, inode_item)); - btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item)); - btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0, - round_up(i_size_read(inode), fs_info->sectorsize)); - - inode_set_atime(inode, btrfs_timespec_sec(leaf, &inode_item->atime), + vfs_inode->i_mode = btrfs_inode_mode(leaf, inode_item); + set_nlink(vfs_inode, btrfs_inode_nlink(leaf, inode_item)); + i_uid_write(vfs_inode, btrfs_inode_uid(leaf, inode_item)); + i_gid_write(vfs_inode, btrfs_inode_gid(leaf, inode_item)); + btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); + btrfs_inode_set_file_extent_range(inode, 0, + round_up(i_size_read(vfs_inode), fs_info->sectorsize)); + + inode_set_atime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->atime), btrfs_timespec_nsec(leaf, &inode_item->atime)); - inode_set_mtime(inode, btrfs_timespec_sec(leaf, &inode_item->mtime), + inode_set_mtime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->mtime), btrfs_timespec_nsec(leaf, &inode_item->mtime)); - inode_set_ctime(inode, btrfs_timespec_sec(leaf, &inode_item->ctime), + inode_set_ctime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->ctime), btrfs_timespec_nsec(leaf, &inode_item->ctime)); - BTRFS_I(inode)->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime); - BTRFS_I(inode)->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime); + inode->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime); + inode->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime); - inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); - BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); - BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item); + inode_set_bytes(vfs_inode, btrfs_inode_nbytes(leaf, inode_item)); + inode->generation = btrfs_inode_generation(leaf, inode_item); + inode->last_trans = btrfs_inode_transid(leaf, inode_item); - inode_set_iversion_queried(inode, - btrfs_inode_sequence(leaf, inode_item)); - inode->i_generation = BTRFS_I(inode)->generation; - inode->i_rdev = 0; + inode_set_iversion_queried(vfs_inode, btrfs_inode_sequence(leaf, inode_item)); + vfs_inode->i_generation = inode->generation; + vfs_inode->i_rdev = 0; rdev = btrfs_inode_rdev(leaf, inode_item); - if (S_ISDIR(inode->i_mode)) - BTRFS_I(inode)->index_cnt = (u64)-1; + if (S_ISDIR(vfs_inode->i_mode)) + inode->index_cnt = (u64)-1; btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item), - &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags); + &inode->flags, &inode->ro_flags); + btrfs_update_inode_mapping_flags(inode); cache_index: /* @@ -3930,9 +3957,8 @@ cache_index: * This is required for both inode re-read from disk and delayed inode * in the delayed_nodes xarray. */ - if (BTRFS_I(inode)->last_trans == btrfs_get_fs_generation(fs_info)) - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags); + if (inode->last_trans == btrfs_get_fs_generation(fs_info)) + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); /* * We don't persist the id of the transaction where an unlink operation @@ -3961,7 +3987,7 @@ cache_index: * transaction commits on fsync if our inode is a directory, or if our * inode is not a directory, logging its parent unnecessarily. */ - BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans; + inode->last_unlink_trans = inode->last_trans; /* * Same logic as for last_unlink_trans. We don't persist the generation @@ -3969,15 +3995,15 @@ cache_index: * operation, so after eviction and reloading the inode we must be * pessimistic and assume the last transaction that modified the inode. */ - BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans; + inode->last_reflink_trans = inode->last_trans; path->slots[0]++; - if (inode->i_nlink != 1 || + if (vfs_inode->i_nlink != 1 || path->slots[0] >= btrfs_header_nritems(leaf)) goto cache_acl; btrfs_item_key_to_cpu(leaf, &location, path->slots[0]); - if (location.objectid != btrfs_ino(BTRFS_I(inode))) + if (location.objectid != btrfs_ino(inode)) goto cache_acl; ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); @@ -3985,13 +4011,12 @@ cache_index: struct btrfs_inode_ref *ref; ref = (struct btrfs_inode_ref *)ptr; - BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref); + inode->dir_index = btrfs_inode_ref_index(leaf, ref); } else if (location.type == BTRFS_INODE_EXTREF_KEY) { struct btrfs_inode_extref *extref; extref = (struct btrfs_inode_extref *)ptr; - BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf, - extref); + inode->dir_index = btrfs_inode_extref_index(leaf, extref); } cache_acl: /* @@ -3999,50 +4024,49 @@ cache_acl: * any xattrs or acls */ maybe_acls = acls_after_inode_item(leaf, path->slots[0], - btrfs_ino(BTRFS_I(inode)), &first_xattr_slot); + btrfs_ino(inode), &first_xattr_slot); if (first_xattr_slot != -1) { path->slots[0] = first_xattr_slot; ret = btrfs_load_inode_props(inode, path); if (ret) btrfs_err(fs_info, "error loading props for ino %llu (root %llu): %d", - btrfs_ino(BTRFS_I(inode)), - btrfs_root_id(root), ret); + btrfs_ino(inode), btrfs_root_id(root), ret); } if (!maybe_acls) - cache_no_acl(inode); + cache_no_acl(vfs_inode); - switch (inode->i_mode & S_IFMT) { + switch (vfs_inode->i_mode & S_IFMT) { case S_IFREG: - inode->i_mapping->a_ops = &btrfs_aops; - inode->i_fop = &btrfs_file_operations; - inode->i_op = &btrfs_file_inode_operations; + vfs_inode->i_mapping->a_ops = &btrfs_aops; + vfs_inode->i_fop = &btrfs_file_operations; + vfs_inode->i_op = &btrfs_file_inode_operations; break; case S_IFDIR: - inode->i_fop = &btrfs_dir_file_operations; - inode->i_op = &btrfs_dir_inode_operations; + vfs_inode->i_fop = &btrfs_dir_file_operations; + vfs_inode->i_op = &btrfs_dir_inode_operations; break; case S_IFLNK: - inode->i_op = &btrfs_symlink_inode_operations; - inode_nohighmem(inode); - inode->i_mapping->a_ops = &btrfs_aops; + vfs_inode->i_op = &btrfs_symlink_inode_operations; + inode_nohighmem(vfs_inode); + vfs_inode->i_mapping->a_ops = &btrfs_aops; break; default: - inode->i_op = &btrfs_special_inode_operations; - init_special_inode(inode, inode->i_mode, rdev); + vfs_inode->i_op = &btrfs_special_inode_operations; + init_special_inode(vfs_inode, vfs_inode->i_mode, rdev); break; } btrfs_sync_inode_flags_to_i_flags(inode); - ret = btrfs_add_inode_to_root(BTRFS_I(inode), true); + ret = btrfs_add_inode_to_root(inode, true); if (ret) goto out; return 0; out: - iget_failed(inode); + iget_failed(vfs_inode); return ret; } @@ -4102,7 +4126,7 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode *inode) { struct btrfs_inode_item *inode_item; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_key key; int ret; @@ -4116,7 +4140,7 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, if (ret) { if (ret > 0) ret = -ENOENT; - goto failed; + return ret; } leaf = path->nodes[0]; @@ -4125,10 +4149,7 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode); btrfs_set_inode_last_trans(trans, inode); - ret = 0; -failed: - btrfs_free_path(path); - return ret; + return 0; } /* @@ -4462,7 +4483,7 @@ out: static noinline int may_destroy_subvol(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_dir_item *di; struct btrfs_key key; struct fscrypt_str name = FSTR_INIT("default", 7); @@ -4484,7 +4505,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root) btrfs_err(fs_info, "deleting default subvolume %llu is not allowed", key.objectid); - goto out; + return ret; } btrfs_release_path(path); } @@ -4495,14 +4516,13 @@ static noinline int may_destroy_subvol(struct btrfs_root *root) ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); if (ret < 0) - goto out; + return ret; if (ret == 0) { /* * Key with offset -1 found, there would have to exist a root * with such id, but this is out of valid range. */ - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } ret = 0; @@ -4512,8 +4532,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root) if (key.objectid == btrfs_root_id(root) && key.type == BTRFS_ROOT_REF_KEY) ret = -ENOTEMPTY; } -out: - btrfs_free_path(path); + return ret; } @@ -4756,20 +4775,80 @@ out_notrans: return ret; } +static bool is_inside_block(u64 bytenr, u64 blockstart, u32 blocksize) +{ + ASSERT(IS_ALIGNED(blockstart, blocksize), "blockstart=%llu blocksize=%u", + blockstart, blocksize); + + if (blockstart <= bytenr && bytenr <= blockstart + blocksize - 1) + return true; + return false; +} + +static int truncate_block_zero_beyond_eof(struct btrfs_inode *inode, u64 start) +{ + const pgoff_t index = (start >> PAGE_SHIFT); + struct address_space *mapping = inode->vfs_inode.i_mapping; + struct folio *folio; + u64 zero_start; + u64 zero_end; + int ret = 0; + +again: + folio = filemap_lock_folio(mapping, index); + /* No folio present. */ + if (IS_ERR(folio)) + return 0; + + if (!folio_test_uptodate(folio)) { + ret = btrfs_read_folio(NULL, folio); + folio_lock(folio); + if (folio->mapping != mapping) { + folio_unlock(folio); + folio_put(folio); + goto again; + } + if (!folio_test_uptodate(folio)) { + ret = -EIO; + goto out_unlock; + } + } + folio_wait_writeback(folio); + + /* + * We do not need to lock extents nor wait for OE, as it's already + * beyond EOF. + */ + + zero_start = max_t(u64, folio_pos(folio), start); + zero_end = folio_pos(folio) + folio_size(folio) - 1; + folio_zero_range(folio, zero_start - folio_pos(folio), + zero_end - zero_start + 1); + +out_unlock: + folio_unlock(folio); + folio_put(folio); + return ret; +} + /* - * Read, zero a chunk and write a block. + * Handle the truncation of a fs block. + * + * @inode - inode that we're zeroing + * @offset - the file offset of the block to truncate + * The value must be inside [@start, @end], and the function will do + * extra checks if the block that covers @offset needs to be zeroed. + * @start - the start file offset of the range we want to zero + * @end - the end (inclusive) file offset of the range we want to zero. * - * @inode - inode that we're zeroing - * @from - the offset to start zeroing - * @len - the length to zero, 0 to zero the entire range respective to the - * offset - * @front - zero up to the offset instead of from the offset on + * If the range is not block aligned, read out the folio that covers @offset, + * and if needed zero blocks that are inside the folio and covered by [@start, @end). + * If @start or @end + 1 lands inside a block, that block will be marked dirty + * for writeback. * - * This will find the block for the "from" offset and cow the block and zero the - * part we want to zero. This is used with truncate and hole punching. + * This is utilized by hole punch, zero range, file expansion. */ -int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, - int front) +int btrfs_truncate_block(struct btrfs_inode *inode, u64 offset, u64 start, u64 end) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct address_space *mapping = inode->vfs_inode.i_mapping; @@ -4779,20 +4858,56 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, struct extent_changeset *data_reserved = NULL; bool only_release_metadata = false; u32 blocksize = fs_info->sectorsize; - pgoff_t index = from >> PAGE_SHIFT; - unsigned offset = from & (blocksize - 1); + pgoff_t index = (offset >> PAGE_SHIFT); struct folio *folio; gfp_t mask = btrfs_alloc_write_mask(mapping); size_t write_bytes = blocksize; int ret = 0; + const bool in_head_block = is_inside_block(offset, round_down(start, blocksize), + blocksize); + const bool in_tail_block = is_inside_block(offset, round_down(end, blocksize), + blocksize); + bool need_truncate_head = false; + bool need_truncate_tail = false; + u64 zero_start; + u64 zero_end; u64 block_start; u64 block_end; - if (IS_ALIGNED(offset, blocksize) && - (!len || IS_ALIGNED(len, blocksize))) + /* @offset should be inside the range. */ + ASSERT(start <= offset && offset <= end, "offset=%llu start=%llu end=%llu", + offset, start, end); + + /* The range is aligned at both ends. */ + if (IS_ALIGNED(start, blocksize) && IS_ALIGNED(end + 1, blocksize)) { + /* + * For block size < page size case, we may have polluted blocks + * beyond EOF. So we also need to zero them out. + */ + if (end == (u64)-1 && blocksize < PAGE_SIZE) + ret = truncate_block_zero_beyond_eof(inode, start); + goto out; + } + + /* + * @offset may not be inside the head nor tail block. In that case we + * don't need to do anything. + */ + if (!in_head_block && !in_tail_block) goto out; - block_start = round_down(from, blocksize); + /* + * Skip the truncatioin if the range in the target block is already aligned. + * The seemingly complex check will also handle the same block case. + */ + if (in_head_block && !IS_ALIGNED(start, blocksize)) + need_truncate_head = true; + if (in_tail_block && !IS_ALIGNED(end + 1, blocksize)) + need_truncate_tail = true; + if (!need_truncate_head && !need_truncate_tail) + goto out; + + block_start = round_down(offset, blocksize); block_end = block_start + blocksize - 1; ret = btrfs_check_data_free_space(inode, &data_reserved, block_start, @@ -4816,10 +4931,13 @@ again: folio = __filemap_get_folio(mapping, index, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask); if (IS_ERR(folio)) { - btrfs_delalloc_release_space(inode, data_reserved, block_start, - blocksize, true); + if (only_release_metadata) + btrfs_delalloc_release_metadata(inode, blocksize, true); + else + btrfs_delalloc_release_space(inode, data_reserved, + block_start, blocksize, true); btrfs_delalloc_release_extents(inode, blocksize); - ret = -ENOMEM; + ret = PTR_ERR(folio); goto out; } @@ -4849,11 +4967,11 @@ again: folio_wait_writeback(folio); - lock_extent(io_tree, block_start, block_end, &cached_state); + btrfs_lock_extent(io_tree, block_start, block_end, &cached_state); ordered = btrfs_lookup_ordered_extent(inode, block_start); if (ordered) { - unlock_extent(io_tree, block_start, block_end, &cached_state); + btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state); folio_unlock(folio); folio_put(folio); btrfs_start_ordered_extent(ordered); @@ -4861,37 +4979,46 @@ again: goto again; } - clear_extent_bit(&inode->io_tree, block_start, block_end, - EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - &cached_state); + btrfs_clear_extent_bit(&inode->io_tree, block_start, block_end, + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, + &cached_state); ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0, &cached_state); if (ret) { - unlock_extent(io_tree, block_start, block_end, &cached_state); + btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state); goto out_unlock; } - if (offset != blocksize) { - if (!len) - len = blocksize - offset; - if (front) - folio_zero_range(folio, block_start - folio_pos(folio), - offset); - else - folio_zero_range(folio, - (block_start - folio_pos(folio)) + offset, - len); + if (end == (u64)-1) { + /* + * We're truncating beyond EOF, the remaining blocks normally are + * already holes thus no need to zero again, but it's possible for + * fs block size < page size cases to have memory mapped writes + * to pollute ranges beyond EOF. + * + * In that case although such polluted blocks beyond EOF will + * not reach disk, it still affects our page caches. + */ + zero_start = max_t(u64, folio_pos(folio), start); + zero_end = min_t(u64, folio_pos(folio) + folio_size(folio) - 1, + end); + } else { + zero_start = max_t(u64, block_start, start); + zero_end = min_t(u64, block_end, end); } + folio_zero_range(folio, zero_start - folio_pos(folio), + zero_end - zero_start + 1); + btrfs_folio_clear_checked(fs_info, folio, block_start, block_end + 1 - block_start); btrfs_folio_set_dirty(fs_info, folio, block_start, block_end + 1 - block_start); - unlock_extent(io_tree, block_start, block_end, &cached_state); + btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state); if (only_release_metadata) - set_extent_bit(&inode->io_tree, block_start, block_end, - EXTENT_NORESERVE, NULL); + btrfs_set_extent_bit(&inode->io_tree, block_start, block_end, + EXTENT_NORESERVE, NULL); out_unlock: if (ret) { @@ -4984,7 +5111,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) * rest of the block before we expand the i_size, otherwise we could * expose stale data. */ - ret = btrfs_truncate_block(inode, oldsize, 0, 0); + ret = btrfs_truncate_block(inode, oldsize, oldsize, -1); if (ret) return ret; @@ -5001,7 +5128,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) em = NULL; break; } - last_byte = min(extent_map_end(em), block_end); + last_byte = min(btrfs_extent_map_end(em), block_end); last_byte = ALIGN(last_byte, fs_info->sectorsize); hole_size = last_byte - cur_offset; @@ -5017,7 +5144,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) if (ret) break; - hole_em = alloc_extent_map(); + hole_em = btrfs_alloc_extent_map(); if (!hole_em) { btrfs_drop_extent_map_range(inode, cur_offset, cur_offset + hole_size - 1, @@ -5034,7 +5161,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) hole_em->generation = btrfs_get_fs_generation(fs_info); ret = btrfs_replace_extent_map_range(inode, hole_em, true); - free_extent_map(hole_em); + btrfs_free_extent_map(hole_em); } else { ret = btrfs_inode_set_file_extent_range(inode, cur_offset, hole_size); @@ -5042,14 +5169,14 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) break; } next: - free_extent_map(em); + btrfs_free_extent_map(em); em = NULL; cur_offset = last_byte; if (cur_offset >= block_end) break; } - free_extent_map(em); - unlock_extent(io_tree, hole_start, block_end - 1, &cached_state); + btrfs_free_extent_map(em); + btrfs_unlock_extent(io_tree, hole_start, block_end - 1, &cached_state); return ret; } @@ -5233,7 +5360,7 @@ static void evict_inode_truncate_pages(struct inode *inode) state_flags = state->state; spin_unlock(&io_tree->lock); - lock_extent(io_tree, start, end, &cached_state); + btrfs_lock_extent(io_tree, start, end, &cached_state); /* * If still has DELALLOC flag, the extent didn't reach disk, @@ -5247,9 +5374,9 @@ static void evict_inode_truncate_pages(struct inode *inode) btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start, end - start + 1, NULL); - clear_extent_bit(io_tree, start, end, - EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING, - &cached_state); + btrfs_clear_extent_bit(io_tree, start, end, + EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING, + &cached_state); cond_resched(); spin_lock(&io_tree->lock); @@ -5435,7 +5562,7 @@ static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry, struct btrfs_key *location, u8 *type) { struct btrfs_dir_item *di; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_root *root = dir->root; int ret = 0; struct fscrypt_name fname; @@ -5446,7 +5573,7 @@ static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry, ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname); if (ret < 0) - goto out; + return ret; /* * fscrypt_setup_filename() should never return a positive value, but * gcc on sparc/parisc thinks it can, so assert that doesn't happen. @@ -5475,7 +5602,6 @@ static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry, *type = btrfs_dir_ftype(path->nodes[0], di); out: fscrypt_free_filename(&fname); - btrfs_free_path(path); return ret; } @@ -5490,7 +5616,7 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, struct btrfs_key *location, struct btrfs_root **sub_root) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_root *new_root; struct btrfs_root_ref *ref; struct extent_buffer *leaf; @@ -5546,7 +5672,6 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, location->offset = 0; err = 0; out: - btrfs_free_path(path); fscrypt_free_filename(&fname); return err; } @@ -5597,7 +5722,7 @@ static int btrfs_find_actor(struct inode *inode, void *opaque) args->root == BTRFS_I(inode)->root; } -static struct inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root) +static struct btrfs_inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root) { struct inode *inode; struct btrfs_iget_args args; @@ -5609,40 +5734,42 @@ static struct inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root) inode = iget5_locked_rcu(root->fs_info->sb, hashval, btrfs_find_actor, btrfs_init_locked_inode, (void *)&args); - return inode; + if (!inode) + return NULL; + return BTRFS_I(inode); } /* * Get an inode object given its inode number and corresponding root. Path is * preallocated to prevent recursing back to iget through allocator. */ -struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, - struct btrfs_path *path) +struct btrfs_inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, + struct btrfs_path *path) { - struct inode *inode; + struct btrfs_inode *inode; int ret; inode = btrfs_iget_locked(ino, root); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode->vfs_inode.i_state & I_NEW)) return inode; ret = btrfs_read_locked_inode(inode, path); if (ret) return ERR_PTR(ret); - unlock_new_inode(inode); + unlock_new_inode(&inode->vfs_inode); return inode; } /* * Get an inode object given its inode number and corresponding root. */ -struct inode *btrfs_iget(u64 ino, struct btrfs_root *root) +struct btrfs_inode *btrfs_iget(u64 ino, struct btrfs_root *root) { - struct inode *inode; + struct btrfs_inode *inode; struct btrfs_path *path; int ret; @@ -5650,55 +5777,60 @@ struct inode *btrfs_iget(u64 ino, struct btrfs_root *root) if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode->vfs_inode.i_state & I_NEW)) return inode; path = btrfs_alloc_path(); - if (!path) + if (!path) { + iget_failed(&inode->vfs_inode); return ERR_PTR(-ENOMEM); + } ret = btrfs_read_locked_inode(inode, path); btrfs_free_path(path); if (ret) return ERR_PTR(ret); - unlock_new_inode(inode); + unlock_new_inode(&inode->vfs_inode); return inode; } -static struct inode *new_simple_dir(struct inode *dir, - struct btrfs_key *key, - struct btrfs_root *root) +static struct btrfs_inode *new_simple_dir(struct inode *dir, + struct btrfs_key *key, + struct btrfs_root *root) { struct timespec64 ts; - struct inode *inode = new_inode(dir->i_sb); + struct inode *vfs_inode; + struct btrfs_inode *inode; - if (!inode) + vfs_inode = new_inode(dir->i_sb); + if (!vfs_inode) return ERR_PTR(-ENOMEM); - BTRFS_I(inode)->root = btrfs_grab_root(root); - BTRFS_I(inode)->ref_root_id = key->objectid; - set_bit(BTRFS_INODE_ROOT_STUB, &BTRFS_I(inode)->runtime_flags); - set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); + inode = BTRFS_I(vfs_inode); + inode->root = btrfs_grab_root(root); + inode->ref_root_id = key->objectid; + set_bit(BTRFS_INODE_ROOT_STUB, &inode->runtime_flags); + set_bit(BTRFS_INODE_DUMMY, &inode->runtime_flags); - btrfs_set_inode_number(BTRFS_I(inode), BTRFS_EMPTY_SUBVOL_DIR_OBJECTID); + btrfs_set_inode_number(inode, BTRFS_EMPTY_SUBVOL_DIR_OBJECTID); /* * We only need lookup, the rest is read-only and there's no inode * associated with the dentry */ - inode->i_op = &simple_dir_inode_operations; - inode->i_opflags &= ~IOP_XATTR; - inode->i_fop = &simple_dir_operations; - inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; + vfs_inode->i_op = &simple_dir_inode_operations; + vfs_inode->i_opflags &= ~IOP_XATTR; + vfs_inode->i_fop = &simple_dir_operations; + vfs_inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; - ts = inode_set_ctime_current(inode); - inode_set_mtime_to_ts(inode, ts); - inode_set_atime_to_ts(inode, inode_get_atime(dir)); - BTRFS_I(inode)->i_otime_sec = ts.tv_sec; - BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec; + ts = inode_set_ctime_current(vfs_inode); + inode_set_mtime_to_ts(vfs_inode, ts); + inode_set_atime_to_ts(vfs_inode, inode_get_atime(dir)); + inode->i_otime_sec = ts.tv_sec; + inode->i_otime_nsec = ts.tv_nsec; - inode->i_uid = dir->i_uid; - inode->i_gid = dir->i_gid; + vfs_inode->i_uid = dir->i_uid; + vfs_inode->i_gid = dir->i_gid; return inode; } @@ -5712,15 +5844,15 @@ static_assert(BTRFS_FT_FIFO == FT_FIFO); static_assert(BTRFS_FT_SOCK == FT_SOCK); static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK); -static inline u8 btrfs_inode_type(struct inode *inode) +static inline u8 btrfs_inode_type(const struct btrfs_inode *inode) { - return fs_umode_to_ftype(inode->i_mode); + return fs_umode_to_ftype(inode->vfs_inode.i_mode); } struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) { struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); - struct inode *inode; + struct btrfs_inode *inode; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *sub_root = root; struct btrfs_key location = { 0 }; @@ -5737,18 +5869,18 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) if (location.type == BTRFS_INODE_ITEM_KEY) { inode = btrfs_iget(location.objectid, root); if (IS_ERR(inode)) - return inode; + return ERR_CAST(inode); /* Do extra check against inode mode with di_type */ if (btrfs_inode_type(inode) != di_type) { btrfs_crit(fs_info, "inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u", - inode->i_mode, btrfs_inode_type(inode), + inode->vfs_inode.i_mode, btrfs_inode_type(inode), di_type); - iput(inode); + iput(&inode->vfs_inode); return ERR_PTR(-EUCLEAN); } - return inode; + return &inode->vfs_inode; } ret = fixup_tree_root_location(fs_info, BTRFS_I(dir), dentry, @@ -5763,19 +5895,22 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) btrfs_put_root(sub_root); if (IS_ERR(inode)) - return inode; + return ERR_CAST(inode); down_read(&fs_info->cleanup_work_sem); - if (!sb_rdonly(inode->i_sb)) + if (!sb_rdonly(inode->vfs_inode.i_sb)) ret = btrfs_orphan_cleanup(sub_root); up_read(&fs_info->cleanup_work_sem); if (ret) { - iput(inode); + iput(&inode->vfs_inode); inode = ERR_PTR(ret); } } - return inode; + if (IS_ERR(inode)) + return ERR_CAST(inode); + + return &inode->vfs_inode; } static int btrfs_dentry_delete(const struct dentry *dentry) @@ -5815,7 +5950,7 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode) { struct btrfs_root *root = inode->root; struct btrfs_key key, found_key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; int ret; @@ -5829,15 +5964,14 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode) ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) - goto out; + return ret; /* FIXME: we should be able to handle this */ if (ret == 0) - goto out; - ret = 0; + return ret; if (path->slots[0] == 0) { inode->index_cnt = BTRFS_DIR_START_INDEX; - goto out; + return 0; } path->slots[0]--; @@ -5848,13 +5982,12 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode) if (found_key.objectid != btrfs_ino(inode) || found_key.type != BTRFS_DIR_INDEX_KEY) { inode->index_cnt = BTRFS_DIR_START_INDEX; - goto out; + return 0; } inode->index_cnt = found_key.offset + 1; -out: - btrfs_free_path(path); - return ret; + + return 0; } static int btrfs_get_dir_last_index(struct btrfs_inode *dir, u64 *index) @@ -5957,7 +6090,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) struct btrfs_dir_item *di; struct btrfs_key key; struct btrfs_key found_key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); void *addr; LIST_HEAD(ins_list); LIST_HEAD(del_list); @@ -6070,7 +6203,6 @@ nopos: err: if (put) btrfs_readdir_put_delayed_items(BTRFS_I(inode), &ins_list, &del_list); - btrfs_free_path(path); return ret; } @@ -6248,7 +6380,7 @@ static void btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode * inode->flags |= BTRFS_INODE_NODATASUM; } - btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode); + btrfs_sync_inode_flags_to_i_flags(inode); } int btrfs_create_new_inode(struct btrfs_trans_handle *trans, @@ -6334,6 +6466,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, if (btrfs_test_opt(fs_info, NODATACOW)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | BTRFS_INODE_NODATASUM; + btrfs_update_inode_mapping_flags(BTRFS_I(inode)); } ret = btrfs_insert_inode_locked(inode); @@ -6427,7 +6560,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, path = NULL; if (args->subvol) { - struct inode *parent; + struct btrfs_inode *parent; /* * Subvolumes inherit properties from their parent subvolume, @@ -6437,11 +6570,13 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, if (IS_ERR(parent)) { ret = PTR_ERR(parent); } else { - ret = btrfs_inode_inherit_props(trans, inode, parent); - iput(parent); + ret = btrfs_inode_inherit_props(trans, BTRFS_I(inode), + parent); + iput(&parent->vfs_inode); } } else { - ret = btrfs_inode_inherit_props(trans, inode, dir); + ret = btrfs_inode_inherit_props(trans, BTRFS_I(inode), + BTRFS_I(dir)); } if (ret) { btrfs_err(fs_info, @@ -6539,7 +6674,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, return ret; ret = btrfs_insert_dir_item(trans, name, parent_inode, &key, - btrfs_inode_type(&inode->vfs_inode), index); + btrfs_inode_type(inode), index); if (ret == -EEXIST || ret == -EOVERFLOW) goto fail_dir_item; else if (ret) { @@ -6739,18 +6874,18 @@ fail: return err; } -static int btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode *inode; inode = new_inode(dir->i_sb); if (!inode) - return -ENOMEM; + return ERR_PTR(-ENOMEM); inode_init_owner(idmap, inode, dir, S_IFDIR | mode); inode->i_op = &btrfs_dir_inode_operations; inode->i_fop = &btrfs_dir_file_operations; - return btrfs_create_common(dir, dentry, inode); + return ERR_PTR(btrfs_create_common(dir, dentry, inode)); } static noinline int uncompress_inline(struct btrfs_path *path, @@ -6759,6 +6894,7 @@ static noinline int uncompress_inline(struct btrfs_path *path, { int ret; struct extent_buffer *leaf = path->nodes[0]; + const u32 blocksize = leaf->fs_info->sectorsize; char *tmp; size_t max_size; unsigned long inline_size; @@ -6775,7 +6911,7 @@ static noinline int uncompress_inline(struct btrfs_path *path, read_extent_buffer(leaf, tmp, ptr, inline_size); - max_size = min_t(unsigned long, PAGE_SIZE, max_size); + max_size = min_t(unsigned long, blocksize, max_size); ret = btrfs_decompress(compress_type, tmp, folio, 0, inline_size, max_size); @@ -6787,14 +6923,15 @@ static noinline int uncompress_inline(struct btrfs_path *path, * cover that region here. */ - if (max_size < PAGE_SIZE) - folio_zero_range(folio, max_size, PAGE_SIZE - max_size); + if (max_size < blocksize) + folio_zero_range(folio, max_size, blocksize - max_size); kfree(tmp); return ret; } static int read_inline_extent(struct btrfs_path *path, struct folio *folio) { + const u32 blocksize = path->nodes[0]->fs_info->sectorsize; struct btrfs_file_extent_item *fi; void *kaddr; size_t copy_size; @@ -6809,14 +6946,14 @@ static int read_inline_extent(struct btrfs_path *path, struct folio *folio) if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE) return uncompress_inline(path, folio, fi); - copy_size = min_t(u64, PAGE_SIZE, + copy_size = min_t(u64, blocksize, btrfs_file_extent_ram_bytes(path->nodes[0], fi)); kaddr = kmap_local_folio(folio, 0); read_extent_buffer(path->nodes[0], kaddr, btrfs_file_extent_inline_start(fi), copy_size); kunmap_local(kaddr); - if (copy_size < PAGE_SIZE) - folio_zero_range(folio, copy_size, PAGE_SIZE - copy_size); + if (copy_size < blocksize) + folio_zero_range(folio, copy_size, blocksize - copy_size); return 0; } @@ -6855,18 +6992,18 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, struct extent_map_tree *em_tree = &inode->extent_tree; read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, len); + em = btrfs_lookup_extent_mapping(em_tree, start, len); read_unlock(&em_tree->lock); if (em) { if (em->start > start || em->start + em->len <= start) - free_extent_map(em); + btrfs_free_extent_map(em); else if (em->disk_bytenr == EXTENT_MAP_INLINE && folio) - free_extent_map(em); + btrfs_free_extent_map(em); else goto out; } - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { ret = -ENOMEM; goto out; @@ -7003,7 +7140,7 @@ not_found: insert: ret = 0; btrfs_release_path(path); - if (em->start > start || extent_map_end(em) <= start) { + if (em->start > start || btrfs_extent_map_end(em) <= start) { btrfs_err(fs_info, "bad extent! em: [%llu %llu] passed [%llu %llu]", em->start, em->len, start, len); @@ -7020,7 +7157,7 @@ out: trace_btrfs_get_extent(root, inode, em); if (ret) { - free_extent_map(em); + btrfs_free_extent_map(em); return ERR_PTR(ret); } return em; @@ -7057,17 +7194,17 @@ static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) * NOTE: This only checks the file extents, caller is responsible to wait for * any ordered extents. */ -noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, +noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len, struct btrfs_file_extent *file_extent, bool nowait) { - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct can_nocow_file_extent_args nocow_args = { 0 }; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int ret; struct extent_buffer *leaf; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_io_tree *io_tree = &inode->io_tree; struct btrfs_file_extent_item *fi; struct btrfs_key key; int found_type; @@ -7077,35 +7214,34 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, return -ENOMEM; path->nowait = nowait; - ret = btrfs_lookup_file_extent(NULL, root, path, - btrfs_ino(BTRFS_I(inode)), offset, 0); + ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), + offset, 0); if (ret < 0) - goto out; + return ret; if (ret == 1) { if (path->slots[0] == 0) { - /* can't find the item, must cow */ - ret = 0; - goto out; + /* Can't find the item, must COW. */ + return 0; } path->slots[0]--; } ret = 0; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (key.objectid != btrfs_ino(BTRFS_I(inode)) || + if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) { - /* not our file or wrong item type, must cow */ - goto out; + /* Not our file or wrong item type, must COW. */ + return 0; } if (key.offset > offset) { - /* Wrong offset, must cow */ - goto out; + /* Wrong offset, must COW. */ + return 0; } if (btrfs_file_extent_end(path) <= offset) - goto out; + return 0; fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); found_type = btrfs_file_extent_type(leaf, fi); @@ -7114,43 +7250,38 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, nocow_args.end = offset + *len - 1; nocow_args.free_path = true; - ret = can_nocow_file_extent(path, &key, BTRFS_I(inode), &nocow_args); + ret = can_nocow_file_extent(path, &key, inode, &nocow_args); /* can_nocow_file_extent() has freed the path. */ path = NULL; if (ret != 1) { /* Treat errors as not being able to NOCOW. */ - ret = 0; - goto out; + return 0; } - ret = 0; if (btrfs_extent_readonly(fs_info, nocow_args.file_extent.disk_bytenr + nocow_args.file_extent.offset)) - goto out; + return 0; - if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && + if (!(inode->flags & BTRFS_INODE_NODATACOW) && found_type == BTRFS_FILE_EXTENT_PREALLOC) { u64 range_end; range_end = round_up(offset + nocow_args.file_extent.num_bytes, root->fs_info->sectorsize) - 1; - ret = test_range_bit_exists(io_tree, offset, range_end, EXTENT_DELALLOC); - if (ret) { - ret = -EAGAIN; - goto out; - } + ret = btrfs_test_range_bit_exists(io_tree, offset, range_end, + EXTENT_DELALLOC); + if (ret) + return -EAGAIN; } if (file_extent) memcpy(file_extent, &nocow_args.file_extent, sizeof(*file_extent)); *len = nocow_args.file_extent.num_bytes; - ret = 1; -out: - btrfs_free_path(path); - return ret; + + return 1; } /* The callers of this must take lock_extent() */ @@ -7198,7 +7329,7 @@ struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start, break; } - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) return ERR_PTR(-ENOMEM); @@ -7211,15 +7342,15 @@ struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start, em->offset = file_extent->offset; em->flags |= EXTENT_FLAG_PINNED; if (type == BTRFS_ORDERED_COMPRESSED) - extent_map_set_compression(em, file_extent->compression); + btrfs_extent_map_set_compression(em, file_extent->compression); ret = btrfs_replace_extent_map_range(inode, em, true); if (ret) { - free_extent_map(em); + btrfs_free_extent_map(em); return ERR_PTR(ret); } - /* em got 2 refs now, callers needs to do free_extent_map once. */ + /* em got 2 refs now, callers needs to do btrfs_free_extent_map once. */ return em; } @@ -7235,7 +7366,7 @@ static void wait_subpage_spinlock(struct folio *folio) struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); struct btrfs_subpage *subpage; - if (!btrfs_is_subpage(fs_info, folio->mapping)) + if (!btrfs_is_subpage(fs_info, folio)) return; ASSERT(folio_test_private(folio) && folio_get_private(folio)); @@ -7259,7 +7390,7 @@ static void wait_subpage_spinlock(struct folio *folio) static int btrfs_launder_folio(struct folio *folio) { return btrfs_qgroup_free_data(folio_to_inode(folio), NULL, folio_pos(folio), - PAGE_SIZE, NULL); + folio_size(folio), NULL); } static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags) @@ -7346,7 +7477,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, } if (!inode_evicting) - lock_extent(tree, page_start, page_end, &cached_state); + btrfs_lock_extent(tree, page_start, page_end, &cached_state); cur = page_start; while (cur < page_end) { @@ -7402,10 +7533,10 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, * btrfs_finish_ordered_io(). */ if (!inode_evicting) - clear_extent_bit(tree, cur, range_end, - EXTENT_DELALLOC | - EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, &cached_state); + btrfs_clear_extent_bit(tree, cur, range_end, + EXTENT_DELALLOC | + EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, &cached_state); spin_lock_irq(&inode->ordered_tree_lock); set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); @@ -7447,12 +7578,11 @@ next: * Since the IO will never happen for this page. */ btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur, NULL); - if (!inode_evicting) { - clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED | - EXTENT_DELALLOC | EXTENT_UPTODATE | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG | - extra_flags, &cached_state); - } + if (!inode_evicting) + btrfs_clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED | + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG | extra_flags, + &cached_state); cur = range_end + 1; } /* @@ -7556,7 +7686,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize); control.new_size = new_size; - lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state); + btrfs_lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state); /* * We want to drop from the next block forward in case this new * size is not block aligned since we will be keeping the last @@ -7571,7 +7701,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) inode_sub_bytes(&inode->vfs_inode, control.sub_bytes); btrfs_inode_safe_disk_i_size_write(inode, control.last_size); - unlock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state); + btrfs_unlock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state); trans->block_rsv = &fs_info->trans_block_rsv; if (ret != -ENOSPC && ret != -EAGAIN) @@ -7615,7 +7745,8 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); - ret = btrfs_truncate_block(inode, inode->vfs_inode.i_size, 0, 0); + ret = btrfs_truncate_block(inode, inode->vfs_inode.i_size, + inode->vfs_inode.i_size, (u64)-1); if (ret) goto out; trans = btrfs_start_transaction(root, 1); @@ -7727,10 +7858,10 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->i_otime_nsec = 0; inode = &ei->vfs_inode; - extent_map_tree_init(&ei->extent_tree); + btrfs_extent_map_tree_init(&ei->extent_tree); /* This io tree sets the valid inode. */ - extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO); + btrfs_extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO); ei->io_tree.inode = ei; ei->file_extent_tree = NULL; @@ -8494,8 +8625,6 @@ static int start_delalloc_inodes(struct btrfs_root *root, struct writeback_control *wbc, bool snapshot, bool in_reclaim_context) { - struct btrfs_inode *binode; - struct inode *inode; struct btrfs_delalloc_work *work, *next; LIST_HEAD(works); LIST_HEAD(splice); @@ -8506,30 +8635,30 @@ static int start_delalloc_inodes(struct btrfs_root *root, spin_lock(&root->delalloc_lock); list_splice_init(&root->delalloc_inodes, &splice); while (!list_empty(&splice)) { - binode = list_entry(splice.next, struct btrfs_inode, - delalloc_inodes); + struct btrfs_inode *inode; + struct inode *tmp_inode; + + inode = list_first_entry(&splice, struct btrfs_inode, delalloc_inodes); - list_move_tail(&binode->delalloc_inodes, - &root->delalloc_inodes); + list_move_tail(&inode->delalloc_inodes, &root->delalloc_inodes); if (in_reclaim_context && - test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &binode->runtime_flags)) + test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags)) continue; - inode = igrab(&binode->vfs_inode); - if (!inode) { + tmp_inode = igrab(&inode->vfs_inode); + if (!tmp_inode) { cond_resched_lock(&root->delalloc_lock); continue; } spin_unlock(&root->delalloc_lock); if (snapshot) - set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, - &binode->runtime_flags); + set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, &inode->runtime_flags); if (full_flush) { - work = btrfs_alloc_delalloc_work(inode); + work = btrfs_alloc_delalloc_work(&inode->vfs_inode); if (!work) { - iput(inode); + iput(&inode->vfs_inode); ret = -ENOMEM; goto out; } @@ -8537,8 +8666,8 @@ static int start_delalloc_inodes(struct btrfs_root *root, btrfs_queue_work(root->fs_info->flush_workers, &work->work); } else { - ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc); - btrfs_add_delayed_iput(BTRFS_I(inode)); + ret = filemap_fdatawrite_wbc(inode->vfs_inode.i_mapping, wbc); + btrfs_add_delayed_iput(inode); if (ret || wbc->nr_to_write <= 0) goto out; } @@ -8655,7 +8784,12 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct extent_buffer *leaf; name_len = strlen(symname); - if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info)) + /* + * Symlinks utilize uncompressed inline extent data, which should not + * reach block size. + */ + if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) || + name_len >= fs_info->sectorsize) return -ENAMETOOLONG; inode = new_inode(dir->i_sb); @@ -8694,8 +8828,8 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, goto out; } key.objectid = btrfs_ino(BTRFS_I(inode)); - key.offset = 0; key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = 0; datasize = btrfs_file_extent_calc_inline_size(name_len); err = btrfs_insert_empty_item(trans, root, path, &key, datasize); @@ -8868,11 +9002,11 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, if (IS_ERR(trans)) { ret = PTR_ERR(trans); btrfs_free_reserved_extent(fs_info, ins.objectid, - ins.offset, 0); + ins.offset, false); break; } - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { btrfs_drop_extent_map_range(BTRFS_I(inode), cur_offset, cur_offset + ins.offset - 1, false); @@ -8890,7 +9024,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, em->generation = trans->transid; ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true); - free_extent_map(em); + btrfs_free_extent_map(em); next: num_bytes -= ins.offset; cur_offset += ins.offset; @@ -9062,7 +9196,7 @@ static ssize_t btrfs_encoded_read_inline( struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct extent_io_tree *io_tree = &inode->io_tree; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_file_extent_item *item; u64 ram_bytes; @@ -9072,10 +9206,8 @@ static ssize_t btrfs_encoded_read_inline( const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } + if (!path) + return -ENOMEM; path->nowait = nowait; @@ -9084,9 +9216,9 @@ static ssize_t btrfs_encoded_read_inline( if (ret) { if (ret > 0) { /* The extent item disappeared? */ - ret = -EIO; + return -EIO; } - goto out; + return ret; } leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -9099,17 +9231,16 @@ static ssize_t btrfs_encoded_read_inline( ret = btrfs_encoded_io_compression_from_extent(fs_info, btrfs_file_extent_compression(leaf, item)); if (ret < 0) - goto out; + return ret; encoded->compression = ret; if (encoded->compression) { size_t inline_size; inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]); - if (inline_size > count) { - ret = -ENOBUFS; - goto out; - } + if (inline_size > count) + return -ENOBUFS; + count = inline_size; encoded->unencoded_len = ram_bytes; encoded->unencoded_offset = iocb->ki_pos - extent_start; @@ -9121,13 +9252,12 @@ static ssize_t btrfs_encoded_read_inline( } tmp = kmalloc(count, GFP_NOFS); - if (!tmp) { - ret = -ENOMEM; - goto out; - } + if (!tmp) + return -ENOMEM; + read_extent_buffer(leaf, tmp, ptr, count); btrfs_release_path(path); - unlock_extent(io_tree, start, lockend, cached_state); + btrfs_unlock_extent(io_tree, start, lockend, cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); *unlocked = true; @@ -9135,13 +9265,12 @@ static ssize_t btrfs_encoded_read_inline( if (ret != count) ret = -EFAULT; kfree(tmp); -out: - btrfs_free_path(path); + return ret; } struct btrfs_encoded_read_private { - struct completion done; + struct completion *sync_reads; void *uring_ctx; refcount_t pending_refs; blk_status_t status; @@ -9153,11 +9282,10 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) if (bbio->bio.bi_status) { /* - * The memory barrier implied by the atomic_dec_return() here - * pairs with the memory barrier implied by the - * atomic_dec_return() or io_wait_event() in - * btrfs_encoded_read_regular_fill_pages() to ensure that this - * write is observed before the load of status in + * The memory barrier implied by the refcount_dec_and_test() here + * pairs with the memory barrier implied by the refcount_dec_and_test() + * in btrfs_encoded_read_regular_fill_pages() to ensure that + * this write is observed before the load of status in * btrfs_encoded_read_regular_fill_pages(). */ WRITE_ONCE(priv->status, bbio->bio.bi_status); @@ -9169,7 +9297,7 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) btrfs_uring_read_extent_endio(priv->uring_ctx, err); kfree(priv); } else { - complete(&priv->done); + complete(priv->sync_reads); } } bio_put(&bbio->bio); @@ -9180,16 +9308,26 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, struct page **pages, void *uring_ctx) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_encoded_read_private *priv; + struct btrfs_encoded_read_private *priv, sync_priv; + struct completion sync_reads; unsigned long i = 0; struct btrfs_bio *bbio; int ret; - priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS); - if (!priv) - return -ENOMEM; + /* + * Fast path for synchronous reads which completes in this call, io_uring + * needs longer time span. + */ + if (uring_ctx) { + priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS); + if (!priv) + return -ENOMEM; + } else { + priv = &sync_priv; + init_completion(&sync_reads); + priv->sync_reads = &sync_reads; + } - init_completion(&priv->done); refcount_set(&priv->pending_refs, 1); priv->status = 0; priv->uring_ctx = uring_ctx; @@ -9232,11 +9370,9 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, return -EIOCBQUEUED; } else { if (!refcount_dec_and_test(&priv->pending_refs)) - wait_for_completion_io(&priv->done); + wait_for_completion_io(&sync_reads); /* See btrfs_encoded_read_endio() for ordering. */ - ret = blk_status_to_errno(READ_ONCE(priv->status)); - kfree(priv); - return ret; + return blk_status_to_errno(READ_ONCE(priv->status)); } } @@ -9269,7 +9405,7 @@ ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter, if (ret) goto out; - unlock_extent(io_tree, start, lockend, cached_state); + btrfs_unlock_extent(io_tree, start, lockend, cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); *unlocked = true; @@ -9346,7 +9482,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, goto out_unlock_inode; } - if (!try_lock_extent(io_tree, start, lockend, cached_state)) { + if (!btrfs_try_lock_extent(io_tree, start, lockend, cached_state)) { ret = -EAGAIN; goto out_unlock_inode; } @@ -9355,7 +9491,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, lockend - start + 1); if (ordered) { btrfs_put_ordered_extent(ordered); - unlock_extent(io_tree, start, lockend, cached_state); + btrfs_unlock_extent(io_tree, start, lockend, cached_state); ret = -EAGAIN; goto out_unlock_inode; } @@ -9368,13 +9504,13 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, if (ret) goto out_unlock_inode; - lock_extent(io_tree, start, lockend, cached_state); + btrfs_lock_extent(io_tree, start, lockend, cached_state); ordered = btrfs_lookup_ordered_range(inode, start, lockend - start + 1); if (!ordered) break; btrfs_put_ordered_extent(ordered); - unlock_extent(io_tree, start, lockend, cached_state); + btrfs_unlock_extent(io_tree, start, lockend, cached_state); cond_resched(); } } @@ -9392,7 +9528,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, * For inline extents we get everything we need out of the * extent item. */ - free_extent_map(em); + btrfs_free_extent_map(em); em = NULL; ret = btrfs_encoded_read_inline(iocb, iter, start, lockend, cached_state, extent_start, @@ -9404,7 +9540,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, * We only want to return up to EOF even if the extent extends beyond * that. */ - encoded->len = min_t(u64, extent_map_end(em), + encoded->len = min_t(u64, btrfs_extent_map_end(em), inode->vfs_inode.i_size) - iocb->ki_pos; if (em->disk_bytenr == EXTENT_MAP_HOLE || (em->flags & EXTENT_FLAG_PREALLOC)) { @@ -9412,7 +9548,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, count = min_t(u64, count, encoded->len); encoded->len = count; encoded->unencoded_len = count; - } else if (extent_map_is_compressed(em)) { + } else if (btrfs_extent_map_is_compressed(em)) { *disk_bytenr = em->disk_bytenr; /* * Bail if the buffer isn't large enough to return the whole @@ -9427,12 +9563,12 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, encoded->unencoded_len = em->ram_bytes; encoded->unencoded_offset = iocb->ki_pos - (em->start - em->offset); ret = btrfs_encoded_io_compression_from_extent(fs_info, - extent_map_compression(em)); + btrfs_extent_map_compression(em)); if (ret < 0) goto out_em; encoded->compression = ret; } else { - *disk_bytenr = extent_map_block_start(em) + (start - em->start); + *disk_bytenr = btrfs_extent_map_block_start(em) + (start - em->start); if (encoded->len > count) encoded->len = count; /* @@ -9445,11 +9581,11 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, encoded->unencoded_len = count; *disk_io_size = ALIGN(*disk_io_size, fs_info->sectorsize); } - free_extent_map(em); + btrfs_free_extent_map(em); em = NULL; if (*disk_bytenr == EXTENT_MAP_HOLE) { - unlock_extent(io_tree, start, lockend, cached_state); + btrfs_unlock_extent(io_tree, start, lockend, cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); unlocked = true; ret = iov_iter_zero(count, iter); @@ -9461,11 +9597,11 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, } out_em: - free_extent_map(em); + btrfs_free_extent_map(em); out_unlock_extent: /* Leave inode and extent locked if we need to do a read. */ if (!unlocked && ret != -EIOCBQUEUED) - unlock_extent(io_tree, start, lockend, cached_state); + btrfs_unlock_extent(io_tree, start, lockend, cached_state); out_unlock_inode: if (!unlocked && ret != -EIOCBQUEUED) btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); @@ -9612,14 +9748,14 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, end >> PAGE_SHIFT); if (ret) goto out_folios; - lock_extent(io_tree, start, end, &cached_state); + btrfs_lock_extent(io_tree, start, end, &cached_state); ordered = btrfs_lookup_ordered_range(inode, start, num_bytes); if (!ordered && !filemap_range_has_page(inode->vfs_inode.i_mapping, start, end)) break; if (ordered) btrfs_put_ordered_extent(ordered); - unlock_extent(io_tree, start, end, &cached_state); + btrfs_unlock_extent(io_tree, start, end, &cached_state); cond_resched(); } @@ -9669,11 +9805,11 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, ret = PTR_ERR(em); goto out_free_reserved; } - free_extent_map(em); + btrfs_free_extent_map(em); ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent, - (1 << BTRFS_ORDERED_ENCODED) | - (1 << BTRFS_ORDERED_COMPRESSED)); + (1U << BTRFS_ORDERED_ENCODED) | + (1U << BTRFS_ORDERED_COMPRESSED)); if (IS_ERR(ordered)) { btrfs_drop_extent_map_range(inode, start, end, false); ret = PTR_ERR(ordered); @@ -9684,7 +9820,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, if (start + encoded->len > inode->vfs_inode.i_size) i_size_write(&inode->vfs_inode, start + encoded->len); - unlock_extent(io_tree, start, end, &cached_state); + btrfs_unlock_extent(io_tree, start, end, &cached_state); btrfs_delalloc_release_extents(inode, num_bytes); @@ -9694,7 +9830,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, out_free_reserved: btrfs_dec_block_group_reservations(fs_info, ins.objectid); - btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true); out_delalloc_release: btrfs_delalloc_release_extents(inode, num_bytes); btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0); @@ -9707,9 +9843,9 @@ out_free_data_space: * bytes_may_use. */ if (!extent_reserved) - btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes); + btrfs_free_reserved_data_space_noquota(inode, disk_num_bytes); out_unlock: - unlock_extent(io_tree, start, end, &cached_state); + btrfs_unlock_extent(io_tree, start, end, &cached_state); out_folios: for (i = 0; i < nr_folios; i++) { if (folios[i]) @@ -9974,7 +10110,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize); - lock_extent(io_tree, 0, isize - 1, &cached_state); + btrfs_lock_extent(io_tree, 0, isize - 1, &cached_state); while (prev_extent_end < isize) { struct btrfs_key key; struct extent_buffer *leaf; @@ -10152,7 +10288,7 @@ out: if (!IS_ERR_OR_NULL(map)) btrfs_free_chunk_map(map); - unlock_extent(io_tree, 0, isize - 1, &cached_state); + btrfs_unlock_extent(io_tree, 0, isize - 1, &cached_state); if (ret) btrfs_swap_deactivate(file); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 6c18bad53cd3..913acef3f0a9 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -118,8 +118,8 @@ struct btrfs_ioctl_encoded_io_args_32 { #endif /* Mask out flags that are inappropriate for the given type of inode. */ -static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode, - unsigned int flags) +static unsigned int btrfs_mask_fsflags_for_type(const struct inode *inode, + unsigned int flags) { if (S_ISDIR(inode->i_mode)) return flags; @@ -133,11 +133,11 @@ static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode, * Export internal inode flags to the format expected by the FS_IOC_GETFLAGS * ioctl. */ -static unsigned int btrfs_inode_flags_to_fsflags(struct btrfs_inode *binode) +static unsigned int btrfs_inode_flags_to_fsflags(const struct btrfs_inode *inode) { unsigned int iflags = 0; - u32 flags = binode->flags; - u32 ro_flags = binode->ro_flags; + u32 flags = inode->flags; + u32 ro_flags = inode->ro_flags; if (flags & BTRFS_INODE_SYNC) iflags |= FS_SYNC_FL; @@ -167,25 +167,24 @@ static unsigned int btrfs_inode_flags_to_fsflags(struct btrfs_inode *binode) /* * Update inode->i_flags based on the btrfs internal flags. */ -void btrfs_sync_inode_flags_to_i_flags(struct inode *inode) +void btrfs_sync_inode_flags_to_i_flags(struct btrfs_inode *inode) { - struct btrfs_inode *binode = BTRFS_I(inode); unsigned int new_fl = 0; - if (binode->flags & BTRFS_INODE_SYNC) + if (inode->flags & BTRFS_INODE_SYNC) new_fl |= S_SYNC; - if (binode->flags & BTRFS_INODE_IMMUTABLE) + if (inode->flags & BTRFS_INODE_IMMUTABLE) new_fl |= S_IMMUTABLE; - if (binode->flags & BTRFS_INODE_APPEND) + if (inode->flags & BTRFS_INODE_APPEND) new_fl |= S_APPEND; - if (binode->flags & BTRFS_INODE_NOATIME) + if (inode->flags & BTRFS_INODE_NOATIME) new_fl |= S_NOATIME; - if (binode->flags & BTRFS_INODE_DIRSYNC) + if (inode->flags & BTRFS_INODE_DIRSYNC) new_fl |= S_DIRSYNC; - if (binode->ro_flags & BTRFS_INODE_RO_VERITY) + if (inode->ro_flags & BTRFS_INODE_RO_VERITY) new_fl |= S_VERITY; - set_mask_bits(&inode->i_flags, + set_mask_bits(&inode->vfs_inode.i_flags, S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC | S_VERITY, new_fl); } @@ -219,7 +218,7 @@ static int check_fsflags(unsigned int old_flags, unsigned int flags) return 0; } -static int check_fsflags_compatible(struct btrfs_fs_info *fs_info, +static int check_fsflags_compatible(const struct btrfs_fs_info *fs_info, unsigned int flags) { if (btrfs_is_zoned(fs_info) && (flags & FS_NOCOW_FL)) @@ -248,24 +247,23 @@ static int btrfs_check_ioctl_vol_args2_subvol_name(const struct btrfs_ioctl_vol_ */ int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) { - struct btrfs_inode *binode = BTRFS_I(d_inode(dentry)); + const struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); - fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(binode)); + fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(inode)); return 0; } int btrfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { - struct inode *inode = d_inode(dentry); - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct btrfs_inode *binode = BTRFS_I(inode); - struct btrfs_root *root = binode->root; + struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; unsigned int fsflags, old_fsflags; int ret; const char *comp = NULL; - u32 binode_flags; + u32 inode_flags; if (btrfs_root_readonly(root)) return -EROFS; @@ -273,8 +271,8 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, if (fileattr_has_fsx(fa)) return -EOPNOTSUPP; - fsflags = btrfs_mask_fsflags_for_type(inode, fa->flags); - old_fsflags = btrfs_inode_flags_to_fsflags(binode); + fsflags = btrfs_mask_fsflags_for_type(&inode->vfs_inode, fa->flags); + old_fsflags = btrfs_inode_flags_to_fsflags(inode); ret = check_fsflags(old_fsflags, fsflags); if (ret) return ret; @@ -283,27 +281,27 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, if (ret) return ret; - binode_flags = binode->flags; + inode_flags = inode->flags; if (fsflags & FS_SYNC_FL) - binode_flags |= BTRFS_INODE_SYNC; + inode_flags |= BTRFS_INODE_SYNC; else - binode_flags &= ~BTRFS_INODE_SYNC; + inode_flags &= ~BTRFS_INODE_SYNC; if (fsflags & FS_IMMUTABLE_FL) - binode_flags |= BTRFS_INODE_IMMUTABLE; + inode_flags |= BTRFS_INODE_IMMUTABLE; else - binode_flags &= ~BTRFS_INODE_IMMUTABLE; + inode_flags &= ~BTRFS_INODE_IMMUTABLE; if (fsflags & FS_APPEND_FL) - binode_flags |= BTRFS_INODE_APPEND; + inode_flags |= BTRFS_INODE_APPEND; else - binode_flags &= ~BTRFS_INODE_APPEND; + inode_flags &= ~BTRFS_INODE_APPEND; if (fsflags & FS_NODUMP_FL) - binode_flags |= BTRFS_INODE_NODUMP; + inode_flags |= BTRFS_INODE_NODUMP; else - binode_flags &= ~BTRFS_INODE_NODUMP; + inode_flags &= ~BTRFS_INODE_NODUMP; if (fsflags & FS_NOATIME_FL) - binode_flags |= BTRFS_INODE_NOATIME; + inode_flags |= BTRFS_INODE_NOATIME; else - binode_flags &= ~BTRFS_INODE_NOATIME; + inode_flags &= ~BTRFS_INODE_NOATIME; /* If coming from FS_IOC_FSSETXATTR then skip unconverted flags */ if (!fa->flags_valid) { @@ -315,32 +313,32 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, } if (fsflags & FS_DIRSYNC_FL) - binode_flags |= BTRFS_INODE_DIRSYNC; + inode_flags |= BTRFS_INODE_DIRSYNC; else - binode_flags &= ~BTRFS_INODE_DIRSYNC; + inode_flags &= ~BTRFS_INODE_DIRSYNC; if (fsflags & FS_NOCOW_FL) { - if (S_ISREG(inode->i_mode)) { + if (S_ISREG(inode->vfs_inode.i_mode)) { /* * It's safe to turn csums off here, no extents exist. * Otherwise we want the flag to reflect the real COW * status of the file and will not set it. */ - if (inode->i_size == 0) - binode_flags |= BTRFS_INODE_NODATACOW | - BTRFS_INODE_NODATASUM; + if (inode->vfs_inode.i_size == 0) + inode_flags |= BTRFS_INODE_NODATACOW | + BTRFS_INODE_NODATASUM; } else { - binode_flags |= BTRFS_INODE_NODATACOW; + inode_flags |= BTRFS_INODE_NODATACOW; } } else { /* * Revert back under same assumptions as above */ - if (S_ISREG(inode->i_mode)) { - if (inode->i_size == 0) - binode_flags &= ~(BTRFS_INODE_NODATACOW | - BTRFS_INODE_NODATASUM); + if (S_ISREG(inode->vfs_inode.i_mode)) { + if (inode->vfs_inode.i_size == 0) + inode_flags &= ~(BTRFS_INODE_NODATACOW | + BTRFS_INODE_NODATASUM); } else { - binode_flags &= ~BTRFS_INODE_NODATACOW; + inode_flags &= ~BTRFS_INODE_NODATACOW; } } @@ -350,21 +348,21 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, * things smaller. */ if (fsflags & FS_NOCOMP_FL) { - binode_flags &= ~BTRFS_INODE_COMPRESS; - binode_flags |= BTRFS_INODE_NOCOMPRESS; + inode_flags &= ~BTRFS_INODE_COMPRESS; + inode_flags |= BTRFS_INODE_NOCOMPRESS; } else if (fsflags & FS_COMPR_FL) { - if (IS_SWAPFILE(inode)) + if (IS_SWAPFILE(&inode->vfs_inode)) return -ETXTBSY; - binode_flags |= BTRFS_INODE_COMPRESS; - binode_flags &= ~BTRFS_INODE_NOCOMPRESS; + inode_flags |= BTRFS_INODE_COMPRESS; + inode_flags &= ~BTRFS_INODE_NOCOMPRESS; comp = btrfs_compress_type2str(fs_info->compress_type); if (!comp || comp[0] == 0) comp = btrfs_compress_type2str(BTRFS_COMPRESS_ZLIB); } else { - binode_flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); + inode_flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); } /* @@ -376,15 +374,14 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, return PTR_ERR(trans); if (comp) { - ret = btrfs_set_prop(trans, BTRFS_I(inode), "btrfs.compression", + ret = btrfs_set_prop(trans, inode, "btrfs.compression", comp, strlen(comp), 0); if (ret) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } } else { - ret = btrfs_set_prop(trans, BTRFS_I(inode), "btrfs.compression", - NULL, 0, 0); + ret = btrfs_set_prop(trans, inode, "btrfs.compression", NULL, 0, 0); if (ret && ret != -ENODATA) { btrfs_abort_transaction(trans, ret); goto out_end_trans; @@ -392,18 +389,19 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, } update_flags: - binode->flags = binode_flags; + inode->flags = inode_flags; + btrfs_update_inode_mapping_flags(inode); btrfs_sync_inode_flags_to_i_flags(inode); - inode_inc_iversion(inode); - inode_set_ctime_current(inode); - ret = btrfs_update_inode(trans, BTRFS_I(inode)); + inode_inc_iversion(&inode->vfs_inode); + inode_set_ctime_current(&inode->vfs_inode); + ret = btrfs_update_inode(trans, inode); out_end_trans: btrfs_end_transaction(trans); return ret; } -static int btrfs_ioctl_getversion(struct inode *inode, int __user *arg) +static int btrfs_ioctl_getversion(const struct inode *inode, int __user *arg) { return put_user(inode->i_generation, arg); } @@ -475,7 +473,7 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info, * Calculate the number of transaction items to reserve for creating a subvolume * or snapshot, not including the inode, directory entries, or parent directory. */ -static unsigned int create_subvol_num_items(struct btrfs_qgroup_inherit *inherit) +static unsigned int create_subvol_num_items(const struct btrfs_qgroup_inherit *inherit) { /* * 1 to add root block @@ -617,8 +615,8 @@ static noinline int create_subvol(struct mnt_idmap *idmap, btrfs_set_root_dirid(root_item, BTRFS_FIRST_FREE_OBJECTID); key.objectid = objectid; - key.offset = 0; key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = 0; ret = btrfs_insert_root(trans, fs_info->tree_root, &key, root_item); if (ret) { @@ -878,7 +876,7 @@ static int btrfs_may_delete(struct mnt_idmap *idmap, /* copy of may_create in fs/namei.c() */ static inline int btrfs_may_create(struct mnt_idmap *idmap, - struct inode *dir, struct dentry *child) + struct inode *dir, const struct dentry *child) { if (d_really_is_positive(child)) return -EEXIST; @@ -911,7 +909,7 @@ static noinline int btrfs_mksubvol(const struct path *parent, if (error == -EINTR) return error; - dentry = lookup_one(idmap, name, parent->dentry, namelen); + dentry = lookup_one(idmap, &QSTR_LEN(name, namelen), parent->dentry); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out_unlock; @@ -1033,17 +1031,14 @@ static noinline int btrfs_ioctl_resize(struct file *file, void __user *arg) { BTRFS_DEV_LOOKUP_ARGS(args); - struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct btrfs_fs_info *fs_info = root->fs_info; u64 new_size; u64 old_size; u64 devid = 1; - struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_ioctl_vol_args *vol_args; - struct btrfs_trans_handle *trans; struct btrfs_device *device = NULL; char *sizestr; - char *retptr; char *devstr = NULL; int ret = 0; int mod = 0; @@ -1111,6 +1106,8 @@ static noinline int btrfs_ioctl_resize(struct file *file, if (!strcmp(sizestr, "max")) new_size = bdev_nr_bytes(device->bdev); else { + char *retptr; + if (sizestr[0] == '-') { mod = -1; sizestr++; @@ -1158,6 +1155,8 @@ static noinline int btrfs_ioctl_resize(struct file *file, new_size = round_down(new_size, fs_info->sectorsize); if (new_size > old_size) { + struct btrfs_trans_handle *trans; + trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); @@ -1336,15 +1335,15 @@ free_args: return ret; } -static noinline int btrfs_ioctl_subvol_getflags(struct inode *inode, +static noinline int btrfs_ioctl_subvol_getflags(struct btrfs_inode *inode, void __user *arg) { - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; u64 flags = 0; - if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) + if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) return -EINVAL; down_read(&fs_info->subvol_sem); @@ -1447,8 +1446,8 @@ out: return ret; } -static noinline int key_in_sk(struct btrfs_key *key, - struct btrfs_ioctl_search_key *sk) +static noinline bool key_in_sk(const struct btrfs_key *key, + const struct btrfs_ioctl_search_key *sk) { struct btrfs_key test; int ret; @@ -1459,7 +1458,7 @@ static noinline int key_in_sk(struct btrfs_key *key, ret = btrfs_comp_cpu_keys(key, &test); if (ret < 0) - return 0; + return false; test.objectid = sk->max_objectid; test.type = sk->max_type; @@ -1467,13 +1466,13 @@ static noinline int key_in_sk(struct btrfs_key *key, ret = btrfs_comp_cpu_keys(key, &test); if (ret > 0) - return 0; - return 1; + return false; + return true; } static noinline int copy_to_sk(struct btrfs_path *path, struct btrfs_key *key, - struct btrfs_ioctl_search_key *sk, + const struct btrfs_ioctl_search_key *sk, u64 *buf_size, char __user *ubuf, unsigned long *sk_offset, @@ -1530,8 +1529,8 @@ static noinline int copy_to_sk(struct btrfs_path *path, } sh.objectid = key->objectid; - sh.offset = key->offset; sh.type = key->type; + sh.offset = key->offset; sh.len = item_len; sh.transid = found_transid; @@ -1604,13 +1603,12 @@ out: return ret; } -static noinline int search_ioctl(struct inode *inode, +static noinline int search_ioctl(struct btrfs_root *root, struct btrfs_ioctl_search_key *sk, u64 *buf_size, char __user *ubuf) { - struct btrfs_fs_info *info = inode_to_fs_info(inode); - struct btrfs_root *root; + struct btrfs_fs_info *info = root->fs_info; struct btrfs_key key; struct btrfs_path *path; int ret; @@ -1627,9 +1625,10 @@ static noinline int search_ioctl(struct inode *inode, return -ENOMEM; if (sk->tree_id == 0) { - /* search the root of the inode that was passed */ - root = btrfs_grab_root(BTRFS_I(inode)->root); + /* Search the root that we got passed. */ + root = btrfs_grab_root(root); } else { + /* Look up the root from the arguments. */ root = btrfs_get_fs_root(info, sk->tree_id, true); if (IS_ERR(root)) { btrfs_free_path(path); @@ -1642,21 +1641,19 @@ static noinline int search_ioctl(struct inode *inode, key.offset = sk->min_offset; while (1) { - ret = -EFAULT; /* * Ensure that the whole user buffer is faulted in at sub-page * granularity, otherwise the loop may live-lock. */ - if (fault_in_subpage_writeable(ubuf + sk_offset, - *buf_size - sk_offset)) + if (fault_in_subpage_writeable(ubuf + sk_offset, *buf_size - sk_offset)) { + ret = -EFAULT; break; + } ret = btrfs_search_forward(root, &key, path, sk->min_transid); - if (ret != 0) { - if (ret > 0) - ret = 0; - goto err; - } + if (ret) + break; + ret = copy_to_sk(path, &key, sk, buf_size, ubuf, &sk_offset, &num_found); btrfs_release_path(path); @@ -1664,16 +1661,17 @@ static noinline int search_ioctl(struct inode *inode, break; } + /* Normalize return values from btrfs_search_forward() and copy_to_sk(). */ if (ret > 0) ret = 0; -err: + sk->nr_items = num_found; btrfs_put_root(root); btrfs_free_path(path); return ret; } -static noinline int btrfs_ioctl_tree_search(struct inode *inode, +static noinline int btrfs_ioctl_tree_search(struct btrfs_root *root, void __user *argp) { struct btrfs_ioctl_search_args __user *uargs = argp; @@ -1689,7 +1687,7 @@ static noinline int btrfs_ioctl_tree_search(struct inode *inode, buf_size = sizeof(uargs->buf); - ret = search_ioctl(inode, &sk, &buf_size, uargs->buf); + ret = search_ioctl(root, &sk, &buf_size, uargs->buf); /* * In the origin implementation an overflow is handled by returning a @@ -1703,7 +1701,7 @@ static noinline int btrfs_ioctl_tree_search(struct inode *inode, return ret; } -static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode, +static noinline int btrfs_ioctl_tree_search_v2(struct btrfs_root *root, void __user *argp) { struct btrfs_ioctl_search_args_v2 __user *uarg = argp; @@ -1725,7 +1723,7 @@ static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode, if (buf_size > buf_limit) buf_size = buf_limit; - ret = search_ioctl(inode, &args.key, &buf_size, + ret = search_ioctl(root, &args.key, &buf_size, (char __user *)(&uarg->buf[0])); if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key))) ret = -EFAULT; @@ -1833,7 +1831,6 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, struct btrfs_path *path; struct btrfs_key key, key2; struct extent_buffer *leaf; - struct inode *temp_inode; char *ptr; int slot; int len; @@ -1861,6 +1858,8 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, key.type = BTRFS_INODE_REF_KEY; key.offset = (u64)-1; while (1) { + struct btrfs_inode *temp_inode; + ret = btrfs_search_backwards(root, &key, path); if (ret < 0) goto out_put; @@ -1915,9 +1914,9 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, ret = PTR_ERR(temp_inode); goto out_put; } - ret = inode_permission(idmap, temp_inode, + ret = inode_permission(idmap, &temp_inode->vfs_inode, MAY_READ | MAY_EXEC); - iput(temp_inode); + iput(&temp_inode->vfs_inode); if (ret) { ret = -EACCES; goto out_put; @@ -2289,7 +2288,6 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, struct btrfs_ioctl_vol_args_v2 *vol_args2 = NULL; struct mnt_idmap *idmap = file_mnt_idmap(file); char *subvol_name, *subvol_name_ptr = NULL; - int subvol_namelen; int ret = 0; bool destroy_parent = false; @@ -2412,10 +2410,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, goto out; } - subvol_namelen = strlen(subvol_name); - if (strchr(subvol_name, '/') || - strncmp(subvol_name, "..", subvol_namelen) == 0) { + strcmp(subvol_name, "..") == 0) { ret = -EINVAL; goto free_subvol_name; } @@ -2428,7 +2424,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, ret = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); if (ret == -EINTR) goto free_subvol_name; - dentry = lookup_one(idmap, subvol_name, parent, subvol_namelen); + dentry = lookup_one(idmap, &QSTR(subvol_name), parent); if (IS_ERR(dentry)) { ret = PTR_ERR(dentry); goto out_unlock_dir; @@ -2571,7 +2567,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) /* the rest are all set to zero by kzalloc */ range.len = (u64)-1; } - ret = btrfs_defrag_file(file_inode(file), &file->f_ra, + ret = btrfs_defrag_file(BTRFS_I(file_inode(file)), &file->f_ra, &range, BTRFS_OLDEST_GENERATION, 0); if (ret > 0) ret = 0; @@ -2763,7 +2759,7 @@ out_free: return ret; } -static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, +static long btrfs_ioctl_fs_info(const struct btrfs_fs_info *fs_info, void __user *arg) { struct btrfs_ioctl_fs_info_args *fi_args; @@ -2817,7 +2813,7 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, return ret; } -static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info, +static long btrfs_ioctl_dev_info(const struct btrfs_fs_info *fs_info, void __user *arg) { BTRFS_DEV_LOOKUP_ARGS(args); @@ -4248,7 +4244,7 @@ static int btrfs_ioctl_get_features(struct btrfs_fs_info *fs_info, return 0; } -static int check_feature_bits(struct btrfs_fs_info *fs_info, +static int check_feature_bits(const struct btrfs_fs_info *fs_info, enum btrfs_feature_set set, u64 change_mask, u64 flags, u64 supported_flags, u64 safe_set, u64 safe_clear) @@ -4384,7 +4380,7 @@ out_drop_write: return ret; } -static int _btrfs_ioctl_send(struct btrfs_inode *inode, void __user *argp, bool compat) +static int _btrfs_ioctl_send(struct btrfs_root *root, void __user *argp, bool compat) { struct btrfs_ioctl_send_args *arg; int ret; @@ -4415,7 +4411,7 @@ static int _btrfs_ioctl_send(struct btrfs_inode *inode, void __user *argp, bool if (IS_ERR(arg)) return PTR_ERR(arg); } - ret = btrfs_ioctl_send(inode, arg); + ret = btrfs_ioctl_send(root, arg); kfree(arg); return ret; } @@ -4511,7 +4507,7 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp, args.compression, &unlocked); if (!unlocked) { - unlock_extent(io_tree, start, lockend, &cached_state); + btrfs_unlock_extent(io_tree, start, lockend, &cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); } } @@ -4700,7 +4696,7 @@ static void btrfs_uring_read_finished(struct io_uring_cmd *cmd, unsigned int iss ret = priv->count; out: - unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state); + btrfs_unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); io_uring_cmd_done(cmd, ret, 0, issue_flags); @@ -4789,7 +4785,7 @@ static int btrfs_uring_read_extent(struct kiocb *iocb, struct iov_iter *iter, return -EIOCBQUEUED; out_fail: - unlock_extent(io_tree, start, lockend, &cached_state); + btrfs_unlock_extent(io_tree, start, lockend, &cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); kfree(priv); return ret; @@ -4903,6 +4899,8 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue ret = btrfs_encoded_read(&kiocb, &data->iter, &data->args, &cached_state, &disk_bytenr, &disk_io_size); + if (ret == -EAGAIN) + goto out_acct; if (ret < 0 && ret != -EIOCBQUEUED) goto out_free; @@ -4912,7 +4910,7 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue (const char *)&data->args + copy_end_kernel, sizeof(data->args) - copy_end_kernel)) { if (ret == -EIOCBQUEUED) { - unlock_extent(io_tree, start, lockend, &cached_state); + btrfs_unlock_extent(io_tree, start, lockend, &cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); } ret = -EFAULT; @@ -5242,7 +5240,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_SNAP_DESTROY_V2: return btrfs_ioctl_snap_destroy(file, argp, true); case BTRFS_IOC_SUBVOL_GETFLAGS: - return btrfs_ioctl_subvol_getflags(inode, argp); + return btrfs_ioctl_subvol_getflags(BTRFS_I(inode), argp); case BTRFS_IOC_SUBVOL_SETFLAGS: return btrfs_ioctl_subvol_setflags(file, argp); case BTRFS_IOC_DEFAULT_SUBVOL: @@ -5264,9 +5262,9 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_DEV_INFO: return btrfs_ioctl_dev_info(fs_info, argp); case BTRFS_IOC_TREE_SEARCH: - return btrfs_ioctl_tree_search(inode, argp); + return btrfs_ioctl_tree_search(root, argp); case BTRFS_IOC_TREE_SEARCH_V2: - return btrfs_ioctl_tree_search_v2(inode, argp); + return btrfs_ioctl_tree_search_v2(root, argp); case BTRFS_IOC_INO_LOOKUP: return btrfs_ioctl_ino_lookup(root, argp); case BTRFS_IOC_INO_PATHS: @@ -5314,10 +5312,10 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_set_received_subvol_32(file, argp); #endif case BTRFS_IOC_SEND: - return _btrfs_ioctl_send(BTRFS_I(inode), argp, false); + return _btrfs_ioctl_send(root, argp, false); #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) case BTRFS_IOC_SEND_32: - return _btrfs_ioctl_send(BTRFS_I(inode), argp, true); + return _btrfs_ioctl_send(root, argp, true); #endif case BTRFS_IOC_GET_DEV_STATS: return btrfs_ioctl_get_dev_stats(fs_info, argp); diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index ce915fcda43b..e08ea446cf48 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -9,6 +9,8 @@ struct file; struct dentry; struct mnt_idmap; struct fileattr; +struct io_uring_cmd; +struct btrfs_inode; struct btrfs_fs_info; struct btrfs_ioctl_balance_args; @@ -18,7 +20,7 @@ int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa); int btrfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); int btrfs_ioctl_get_supported_features(void __user *arg); -void btrfs_sync_inode_flags_to_i_flags(struct inode *inode); +void btrfs_sync_inode_flags_to_i_flags(struct btrfs_inode *inode); void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_balance_args *bargs); int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags); diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 9a7a7b723305..a3e6d9616e60 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -9,7 +9,6 @@ #include <linux/page-flags.h> #include <asm/bug.h> #include <trace/events/btrfs.h> -#include "misc.h" #include "ctree.h" #include "extent_io.h" #include "locking.h" @@ -150,15 +149,15 @@ void btrfs_tree_read_lock_nested(struct extent_buffer *eb, enum btrfs_lock_nesti /* * Try-lock for read. * - * Return 1 if the rwlock has been taken, 0 otherwise + * Return true if the rwlock has been taken, false otherwise */ -int btrfs_try_tree_read_lock(struct extent_buffer *eb) +bool btrfs_try_tree_read_lock(struct extent_buffer *eb) { if (down_read_trylock(&eb->lock)) { trace_btrfs_try_tree_read_lock(eb); - return 1; + return true; } - return 0; + return false; } /* diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index c69e57ff804b..af29df98ac14 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -189,7 +189,7 @@ static inline void btrfs_tree_read_lock(struct extent_buffer *eb) } void btrfs_tree_read_unlock(struct extent_buffer *eb); -int btrfs_try_tree_read_lock(struct extent_buffer *eb); +bool btrfs_try_tree_read_lock(struct extent_buffer *eb); struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root); diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index a45bc11f8665..d403641889ca 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -252,9 +252,8 @@ int lzo_compress_folios(struct list_head *ws, struct address_space *mapping, /* Compress at most one sector of data each time */ in_len = min_t(u32, start + len - cur_in, sectorsize - sector_off); ASSERT(in_len); - data_in = kmap_local_folio(folio_in, 0); - ret = lzo1x_1_compress(data_in + - offset_in_page(cur_in), in_len, + data_in = kmap_local_folio(folio_in, offset_in_folio(folio_in, cur_in)); + ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf, &out_len, workspace->mem); kunmap_local(data_in); diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h index 08a9272399d2..6abf81bb00c2 100644 --- a/fs/btrfs/messages.h +++ b/fs/btrfs/messages.h @@ -4,6 +4,7 @@ #define BTRFS_MESSAGES_H #include <linux/types.h> +#include <linux/types.h> #include <linux/printk.h> #include <linux/bug.h> @@ -170,15 +171,83 @@ do { \ #ifdef CONFIG_BTRFS_ASSERT -#define btrfs_assertfail(expr, file, line) ({ \ - pr_err("assertion failed: %s, in %s:%d\n", (expr), (file), (line)); \ - BUG(); \ -}) +__printf(1, 2) +static inline void verify_assert_printk_format(const char *fmt, ...) { + /* Stub to verify the assertion format string. */ +} + +/* Take the first token if any. */ +#define __FIRST_ARG(_, ...) _ +/* + * Skip the first token and return the rest, if it's empty the comma is dropped. + * As ##__VA_ARGS__ cannot be at the beginning of the macro the __VA_OPT__ is needed + * and supported since GCC 8 and Clang 12. + */ +#define __REST_ARGS(_, ... ) __VA_OPT__(,) __VA_ARGS__ + +#if defined(CONFIG_CC_IS_CLANG) || GCC_VERSION >= 80000 +/* + * Assertion with optional printk() format. + * + * Accepted syntax: + * ASSERT(condition); + * ASSERT(condition, "string"); + * ASSERT(condition, "variable=%d", variable); + * + * How it works: + * - if there's no format string, ""[0] evaluates at compile time to 0 and the + * true branch is executed + * - any non-empty format string with the "" prefix evaluates to != 0 at + * compile time and the false branch is executed + * - stringified condition is printed as %s so we don't accidentally mix format + * strings (the % operator) + * - there can be only one printk() call, so the format strings and arguments are + * spliced together: + * DEFAULT_FMT [USER_FMT], DEFAULT_ARGS [, USER_ARGS] + * - comma between DEFAULT_ARGS and USER_ARGS is handled by preprocessor + * (requires __VA_OPT__ support) + * - otherwise we could use __VA_OPT(,) __VA_ARGS__ for the 2nd+ argument of args, + */ +#define ASSERT(cond, args...) \ +do { \ + verify_assert_printk_format("check the format string" args); \ + if (!likely(cond)) { \ + if (("" __FIRST_ARG(args) [0]) == 0) { \ + pr_err("assertion failed: %s :: %ld, in %s:%d\n", \ + #cond, (long)(cond), __FILE__, __LINE__); \ + } else { \ + pr_err("assertion failed: %s :: %ld, in %s:%d (" __FIRST_ARG(args) ")\n", \ + #cond, (long)(cond), __FILE__, __LINE__ __REST_ARGS(args)); \ + } \ + BUG(); \ + } \ +} while(0) + +#else + +/* For GCC < 8.x only the simple output. */ + +#define ASSERT(cond, args...) \ +do { \ + verify_assert_printk_format("check the format string" args); \ + if (!likely(cond)) { \ + pr_err("assertion failed: %s :: %ld, in %s:%d\n", \ + #cond, (long)(cond), __FILE__, __LINE__); \ + BUG(); \ + } \ +} while(0) + +#endif + +#else +#define ASSERT(cond, args...) (void)(cond) +#endif -#define ASSERT(expr) \ - (likely(expr) ? (void)0 : btrfs_assertfail(#expr, __FILE__, __LINE__)) +#ifdef CONFIG_BTRFS_DEBUG +/* Verbose warning only under debug build. */ +#define DEBUG_WARN(args...) WARN(1, KERN_ERR args) #else -#define ASSERT(expr) (void)(expr) +#define DEBUG_WARN(...) do {} while(0) #endif __printf(5, 6) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 4aca7475fd82..9212ce110cde 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -153,25 +153,30 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( struct btrfs_ordered_extent *entry; int ret; u64 qgroup_rsv = 0; + const bool is_nocow = (flags & + ((1U << BTRFS_ORDERED_NOCOW) | (1U << BTRFS_ORDERED_PREALLOC))); - if (flags & - ((1 << BTRFS_ORDERED_NOCOW) | (1 << BTRFS_ORDERED_PREALLOC))) { - /* For nocow write, we can release the qgroup rsv right now */ + /* + * For a NOCOW write we can free the qgroup reserve right now. For a COW + * one we transfer the reserved space from the inode's iotree into the + * ordered extent by calling btrfs_qgroup_release_data() and tracking + * the qgroup reserved amount in the ordered extent, so that later after + * completing the ordered extent, when running the data delayed ref it + * creates, we free the reserved data with btrfs_qgroup_free_refroot(). + */ + if (is_nocow) ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes, &qgroup_rsv); - if (ret < 0) - return ERR_PTR(ret); - } else { - /* - * The ordered extent has reserved qgroup space, release now - * and pass the reserved number for qgroup_record to free. - */ + else ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes, &qgroup_rsv); - if (ret < 0) - return ERR_PTR(ret); - } + + if (ret < 0) + return ERR_PTR(ret); + entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS); - if (!entry) - return ERR_PTR(-ENOMEM); + if (!entry) { + entry = ERR_PTR(-ENOMEM); + goto out; + } entry->file_offset = file_offset; entry->num_bytes = num_bytes; @@ -180,7 +185,12 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( entry->disk_num_bytes = disk_num_bytes; entry->offset = offset; entry->bytes_left = num_bytes; - entry->inode = BTRFS_I(igrab(&inode->vfs_inode)); + if (WARN_ON_ONCE(!igrab(&inode->vfs_inode))) { + kmem_cache_free(btrfs_ordered_extent_cache, entry); + entry = ERR_PTR(-ESTALE); + goto out; + } + entry->inode = inode; entry->compress_type = compress_type; entry->truncated_len = (u64)-1; entry->qgroup_rsv = qgroup_rsv; @@ -203,6 +213,12 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( btrfs_mod_outstanding_extents(inode, 1); spin_unlock(&inode->lock); +out: + if (IS_ERR(entry) && !is_nocow) + btrfs_qgroup_free_refroot(inode->root->fs_info, + btrfs_root_id(inode->root), + qgroup_rsv, BTRFS_QGROUP_RSV_DATA); + return entry; } @@ -253,7 +269,7 @@ static void insert_ordered_extent(struct btrfs_ordered_extent *entry) * @disk_bytenr: Offset of extent on disk. * @disk_num_bytes: Size of extent on disk. * @offset: Offset into unencoded data where file data starts. - * @flags: Flags specifying type of extent (1 << BTRFS_ORDERED_*). + * @flags: Flags specifying type of extent (1U << BTRFS_ORDERED_*). * @compress_type: Compression algorithm used for data. * * Most of these parameters correspond to &struct btrfs_file_extent_item. The @@ -607,23 +623,18 @@ out: */ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) { - struct list_head *cur; - struct btrfs_ordered_sum *sum; - trace_btrfs_ordered_extent_put(entry->inode, entry); if (refcount_dec_and_test(&entry->refs)) { + struct btrfs_ordered_sum *sum; + struct btrfs_ordered_sum *tmp; + ASSERT(list_empty(&entry->root_extent_list)); ASSERT(list_empty(&entry->log_list)); ASSERT(RB_EMPTY_NODE(&entry->rb_node)); - if (entry->inode) - btrfs_add_delayed_iput(entry->inode); - while (!list_empty(&entry->list)) { - cur = entry->list.next; - sum = list_entry(cur, struct btrfs_ordered_sum, list); - list_del(&sum->list); + btrfs_add_delayed_iput(entry->inode); + list_for_each_entry_safe(sum, tmp, &entry->list, list) kvfree(sum); - } kmem_cache_free(btrfs_ordered_extent_cache, entry); } } @@ -842,10 +853,12 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, /* * Start IO and wait for a given ordered extent to finish. * - * Wait on page writeback for all the pages in the extent and the IO completion - * code to insert metadata into the btree corresponding to the extent. + * Wait on page writeback for all the pages in the extent but not in + * [@nowriteback_start, @nowriteback_start + @nowriteback_len) and the + * IO completion code to insert metadata into the btree corresponding to the extent. */ -void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry) +void btrfs_start_ordered_extent_nowriteback(struct btrfs_ordered_extent *entry, + u64 nowriteback_start, u32 nowriteback_len) { u64 start = entry->file_offset; u64 end = start + entry->num_bytes - 1; @@ -865,8 +878,19 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry) * start IO on any dirty ones so the wait doesn't stall waiting * for the flusher thread to find them */ - if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) - filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end); + if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) { + if (!nowriteback_len) { + filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end); + } else { + if (start < nowriteback_start) + filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, + nowriteback_start - 1); + if (nowriteback_start + nowriteback_len < end) + filemap_fdatawrite_range(inode->vfs_inode.i_mapping, + nowriteback_start + nowriteback_len, + end); + } + } if (!freespace_inode) btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent); @@ -1160,7 +1184,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, cachedp = cached_state; while (1) { - lock_extent(&inode->io_tree, start, end, cachedp); + btrfs_lock_extent(&inode->io_tree, start, end, cachedp); ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1); if (!ordered) { @@ -1173,7 +1197,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, refcount_dec(&cache->refs); break; } - unlock_extent(&inode->io_tree, start, end, cachedp); + btrfs_unlock_extent(&inode->io_tree, start, end, cachedp); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); } @@ -1191,7 +1215,7 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, { struct btrfs_ordered_extent *ordered; - if (!try_lock_extent(&inode->io_tree, start, end, cached_state)) + if (!btrfs_try_lock_extent(&inode->io_tree, start, end, cached_state)) return false; ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1); @@ -1199,7 +1223,7 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, return true; btrfs_put_ordered_extent(ordered); - unlock_extent(&inode->io_tree, start, end, cached_state); + btrfs_unlock_extent(&inode->io_tree, start, end, cached_state); return false; } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 4e152736d06c..1e6b0b182b29 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -17,6 +17,7 @@ struct inode; struct page; struct extent_state; +struct btrfs_block_group; struct btrfs_inode; struct btrfs_root; struct btrfs_fs_info; @@ -191,7 +192,13 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, u64 file_offset); -void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry); +void btrfs_start_ordered_extent_nowriteback(struct btrfs_ordered_extent *entry, + u64 nowriteback_start, u32 nowriteback_len); +static inline void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry) +{ + return btrfs_start_ordered_extent_nowriteback(entry, 0, 0); +} + int btrfs_wait_ordered_range(struct btrfs_inode *inode, u64 start, u64 len); struct btrfs_ordered_extent * btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset); diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h index 8504bf1702c7..d0e620bf5f5a 100644 --- a/fs/btrfs/print-tree.h +++ b/fs/btrfs/print-tree.h @@ -6,6 +6,8 @@ #ifndef BTRFS_PRINT_TREE_H #define BTRFS_PRINT_TREE_H +#include <linux/types.h> + /* Buffer size to contain tree name and possibly additional data (offset) */ #define BTRFS_ROOT_NAME_BUF_LEN 48 diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index b8fa34e16abb..adc956432d2f 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -26,8 +26,8 @@ struct prop_handler { const char *xattr_name; int (*validate)(const struct btrfs_inode *inode, const char *value, size_t len); - int (*apply)(struct inode *inode, const char *value, size_t len); - const char *(*extract)(const struct inode *inode); + int (*apply)(struct btrfs_inode *inode, const char *value, size_t len); + const char *(*extract)(const struct btrfs_inode *inode); bool (*ignore)(const struct btrfs_inode *inode); int inheritable; }; @@ -121,7 +121,7 @@ int btrfs_set_prop(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, if (ret) return ret; - ret = handler->apply(&inode->vfs_inode, NULL, 0); + ret = handler->apply(inode, NULL, 0); ASSERT(ret == 0); return ret; @@ -131,7 +131,7 @@ int btrfs_set_prop(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, value_len, flags); if (ret) return ret; - ret = handler->apply(&inode->vfs_inode, value, value_len); + ret = handler->apply(inode, value, value_len); if (ret) { btrfs_setxattr(trans, &inode->vfs_inode, handler->xattr_name, NULL, 0, flags); @@ -263,7 +263,7 @@ static void inode_prop_iterator(void *ctx, struct btrfs_root *root = BTRFS_I(inode)->root; int ret; - ret = handler->apply(inode, value, len); + ret = handler->apply(BTRFS_I(inode), value, len); if (unlikely(ret)) btrfs_warn(root->fs_info, "error applying prop %s to ino %llu (root %llu): %d", @@ -273,12 +273,13 @@ static void inode_prop_iterator(void *ctx, set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags); } -int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path) +int btrfs_load_inode_props(struct btrfs_inode *inode, struct btrfs_path *path) { - struct btrfs_root *root = BTRFS_I(inode)->root; - u64 ino = btrfs_ino(BTRFS_I(inode)); + struct btrfs_root *root = inode->root; + u64 ino = btrfs_ino(inode); - return iterate_object_props(root, path, ino, inode_prop_iterator, inode); + return iterate_object_props(root, path, ino, inode_prop_iterator, + &inode->vfs_inode); } static int prop_compression_validate(const struct btrfs_inode *inode, @@ -300,26 +301,26 @@ static int prop_compression_validate(const struct btrfs_inode *inode, return -EINVAL; } -static int prop_compression_apply(struct inode *inode, const char *value, +static int prop_compression_apply(struct btrfs_inode *inode, const char *value, size_t len) { - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; int type; /* Reset to defaults */ if (len == 0) { - BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; - BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; - BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE; + inode->flags &= ~BTRFS_INODE_COMPRESS; + inode->flags &= ~BTRFS_INODE_NOCOMPRESS; + inode->prop_compress = BTRFS_COMPRESS_NONE; return 0; } /* Set NOCOMPRESS flag */ if ((len == 2 && strncmp("no", value, 2) == 0) || (len == 4 && strncmp("none", value, 4) == 0)) { - BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; - BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; - BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE; + inode->flags |= BTRFS_INODE_NOCOMPRESS; + inode->flags &= ~BTRFS_INODE_COMPRESS; + inode->prop_compress = BTRFS_COMPRESS_NONE; return 0; } @@ -336,9 +337,9 @@ static int prop_compression_apply(struct inode *inode, const char *value, return -EINVAL; } - BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; - BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; - BTRFS_I(inode)->prop_compress = type; + inode->flags &= ~BTRFS_INODE_NOCOMPRESS; + inode->flags |= BTRFS_INODE_COMPRESS; + inode->prop_compress = type; return 0; } @@ -359,13 +360,13 @@ static bool prop_compression_ignore(const struct btrfs_inode *inode) return false; } -static const char *prop_compression_extract(const struct inode *inode) +static const char *prop_compression_extract(const struct btrfs_inode *inode) { - switch (BTRFS_I(inode)->prop_compress) { + switch (inode->prop_compress) { case BTRFS_COMPRESS_ZLIB: case BTRFS_COMPRESS_LZO: case BTRFS_COMPRESS_ZSTD: - return btrfs_compress_type2str(BTRFS_I(inode)->prop_compress); + return btrfs_compress_type2str(inode->prop_compress); default: break; } @@ -385,16 +386,16 @@ static struct prop_handler prop_handlers[] = { }; int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, - struct inode *inode, const struct inode *parent) + struct btrfs_inode *inode, + const struct btrfs_inode *parent) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; int ret; int i; bool need_reserve = false; - if (!test_bit(BTRFS_INODE_HAS_PROPS, - &BTRFS_I(parent)->runtime_flags)) + if (!test_bit(BTRFS_INODE_HAS_PROPS, &parent->runtime_flags)) return 0; for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) { @@ -405,7 +406,7 @@ int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, if (!h->inheritable) continue; - if (h->ignore(BTRFS_I(inode))) + if (h->ignore(inode)) continue; value = h->extract(parent); @@ -416,7 +417,7 @@ int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, * This is not strictly necessary as the property should be * valid, but in case it isn't, don't propagate it further. */ - ret = h->validate(BTRFS_I(inode), value, strlen(value)); + ret = h->validate(inode, value, strlen(value)); if (ret) continue; @@ -436,16 +437,15 @@ int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, return ret; } - ret = btrfs_setxattr(trans, inode, h->xattr_name, value, + ret = btrfs_setxattr(trans, &inode->vfs_inode, h->xattr_name, value, strlen(value), 0); if (!ret) { ret = h->apply(inode, value, strlen(value)); if (ret) - btrfs_setxattr(trans, inode, h->xattr_name, + btrfs_setxattr(trans, &inode->vfs_inode, h->xattr_name, NULL, 0, 0); else - set_bit(BTRFS_INODE_HAS_PROPS, - &BTRFS_I(inode)->runtime_flags); + set_bit(BTRFS_INODE_HAS_PROPS, &inode->runtime_flags); } if (need_reserve) { diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h index 63546d0a9444..15d9a025c923 100644 --- a/fs/btrfs/props.h +++ b/fs/btrfs/props.h @@ -6,9 +6,9 @@ #ifndef BTRFS_PROPS_H #define BTRFS_PROPS_H +#include <linux/types.h> #include <linux/compiler_types.h> -struct inode; struct btrfs_inode; struct btrfs_path; struct btrfs_trans_handle; @@ -22,10 +22,10 @@ int btrfs_validate_prop(const struct btrfs_inode *inode, const char *name, const char *value, size_t value_len); bool btrfs_ignore_prop(const struct btrfs_inode *inode, const char *name); -int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path); +int btrfs_load_inode_props(struct btrfs_inode *inode, struct btrfs_path *path); int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, - struct inode *inode, - const struct inode *dir); + struct btrfs_inode *inode, + const struct btrfs_inode *dir); #endif diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index f9d3766c809b..b3176edbde82 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -83,7 +83,7 @@ static void qgroup_rsv_add(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup, u64 num_bytes, enum btrfs_qgroup_rsv_type type) { - trace_qgroup_update_reserve(fs_info, qgroup, num_bytes, type); + trace_btrfs_qgroup_update_reserve(fs_info, qgroup, num_bytes, type); qgroup->rsv.values[type] += num_bytes; } @@ -91,7 +91,7 @@ static void qgroup_rsv_release(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup, u64 num_bytes, enum btrfs_qgroup_rsv_type type) { - trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes, type); + trace_btrfs_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes, type); if (qgroup->rsv.values[type] >= num_bytes) { qgroup->rsv.values[type] -= num_bytes; return; @@ -956,8 +956,8 @@ static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans, return -ENOMEM; key.objectid = 0; - key.offset = 0; key.type = 0; + key.offset = 0; while (1) { ret = btrfs_search_slot(trans, root, &key, path, -1, 1); @@ -1823,7 +1823,7 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) if (qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] || qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] || qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]) { - WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + DEBUG_WARN(); btrfs_warn_rl(fs_info, "to be deleted qgroup %u/%llu has non-zero numbers, data %llu meta prealloc %llu meta pertrans %llu", btrfs_qgroup_level(qgroup->qgroupid), @@ -1843,7 +1843,7 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT)) { if (qgroup->rfer || qgroup->excl || qgroup->rfer_cmpr || qgroup->excl_cmpr) { - WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + DEBUG_WARN(); btrfs_warn_rl(fs_info, "to be deleted qgroup %u/%llu has non-zero numbers, rfer %llu rfer_cmpr %llu excl %llu excl_cmpr %llu", btrfs_qgroup_level(qgroup->qgroupid), @@ -2837,8 +2837,8 @@ static void qgroup_update_counters(struct btrfs_fs_info *fs_info, cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq); cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq); - trace_qgroup_update_counters(fs_info, qg, cur_old_count, - cur_new_count); + trace_btrfs_qgroup_update_counters(fs_info, qg, cur_old_count, + cur_new_count); /* Rfer update part */ if (cur_old_count == 0 && cur_new_count > 0) { @@ -3100,8 +3100,7 @@ cleanup: kfree(record); } - trace_qgroup_num_dirty_extents(fs_info, trans->transid, - num_dirty_extents); + trace_btrfs_qgroup_num_dirty_extents(fs_info, trans->transid, num_dirty_extents); return ret; } @@ -4129,8 +4128,8 @@ static int qgroup_unreserve_range(struct btrfs_inode *inode, * Now the entry is in [start, start + len), revert the * EXTENT_QGROUP_RESERVED bit. */ - clear_ret = clear_extent_bits(&inode->io_tree, entry_start, - entry_end, EXTENT_QGROUP_RESERVED); + clear_ret = btrfs_clear_extent_bits(&inode->io_tree, entry_start, + entry_end, EXTENT_QGROUP_RESERVED); if (!ret && clear_ret < 0) ret = clear_ret; @@ -4232,8 +4231,9 @@ static int qgroup_reserve_data(struct btrfs_inode *inode, reserved = *reserved_ret; /* Record already reserved space */ orig_reserved = reserved->bytes_changed; - ret = set_record_extent_bits(&inode->io_tree, start, - start + len -1, EXTENT_QGROUP_RESERVED, reserved); + ret = btrfs_set_record_extent_bits(&inode->io_tree, start, + start + len - 1, EXTENT_QGROUP_RESERVED, + reserved); /* Newly reserved space */ to_reserve = reserved->bytes_changed - orig_reserved; @@ -4326,9 +4326,10 @@ static int qgroup_free_reserved_data(struct btrfs_inode *inode, * EXTENT_QGROUP_RESERVED, we won't double free. * So not need to rush. */ - ret = clear_record_extent_bits(&inode->io_tree, free_start, - free_start + free_len - 1, - EXTENT_QGROUP_RESERVED, &changeset); + ret = btrfs_clear_record_extent_bits(&inode->io_tree, free_start, + free_start + free_len - 1, + EXTENT_QGROUP_RESERVED, + &changeset); if (ret < 0) goto out; freed += changeset.bytes_changed; @@ -4352,9 +4353,9 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, int ret; if (btrfs_qgroup_mode(inode->root->fs_info) == BTRFS_QGROUP_MODE_DISABLED) { - return clear_record_extent_bits(&inode->io_tree, start, - start + len - 1, - EXTENT_QGROUP_RESERVED, NULL); + return btrfs_clear_record_extent_bits(&inode->io_tree, start, + start + len - 1, + EXTENT_QGROUP_RESERVED, NULL); } /* In release case, we shouldn't have @reserved */ @@ -4362,8 +4363,8 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, if (free && reserved) return qgroup_free_reserved_data(inode, reserved, start, len, released); extent_changeset_init(&changeset); - ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1, - EXTENT_QGROUP_RESERVED, &changeset); + ret = btrfs_clear_record_extent_bits(&inode->io_tree, start, start + len - 1, + EXTENT_QGROUP_RESERVED, &changeset); if (ret < 0) goto out; @@ -4472,7 +4473,7 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, return 0; BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); - trace_qgroup_meta_reserve(root, (s64)num_bytes, type); + trace_btrfs_qgroup_meta_reserve(root, (s64)num_bytes, type); ret = qgroup_reserve(root, num_bytes, enforce, type); if (ret < 0) return ret; @@ -4517,7 +4518,7 @@ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root) return; /* TODO: Update trace point to handle such free */ - trace_qgroup_meta_free_all_pertrans(root); + trace_btrfs_qgroup_meta_free_all_pertrans(root); /* Special value -1 means to free all reserved space */ btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(root), (u64)-1, BTRFS_QGROUP_RSV_META_PERTRANS); @@ -4539,7 +4540,7 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, */ num_bytes = sub_root_meta_rsv(root, num_bytes, type); BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); - trace_qgroup_meta_reserve(root, -(s64)num_bytes, type); + trace_btrfs_qgroup_meta_reserve(root, -(s64)num_bytes, type); btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(root), num_bytes, type); } @@ -4593,7 +4594,7 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes) /* Same as btrfs_qgroup_free_meta_prealloc() */ num_bytes = sub_root_meta_rsv(root, num_bytes, BTRFS_QGROUP_RSV_META_PREALLOC); - trace_qgroup_meta_convert(root, num_bytes); + trace_btrfs_qgroup_meta_convert(root, num_bytes); qgroup_convert_meta(fs_info, btrfs_root_id(root), num_bytes); if (!sb_rdonly(fs_info->sb)) add_root_meta_rsv(root, num_bytes, BTRFS_QGROUP_RSV_META_PERTRANS); @@ -4611,8 +4612,8 @@ void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode) int ret; extent_changeset_init(&changeset); - ret = clear_record_extent_bits(&inode->io_tree, 0, (u64)-1, - EXTENT_QGROUP_RESERVED, &changeset); + ret = btrfs_clear_record_extent_bits(&inode->io_tree, 0, (u64)-1, + EXTENT_QGROUP_RESERVED, &changeset); WARN_ON(ret < 0); if (WARN_ON(changeset.bytes_changed)) { @@ -4766,7 +4767,7 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root, * Marking qgroup inconsistent should be enough * for end users. */ - WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + DEBUG_WARN("duplicated but mismatched entry found"); ret = -EEXIST; } kfree(block); diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index e233cc79af18..a979fd59a4da 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -22,6 +22,9 @@ struct btrfs_ioctl_quota_ctl_args; struct btrfs_trans_handle; struct btrfs_delayed_ref_root; struct btrfs_inode; +struct btrfs_transaction; +struct btrfs_block_group; +struct btrfs_qgroup_swapped_blocks; /* * Btrfs qgroup overview diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h index 541836421778..69942ad43140 100644 --- a/fs/btrfs/raid-stripe-tree.h +++ b/fs/btrfs/raid-stripe-tree.h @@ -9,6 +9,7 @@ #include <linux/types.h> #include <uapi/linux/btrfs_tree.h> #include "fs.h" +#include "accessors.h" #define BTRFS_RST_SUPP_BLOCK_GROUP_MASK (BTRFS_BLOCK_GROUP_DUP | \ BTRFS_BLOCK_GROUP_RAID1_MASK | \ diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index cdd373c27784..3ff2bedfb3a4 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -134,14 +134,17 @@ struct btrfs_stripe_hash_table { }; /* - * A bvec like structure to present a sector inside a page. - * - * Unlike bvec we don't need bvlen, as it's fixed to sectorsize. + * A structure to present a sector inside a page, the length is fixed to + * sectorsize; */ struct sector_ptr { - struct page *page; - unsigned int pgoff:24; - unsigned int uptodate:8; + /* + * Blocks from the bio list can still be highmem. + * So here we use physical address to present a page and the offset inside it. + */ + phys_addr_t paddr; + bool has_paddr; + bool uptodate; }; static void rmw_rbio_work(struct work_struct *work); @@ -200,8 +203,7 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) struct btrfs_stripe_hash_table *x; struct btrfs_stripe_hash *cur; struct btrfs_stripe_hash *h; - int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; - int i; + unsigned int num_entries = 1U << BTRFS_STRIPE_HASH_TABLE_BITS; if (info->stripe_hash_table) return 0; @@ -222,7 +224,7 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) h = table->table; - for (i = 0; i < num_entries; i++) { + for (unsigned int i = 0; i < num_entries; i++) { cur = h + i; INIT_LIST_HEAD(&cur->hash_list); spin_lock_init(&cur->lock); @@ -233,6 +235,14 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) return 0; } +static void memcpy_sectors(const struct sector_ptr *dst, + const struct sector_ptr *src, u32 blocksize) +{ + memcpy_page(phys_to_page(dst->paddr), offset_in_page(dst->paddr), + phys_to_page(src->paddr), offset_in_page(src->paddr), + blocksize); +} + /* * caching an rbio means to copy anything from the * bio_sectors array into the stripe_pages array. We @@ -253,7 +263,7 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio) for (i = 0; i < rbio->nr_sectors; i++) { /* Some range not covered by bio (partial write), skip it */ - if (!rbio->bio_sectors[i].page) { + if (!rbio->bio_sectors[i].has_paddr) { /* * Even if the sector is not covered by bio, if it is * a data sector it should still be uptodate as it is @@ -264,12 +274,8 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio) continue; } - ASSERT(rbio->stripe_sectors[i].page); - memcpy_page(rbio->stripe_sectors[i].page, - rbio->stripe_sectors[i].pgoff, - rbio->bio_sectors[i].page, - rbio->bio_sectors[i].pgoff, - rbio->bioc->fs_info->sectorsize); + memcpy_sectors(&rbio->stripe_sectors[i], &rbio->bio_sectors[i], + rbio->bioc->fs_info->sectorsize); rbio->stripe_sectors[i].uptodate = 1; } set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); @@ -326,8 +332,13 @@ static void index_stripe_sectors(struct btrfs_raid_bio *rbio) int page_index = offset >> PAGE_SHIFT; ASSERT(page_index < rbio->nr_pages); - rbio->stripe_sectors[i].page = rbio->stripe_pages[page_index]; - rbio->stripe_sectors[i].pgoff = offset_in_page(offset); + if (!rbio->stripe_pages[page_index]) + continue; + + rbio->stripe_sectors[i].has_paddr = true; + rbio->stripe_sectors[i].paddr = + page_to_phys(rbio->stripe_pages[page_index]) + + offset_in_page(offset); } } @@ -507,9 +518,8 @@ static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) spin_lock(&table->cache_lock); while (!list_empty(&table->stripe_cache)) { - rbio = list_entry(table->stripe_cache.next, - struct btrfs_raid_bio, - stripe_cache); + rbio = list_first_entry(&table->stripe_cache, + struct btrfs_raid_bio, stripe_cache); __remove_rbio_from_cache(rbio); } spin_unlock(&table->cache_lock); @@ -567,9 +577,9 @@ static void cache_rbio(struct btrfs_raid_bio *rbio) if (table->cache_size > RBIO_CACHE_SIZE) { struct btrfs_raid_bio *found; - found = list_entry(table->stripe_cache.prev, - struct btrfs_raid_bio, - stripe_cache); + found = list_last_entry(&table->stripe_cache, + struct btrfs_raid_bio, + stripe_cache); if (found != rbio) __remove_rbio_from_cache(found); @@ -882,14 +892,14 @@ done_nolock: remove_rbio_from_cache(rbio); } -static void rbio_endio_bio_list(struct bio *cur, blk_status_t err) +static void rbio_endio_bio_list(struct bio *cur, blk_status_t status) { struct bio *next; while (cur) { next = cur->bi_next; cur->bi_next = NULL; - cur->bi_status = err; + cur->bi_status = status; bio_endio(cur); cur = next; } @@ -899,7 +909,7 @@ static void rbio_endio_bio_list(struct bio *cur, blk_status_t err) * this frees the rbio and runs through all the bios in the * bio_list and calls end_io on them */ -static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) +static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t status) { struct bio *cur = bio_list_get(&rbio->bio_list); struct bio *extra; @@ -928,9 +938,9 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) extra = bio_list_get(&rbio->bio_list); free_raid_bio(rbio); - rbio_endio_bio_list(cur, err); + rbio_endio_bio_list(cur, status); if (extra) - rbio_endio_bio_list(extra, err); + rbio_endio_bio_list(extra, status); } /* @@ -962,9 +972,9 @@ static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio, spin_lock(&rbio->bio_list_lock); sector = &rbio->bio_sectors[index]; - if (sector->page || bio_list_only) { + if (sector->has_paddr || bio_list_only) { /* Don't return sector without a valid page pointer */ - if (!sector->page) + if (!sector->has_paddr) sector = NULL; spin_unlock(&rbio->bio_list_lock); return sector; @@ -1142,7 +1152,7 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, rbio, stripe_nr); ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors, rbio, sector_nr); - ASSERT(sector->page); + ASSERT(sector->has_paddr); stripe = &rbio->bioc->stripes[stripe_nr]; disk_start = stripe->physical + sector_nr * sectorsize; @@ -1173,8 +1183,8 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, */ if (last_end == disk_start && !last->bi_status && last->bi_bdev == stripe->dev->bdev) { - ret = bio_add_page(last, sector->page, sectorsize, - sector->pgoff); + ret = bio_add_page(last, phys_to_page(sector->paddr), + sectorsize, offset_in_page(sector->paddr)); if (ret == sectorsize) return 0; } @@ -1187,7 +1197,8 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, bio->bi_iter.bi_sector = disk_start >> SECTOR_SHIFT; bio->bi_private = rbio; - __bio_add_page(bio, sector->page, sectorsize, sector->pgoff); + __bio_add_page(bio, phys_to_page(sector->paddr), sectorsize, + offset_in_page(sector->paddr)); bio_list_add(bio_list, bio); return 0; } @@ -1195,23 +1206,20 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio) { const u32 sectorsize = rbio->bioc->fs_info->sectorsize; - struct bio_vec bvec; - struct bvec_iter iter; + const u32 sectorsize_bits = rbio->bioc->fs_info->sectorsize_bits; + struct bvec_iter iter = bio->bi_iter; u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - rbio->bioc->full_stripe_logical; - bio_for_each_segment(bvec, bio, iter) { - u32 bvec_offset; - - for (bvec_offset = 0; bvec_offset < bvec.bv_len; - bvec_offset += sectorsize, offset += sectorsize) { - int index = offset / sectorsize; - struct sector_ptr *sector = &rbio->bio_sectors[index]; + while (iter.bi_size) { + unsigned int index = (offset >> sectorsize_bits); + struct sector_ptr *sector = &rbio->bio_sectors[index]; + struct bio_vec bv = bio_iter_iovec(bio, iter); - sector->page = bvec.bv_page; - sector->pgoff = bvec.bv_offset + bvec_offset; - ASSERT(sector->pgoff < PAGE_SIZE); - } + sector->has_paddr = true; + sector->paddr = bvec_phys(&bv); + bio_advance_iter_single(bio, &iter, sectorsize); + offset += sectorsize; } } @@ -1289,6 +1297,15 @@ static void assert_rbio(struct btrfs_raid_bio *rbio) ASSERT_RBIO(rbio->nr_data < rbio->real_stripes, rbio); } +static inline void *kmap_local_sector(const struct sector_ptr *sector) +{ + /* The sector pointer must have a page mapped to it. */ + ASSERT(sector->has_paddr); + + return kmap_local_page(phys_to_page(sector->paddr)) + + offset_in_page(sector->paddr); +} + /* Generate PQ for one vertical stripe. */ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) { @@ -1301,14 +1318,13 @@ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) /* First collect one sector from each data stripe */ for (stripe = 0; stripe < rbio->nr_data; stripe++) { sector = sector_in_rbio(rbio, stripe, sectornr, 0); - pointers[stripe] = kmap_local_page(sector->page) + - sector->pgoff; + pointers[stripe] = kmap_local_sector(sector); } /* Then add the parity stripe */ sector = rbio_pstripe_sector(rbio, sectornr); sector->uptodate = 1; - pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff; + pointers[stripe++] = kmap_local_sector(sector); if (has_qstripe) { /* @@ -1317,8 +1333,7 @@ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) */ sector = rbio_qstripe_sector(rbio, sectornr); sector->uptodate = 1; - pointers[stripe++] = kmap_local_page(sector->page) + - sector->pgoff; + pointers[stripe++] = kmap_local_sector(sector); assert_rbio(rbio); raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, @@ -1477,15 +1492,14 @@ static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio) * stripe_pages[], thus we need to locate the sector. */ static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio, - struct page *page, - unsigned int pgoff) + phys_addr_t paddr) { int i; for (i = 0; i < rbio->nr_sectors; i++) { struct sector_ptr *sector = &rbio->stripe_sectors[i]; - if (sector->page == page && sector->pgoff == pgoff) + if (sector->has_paddr && sector->paddr == paddr) return sector; } return NULL; @@ -1505,11 +1519,10 @@ static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio) bio_for_each_segment_all(bvec, bio, iter_all) { struct sector_ptr *sector; - int pgoff; + phys_addr_t paddr = bvec_phys(bvec); - for (pgoff = bvec->bv_offset; pgoff - bvec->bv_offset < bvec->bv_len; - pgoff += sectorsize) { - sector = find_stripe_sector(rbio, bvec->bv_page, pgoff); + for (u32 off = 0; off < bvec->bv_len; off += sectorsize) { + sector = find_stripe_sector(rbio, paddr + off); ASSERT(sector); if (sector) sector->uptodate = 1; @@ -1519,17 +1532,14 @@ static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio) static int get_bio_sector_nr(struct btrfs_raid_bio *rbio, struct bio *bio) { - struct bio_vec *bv = bio_first_bvec_all(bio); + phys_addr_t bvec_paddr = bvec_phys(bio_first_bvec_all(bio)); int i; for (i = 0; i < rbio->nr_sectors; i++) { - struct sector_ptr *sector; - - sector = &rbio->stripe_sectors[i]; - if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset) + if (rbio->stripe_sectors[i].paddr == bvec_paddr) break; - sector = &rbio->bio_sectors[i]; - if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset) + if (rbio->bio_sectors[i].has_paddr && + rbio->bio_sectors[i].paddr == bvec_paddr) break; } ASSERT(i < rbio->nr_sectors); @@ -1575,11 +1585,11 @@ static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio, return; bio_for_each_segment_all(bvec, bio, iter_all) { - int bv_offset; + void *kaddr; - for (bv_offset = bvec->bv_offset; - bv_offset < bvec->bv_offset + bvec->bv_len; - bv_offset += fs_info->sectorsize, total_sector_nr++) { + kaddr = bvec_kmap_local(bvec); + for (u32 off = 0; off < bvec->bv_len; + off += fs_info->sectorsize, total_sector_nr++) { u8 csum_buf[BTRFS_CSUM_SIZE]; u8 *expected_csum = rbio->csum_buf + total_sector_nr * fs_info->csum_size; @@ -1589,11 +1599,12 @@ static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio, if (!test_bit(total_sector_nr, rbio->csum_bitmap)) continue; - ret = btrfs_check_sector_csum(fs_info, bvec->bv_page, - bv_offset, csum_buf, expected_csum); + ret = btrfs_check_sector_csum(fs_info, kaddr + off, + csum_buf, expected_csum); if (ret < 0) set_bit(total_sector_nr, rbio->error_bitmap); } + kunmap_local(kaddr); } } @@ -1689,8 +1700,8 @@ static void raid_unplug(struct blk_plug_cb *cb, bool from_schedule) list_sort(NULL, &plug->rbio_list, plug_cmp); while (!list_empty(&plug->rbio_list)) { - cur = list_entry(plug->rbio_list.next, - struct btrfs_raid_bio, plug_list); + cur = list_first_entry(&plug->rbio_list, + struct btrfs_raid_bio, plug_list); list_del_init(&cur->plug_list); if (rbio_is_full(cur)) { @@ -1791,6 +1802,7 @@ static int verify_one_sector(struct btrfs_raid_bio *rbio, struct sector_ptr *sector; u8 csum_buf[BTRFS_CSUM_SIZE]; u8 *csum_expected; + void *kaddr; int ret; if (!rbio->csum_bitmap || !rbio->csum_buf) @@ -1809,13 +1821,12 @@ static int verify_one_sector(struct btrfs_raid_bio *rbio, sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); } - ASSERT(sector->page); - csum_expected = rbio->csum_buf + (stripe_nr * rbio->stripe_nsectors + sector_nr) * fs_info->csum_size; - ret = btrfs_check_sector_csum(fs_info, sector->page, sector->pgoff, - csum_buf, csum_expected); + kaddr = kmap_local_sector(sector); + ret = btrfs_check_sector_csum(fs_info, kaddr, csum_buf, csum_expected); + kunmap_local(kaddr); return ret; } @@ -1872,9 +1883,7 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, } else { sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); } - ASSERT(sector->page); - pointers[stripe_nr] = kmap_local_page(sector->page) + - sector->pgoff; + pointers[stripe_nr] = kmap_local_sector(sector); unmap_array[stripe_nr] = pointers[stripe_nr]; } @@ -2282,9 +2291,8 @@ static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio) static void raid_wait_write_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; - blk_status_t err = bio->bi_status; - if (err) + if (bio->bi_status) rbio_update_error_bitmap(rbio, bio); bio_put(bio); if (atomic_dec_and_test(&rbio->stripes_pending)) @@ -2326,7 +2334,7 @@ static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio) * thus this rbio can not be cached one, as cached one must * have all its data sectors present and uptodate. */ - if (!sector->page || !sector->uptodate) + if (!sector->has_paddr || !sector->uptodate) return true; } return false; @@ -2516,6 +2524,7 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio) int stripe; int sectornr; bool has_qstripe; + struct page *page; struct sector_ptr p_sector = { 0 }; struct sector_ptr q_sector = { 0 }; struct bio_list bio_list; @@ -2547,29 +2556,33 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio) */ clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); - p_sector.page = alloc_page(GFP_NOFS); - if (!p_sector.page) + page = alloc_page(GFP_NOFS); + if (!page) return -ENOMEM; - p_sector.pgoff = 0; + p_sector.has_paddr = true; + p_sector.paddr = page_to_phys(page); p_sector.uptodate = 1; + page = NULL; if (has_qstripe) { /* RAID6, allocate and map temp space for the Q stripe */ - q_sector.page = alloc_page(GFP_NOFS); - if (!q_sector.page) { - __free_page(p_sector.page); - p_sector.page = NULL; + page = alloc_page(GFP_NOFS); + if (!page) { + __free_page(phys_to_page(p_sector.paddr)); + p_sector.has_paddr = false; return -ENOMEM; } - q_sector.pgoff = 0; + q_sector.has_paddr = true; + q_sector.paddr = page_to_phys(page); q_sector.uptodate = 1; - pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page); + page = NULL; + pointers[rbio->real_stripes - 1] = kmap_local_sector(&q_sector); } bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); /* Map the parity stripe just once */ - pointers[nr_data] = kmap_local_page(p_sector.page); + pointers[nr_data] = kmap_local_sector(&p_sector); for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { struct sector_ptr *sector; @@ -2578,8 +2591,7 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio) /* first collect one page from each data stripe */ for (stripe = 0; stripe < nr_data; stripe++) { sector = sector_in_rbio(rbio, stripe, sectornr, 0); - pointers[stripe] = kmap_local_page(sector->page) + - sector->pgoff; + pointers[stripe] = kmap_local_sector(sector); } if (has_qstripe) { @@ -2595,7 +2607,7 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio) /* Check scrubbing parity and repair it */ sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); - parity = kmap_local_page(sector->page) + sector->pgoff; + parity = kmap_local_sector(sector); if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0) memcpy(parity, pointers[rbio->scrubp], sectorsize); else @@ -2608,12 +2620,11 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio) } kunmap_local(pointers[nr_data]); - __free_page(p_sector.page); - p_sector.page = NULL; - if (q_sector.page) { - kunmap_local(pointers[rbio->real_stripes - 1]); - __free_page(q_sector.page); - q_sector.page = NULL; + __free_page(phys_to_page(p_sector.paddr)); + p_sector.has_paddr = false; + if (q_sector.has_paddr) { + __free_page(phys_to_page(q_sector.paddr)); + q_sector.has_paddr = false; } /* diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index f0824c948cb7..62161beca559 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -87,7 +87,7 @@ static int copy_inline_to_page(struct btrfs_inode *inode, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, btrfs_alloc_write_mask(mapping)); if (IS_ERR(folio)) { - ret = -ENOMEM; + ret = PTR_ERR(folio); goto out_unlock; } @@ -95,9 +95,8 @@ static int copy_inline_to_page(struct btrfs_inode *inode, if (ret < 0) goto out_unlock; - clear_extent_bit(&inode->io_tree, file_offset, range_end, - EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - NULL); + btrfs_clear_extent_bits(&inode->io_tree, file_offset, range_end, + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG); ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL); if (ret) goto out_unlock; @@ -165,7 +164,7 @@ out: * the source inode to destination inode when possible. When not possible we * copy the inline extent's data into the respective page of the inode. */ -static int clone_copy_inline_extent(struct inode *dst, +static int clone_copy_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path, struct btrfs_key *new_key, const u64 drop_start, @@ -175,8 +174,8 @@ static int clone_copy_inline_extent(struct inode *dst, char *inline_data, struct btrfs_trans_handle **trans_out) { - struct btrfs_fs_info *fs_info = inode_to_fs_info(dst); - struct btrfs_root *root = BTRFS_I(dst)->root; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; const u64 aligned_end = ALIGN(new_key->offset + datal, fs_info->sectorsize); struct btrfs_trans_handle *trans = NULL; @@ -185,12 +184,12 @@ static int clone_copy_inline_extent(struct inode *dst, struct btrfs_key key; if (new_key->offset > 0) { - ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, + ret = copy_inline_to_page(inode, new_key->offset, inline_data, size, datal, comp_type); goto out; } - key.objectid = btrfs_ino(BTRFS_I(dst)); + key.objectid = btrfs_ino(inode); key.type = BTRFS_EXTENT_DATA_KEY; key.offset = 0; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); @@ -205,7 +204,7 @@ static int clone_copy_inline_extent(struct inode *dst, goto copy_inline_extent; } btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - if (key.objectid == btrfs_ino(BTRFS_I(dst)) && + if (key.objectid == btrfs_ino(inode) && key.type == BTRFS_EXTENT_DATA_KEY) { /* * There's an implicit hole at file offset 0, copy the @@ -214,7 +213,7 @@ static int clone_copy_inline_extent(struct inode *dst, ASSERT(key.offset > 0); goto copy_to_page; } - } else if (i_size_read(dst) <= datal) { + } else if (i_size_read(&inode->vfs_inode) <= datal) { struct btrfs_file_extent_item *ei; ei = btrfs_item_ptr(path->nodes[0], path->slots[0], @@ -236,7 +235,7 @@ copy_inline_extent: * We have no extent items, or we have an extent at offset 0 which may * or may not be inlined. All these cases are dealt the same way. */ - if (i_size_read(dst) > datal) { + if (i_size_read(&inode->vfs_inode) > datal) { /* * At the destination offset 0 we have either a hole, a regular * extent or an inline extent larger then the one we want to @@ -270,7 +269,7 @@ copy_inline_extent: drop_args.start = drop_start; drop_args.end = aligned_end; drop_args.drop_cache = true; - ret = btrfs_drop_extents(trans, root, BTRFS_I(dst), &drop_args); + ret = btrfs_drop_extents(trans, root, inode, &drop_args); if (ret) goto out; ret = btrfs_insert_empty_item(trans, root, path, new_key, size); @@ -281,9 +280,9 @@ copy_inline_extent: btrfs_item_ptr_offset(path->nodes[0], path->slots[0]), size); - btrfs_update_inode_bytes(BTRFS_I(dst), datal, drop_args.bytes_found); - btrfs_set_inode_full_sync(BTRFS_I(dst)); - ret = btrfs_inode_set_file_extent_range(BTRFS_I(dst), 0, aligned_end); + btrfs_update_inode_bytes(inode, datal, drop_args.bytes_found); + btrfs_set_inode_full_sync(inode); + ret = btrfs_inode_set_file_extent_range(inode, 0, aligned_end); out: if (!ret && !trans) { /* @@ -318,7 +317,7 @@ copy_to_page: */ btrfs_release_path(path); - ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, + ret = copy_inline_to_page(inode, new_key->offset, inline_data, size, datal, comp_type); goto out; } @@ -526,7 +525,7 @@ process_slot: goto out; } - ret = clone_copy_inline_extent(inode, path, &new_key, + ret = clone_copy_inline_extent(BTRFS_I(inode), path, &new_key, drop_start, datal, size, comp, buf, &trans); if (ret) @@ -617,26 +616,26 @@ out: return ret; } -static void btrfs_double_mmap_lock(struct inode *inode1, struct inode *inode2) +static void btrfs_double_mmap_lock(struct btrfs_inode *inode1, struct btrfs_inode *inode2) { if (inode1 < inode2) swap(inode1, inode2); - down_write(&BTRFS_I(inode1)->i_mmap_lock); - down_write_nested(&BTRFS_I(inode2)->i_mmap_lock, SINGLE_DEPTH_NESTING); + down_write(&inode1->i_mmap_lock); + down_write_nested(&inode2->i_mmap_lock, SINGLE_DEPTH_NESTING); } -static void btrfs_double_mmap_unlock(struct inode *inode1, struct inode *inode2) +static void btrfs_double_mmap_unlock(struct btrfs_inode *inode1, struct btrfs_inode *inode2) { - up_write(&BTRFS_I(inode1)->i_mmap_lock); - up_write(&BTRFS_I(inode2)->i_mmap_lock); + up_write(&inode1->i_mmap_lock); + up_write(&inode2->i_mmap_lock); } -static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, - struct inode *dst, u64 dst_loff) +static int btrfs_extent_same_range(struct btrfs_inode *src, u64 loff, u64 len, + struct btrfs_inode *dst, u64 dst_loff) { const u64 end = dst_loff + len - 1; struct extent_state *cached_state = NULL; - struct btrfs_fs_info *fs_info = BTRFS_I(src)->root->fs_info; + struct btrfs_fs_info *fs_info = src->root->fs_info; const u64 bs = fs_info->sectorsize; int ret; @@ -646,9 +645,10 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, * because we have already locked the inode's i_mmap_lock in exclusive * mode. */ - lock_extent(&BTRFS_I(dst)->io_tree, dst_loff, end, &cached_state); - ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1); - unlock_extent(&BTRFS_I(dst)->io_tree, dst_loff, end, &cached_state); + btrfs_lock_extent(&dst->io_tree, dst_loff, end, &cached_state); + ret = btrfs_clone(&src->vfs_inode, &dst->vfs_inode, loff, len, + ALIGN(len, bs), dst_loff, 1); + btrfs_unlock_extent(&dst->io_tree, dst_loff, end, &cached_state); btrfs_btree_balance_dirty(fs_info); @@ -678,8 +678,8 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN); for (i = 0; i < chunk_count; i++) { - ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN, - dst, dst_loff); + ret = btrfs_extent_same_range(BTRFS_I(src), loff, BTRFS_MAX_DEDUPE_LEN, + BTRFS_I(dst), dst_loff); if (ret) goto out; @@ -688,7 +688,8 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, } if (tail_len > 0) - ret = btrfs_extent_same_range(src, loff, tail_len, dst, dst_loff); + ret = btrfs_extent_same_range(BTRFS_I(src), loff, tail_len, + BTRFS_I(dst), dst_loff); out: spin_lock(&root_dst->root_item_lock); root_dst->dedupe_in_progress--; @@ -747,9 +748,9 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, * mode. */ end = destoff + len - 1; - lock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state); + btrfs_lock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state); ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); - unlock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state); + btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state); /* * We may have copied an inline extent into a page of the destination @@ -775,24 +776,24 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *len, unsigned int remap_flags) { - struct inode *inode_in = file_inode(file_in); - struct inode *inode_out = file_inode(file_out); - u64 bs = BTRFS_I(inode_out)->root->fs_info->sectorsize; + struct btrfs_inode *inode_in = BTRFS_I(file_inode(file_in)); + struct btrfs_inode *inode_out = BTRFS_I(file_inode(file_out)); + u64 bs = inode_out->root->fs_info->sectorsize; u64 wb_len; int ret; if (!(remap_flags & REMAP_FILE_DEDUP)) { - struct btrfs_root *root_out = BTRFS_I(inode_out)->root; + struct btrfs_root *root_out = inode_out->root; if (btrfs_root_readonly(root_out)) return -EROFS; - ASSERT(inode_in->i_sb == inode_out->i_sb); + ASSERT(inode_in->vfs_inode.i_sb == inode_out->vfs_inode.i_sb); } /* Don't make the dst file partly checksummed */ - if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) != - (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) { + if ((inode_in->flags & BTRFS_INODE_NODATASUM) != + (inode_out->flags & BTRFS_INODE_NODATASUM)) { return -EINVAL; } @@ -811,7 +812,7 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, * to complete so that new file extent items are in the fs tree. */ if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP)) - wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs); + wb_len = ALIGN(inode_in->vfs_inode.i_size, bs) - ALIGN_DOWN(pos_in, bs); else wb_len = ALIGN(*len, bs); @@ -832,16 +833,14 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, * Also we don't need to check ASYNC_EXTENT, as async extent will be * CoWed anyway, not affecting nocow part. */ - ret = filemap_flush(inode_in->i_mapping); + ret = filemap_flush(inode_in->vfs_inode.i_mapping); if (ret < 0) return ret; - ret = btrfs_wait_ordered_range(BTRFS_I(inode_in), ALIGN_DOWN(pos_in, bs), - wb_len); + ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs), wb_len); if (ret < 0) return ret; - ret = btrfs_wait_ordered_range(BTRFS_I(inode_out), ALIGN_DOWN(pos_out, bs), - wb_len); + ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs), wb_len); if (ret < 0) return ret; @@ -863,8 +862,8 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, struct file *dst_file, loff_t destoff, loff_t len, unsigned int remap_flags) { - struct inode *src_inode = file_inode(src_file); - struct inode *dst_inode = file_inode(dst_file); + struct btrfs_inode *src_inode = BTRFS_I(file_inode(src_file)); + struct btrfs_inode *dst_inode = BTRFS_I(file_inode(dst_file)); bool same_inode = dst_inode == src_inode; int ret; @@ -872,9 +871,9 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, return -EINVAL; if (same_inode) { - btrfs_inode_lock(BTRFS_I(src_inode), BTRFS_ILOCK_MMAP); + btrfs_inode_lock(src_inode, BTRFS_ILOCK_MMAP); } else { - lock_two_nondirectories(src_inode, dst_inode); + lock_two_nondirectories(&src_inode->vfs_inode, &dst_inode->vfs_inode); btrfs_double_mmap_lock(src_inode, dst_inode); } @@ -884,16 +883,18 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, goto out_unlock; if (remap_flags & REMAP_FILE_DEDUP) - ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff); + ret = btrfs_extent_same(&src_inode->vfs_inode, off, len, + &dst_inode->vfs_inode, destoff); else ret = btrfs_clone_files(dst_file, src_file, off, len, destoff); out_unlock: if (same_inode) { - btrfs_inode_unlock(BTRFS_I(src_inode), BTRFS_ILOCK_MMAP); + btrfs_inode_unlock(src_inode, BTRFS_ILOCK_MMAP); } else { btrfs_double_mmap_unlock(src_inode, dst_inode); - unlock_two_nondirectories(src_inode, dst_inode); + unlock_two_nondirectories(&src_inode->vfs_inode, + &dst_inode->vfs_inode); } /* diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index af0969b70b53..02086191630d 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -178,8 +178,9 @@ static void mark_block_processed(struct reloc_control *rc, in_range(node->bytenr, rc->block_group->start, rc->block_group->length)) { blocksize = rc->extent_root->fs_info->nodesize; - set_extent_bit(&rc->processed_blocks, node->bytenr, - node->bytenr + blocksize - 1, EXTENT_DIRTY, NULL); + btrfs_set_extent_bit(&rc->processed_blocks, node->bytenr, + node->bytenr + blocksize - 1, EXTENT_DIRTY, + NULL); } node->processed = 1; } @@ -195,8 +196,8 @@ static struct btrfs_backref_node *walk_up_backref( int idx = *index; while (!list_empty(&node->upper)) { - edge = list_entry(node->upper.next, - struct btrfs_backref_edge, list[LOWER]); + edge = list_first_entry(&node->upper, struct btrfs_backref_edge, + list[LOWER]); edges[idx++] = edge; node = edge->node[UPPER]; } @@ -222,8 +223,8 @@ static struct btrfs_backref_node *walk_down_backref( idx--; continue; } - edge = list_entry(edge->list[LOWER].next, - struct btrfs_backref_edge, list[LOWER]); + edge = list_first_entry(&edge->list[LOWER], struct btrfs_backref_edge, + list[LOWER]); edges[idx - 1] = edge; *index = idx; return edge->node[UPPER]; @@ -347,8 +348,8 @@ static bool handle_useless_nodes(struct reloc_control *rc, struct btrfs_backref_edge *edge; struct btrfs_backref_node *lower; - edge = list_entry(cur->lower.next, - struct btrfs_backref_edge, list[UPPER]); + edge = list_first_entry(&cur->lower, struct btrfs_backref_edge, + list[UPPER]); list_del(&edge->list[UPPER]); list_del(&edge->list[LOWER]); lower = edge->node[LOWER]; @@ -910,16 +911,16 @@ int replace_file_extents(struct btrfs_trans_handle *trans, /* Take mmap lock to serialize with reflinks. */ if (!down_read_trylock(&inode->i_mmap_lock)) continue; - ret = try_lock_extent(&inode->io_tree, key.offset, - end, &cached_state); + ret = btrfs_try_lock_extent(&inode->io_tree, key.offset, + end, &cached_state); if (!ret) { up_read(&inode->i_mmap_lock); continue; } btrfs_drop_extent_map_range(inode, key.offset, end, true); - unlock_extent(&inode->io_tree, key.offset, end, - &cached_state); + btrfs_unlock_extent(&inode->io_tree, key.offset, end, + &cached_state); up_read(&inode->i_mmap_lock); } } @@ -1378,9 +1379,9 @@ static int invalidate_extent_cache(struct btrfs_root *root, } /* the lock_extent waits for read_folio to complete */ - lock_extent(&inode->io_tree, start, end, &cached_state); + btrfs_lock_extent(&inode->io_tree, start, end, &cached_state); btrfs_drop_extent_map_range(inode, start, end, true); - unlock_extent(&inode->io_tree, start, end, &cached_state); + btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); } return 0; } @@ -1697,8 +1698,8 @@ again: rc->merge_reloc_tree = true; while (!list_empty(&rc->reloc_roots)) { - reloc_root = list_entry(rc->reloc_roots.next, - struct btrfs_root, root_list); + reloc_root = list_first_entry(&rc->reloc_roots, + struct btrfs_root, root_list); list_del_init(&reloc_root->root_list); root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, @@ -1813,8 +1814,7 @@ again: while (!list_empty(&reloc_roots)) { found = 1; - reloc_root = list_entry(reloc_roots.next, - struct btrfs_root, root_list); + reloc_root = list_first_entry(&reloc_roots, struct btrfs_root, root_list); root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false); @@ -1930,11 +1930,11 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, * reloc root without a corresponding root this could return ENOENT. */ if (IS_ERR(root)) { - ASSERT(0); + DEBUG_WARN("error %ld reading root for reloc root", PTR_ERR(root)); return PTR_ERR(root); } if (root->reloc_root != reloc_root) { - ASSERT(0); + DEBUG_WARN("unexpected reloc root found"); btrfs_err(fs_info, "root %llu has two reloc roots associated with it", reloc_root->root_key.offset); @@ -2109,8 +2109,8 @@ static noinline_for_stack u64 calcu_metadata_size(struct reloc_control *rc, if (list_empty(&next->upper)) break; - edge = list_entry(next->upper.next, - struct btrfs_backref_edge, list[LOWER]); + edge = list_first_entry(&next->upper, struct btrfs_backref_edge, + list[LOWER]); edges[index++] = edge; next = edge->node[UPPER]; } @@ -2356,8 +2356,8 @@ static int finish_pending_nodes(struct btrfs_trans_handle *trans, for (level = 0; level < BTRFS_MAX_LEVEL; level++) { while (!list_empty(&cache->pending[level])) { - node = list_entry(cache->pending[level].next, - struct btrfs_backref_node, list); + node = list_first_entry(&cache->pending[level], + struct btrfs_backref_node, list); list_move_tail(&node->list, &list); BUG_ON(!node->pending); @@ -2395,8 +2395,8 @@ static void update_processed_blocks(struct reloc_control *rc, if (list_empty(&next->upper)) break; - edge = list_entry(next->upper.next, - struct btrfs_backref_edge, list[LOWER]); + edge = list_first_entry(&next->upper, struct btrfs_backref_edge, + list[LOWER]); edges[index++] = edge; next = edge->node[UPPER]; } @@ -2408,8 +2408,8 @@ static int tree_block_processed(u64 bytenr, struct reloc_control *rc) { u32 blocksize = rc->extent_root->fs_info->nodesize; - if (test_range_bit(&rc->processed_blocks, bytenr, - bytenr + blocksize - 1, EXTENT_DIRTY, NULL)) + if (btrfs_test_range_bit(&rc->processed_blocks, bytenr, + bytenr + blocksize - 1, EXTENT_DIRTY, NULL)) return 1; return 0; } @@ -2706,9 +2706,6 @@ static noinline_for_stack int prealloc_file_extent_cluster(struct reloc_control if (ret < 0) return ret; - clear_extent_bits(&inode->io_tree, i_size, - round_up(i_size, PAGE_SIZE) - 1, - EXTENT_UPTODATE); folio = filemap_lock_folio(mapping, i_size >> PAGE_SHIFT); /* * If page is freed we don't need to do anything then, as we @@ -2738,21 +2735,21 @@ static noinline_for_stack int prealloc_file_extent_cluster(struct reloc_control else end = cluster->end - offset; - lock_extent(&inode->io_tree, start, end, &cached_state); + btrfs_lock_extent(&inode->io_tree, start, end, &cached_state); num_bytes = end + 1 - start; ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start, num_bytes, num_bytes, end + 1, &alloc_hint); cur_offset = end + 1; - unlock_extent(&inode->io_tree, start, end, &cached_state); + btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); if (ret) break; } btrfs_inode_unlock(inode, 0); if (cur_offset < prealloc_end) - btrfs_free_reserved_data_space_noquota(inode->root->fs_info, - prealloc_end + 1 - cur_offset); + btrfs_free_reserved_data_space_noquota(inode, + prealloc_end + 1 - cur_offset); return ret; } @@ -2766,7 +2763,7 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct reloc_contr u64 end = rc->cluster.end - offset; int ret = 0; - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) return -ENOMEM; @@ -2777,10 +2774,10 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct reloc_contr em->ram_bytes = em->len; em->flags |= EXTENT_FLAG_PINNED; - lock_extent(&inode->io_tree, start, end, &cached_state); + btrfs_lock_extent(&inode->io_tree, start, end, &cached_state); ret = btrfs_replace_extent_map_range(inode, em, false); - unlock_extent(&inode->io_tree, start, end, &cached_state); - free_extent_map(em); + btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); + btrfs_free_extent_map(em); return ret; } @@ -2902,15 +2899,15 @@ again: goto release_folio; /* Mark the range delalloc and dirty for later writeback */ - lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, - &cached_state); + btrfs_lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, + clamped_end, &cached_state); ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start, clamped_end, 0, &cached_state); if (ret) { - clear_extent_bit(&BTRFS_I(inode)->io_tree, - clamped_start, clamped_end, - EXTENT_LOCKED | EXTENT_BOUNDARY, - &cached_state); + btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, + clamped_start, clamped_end, + EXTENT_LOCKED | EXTENT_BOUNDARY, + &cached_state); btrfs_delalloc_release_metadata(BTRFS_I(inode), clamped_len, true); btrfs_delalloc_release_extents(BTRFS_I(inode), @@ -2932,12 +2929,12 @@ again: u64 boundary_end = boundary_start + fs_info->sectorsize - 1; - set_extent_bit(&BTRFS_I(inode)->io_tree, - boundary_start, boundary_end, - EXTENT_BOUNDARY, NULL); + btrfs_set_extent_bit(&BTRFS_I(inode)->io_tree, + boundary_start, boundary_end, + EXTENT_BOUNDARY, NULL); } - unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, - &cached_state); + btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, + &cached_state); btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len); cur += clamped_len; @@ -3239,21 +3236,23 @@ out: return ret; } -static int delete_block_group_cache(struct btrfs_fs_info *fs_info, - struct btrfs_block_group *block_group, +static int delete_block_group_cache(struct btrfs_block_group *block_group, struct inode *inode, u64 ino) { + struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_root *root = fs_info->tree_root; struct btrfs_trans_handle *trans; + struct btrfs_inode *btrfs_inode; int ret = 0; if (inode) goto truncate; - inode = btrfs_iget(ino, root); - if (IS_ERR(inode)) + btrfs_inode = btrfs_iget(ino, root); + if (IS_ERR(btrfs_inode)) return -ENOENT; + inode = &btrfs_inode->vfs_inode; truncate: ret = btrfs_check_trunc_cache_free_space(fs_info, @@ -3313,8 +3312,7 @@ static int delete_v1_space_cache(struct extent_buffer *leaf, } if (!found) return -ENOENT; - ret = delete_block_group_cache(leaf->fs_info, block_group, NULL, - space_cache_ino); + ret = delete_block_group_cache(block_group, NULL, space_cache_ino); return ret; } @@ -3434,9 +3432,9 @@ next: goto next; } - block_found = find_first_extent_bit(&rc->processed_blocks, - key.objectid, &start, &end, - EXTENT_DIRTY, NULL); + block_found = btrfs_find_first_extent_bit(&rc->processed_blocks, + key.objectid, &start, &end, + EXTENT_DIRTY, NULL); if (block_found && start <= key.objectid) { btrfs_release_path(path); @@ -3645,7 +3643,7 @@ restart: } btrfs_release_path(path); - clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY); + btrfs_clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY); if (trans) { btrfs_end_transaction_throttle(trans); @@ -3761,10 +3759,10 @@ out: * the inode is in data relocation tree and its link count is 0 */ static noinline_for_stack struct inode *create_reloc_inode( - struct btrfs_fs_info *fs_info, const struct btrfs_block_group *group) { - struct inode *inode = NULL; + struct btrfs_fs_info *fs_info = group->fs_info; + struct btrfs_inode *inode = NULL; struct btrfs_trans_handle *trans; struct btrfs_root *root; u64 objectid; @@ -3792,18 +3790,19 @@ static noinline_for_stack struct inode *create_reloc_inode( inode = NULL; goto out; } - BTRFS_I(inode)->reloc_block_group_start = group->start; + inode->reloc_block_group_start = group->start; - ret = btrfs_orphan_add(trans, BTRFS_I(inode)); + ret = btrfs_orphan_add(trans, inode); out: btrfs_put_root(root); btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); if (ret) { - iput(inode); - inode = ERR_PTR(ret); + if (inode) + iput(&inode->vfs_inode); + return ERR_PTR(ret); } - return inode; + return &inode->vfs_inode; } /* @@ -3860,7 +3859,7 @@ static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) btrfs_backref_init_cache(fs_info, &rc->backref_cache, true); rc->reloc_root_tree.rb_root = RB_ROOT; spin_lock_init(&rc->reloc_root_tree.lock); - extent_io_tree_init(fs_info, &rc->processed_blocks, IO_TREE_RELOC_BLOCKS); + btrfs_extent_io_tree_init(fs_info, &rc->processed_blocks, IO_TREE_RELOC_BLOCKS); return rc; } @@ -3977,7 +3976,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) btrfs_free_path(path); if (!IS_ERR(inode)) - ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0); + ret = delete_block_group_cache(rc->block_group, inode, 0); else ret = PTR_ERR(inode); @@ -3986,7 +3985,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) goto out; } - rc->data_inode = create_reloc_inode(fs_info, rc->block_group); + rc->data_inode = create_reloc_inode(rc->block_group); if (IS_ERR(rc->data_inode)) { err = PTR_ERR(rc->data_inode); rc->data_inode = NULL; @@ -4183,8 +4182,7 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info) rc->merge_reloc_tree = true; while (!list_empty(&reloc_roots)) { - reloc_root = list_entry(reloc_roots.next, - struct btrfs_root, root_list); + reloc_root = list_first_entry(&reloc_roots, struct btrfs_root, root_list); list_del(&reloc_root->root_list); if (btrfs_root_refs(&reloc_root->root_item) == 0) { @@ -4277,7 +4275,7 @@ int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered) while (!list_empty(&list)) { struct btrfs_ordered_sum *sums = - list_entry(list.next, struct btrfs_ordered_sum, list); + list_first_entry(&list, struct btrfs_ordered_sum, list); list_del_init(&sums->list); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 531312efee8d..ce36fafc771e 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -66,8 +66,6 @@ struct scrub_ctx; /* Represent one sector and its needed info to verify the content. */ struct scrub_sector_verification { - bool is_metadata; - union { /* * Csum pointer for data csum verification. Should point to a @@ -100,6 +98,38 @@ enum scrub_stripe_flags { SCRUB_STRIPE_FLAG_NO_REPORT, }; +/* + * We have multiple bitmaps for one scrub_stripe. + * However each bitmap has at most (BTRFS_STRIPE_LEN / blocksize) bits, + * which is normally 16, and much smaller than BITS_PER_LONG (32 or 64). + * + * So to reduce memory usage for each scrub_stripe, we pack those bitmaps + * into a larger one. + * + * These enum records where the sub-bitmap are inside the larger one. + * Each subbitmap starts at scrub_bitmap_nr_##name * nr_sectors bit. + */ +enum { + /* Which blocks are covered by extent items. */ + scrub_bitmap_nr_has_extent = 0, + + /* Which blocks are meteadata. */ + scrub_bitmap_nr_is_metadata, + + /* + * Which blocks have errors, including IO, csum, and metadata + * errors. + * This sub-bitmap is the OR results of the next few error related + * sub-bitmaps. + */ + scrub_bitmap_nr_error, + scrub_bitmap_nr_io_error, + scrub_bitmap_nr_csum_error, + scrub_bitmap_nr_meta_error, + scrub_bitmap_nr_meta_gen_error, + scrub_bitmap_nr_last, +}; + #define SCRUB_STRIPE_PAGES (BTRFS_STRIPE_LEN / PAGE_SIZE) /* @@ -138,36 +168,15 @@ struct scrub_stripe { */ unsigned long state; - /* Indicate which sectors are covered by extent items. */ - unsigned long extent_sector_bitmap; - - /* - * The errors hit during the initial read of the stripe. - * - * Would be utilized for error reporting and repair. - * - * The remaining init_nr_* records the number of errors hit, only used - * by error reporting. - */ - unsigned long init_error_bitmap; - unsigned int init_nr_io_errors; - unsigned int init_nr_csum_errors; - unsigned int init_nr_meta_errors; + /* The large bitmap contains all the sub-bitmaps. */ + unsigned long bitmaps[BITS_TO_LONGS(scrub_bitmap_nr_last * + (BTRFS_STRIPE_LEN / BTRFS_MIN_BLOCKSIZE))]; /* - * The following error bitmaps are all for the current status. - * Every time we submit a new read, these bitmaps may be updated. - * - * error_bitmap = io_error_bitmap | csum_error_bitmap | meta_error_bitmap; - * - * IO and csum errors can happen for both metadata and data. + * For writeback (repair or replace) error reporting. + * This one is protected by a spinlock, thus can not be packed into + * the larger bitmap. */ - unsigned long error_bitmap; - unsigned long io_error_bitmap; - unsigned long csum_error_bitmap; - unsigned long meta_error_bitmap; - - /* For writeback (repair or replace) error reporting. */ unsigned long write_error_bitmap; /* Writeback can be concurrent, thus we need to protect the bitmap. */ @@ -219,6 +228,90 @@ struct scrub_ctx { refcount_t refs; }; +#define scrub_calc_start_bit(stripe, name, block_nr) \ +({ \ + unsigned int __start_bit; \ + \ + ASSERT(block_nr < stripe->nr_sectors, \ + "nr_sectors=%u block_nr=%u", stripe->nr_sectors, block_nr); \ + __start_bit = scrub_bitmap_nr_##name * stripe->nr_sectors + block_nr; \ + __start_bit; \ +}) + +#define IMPLEMENT_SCRUB_BITMAP_OPS(name) \ +static inline void scrub_bitmap_set_##name(struct scrub_stripe *stripe, \ + unsigned int block_nr, \ + unsigned int nr_blocks) \ +{ \ + const unsigned int start_bit = scrub_calc_start_bit(stripe, \ + name, block_nr); \ + \ + bitmap_set(stripe->bitmaps, start_bit, nr_blocks); \ +} \ +static inline void scrub_bitmap_clear_##name(struct scrub_stripe *stripe, \ + unsigned int block_nr, \ + unsigned int nr_blocks) \ +{ \ + const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \ + block_nr); \ + \ + bitmap_clear(stripe->bitmaps, start_bit, nr_blocks); \ +} \ +static inline bool scrub_bitmap_test_bit_##name(struct scrub_stripe *stripe, \ + unsigned int block_nr) \ +{ \ + const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \ + block_nr); \ + \ + return test_bit(start_bit, stripe->bitmaps); \ +} \ +static inline void scrub_bitmap_set_bit_##name(struct scrub_stripe *stripe, \ + unsigned int block_nr) \ +{ \ + const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \ + block_nr); \ + \ + set_bit(start_bit, stripe->bitmaps); \ +} \ +static inline void scrub_bitmap_clear_bit_##name(struct scrub_stripe *stripe, \ + unsigned int block_nr) \ +{ \ + const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \ + block_nr); \ + \ + clear_bit(start_bit, stripe->bitmaps); \ +} \ +static inline unsigned long scrub_bitmap_read_##name(struct scrub_stripe *stripe) \ +{ \ + const unsigned int nr_blocks = stripe->nr_sectors; \ + \ + ASSERT(nr_blocks > 0 && nr_blocks <= BITS_PER_LONG, \ + "nr_blocks=%u BITS_PER_LONG=%u", \ + nr_blocks, BITS_PER_LONG); \ + \ + return bitmap_read(stripe->bitmaps, nr_blocks * scrub_bitmap_nr_##name, \ + stripe->nr_sectors); \ +} \ +static inline bool scrub_bitmap_empty_##name(struct scrub_stripe *stripe) \ +{ \ + unsigned long bitmap = scrub_bitmap_read_##name(stripe); \ + \ + return bitmap_empty(&bitmap, stripe->nr_sectors); \ +} \ +static inline unsigned int scrub_bitmap_weight_##name(struct scrub_stripe *stripe) \ +{ \ + unsigned long bitmap = scrub_bitmap_read_##name(stripe); \ + \ + return bitmap_weight(&bitmap, stripe->nr_sectors); \ +} +IMPLEMENT_SCRUB_BITMAP_OPS(has_extent); +IMPLEMENT_SCRUB_BITMAP_OPS(is_metadata); +IMPLEMENT_SCRUB_BITMAP_OPS(error); +IMPLEMENT_SCRUB_BITMAP_OPS(io_error); +IMPLEMENT_SCRUB_BITMAP_OPS(csum_error); +IMPLEMENT_SCRUB_BITMAP_OPS(meta_error); +IMPLEMENT_SCRUB_BITMAP_OPS(meta_gen_error); + struct scrub_warning { struct btrfs_path *path; u64 extent_item_size; @@ -228,6 +321,19 @@ struct scrub_warning { struct btrfs_device *dev; }; +struct scrub_error_records { + /* + * Bitmap recording which blocks hit errors (IO/csum/...) during the + * initial read. + */ + unsigned long init_error_bitmap; + + unsigned int nr_io_errors; + unsigned int nr_csum_errors; + unsigned int nr_meta_errors; + unsigned int nr_meta_gen_errors; +}; + static void release_scrub_stripe(struct scrub_stripe *stripe) { if (!stripe) @@ -579,20 +685,15 @@ static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical) return ret; } -static struct page *scrub_stripe_get_page(struct scrub_stripe *stripe, int sector_nr) +static void *scrub_stripe_get_kaddr(struct scrub_stripe *stripe, int sector_nr) { - struct btrfs_fs_info *fs_info = stripe->bg->fs_info; - int page_index = (sector_nr << fs_info->sectorsize_bits) >> PAGE_SHIFT; + u32 offset = (sector_nr << stripe->bg->fs_info->sectorsize_bits); + const struct page *page = stripe->pages[offset >> PAGE_SHIFT]; - return stripe->pages[page_index]; -} - -static unsigned int scrub_stripe_get_page_offset(struct scrub_stripe *stripe, - int sector_nr) -{ - struct btrfs_fs_info *fs_info = stripe->bg->fs_info; - - return offset_in_page(sector_nr << fs_info->sectorsize_bits); + /* stripe->pages[] is allocated by us and no highmem is allowed. */ + ASSERT(page); + ASSERT(!PageHighMem(page)); + return page_address(page) + offset_in_page(offset); } static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr) @@ -600,24 +701,22 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr struct btrfs_fs_info *fs_info = stripe->bg->fs_info; const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits; const u64 logical = stripe->logical + (sector_nr << fs_info->sectorsize_bits); - const struct page *first_page = scrub_stripe_get_page(stripe, sector_nr); - const unsigned int first_off = scrub_stripe_get_page_offset(stripe, sector_nr); + void *first_kaddr = scrub_stripe_get_kaddr(stripe, sector_nr); + struct btrfs_header *header = first_kaddr; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); u8 on_disk_csum[BTRFS_CSUM_SIZE]; u8 calculated_csum[BTRFS_CSUM_SIZE]; - struct btrfs_header *header; /* * Here we don't have a good way to attach the pages (and subpages) * to a dummy extent buffer, thus we have to directly grab the members * from pages. */ - header = (struct btrfs_header *)(page_address(first_page) + first_off); memcpy(on_disk_csum, header->csum, fs_info->csum_size); if (logical != btrfs_stack_header_bytenr(header)) { - bitmap_set(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree); - bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); + scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); + scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, "tree block %llu mirror %u has bad bytenr, has %llu want %llu", logical, stripe->mirror_num, @@ -626,8 +725,8 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr } if (memcmp(header->fsid, fs_info->fs_devices->metadata_uuid, BTRFS_FSID_SIZE) != 0) { - bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); - bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); + scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); + scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, "tree block %llu mirror %u has bad fsid, has %pU want %pU", logical, stripe->mirror_num, @@ -636,8 +735,8 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr } if (memcmp(header->chunk_tree_uuid, fs_info->chunk_tree_uuid, BTRFS_UUID_SIZE) != 0) { - bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); - bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); + scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); + scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, "tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU", logical, stripe->mirror_num, @@ -648,21 +747,18 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr /* Now check tree block csum. */ shash->tfm = fs_info->csum_shash; crypto_shash_init(shash); - crypto_shash_update(shash, page_address(first_page) + first_off + - BTRFS_CSUM_SIZE, fs_info->sectorsize - BTRFS_CSUM_SIZE); + crypto_shash_update(shash, first_kaddr + BTRFS_CSUM_SIZE, + fs_info->sectorsize - BTRFS_CSUM_SIZE); for (int i = sector_nr + 1; i < sector_nr + sectors_per_tree; i++) { - struct page *page = scrub_stripe_get_page(stripe, i); - unsigned int page_off = scrub_stripe_get_page_offset(stripe, i); - - crypto_shash_update(shash, page_address(page) + page_off, + crypto_shash_update(shash, scrub_stripe_get_kaddr(stripe, i), fs_info->sectorsize); } crypto_shash_final(shash, calculated_csum); if (memcmp(calculated_csum, on_disk_csum, fs_info->csum_size) != 0) { - bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); - bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); + scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); + scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, "tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT, logical, stripe->mirror_num, @@ -672,8 +768,8 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr } if (stripe->sectors[sector_nr].generation != btrfs_stack_header_generation(header)) { - bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); - bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); + scrub_bitmap_set_meta_gen_error(stripe, sector_nr, sectors_per_tree); + scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, "tree block %llu mirror %u has bad generation, has %llu want %llu", logical, stripe->mirror_num, @@ -681,9 +777,10 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr stripe->sectors[sector_nr].generation); return; } - bitmap_clear(&stripe->error_bitmap, sector_nr, sectors_per_tree); - bitmap_clear(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree); - bitmap_clear(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); + scrub_bitmap_clear_error(stripe, sector_nr, sectors_per_tree); + scrub_bitmap_clear_csum_error(stripe, sector_nr, sectors_per_tree); + scrub_bitmap_clear_meta_error(stripe, sector_nr, sectors_per_tree); + scrub_bitmap_clear_meta_gen_error(stripe, sector_nr, sectors_per_tree); } static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr) @@ -691,23 +788,22 @@ static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr) struct btrfs_fs_info *fs_info = stripe->bg->fs_info; struct scrub_sector_verification *sector = &stripe->sectors[sector_nr]; const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits; - struct page *page = scrub_stripe_get_page(stripe, sector_nr); - unsigned int pgoff = scrub_stripe_get_page_offset(stripe, sector_nr); + void *kaddr = scrub_stripe_get_kaddr(stripe, sector_nr); u8 csum_buf[BTRFS_CSUM_SIZE]; int ret; ASSERT(sector_nr >= 0 && sector_nr < stripe->nr_sectors); /* Sector not utilized, skip it. */ - if (!test_bit(sector_nr, &stripe->extent_sector_bitmap)) + if (!scrub_bitmap_test_bit_has_extent(stripe, sector_nr)) return; /* IO error, no need to check. */ - if (test_bit(sector_nr, &stripe->io_error_bitmap)) + if (scrub_bitmap_test_bit_io_error(stripe, sector_nr)) return; /* Metadata, verify the full tree block. */ - if (sector->is_metadata) { + if (scrub_bitmap_test_bit_is_metadata(stripe, sector_nr)) { /* * Check if the tree block crosses the stripe boundary. If * crossed the boundary, we cannot verify it but only give a @@ -733,17 +829,17 @@ static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr) * cases without csum, we have no other choice but to trust it. */ if (!sector->csum) { - clear_bit(sector_nr, &stripe->error_bitmap); + scrub_bitmap_clear_bit_error(stripe, sector_nr); return; } - ret = btrfs_check_sector_csum(fs_info, page, pgoff, csum_buf, sector->csum); + ret = btrfs_check_sector_csum(fs_info, kaddr, csum_buf, sector->csum); if (ret < 0) { - set_bit(sector_nr, &stripe->csum_error_bitmap); - set_bit(sector_nr, &stripe->error_bitmap); + scrub_bitmap_set_bit_csum_error(stripe, sector_nr); + scrub_bitmap_set_bit_error(stripe, sector_nr); } else { - clear_bit(sector_nr, &stripe->csum_error_bitmap); - clear_bit(sector_nr, &stripe->error_bitmap); + scrub_bitmap_clear_bit_csum_error(stripe, sector_nr); + scrub_bitmap_clear_bit_error(stripe, sector_nr); } } @@ -756,7 +852,7 @@ static void scrub_verify_one_stripe(struct scrub_stripe *stripe, unsigned long b for_each_set_bit(sector_nr, &bitmap, stripe->nr_sectors) { scrub_verify_one_sector(stripe, sector_nr); - if (stripe->sectors[sector_nr].is_metadata) + if (scrub_bitmap_test_bit_is_metadata(stripe, sector_nr)) sector_nr += sectors_per_tree - 1; } } @@ -766,8 +862,7 @@ static int calc_sector_number(struct scrub_stripe *stripe, struct bio_vec *first int i; for (i = 0; i < stripe->nr_sectors; i++) { - if (scrub_stripe_get_page(stripe, i) == first_bvec->bv_page && - scrub_stripe_get_page_offset(stripe, i) == first_bvec->bv_offset) + if (scrub_stripe_get_kaddr(stripe, i) == bvec_virt(first_bvec)) break; } ASSERT(i < stripe->nr_sectors); @@ -795,13 +890,13 @@ static void scrub_repair_read_endio(struct btrfs_bio *bbio) bio_size += bvec->bv_len; if (bbio->bio.bi_status) { - bitmap_set(&stripe->io_error_bitmap, sector_nr, - bio_size >> fs_info->sectorsize_bits); - bitmap_set(&stripe->error_bitmap, sector_nr, - bio_size >> fs_info->sectorsize_bits); + scrub_bitmap_set_io_error(stripe, sector_nr, + bio_size >> fs_info->sectorsize_bits); + scrub_bitmap_set_error(stripe, sector_nr, + bio_size >> fs_info->sectorsize_bits); } else { - bitmap_clear(&stripe->io_error_bitmap, sector_nr, - bio_size >> fs_info->sectorsize_bits); + scrub_bitmap_clear_io_error(stripe, sector_nr, + bio_size >> fs_info->sectorsize_bits); } bio_put(&bbio->bio); if (atomic_dec_and_test(&stripe->pending_io)) @@ -814,27 +909,39 @@ static int calc_next_mirror(int mirror, int num_copies) return (mirror + 1 > num_copies) ? 1 : mirror + 1; } +static void scrub_bio_add_sector(struct btrfs_bio *bbio, struct scrub_stripe *stripe, + int sector_nr) +{ + void *kaddr = scrub_stripe_get_kaddr(stripe, sector_nr); + int ret; + + ret = bio_add_page(&bbio->bio, virt_to_page(kaddr), bbio->fs_info->sectorsize, + offset_in_page(kaddr)); + /* + * Caller should ensure the bbio has enough size. + * And we cannot use __bio_add_page(), which doesn't do any merge. + * + * Meanwhile for scrub_submit_initial_read() we fully rely on the merge + * to create the minimal amount of bio vectors, for fs block size < page + * size cases. + */ + ASSERT(ret == bbio->fs_info->sectorsize); +} + static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe, int mirror, int blocksize, bool wait) { struct btrfs_fs_info *fs_info = stripe->bg->fs_info; struct btrfs_bio *bbio = NULL; - const unsigned long old_error_bitmap = stripe->error_bitmap; + const unsigned long old_error_bitmap = scrub_bitmap_read_error(stripe); int i; ASSERT(stripe->mirror_num >= 1); ASSERT(atomic_read(&stripe->pending_io) == 0); for_each_set_bit(i, &old_error_bitmap, stripe->nr_sectors) { - struct page *page; - int pgoff; - int ret; - - page = scrub_stripe_get_page(stripe, i); - pgoff = scrub_stripe_get_page_offset(stripe, i); - /* The current sector cannot be merged, submit the bio. */ - if (bbio && ((i > 0 && !test_bit(i - 1, &stripe->error_bitmap)) || + if (bbio && ((i > 0 && !test_bit(i - 1, &old_error_bitmap)) || bbio->bio.bi_iter.bi_size >= blocksize)) { ASSERT(bbio->bio.bi_iter.bi_size); atomic_inc(&stripe->pending_io); @@ -851,8 +958,7 @@ static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe, (i << fs_info->sectorsize_bits)) >> SECTOR_SHIFT; } - ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff); - ASSERT(ret == fs_info->sectorsize); + scrub_bio_add_sector(bbio, stripe, i); } if (bbio) { ASSERT(bbio->bio.bi_iter.bi_size); @@ -864,12 +970,15 @@ static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe, } static void scrub_stripe_report_errors(struct scrub_ctx *sctx, - struct scrub_stripe *stripe) + struct scrub_stripe *stripe, + const struct scrub_error_records *errors) { static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_device *dev = NULL; + const unsigned long extent_bitmap = scrub_bitmap_read_has_extent(stripe); + const unsigned long error_bitmap = scrub_bitmap_read_error(stripe); u64 physical = 0; int nr_data_sectors = 0; int nr_meta_sectors = 0; @@ -886,7 +995,7 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx, * Although our scrub_stripe infrastructure is mostly based on btrfs_submit_bio() * thus no need for dev/physical, error reporting still needs dev and physical. */ - if (!bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors)) { + if (!bitmap_empty(&errors->init_error_bitmap, stripe->nr_sectors)) { u64 mapped_len = fs_info->sectorsize; struct btrfs_io_context *bioc = NULL; int stripe_index = stripe->mirror_num - 1; @@ -909,10 +1018,10 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx, } skip: - for_each_set_bit(sector_nr, &stripe->extent_sector_bitmap, stripe->nr_sectors) { + for_each_set_bit(sector_nr, &extent_bitmap, stripe->nr_sectors) { bool repaired = false; - if (stripe->sectors[sector_nr].is_metadata) { + if (scrub_bitmap_test_bit_is_metadata(stripe, sector_nr)) { nr_meta_sectors++; } else { nr_data_sectors++; @@ -920,14 +1029,14 @@ skip: nr_nodatacsum_sectors++; } - if (test_bit(sector_nr, &stripe->init_error_bitmap) && - !test_bit(sector_nr, &stripe->error_bitmap)) { + if (test_bit(sector_nr, &errors->init_error_bitmap) && + !test_bit(sector_nr, &error_bitmap)) { nr_repaired_sectors++; repaired = true; } /* Good sector from the beginning, nothing need to be done. */ - if (!test_bit(sector_nr, &stripe->init_error_bitmap)) + if (!test_bit(sector_nr, &errors->init_error_bitmap)) continue; /* @@ -960,31 +1069,46 @@ skip: stripe->logical, stripe->mirror_num); } - if (test_bit(sector_nr, &stripe->io_error_bitmap)) + if (scrub_bitmap_test_bit_io_error(stripe, sector_nr)) if (__ratelimit(&rs) && dev) scrub_print_common_warning("i/o error", dev, false, stripe->logical, physical); - if (test_bit(sector_nr, &stripe->csum_error_bitmap)) + if (scrub_bitmap_test_bit_csum_error(stripe, sector_nr)) if (__ratelimit(&rs) && dev) scrub_print_common_warning("checksum error", dev, false, stripe->logical, physical); - if (test_bit(sector_nr, &stripe->meta_error_bitmap)) + if (scrub_bitmap_test_bit_meta_error(stripe, sector_nr)) if (__ratelimit(&rs) && dev) scrub_print_common_warning("header error", dev, false, stripe->logical, physical); + if (scrub_bitmap_test_bit_meta_gen_error(stripe, sector_nr)) + if (__ratelimit(&rs) && dev) + scrub_print_common_warning("generation error", dev, false, + stripe->logical, physical); } + /* Update the device stats. */ + for (int i = 0; i < errors->nr_io_errors; i++) + btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_READ_ERRS); + for (int i = 0; i < errors->nr_csum_errors; i++) + btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); + /* Generation mismatch error is based on each metadata, not each block. */ + for (int i = 0; i < errors->nr_meta_gen_errors; + i += (fs_info->nodesize >> fs_info->sectorsize_bits)) + btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_GENERATION_ERRS); + spin_lock(&sctx->stat_lock); sctx->stat.data_extents_scrubbed += stripe->nr_data_extents; sctx->stat.tree_extents_scrubbed += stripe->nr_meta_extents; sctx->stat.data_bytes_scrubbed += nr_data_sectors << fs_info->sectorsize_bits; sctx->stat.tree_bytes_scrubbed += nr_meta_sectors << fs_info->sectorsize_bits; sctx->stat.no_csum += nr_nodatacsum_sectors; - sctx->stat.read_errors += stripe->init_nr_io_errors; - sctx->stat.csum_errors += stripe->init_nr_csum_errors; - sctx->stat.verify_errors += stripe->init_nr_meta_errors; + sctx->stat.read_errors += errors->nr_io_errors; + sctx->stat.csum_errors += errors->nr_csum_errors; + sctx->stat.verify_errors += errors->nr_meta_errors + + errors->nr_meta_gen_errors; sctx->stat.uncorrectable_errors += - bitmap_weight(&stripe->error_bitmap, stripe->nr_sectors); + bitmap_weight(&error_bitmap, stripe->nr_sectors); sctx->stat.corrected_errors += nr_repaired_sectors; spin_unlock(&sctx->stat_lock); } @@ -1010,26 +1134,26 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work) struct scrub_stripe *stripe = container_of(work, struct scrub_stripe, work); struct scrub_ctx *sctx = stripe->sctx; struct btrfs_fs_info *fs_info = sctx->fs_info; + struct scrub_error_records errors = { 0 }; int num_copies = btrfs_num_copies(fs_info, stripe->bg->start, stripe->bg->length); unsigned long repaired; + unsigned long error; int mirror; int i; ASSERT(stripe->mirror_num > 0); wait_scrub_stripe_io(stripe); - scrub_verify_one_stripe(stripe, stripe->extent_sector_bitmap); + scrub_verify_one_stripe(stripe, scrub_bitmap_read_has_extent(stripe)); /* Save the initial failed bitmap for later repair and report usage. */ - stripe->init_error_bitmap = stripe->error_bitmap; - stripe->init_nr_io_errors = bitmap_weight(&stripe->io_error_bitmap, - stripe->nr_sectors); - stripe->init_nr_csum_errors = bitmap_weight(&stripe->csum_error_bitmap, - stripe->nr_sectors); - stripe->init_nr_meta_errors = bitmap_weight(&stripe->meta_error_bitmap, - stripe->nr_sectors); - - if (bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors)) + errors.init_error_bitmap = scrub_bitmap_read_error(stripe); + errors.nr_io_errors = scrub_bitmap_weight_io_error(stripe); + errors.nr_csum_errors = scrub_bitmap_weight_csum_error(stripe); + errors.nr_meta_errors = scrub_bitmap_weight_meta_error(stripe); + errors.nr_meta_gen_errors = scrub_bitmap_weight_meta_gen_error(stripe); + + if (bitmap_empty(&errors.init_error_bitmap, stripe->nr_sectors)) goto out; /* @@ -1041,13 +1165,13 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work) for (mirror = calc_next_mirror(stripe->mirror_num, num_copies); mirror != stripe->mirror_num; mirror = calc_next_mirror(mirror, num_copies)) { - const unsigned long old_error_bitmap = stripe->error_bitmap; + const unsigned long old_error_bitmap = scrub_bitmap_read_error(stripe); scrub_stripe_submit_repair_read(stripe, mirror, BTRFS_STRIPE_LEN, false); wait_scrub_stripe_io(stripe); scrub_verify_one_stripe(stripe, old_error_bitmap); - if (bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors)) + if (scrub_bitmap_empty_error(stripe)) goto out; } @@ -1065,21 +1189,22 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work) for (i = 0, mirror = stripe->mirror_num; i < num_copies; i++, mirror = calc_next_mirror(mirror, num_copies)) { - const unsigned long old_error_bitmap = stripe->error_bitmap; + const unsigned long old_error_bitmap = scrub_bitmap_read_error(stripe); scrub_stripe_submit_repair_read(stripe, mirror, fs_info->sectorsize, true); wait_scrub_stripe_io(stripe); scrub_verify_one_stripe(stripe, old_error_bitmap); - if (bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors)) + if (scrub_bitmap_empty_error(stripe)) goto out; } out: + error = scrub_bitmap_read_error(stripe); /* * Submit the repaired sectors. For zoned case, we cannot do repair * in-place, but queue the bg to be relocated. */ - bitmap_andnot(&repaired, &stripe->init_error_bitmap, &stripe->error_bitmap, + bitmap_andnot(&repaired, &errors.init_error_bitmap, &error, stripe->nr_sectors); if (!sctx->readonly && !bitmap_empty(&repaired, stripe->nr_sectors)) { if (btrfs_is_zoned(fs_info)) { @@ -1090,7 +1215,7 @@ out: } } - scrub_stripe_report_errors(sctx, stripe); + scrub_stripe_report_errors(sctx, stripe, &errors); set_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state); wake_up(&stripe->repair_wait); } @@ -1110,10 +1235,10 @@ static void scrub_read_endio(struct btrfs_bio *bbio) num_sectors = bio_size >> stripe->bg->fs_info->sectorsize_bits; if (bbio->bio.bi_status) { - bitmap_set(&stripe->io_error_bitmap, sector_nr, num_sectors); - bitmap_set(&stripe->error_bitmap, sector_nr, num_sectors); + scrub_bitmap_set_io_error(stripe, sector_nr, num_sectors); + scrub_bitmap_set_error(stripe, sector_nr, num_sectors); } else { - bitmap_clear(&stripe->io_error_bitmap, sector_nr, num_sectors); + scrub_bitmap_clear_io_error(stripe, sector_nr, num_sectors); } bio_put(&bbio->bio); if (atomic_dec_and_test(&stripe->pending_io)) { @@ -1142,6 +1267,9 @@ static void scrub_write_endio(struct btrfs_bio *bbio) bitmap_set(&stripe->write_error_bitmap, sector_nr, bio_size >> fs_info->sectorsize_bits); spin_unlock_irqrestore(&stripe->write_error_lock, flags); + for (int i = 0; i < (bio_size >> fs_info->sectorsize_bits); i++) + btrfs_dev_stat_inc_and_print(stripe->dev, + BTRFS_DEV_STAT_WRITE_ERRS); } bio_put(&bbio->bio); @@ -1199,12 +1327,8 @@ static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *str int sector_nr; for_each_set_bit(sector_nr, &write_bitmap, stripe->nr_sectors) { - struct page *page = scrub_stripe_get_page(stripe, sector_nr); - unsigned int pgoff = scrub_stripe_get_page_offset(stripe, sector_nr); - int ret; - /* We should only writeback sectors covered by an extent. */ - ASSERT(test_bit(sector_nr, &stripe->extent_sector_bitmap)); + ASSERT(scrub_bitmap_test_bit_has_extent(stripe, sector_nr)); /* Cannot merge with previous sector, submit the current one. */ if (bbio && sector_nr && !test_bit(sector_nr - 1, &write_bitmap)) { @@ -1218,8 +1342,7 @@ static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *str (sector_nr << fs_info->sectorsize_bits)) >> SECTOR_SHIFT; } - ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff); - ASSERT(ret == fs_info->sectorsize); + scrub_bio_add_sector(bbio, stripe, sector_nr); } if (bbio) scrub_submit_write_bio(sctx, stripe, bbio, dev_replace); @@ -1380,11 +1503,11 @@ static int find_first_extent_item(struct btrfs_root *extent_root, if (path->nodes[0]) goto search_forward; + key.objectid = search_start; if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) key.type = BTRFS_METADATA_ITEM_KEY; else key.type = BTRFS_EXTENT_ITEM_KEY; - key.objectid = search_start; key.offset = (u64)-1; ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); @@ -1493,9 +1616,9 @@ static void fill_one_extent_info(struct btrfs_fs_info *fs_info, struct scrub_sector_verification *sector = &stripe->sectors[nr_sector]; - set_bit(nr_sector, &stripe->extent_sector_bitmap); + scrub_bitmap_set_bit_has_extent(stripe, nr_sector); if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - sector->is_metadata = true; + scrub_bitmap_set_bit_is_metadata(stripe, nr_sector); sector->generation = extent_gen; } } @@ -1503,15 +1626,8 @@ static void fill_one_extent_info(struct btrfs_fs_info *fs_info, static void scrub_stripe_reset_bitmaps(struct scrub_stripe *stripe) { - stripe->extent_sector_bitmap = 0; - stripe->init_error_bitmap = 0; - stripe->init_nr_io_errors = 0; - stripe->init_nr_csum_errors = 0; - stripe->init_nr_meta_errors = 0; - stripe->error_bitmap = 0; - stripe->io_error_bitmap = 0; - stripe->csum_error_bitmap = 0; - stripe->meta_error_bitmap = 0; + ASSERT(stripe->nr_sectors); + bitmap_zero(stripe->bitmaps, scrub_bitmap_nr_last * stripe->nr_sectors); } /* @@ -1541,8 +1657,8 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, u64 extent_gen; int ret; - if (unlikely(!extent_root)) { - btrfs_err(fs_info, "no valid extent root for scrub"); + if (unlikely(!extent_root || !csum_root)) { + btrfs_err(fs_info, "no valid extent or csum root for scrub"); return -EUCLEAN; } memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) * @@ -1646,7 +1762,6 @@ static void scrub_reset_stripe(struct scrub_stripe *stripe) stripe->state = 0; for (int i = 0; i < stripe->nr_sectors; i++) { - stripe->sectors[i].is_metadata = false; stripe->sectors[i].csum = NULL; stripe->sectors[i].generation = 0; } @@ -1665,24 +1780,21 @@ static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe) struct btrfs_fs_info *fs_info = stripe->bg->fs_info; struct btrfs_bio *bbio = NULL; unsigned int nr_sectors = stripe_length(stripe) >> fs_info->sectorsize_bits; + const unsigned long has_extent = scrub_bitmap_read_has_extent(stripe); u64 stripe_len = BTRFS_STRIPE_LEN; int mirror = stripe->mirror_num; int i; atomic_inc(&stripe->pending_io); - for_each_set_bit(i, &stripe->extent_sector_bitmap, stripe->nr_sectors) { - struct page *page = scrub_stripe_get_page(stripe, i); - unsigned int pgoff = scrub_stripe_get_page_offset(stripe, i); - + for_each_set_bit(i, &has_extent, stripe->nr_sectors) { /* We're beyond the chunk boundary, no need to read anymore. */ if (i >= nr_sectors) break; /* The current sector cannot be merged, submit the bio. */ if (bbio && - ((i > 0 && - !test_bit(i - 1, &stripe->extent_sector_bitmap)) || + ((i > 0 && !test_bit(i - 1, &has_extent)) || bbio->bio.bi_iter.bi_size >= stripe_len)) { ASSERT(bbio->bio.bi_iter.bi_size); atomic_inc(&stripe->pending_io); @@ -1716,8 +1828,8 @@ static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe) * the extent tree, then it's a preallocated * extent and not an error. */ - set_bit(i, &stripe->io_error_bitmap); - set_bit(i, &stripe->error_bitmap); + scrub_bitmap_set_bit_io_error(stripe, i); + scrub_bitmap_set_bit_error(stripe, i); } continue; } @@ -1727,7 +1839,7 @@ static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe) bbio->bio.bi_iter.bi_sector = logical >> SECTOR_SHIFT; } - __bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff); + scrub_bio_add_sector(bbio, stripe, i); } if (bbio) { @@ -1765,15 +1877,8 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, bbio->bio.bi_iter.bi_sector = stripe->logical >> SECTOR_SHIFT; /* Read the whole range inside the chunk boundary. */ - for (unsigned int cur = 0; cur < nr_sectors; cur++) { - struct page *page = scrub_stripe_get_page(stripe, cur); - unsigned int pgoff = scrub_stripe_get_page_offset(stripe, cur); - int ret; - - ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff); - /* We should have allocated enough bio vectors. */ - ASSERT(ret == fs_info->sectorsize); - } + for (unsigned int cur = 0; cur < nr_sectors; cur++) + scrub_bio_add_sector(bbio, stripe, cur); atomic_inc(&stripe->pending_io); /* @@ -1794,10 +1899,11 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, static bool stripe_has_metadata_error(struct scrub_stripe *stripe) { + const unsigned long error = scrub_bitmap_read_error(stripe); int i; - for_each_set_bit(i, &stripe->error_bitmap, stripe->nr_sectors) { - if (stripe->sectors[i].is_metadata) { + for_each_set_bit(i, &error, stripe->nr_sectors) { + if (scrub_bitmap_test_bit_is_metadata(stripe, i)) { struct btrfs_fs_info *fs_info = stripe->bg->fs_info; btrfs_err(fs_info, @@ -1872,13 +1978,16 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx) } for (int i = 0; i < nr_stripes; i++) { unsigned long good; + unsigned long has_extent; + unsigned long error; stripe = &sctx->stripes[i]; ASSERT(stripe->dev == fs_info->dev_replace.srcdev); - bitmap_andnot(&good, &stripe->extent_sector_bitmap, - &stripe->error_bitmap, stripe->nr_sectors); + has_extent = scrub_bitmap_read_has_extent(stripe); + error = scrub_bitmap_read_error(stripe); + bitmap_andnot(&good, &has_extent, &error, stripe->nr_sectors); scrub_write_sectors(sctx, stripe, good, true); } } @@ -2012,7 +2121,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, /* Check if all data stripes are empty. */ for (int i = 0; i < data_stripes; i++) { stripe = &sctx->raid56_data_stripes[i]; - if (!bitmap_empty(&stripe->extent_sector_bitmap, stripe->nr_sectors)) { + if (!scrub_bitmap_empty_has_extent(stripe)) { all_empty = false; break; } @@ -2044,15 +2153,18 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, */ for (int i = 0; i < data_stripes; i++) { unsigned long error; + unsigned long has_extent; stripe = &sctx->raid56_data_stripes[i]; + error = scrub_bitmap_read_error(stripe); + has_extent = scrub_bitmap_read_has_extent(stripe); + /* * We should only check the errors where there is an extent. * As we may hit an empty data stripe while it's missing. */ - bitmap_and(&error, &stripe->error_bitmap, - &stripe->extent_sector_bitmap, stripe->nr_sectors); + bitmap_and(&error, &error, &has_extent, stripe->nr_sectors); if (!bitmap_empty(&error, stripe->nr_sectors)) { btrfs_err(fs_info, "unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl", @@ -2061,8 +2173,8 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, ret = -EIO; goto out; } - bitmap_or(&extent_bitmap, &extent_bitmap, - &stripe->extent_sector_bitmap, stripe->nr_sectors); + bitmap_or(&extent_bitmap, &extent_bitmap, &has_extent, + stripe->nr_sectors); } /* Now we can check and regenerate the P/Q stripe. */ @@ -2497,8 +2609,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, path->skip_locking = 1; key.objectid = scrub_dev->devid; - key.offset = 0ull; key.type = BTRFS_DEV_EXTENT_KEY; + key.offset = 0ull; while (1) { u64 dev_extent_len; @@ -2770,17 +2882,11 @@ static int scrub_one_super(struct scrub_ctx *sctx, struct btrfs_device *dev, struct page *page, u64 physical, u64 generation) { struct btrfs_fs_info *fs_info = sctx->fs_info; - struct bio_vec bvec; - struct bio bio; struct btrfs_super_block *sb = page_address(page); int ret; - bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_READ); - bio.bi_iter.bi_sector = physical >> SECTOR_SHIFT; - __bio_add_page(&bio, page, BTRFS_SUPER_INFO_SIZE, 0); - ret = submit_bio_wait(&bio); - bio_uninit(&bio); - + ret = bdev_rw_virt(dev->bdev, physical >> SECTOR_SHIFT, sb, + BTRFS_SUPER_INFO_SIZE, REQ_OP_READ); if (ret < 0) return ret; ret = btrfs_check_super_csum(fs_info, sb); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index f437138fefbc..2891ec4056c6 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -16,7 +16,6 @@ #include <linux/compat.h> #include <linux/crc32c.h> #include <linux/fsverity.h> - #include "send.h" #include "ctree.h" #include "backref.h" @@ -178,6 +177,7 @@ struct send_ctx { u64 cur_inode_rdev; u64 cur_inode_last_extent; u64 cur_inode_next_write_offset; + struct fs_path cur_inode_path; bool cur_inode_new; bool cur_inode_new_gen; bool cur_inode_deleted; @@ -383,11 +383,11 @@ static void inconsistent_snapshot_error(struct send_ctx *sctx, result_string = "updated"; break; case BTRFS_COMPARE_TREE_SAME: - ASSERT(0); + DEBUG_WARN("no change between trees"); result_string = "unchanged"; break; default: - ASSERT(0); + DEBUG_WARN("unexpected comparison result %d", result); result_string = "unexpected"; } @@ -425,15 +425,21 @@ static int need_send_hole(struct send_ctx *sctx) static void fs_path_reset(struct fs_path *p) { - if (p->reversed) { + if (p->reversed) p->start = p->buf + p->buf_len - 1; - p->end = p->start; - *p->start = 0; - } else { + else p->start = p->buf; - p->end = p->start; - *p->start = 0; - } + + p->end = p->start; + *p->start = 0; +} + +static void init_path(struct fs_path *p) +{ + p->reversed = 0; + p->buf = p->inline_buf; + p->buf_len = FS_PATH_INLINE_SIZE; + fs_path_reset(p); } static struct fs_path *fs_path_alloc(void) @@ -443,10 +449,7 @@ static struct fs_path *fs_path_alloc(void) p = kmalloc(sizeof(*p), GFP_KERNEL); if (!p) return NULL; - p->reversed = 0; - p->buf = p->inline_buf; - p->buf_len = FS_PATH_INLINE_SIZE; - fs_path_reset(p); + init_path(p); return p; } @@ -471,7 +474,7 @@ static void fs_path_free(struct fs_path *p) kfree(p); } -static int fs_path_len(struct fs_path *p) +static inline int fs_path_len(const struct fs_path *p) { return p->end - p->start; } @@ -487,12 +490,10 @@ static int fs_path_ensure_buf(struct fs_path *p, int len) if (p->buf_len >= len) return 0; - if (len > PATH_MAX) { - WARN_ON(1); - return -ENOMEM; - } + if (WARN_ON(len > PATH_MAX)) + return -ENAMETOOLONG; - path_len = p->end - p->start; + path_len = fs_path_len(p); old_buf_len = p->buf_len; /* @@ -533,12 +534,12 @@ static int fs_path_prepare_for_add(struct fs_path *p, int name_len, int ret; int new_len; - new_len = p->end - p->start + name_len; + new_len = fs_path_len(p) + name_len; if (p->start != p->end) new_len++; ret = fs_path_ensure_buf(p, new_len); if (ret < 0) - goto out; + return ret; if (p->reversed) { if (p->start != p->end) @@ -553,8 +554,7 @@ static int fs_path_prepare_for_add(struct fs_path *p, int name_len, *p->end = 0; } -out: - return ret; + return 0; } static int fs_path_add(struct fs_path *p, const char *name, int name_len) @@ -564,25 +564,15 @@ static int fs_path_add(struct fs_path *p, const char *name, int name_len) ret = fs_path_prepare_for_add(p, name_len, &prepared); if (ret < 0) - goto out; + return ret; memcpy(prepared, name, name_len); -out: - return ret; + return 0; } -static int fs_path_add_path(struct fs_path *p, struct fs_path *p2) +static inline int fs_path_add_path(struct fs_path *p, const struct fs_path *p2) { - int ret; - char *prepared; - - ret = fs_path_prepare_for_add(p, p2->end - p2->start, &prepared); - if (ret < 0) - goto out; - memcpy(prepared, p2->start, p2->end - p2->start); - -out: - return ret; + return fs_path_add(p, p2->start, fs_path_len(p2)); } static int fs_path_add_from_extent_buffer(struct fs_path *p, @@ -594,12 +584,11 @@ static int fs_path_add_from_extent_buffer(struct fs_path *p, ret = fs_path_prepare_for_add(p, len, &prepared); if (ret < 0) - goto out; + return ret; read_extent_buffer(eb, prepared, off, len); -out: - return ret; + return 0; } static int fs_path_copy(struct fs_path *p, struct fs_path *from) @@ -619,13 +608,21 @@ static void fs_path_unreverse(struct fs_path *p) return; tmp = p->start; - len = p->end - p->start; + len = fs_path_len(p); p->start = p->buf; p->end = p->start + len; memmove(p->start, tmp, len + 1); p->reversed = 0; } +static inline bool is_current_inode_path(const struct send_ctx *sctx, + const struct fs_path *path) +{ + const struct fs_path *cur = &sctx->cur_inode_path; + + return (strncmp(path->start, cur->start, fs_path_len(cur)) == 0); +} + static struct btrfs_path *alloc_path_for_send(void) { struct btrfs_path *path; @@ -740,7 +737,7 @@ static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr, #define TLV_PUT_PATH(sctx, attrtype, p) \ do { \ ret = tlv_put_string(sctx, attrtype, p->start, \ - p->end - p->start); \ + fs_path_len((p))); \ if (ret < 0) \ goto tlv_put_failure; \ } while(0) @@ -819,14 +816,11 @@ static int send_cmd(struct send_ctx *sctx) static int send_rename(struct send_ctx *sctx, struct fs_path *from, struct fs_path *to) { - struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret; - btrfs_debug(fs_info, "send_rename %s -> %s", from->start, to->start); - ret = begin_cmd(sctx, BTRFS_SEND_C_RENAME); if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, from); TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_TO, to); @@ -834,7 +828,6 @@ static int send_rename(struct send_ctx *sctx, ret = send_cmd(sctx); tlv_put_failure: -out: return ret; } @@ -844,14 +837,11 @@ out: static int send_link(struct send_ctx *sctx, struct fs_path *path, struct fs_path *lnk) { - struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret; - btrfs_debug(fs_info, "send_link %s -> %s", path->start, lnk->start); - ret = begin_cmd(sctx, BTRFS_SEND_C_LINK); if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, lnk); @@ -859,7 +849,6 @@ static int send_link(struct send_ctx *sctx, ret = send_cmd(sctx); tlv_put_failure: -out: return ret; } @@ -868,21 +857,17 @@ out: */ static int send_unlink(struct send_ctx *sctx, struct fs_path *path) { - struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret; - btrfs_debug(fs_info, "send_unlink %s", path->start); - ret = begin_cmd(sctx, BTRFS_SEND_C_UNLINK); if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); ret = send_cmd(sctx); tlv_put_failure: -out: return ret; } @@ -891,21 +876,17 @@ out: */ static int send_rmdir(struct send_ctx *sctx, struct fs_path *path) { - struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret; - btrfs_debug(fs_info, "send_rmdir %s", path->start); - ret = begin_cmd(sctx, BTRFS_SEND_C_RMDIR); if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); ret = send_cmd(sctx); tlv_put_failure: -out: return ret; } @@ -1580,7 +1561,6 @@ static int find_extent_clone(struct send_ctx *sctx, struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret; int extent_type; - u64 logical; u64 disk_byte; u64 num_bytes; struct btrfs_file_extent_item *fi; @@ -1611,7 +1591,6 @@ static int find_extent_clone(struct send_ctx *sctx, compressed = btrfs_file_extent_compression(eb, fi); num_bytes = btrfs_file_extent_num_bytes(eb, fi); - logical = disk_byte + btrfs_file_extent_offset(eb, fi); /* * Setup the clone roots. @@ -1693,14 +1672,8 @@ static int find_extent_clone(struct send_ctx *sctx, } up_read(&fs_info->commit_root_sem); - btrfs_debug(fs_info, - "find_extent_clone: data_offset=%llu, ino=%llu, num_bytes=%llu, logical=%llu", - data_offset, ino, num_bytes, logical); - - if (!backref_ctx.found) { - btrfs_debug(fs_info, "no clones found"); + if (!backref_ctx.found) return -ENOENT; - } cur_clone_root = NULL; for (i = 0; i < sctx->clone_roots_cnt; i++) { @@ -1897,7 +1870,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen, ret = get_inode_info(sctx->send_root, ino, &info); if (ret < 0 && ret != -ENOENT) - goto out; + return ret; left_ret = (info.nlink == 0) ? -ENOENT : ret; left_gen = info.gen; if (send_gen) @@ -1908,7 +1881,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen, } else { ret = get_inode_info(sctx->parent_root, ino, &info); if (ret < 0 && ret != -ENOENT) - goto out; + return ret; right_ret = (info.nlink == 0) ? -ENOENT : ret; right_gen = info.gen; if (parent_gen) @@ -1953,7 +1926,6 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen, ret = -ENOENT; } -out: return ret; } @@ -1967,17 +1939,14 @@ static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen, ret = get_cur_inode_state(sctx, ino, gen, send_gen, parent_gen); if (ret < 0) - goto out; + return ret; if (ret == inode_state_no_change || ret == inode_state_did_create || ret == inode_state_will_delete) - ret = 1; - else - ret = 0; + return 1; -out: - return ret; + return 0; } /* @@ -2326,9 +2295,8 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, *parent_gen = nce->parent_gen; ret = fs_path_add(dest, nce->name, nce->name_len); if (ret < 0) - goto out; - ret = nce->ret; - goto out; + return ret; + return nce->ret; } } @@ -2339,12 +2307,12 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, */ ret = is_inode_existent(sctx, ino, gen, NULL, NULL); if (ret < 0) - goto out; + return ret; if (!ret) { ret = gen_unique_name(sctx, ino, gen, dest); if (ret < 0) - goto out; + return ret; ret = 1; goto out_cache; } @@ -2360,21 +2328,21 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, ret = get_first_ref(sctx->parent_root, ino, parent_ino, parent_gen, dest); if (ret < 0) - goto out; + return ret; /* * Check if the ref was overwritten by an inode's ref that was processed * earlier. If yes, treat as orphan and return 1. */ ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen, - dest->start, dest->end - dest->start); + dest->start, fs_path_len(dest)); if (ret < 0) - goto out; + return ret; if (ret) { fs_path_reset(dest); ret = gen_unique_name(sctx, ino, gen, dest); if (ret < 0) - goto out; + return ret; ret = 1; } @@ -2383,10 +2351,8 @@ out_cache: * Store the result of the lookup in the name cache. */ nce = kmalloc(sizeof(*nce) + fs_path_len(dest), GFP_KERNEL); - if (!nce) { - ret = -ENOMEM; - goto out; - } + if (!nce) + return -ENOMEM; nce->entry.key = ino; nce->entry.gen = gen; @@ -2404,10 +2370,9 @@ out_cache: nce_ret = btrfs_lru_cache_store(&sctx->name_cache, &nce->entry, GFP_KERNEL); if (nce_ret < 0) { kfree(nce); - ret = nce_ret; + return nce_ret; } -out: return ret; } @@ -2444,6 +2409,14 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, u64 parent_inode = 0; u64 parent_gen = 0; int stop = 0; + const bool is_cur_inode = (ino == sctx->cur_ino && gen == sctx->cur_inode_gen); + + if (is_cur_inode && fs_path_len(&sctx->cur_inode_path) > 0) { + if (dest != &sctx->cur_inode_path) + return fs_path_copy(dest, &sctx->cur_inode_path); + + return 0; + } name = fs_path_alloc(); if (!name) { @@ -2495,8 +2468,12 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, out: fs_path_free(name); - if (!ret) + if (!ret) { fs_path_unreverse(dest); + if (is_cur_inode && dest != &sctx->cur_inode_path) + ret = fs_path_copy(&sctx->cur_inode_path, dest); + } + return ret; } @@ -2591,25 +2568,60 @@ out: return ret; } +static struct fs_path *get_cur_inode_path(struct send_ctx *sctx) +{ + if (fs_path_len(&sctx->cur_inode_path) == 0) { + int ret; + + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, + &sctx->cur_inode_path); + if (ret < 0) + return ERR_PTR(ret); + } + + return &sctx->cur_inode_path; +} + +static struct fs_path *get_path_for_command(struct send_ctx *sctx, u64 ino, u64 gen) +{ + struct fs_path *path; + int ret; + + if (ino == sctx->cur_ino && gen == sctx->cur_inode_gen) + return get_cur_inode_path(sctx); + + path = fs_path_alloc(); + if (!path) + return ERR_PTR(-ENOMEM); + + ret = get_cur_path(sctx, ino, gen, path); + if (ret < 0) { + fs_path_free(path); + return ERR_PTR(ret); + } + + return path; +} + +static void free_path_for_command(const struct send_ctx *sctx, struct fs_path *path) +{ + if (path != &sctx->cur_inode_path) + fs_path_free(path); +} + static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size) { - struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret = 0; struct fs_path *p; - btrfs_debug(fs_info, "send_truncate %llu size=%llu", ino, size); - - p = fs_path_alloc(); - if (!p) - return -ENOMEM; + p = get_path_for_command(sctx, ino, gen); + if (IS_ERR(p)) + return PTR_ERR(p); ret = begin_cmd(sctx, BTRFS_SEND_C_TRUNCATE); if (ret < 0) goto out; - ret = get_cur_path(sctx, ino, gen, p); - if (ret < 0) - goto out; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, size); @@ -2617,29 +2629,23 @@ static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size) tlv_put_failure: out: - fs_path_free(p); + free_path_for_command(sctx, p); return ret; } static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode) { - struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret = 0; struct fs_path *p; - btrfs_debug(fs_info, "send_chmod %llu mode=%llu", ino, mode); - - p = fs_path_alloc(); - if (!p) - return -ENOMEM; + p = get_path_for_command(sctx, ino, gen); + if (IS_ERR(p)) + return PTR_ERR(p); ret = begin_cmd(sctx, BTRFS_SEND_C_CHMOD); if (ret < 0) goto out; - ret = get_cur_path(sctx, ino, gen, p); - if (ret < 0) - goto out; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode & 07777); @@ -2647,32 +2653,26 @@ static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode) tlv_put_failure: out: - fs_path_free(p); + free_path_for_command(sctx, p); return ret; } static int send_fileattr(struct send_ctx *sctx, u64 ino, u64 gen, u64 fileattr) { - struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret = 0; struct fs_path *p; if (sctx->proto < 2) return 0; - btrfs_debug(fs_info, "send_fileattr %llu fileattr=%llu", ino, fileattr); - - p = fs_path_alloc(); - if (!p) - return -ENOMEM; + p = get_path_for_command(sctx, ino, gen); + if (IS_ERR(p)) + return PTR_ERR(p); ret = begin_cmd(sctx, BTRFS_SEND_C_FILEATTR); if (ret < 0) goto out; - ret = get_cur_path(sctx, ino, gen, p); - if (ret < 0) - goto out; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_FILEATTR, fileattr); @@ -2680,30 +2680,23 @@ static int send_fileattr(struct send_ctx *sctx, u64 ino, u64 gen, u64 fileattr) tlv_put_failure: out: - fs_path_free(p); + free_path_for_command(sctx, p); return ret; } static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid) { - struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret = 0; struct fs_path *p; - btrfs_debug(fs_info, "send_chown %llu uid=%llu, gid=%llu", - ino, uid, gid); - - p = fs_path_alloc(); - if (!p) - return -ENOMEM; + p = get_path_for_command(sctx, ino, gen); + if (IS_ERR(p)) + return PTR_ERR(p); ret = begin_cmd(sctx, BTRFS_SEND_C_CHOWN); if (ret < 0) goto out; - ret = get_cur_path(sctx, ino, gen, p); - if (ret < 0) - goto out; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_UID, uid); TLV_PUT_U64(sctx, BTRFS_SEND_A_GID, gid); @@ -2712,13 +2705,12 @@ static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid) tlv_put_failure: out: - fs_path_free(p); + free_path_for_command(sctx, p); return ret; } static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) { - struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret = 0; struct fs_path *p = NULL; struct btrfs_inode_item *ii; @@ -2727,11 +2719,9 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) struct btrfs_key key; int slot; - btrfs_debug(fs_info, "send_utimes %llu", ino); - - p = fs_path_alloc(); - if (!p) - return -ENOMEM; + p = get_path_for_command(sctx, ino, gen); + if (IS_ERR(p)) + return PTR_ERR(p); path = alloc_path_for_send(); if (!path) { @@ -2756,9 +2746,6 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) if (ret < 0) goto out; - ret = get_cur_path(sctx, ino, gen, p); - if (ret < 0) - goto out; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, &ii->atime); TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, &ii->mtime); @@ -2770,7 +2757,7 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) tlv_put_failure: out: - fs_path_free(p); + free_path_for_command(sctx, p); btrfs_free_path(path); return ret; } @@ -2838,7 +2825,6 @@ static int trim_dir_utimes_cache(struct send_ctx *sctx) */ static int send_create_inode(struct send_ctx *sctx, u64 ino) { - struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret = 0; struct fs_path *p; int cmd; @@ -2847,8 +2833,6 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino) u64 mode; u64 rdev; - btrfs_debug(fs_info, "send_create_inode %llu", ino); - p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -3075,7 +3059,7 @@ static void __free_recorded_refs(struct list_head *head) struct recorded_ref *cur; while (!list_empty(head)) { - cur = list_entry(head->next, struct recorded_ref, list); + cur = list_first_entry(head, struct recorded_ref, list); recorded_ref_free(cur); } } @@ -3106,6 +3090,11 @@ static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen, goto out; ret = send_rename(sctx, path, orphan); + if (ret < 0) + goto out; + + if (ino == sctx->cur_ino && gen == sctx->cur_inode_gen) + ret = fs_path_copy(&sctx->cur_inode_path, orphan); out: fs_path_free(orphan); @@ -4158,6 +4147,23 @@ out: return ret; } +static int rename_current_inode(struct send_ctx *sctx, + struct fs_path *current_path, + struct fs_path *new_path) +{ + int ret; + + ret = send_rename(sctx, current_path, new_path); + if (ret < 0) + return ret; + + ret = fs_path_copy(&sctx->cur_inode_path, new_path); + if (ret < 0) + return ret; + + return fs_path_copy(current_path, new_path); +} + /* * This does all the move/link/unlink/rmdir magic. */ @@ -4172,15 +4178,13 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) u64 ow_inode = 0; u64 ow_gen; u64 ow_mode; - int did_overwrite = 0; - int is_orphan = 0; u64 last_dir_ino_rm = 0; + bool did_overwrite = false; + bool is_orphan = false; bool can_rename = true; bool orphanized_dir = false; bool orphanized_ancestor = false; - btrfs_debug(fs_info, "process_recorded_refs %llu", sctx->cur_ino); - /* * This should never happen as the root dir always has the same ref * which is always '..' @@ -4216,14 +4220,14 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) if (ret < 0) goto out; if (ret) - did_overwrite = 1; + did_overwrite = true; } if (sctx->cur_inode_new || did_overwrite) { ret = gen_unique_name(sctx, sctx->cur_ino, sctx->cur_inode_gen, valid_path); if (ret < 0) goto out; - is_orphan = 1; + is_orphan = true; } else { ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, valid_path); @@ -4348,6 +4352,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) if (ret > 0) { orphanized_ancestor = true; fs_path_reset(valid_path); + fs_path_reset(&sctx->cur_inode_path); ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, valid_path); @@ -4443,13 +4448,10 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * it depending on the inode mode. */ if (is_orphan && can_rename) { - ret = send_rename(sctx, valid_path, cur->full_path); - if (ret < 0) - goto out; - is_orphan = 0; - ret = fs_path_copy(valid_path, cur->full_path); + ret = rename_current_inode(sctx, valid_path, cur->full_path); if (ret < 0) goto out; + is_orphan = false; } else if (can_rename) { if (S_ISDIR(sctx->cur_inode_mode)) { /* @@ -4457,10 +4459,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * dirs, we always have one new and one deleted * ref. The deleted ref is ignored later. */ - ret = send_rename(sctx, valid_path, - cur->full_path); - if (!ret) - ret = fs_path_copy(valid_path, + ret = rename_current_inode(sctx, valid_path, cur->full_path); if (ret < 0) goto out; @@ -4507,7 +4506,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) sctx->cur_inode_gen, valid_path); if (ret < 0) goto out; - is_orphan = 1; + is_orphan = true; } list_for_each_entry(cur, &sctx->deleted_refs, list) { @@ -4520,8 +4519,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) /* * We have a moved dir. Add the old parent to check_dirs */ - cur = list_entry(sctx->deleted_refs.next, struct recorded_ref, - list); + cur = list_first_entry(&sctx->deleted_refs, struct recorded_ref, list); ret = dup_ref(cur, &check_dirs); if (ret < 0) goto out; @@ -4553,6 +4551,8 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) ret = send_unlink(sctx, cur->full_path); if (ret < 0) goto out; + if (is_current_inode_path(sctx, cur->full_path)) + fs_path_reset(&sctx->cur_inode_path); } ret = dup_ref(cur, &check_dirs); if (ret < 0) @@ -4701,7 +4701,7 @@ out: static int record_new_ref_if_needed(u64 dir, struct fs_path *name, void *ctx) { - int ret = 0; + int ret; struct send_ctx *sctx = ctx; struct rb_node *node = NULL; struct recorded_ref data; @@ -4710,7 +4710,7 @@ static int record_new_ref_if_needed(u64 dir, struct fs_path *name, void *ctx) ret = get_inode_gen(sctx->send_root, dir, &dir_gen); if (ret < 0) - goto out; + return ret; data.dir = dir; data.dir_gen = dir_gen; @@ -4724,13 +4724,13 @@ static int record_new_ref_if_needed(u64 dir, struct fs_path *name, void *ctx) &sctx->new_refs, name, dir, dir_gen, sctx); } -out: + return ret; } static int record_deleted_ref_if_needed(u64 dir, struct fs_path *name, void *ctx) { - int ret = 0; + int ret; struct send_ctx *sctx = ctx; struct rb_node *node = NULL; struct recorded_ref data; @@ -4739,7 +4739,7 @@ static int record_deleted_ref_if_needed(u64 dir, struct fs_path *name, void *ctx ret = get_inode_gen(sctx->parent_root, dir, &dir_gen); if (ret < 0) - goto out; + return ret; data.dir = dir; data.dir_gen = dir_gen; @@ -4753,7 +4753,7 @@ static int record_deleted_ref_if_needed(u64 dir, struct fs_path *name, void *ctx &sctx->deleted_refs, name, dir, dir_gen, sctx); } -out: + return ret; } @@ -4764,11 +4764,9 @@ static int record_new_ref(struct send_ctx *sctx) ret = iterate_inode_ref(sctx->send_root, sctx->left_path, sctx->cmp_key, 0, record_new_ref_if_needed, sctx); if (ret < 0) - goto out; - ret = 0; + return ret; -out: - return ret; + return 0; } static int record_deleted_ref(struct send_ctx *sctx) @@ -4779,29 +4777,25 @@ static int record_deleted_ref(struct send_ctx *sctx) sctx->cmp_key, 0, record_deleted_ref_if_needed, sctx); if (ret < 0) - goto out; - ret = 0; + return ret; -out: - return ret; + return 0; } static int record_changed_ref(struct send_ctx *sctx) { - int ret = 0; + int ret; ret = iterate_inode_ref(sctx->send_root, sctx->left_path, sctx->cmp_key, 0, record_new_ref_if_needed, sctx); if (ret < 0) - goto out; + return ret; ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, sctx->cmp_key, 0, record_deleted_ref_if_needed, sctx); if (ret < 0) - goto out; - ret = 0; + return ret; -out: - return ret; + return 0; } /* @@ -4869,15 +4863,19 @@ out: } static int send_set_xattr(struct send_ctx *sctx, - struct fs_path *path, const char *name, int name_len, const char *data, int data_len) { - int ret = 0; + struct fs_path *path; + int ret; + + path = get_cur_inode_path(sctx); + if (IS_ERR(path)) + return PTR_ERR(path); ret = begin_cmd(sctx, BTRFS_SEND_C_SET_XATTR); if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len); @@ -4886,7 +4884,6 @@ static int send_set_xattr(struct send_ctx *sctx, ret = send_cmd(sctx); tlv_put_failure: -out: return ret; } @@ -4894,11 +4891,11 @@ static int send_remove_xattr(struct send_ctx *sctx, struct fs_path *path, const char *name, int name_len) { - int ret = 0; + int ret; ret = begin_cmd(sctx, BTRFS_SEND_C_REMOVE_XATTR); if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len); @@ -4906,7 +4903,6 @@ static int send_remove_xattr(struct send_ctx *sctx, ret = send_cmd(sctx); tlv_put_failure: -out: return ret; } @@ -4914,19 +4910,13 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key, const char *name, int name_len, const char *data, int data_len, void *ctx) { - int ret; struct send_ctx *sctx = ctx; - struct fs_path *p; struct posix_acl_xattr_header dummy_acl; /* Capabilities are emitted by finish_inode_if_needed */ if (!strncmp(name, XATTR_NAME_CAPS, name_len)) return 0; - p = fs_path_alloc(); - if (!p) - return -ENOMEM; - /* * This hack is needed because empty acls are stored as zero byte * data in xattrs. Problem with that is, that receiving these zero byte @@ -4943,48 +4933,27 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key, } } - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); - if (ret < 0) - goto out; - - ret = send_set_xattr(sctx, p, name, name_len, data, data_len); - -out: - fs_path_free(p); - return ret; + return send_set_xattr(sctx, name, name_len, data, data_len); } static int __process_deleted_xattr(int num, struct btrfs_key *di_key, const char *name, int name_len, const char *data, int data_len, void *ctx) { - int ret; struct send_ctx *sctx = ctx; struct fs_path *p; - p = fs_path_alloc(); - if (!p) - return -ENOMEM; - - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); - if (ret < 0) - goto out; + p = get_cur_inode_path(sctx); + if (IS_ERR(p)) + return PTR_ERR(p); - ret = send_remove_xattr(sctx, p, name, name_len); - -out: - fs_path_free(p); - return ret; + return send_remove_xattr(sctx, p, name, name_len); } static int process_new_xattr(struct send_ctx *sctx) { - int ret = 0; - - ret = iterate_dir_item(sctx->send_root, sctx->left_path, - __process_new_xattr, sctx); - - return ret; + return iterate_dir_item(sctx->send_root, sctx->left_path, + __process_new_xattr, sctx); } static int process_deleted_xattr(struct send_ctx *sctx) @@ -5100,17 +5069,15 @@ static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key, static int process_changed_xattr(struct send_ctx *sctx) { - int ret = 0; + int ret; ret = iterate_dir_item(sctx->send_root, sctx->left_path, __process_changed_new_xattr, sctx); if (ret < 0) - goto out; - ret = iterate_dir_item(sctx->parent_root, sctx->right_path, - __process_changed_deleted_xattr, sctx); + return ret; -out: - return ret; + return iterate_dir_item(sctx->parent_root, sctx->right_path, + __process_changed_deleted_xattr, sctx); } static int process_all_new_xattrs(struct send_ctx *sctx) @@ -5157,7 +5124,7 @@ static int send_verity(struct send_ctx *sctx, struct fs_path *path, ret = begin_cmd(sctx, BTRFS_SEND_C_ENABLE_VERITY); if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); TLV_PUT_U8(sctx, BTRFS_SEND_A_VERITY_ALGORITHM, @@ -5172,21 +5139,20 @@ static int send_verity(struct send_ctx *sctx, struct fs_path *path, ret = send_cmd(sctx); tlv_put_failure: -out: return ret; } static int process_verity(struct send_ctx *sctx) { int ret = 0; - struct inode *inode; + struct btrfs_inode *inode; struct fs_path *p; inode = btrfs_iget(sctx->cur_ino, sctx->send_root); if (IS_ERR(inode)) return PTR_ERR(inode); - ret = btrfs_get_verity_descriptor(inode, NULL, 0); + ret = btrfs_get_verity_descriptor(&inode->vfs_inode, NULL, 0); if (ret < 0) goto iput; @@ -5203,27 +5169,19 @@ static int process_verity(struct send_ctx *sctx) } } - ret = btrfs_get_verity_descriptor(inode, sctx->verity_descriptor, ret); + ret = btrfs_get_verity_descriptor(&inode->vfs_inode, sctx->verity_descriptor, ret); if (ret < 0) goto iput; - p = fs_path_alloc(); - if (!p) { - ret = -ENOMEM; + p = get_cur_inode_path(sctx); + if (IS_ERR(p)) { + ret = PTR_ERR(p); goto iput; } - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); - if (ret < 0) - goto free_path; ret = send_verity(sctx, p, sctx->verity_descriptor); - if (ret < 0) - goto free_path; - -free_path: - fs_path_free(p); iput: - iput(inode); + iput(&inode->vfs_inode); return ret; } @@ -5263,10 +5221,9 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) { struct btrfs_root *root = sctx->send_root; struct btrfs_fs_info *fs_info = root->fs_info; - struct folio *folio; - pgoff_t index = offset >> PAGE_SHIFT; - pgoff_t last_index; - unsigned pg_offset = offset_in_page(offset); + u64 cur = offset; + const u64 end = offset + len; + const pgoff_t last_index = ((end - 1) >> PAGE_SHIFT); struct address_space *mapping = sctx->cur_inode->i_mapping; int ret; @@ -5274,13 +5231,12 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) if (ret) return ret; - last_index = (offset + len - 1) >> PAGE_SHIFT; - - while (index <= last_index) { - unsigned cur_len = min_t(unsigned, len, - PAGE_SIZE - pg_offset); + while (cur < end) { + pgoff_t index = (cur >> PAGE_SHIFT); + unsigned int cur_len; + unsigned int pg_offset; + struct folio *folio; -again: folio = filemap_lock_folio(mapping, index); if (IS_ERR(folio)) { page_cache_sync_readahead(mapping, @@ -5293,8 +5249,8 @@ again: break; } } - - WARN_ON(folio_order(folio)); + pg_offset = offset_in_folio(folio, cur); + cur_len = min_t(unsigned int, end - cur, folio_size(folio) - pg_offset); if (folio_test_readahead(folio)) page_cache_async_readahead(mapping, &sctx->ra, NULL, folio, @@ -5316,7 +5272,7 @@ again: if (folio->mapping != mapping) { folio_unlock(folio); folio_put(folio); - goto again; + continue; } } @@ -5324,9 +5280,7 @@ again: pg_offset, cur_len); folio_unlock(folio); folio_put(folio); - index++; - pg_offset = 0; - len -= cur_len; + cur += cur_len; sctx->send_size += cur_len; } @@ -5339,35 +5293,26 @@ again: */ static int send_write(struct send_ctx *sctx, u64 offset, u32 len) { - struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret = 0; struct fs_path *p; - p = fs_path_alloc(); - if (!p) - return -ENOMEM; - - btrfs_debug(fs_info, "send_write offset=%llu, len=%d", offset, len); + p = get_cur_inode_path(sctx); + if (IS_ERR(p)) + return PTR_ERR(p); ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); if (ret < 0) - goto out; - - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); - if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); ret = put_file_data(sctx, offset, len); if (ret < 0) - goto out; + return ret; ret = send_cmd(sctx); tlv_put_failure: -out: - fs_path_free(p); return ret; } @@ -5380,12 +5325,12 @@ static int send_clone(struct send_ctx *sctx, { int ret = 0; struct fs_path *p; + struct fs_path *cur_inode_path; u64 gen; - btrfs_debug(sctx->send_root->fs_info, - "send_clone offset=%llu, len=%d, clone_root=%llu, clone_inode=%llu, clone_offset=%llu", - offset, len, btrfs_root_id(clone_root->root), - clone_root->ino, clone_root->offset); + cur_inode_path = get_cur_inode_path(sctx); + if (IS_ERR(cur_inode_path)) + return PTR_ERR(cur_inode_path); p = fs_path_alloc(); if (!p) @@ -5395,13 +5340,9 @@ static int send_clone(struct send_ctx *sctx, if (ret < 0) goto out; - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); - if (ret < 0) - goto out; - TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len); - TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, cur_inode_path); if (clone_root->root == sctx->send_root) { ret = get_inode_gen(sctx->send_root, clone_root->ino, &gen); @@ -5452,17 +5393,13 @@ static int send_update_extent(struct send_ctx *sctx, int ret = 0; struct fs_path *p; - p = fs_path_alloc(); - if (!p) - return -ENOMEM; + p = get_cur_inode_path(sctx); + if (IS_ERR(p)) + return PTR_ERR(p); ret = begin_cmd(sctx, BTRFS_SEND_C_UPDATE_EXTENT); if (ret < 0) - goto out; - - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); - if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); @@ -5471,8 +5408,6 @@ static int send_update_extent(struct send_ctx *sctx, ret = send_cmd(sctx); tlv_put_failure: -out: - fs_path_free(p); return ret; } @@ -5501,12 +5436,10 @@ static int send_hole(struct send_ctx *sctx, u64 end) if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) return send_update_extent(sctx, offset, end - offset); - p = fs_path_alloc(); - if (!p) - return -ENOMEM; - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); - if (ret < 0) - goto tlv_put_failure; + p = get_cur_inode_path(sctx); + if (IS_ERR(p)) + return PTR_ERR(p); + while (offset < end) { u64 len = min(end - offset, read_size); @@ -5527,7 +5460,6 @@ static int send_hole(struct send_ctx *sctx, u64 end) } sctx->cur_inode_next_write_offset = offset; tlv_put_failure: - fs_path_free(p); return ret; } @@ -5535,9 +5467,7 @@ static int send_encoded_inline_extent(struct send_ctx *sctx, struct btrfs_path *path, u64 offset, u64 len) { - struct btrfs_root *root = sctx->send_root; - struct btrfs_fs_info *fs_info = root->fs_info; - struct inode *inode; + struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; struct fs_path *fspath; struct extent_buffer *leaf = path->nodes[0]; struct btrfs_key key; @@ -5546,23 +5476,13 @@ static int send_encoded_inline_extent(struct send_ctx *sctx, size_t inline_size; int ret; - inode = btrfs_iget(sctx->cur_ino, root); - if (IS_ERR(inode)) - return PTR_ERR(inode); - - fspath = fs_path_alloc(); - if (!fspath) { - ret = -ENOMEM; - goto out; - } + fspath = get_cur_inode_path(sctx); + if (IS_ERR(fspath)) + return PTR_ERR(fspath); ret = begin_cmd(sctx, BTRFS_SEND_C_ENCODED_WRITE); if (ret < 0) - goto out; - - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath); - if (ret < 0) - goto out; + return ret; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -5578,12 +5498,12 @@ static int send_encoded_inline_extent(struct send_ctx *sctx, ret = btrfs_encoded_io_compression_from_extent(fs_info, btrfs_file_extent_compression(leaf, ei)); if (ret < 0) - goto out; + return ret; TLV_PUT_U32(sctx, BTRFS_SEND_A_COMPRESSION, ret); ret = put_data_header(sctx, inline_size); if (ret < 0) - goto out; + return ret; read_extent_buffer(leaf, sctx->send_buf + sctx->send_size, btrfs_file_extent_inline_start(ei), inline_size); sctx->send_size += inline_size; @@ -5591,9 +5511,6 @@ static int send_encoded_inline_extent(struct send_ctx *sctx, ret = send_cmd(sctx); tlv_put_failure: -out: - fs_path_free(fspath); - iput(inode); return ret; } @@ -5602,7 +5519,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, { struct btrfs_root *root = sctx->send_root; struct btrfs_fs_info *fs_info = root->fs_info; - struct inode *inode; + struct btrfs_inode *inode; struct fs_path *fspath; struct extent_buffer *leaf = path->nodes[0]; struct btrfs_key key; @@ -5617,9 +5534,9 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, if (IS_ERR(inode)) return PTR_ERR(inode); - fspath = fs_path_alloc(); - if (!fspath) { - ret = -ENOMEM; + fspath = get_cur_inode_path(sctx); + if (IS_ERR(fspath)) { + ret = PTR_ERR(fspath); goto out; } @@ -5627,10 +5544,6 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, if (ret < 0) goto out; - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath); - if (ret < 0) - goto out; - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); @@ -5672,7 +5585,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, * Note that send_buf is a mapping of send_buf_pages, so this is really * reading into send_buf. */ - ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode), + ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr, disk_num_bytes, sctx->send_buf_pages + (data_offset >> PAGE_SHIFT), @@ -5698,8 +5611,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, tlv_put_failure: out: - fs_path_free(fspath); - iput(inode); + iput(&inode->vfs_inode); return ret; } @@ -5741,15 +5653,14 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path, } if (sctx->cur_inode == NULL) { + struct btrfs_inode *btrfs_inode; struct btrfs_root *root = sctx->send_root; - sctx->cur_inode = btrfs_iget(sctx->cur_ino, root); - if (IS_ERR(sctx->cur_inode)) { - int err = PTR_ERR(sctx->cur_inode); + btrfs_inode = btrfs_iget(sctx->cur_ino, root); + if (IS_ERR(btrfs_inode)) + return PTR_ERR(btrfs_inode); - sctx->cur_inode = NULL; - return err; - } + sctx->cur_inode = &btrfs_inode->vfs_inode; memset(&sctx->ra, 0, sizeof(struct file_ra_state)); file_ra_state_init(&sctx->ra, sctx->cur_inode->i_mapping); @@ -5828,7 +5739,6 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path, */ static int send_capabilities(struct send_ctx *sctx) { - struct fs_path *fspath = NULL; struct btrfs_path *path; struct btrfs_dir_item *di; struct extent_buffer *leaf; @@ -5854,25 +5764,19 @@ static int send_capabilities(struct send_ctx *sctx) leaf = path->nodes[0]; buf_len = btrfs_dir_data_len(leaf, di); - fspath = fs_path_alloc(); buf = kmalloc(buf_len, GFP_KERNEL); - if (!fspath || !buf) { + if (!buf) { ret = -ENOMEM; goto out; } - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath); - if (ret < 0) - goto out; - data_ptr = (unsigned long)(di + 1) + btrfs_dir_name_len(leaf, di); read_extent_buffer(leaf, buf, data_ptr, buf_len); - ret = send_set_xattr(sctx, fspath, XATTR_NAME_CAPS, + ret = send_set_xattr(sctx, XATTR_NAME_CAPS, strlen(XATTR_NAME_CAPS), buf, buf_len); out: kfree(buf); - fs_path_free(fspath); btrfs_free_path(path); return ret; } @@ -6898,6 +6802,7 @@ static int changed_inode(struct send_ctx *sctx, sctx->cur_inode_last_extent = (u64)-1; sctx->cur_inode_next_write_offset = 0; sctx->ignore_cur_inode = false; + fs_path_reset(&sctx->cur_inode_path); /* * Set send_progress to current inode. This will tell all get_cur_xxx @@ -8107,10 +8012,9 @@ static void dedupe_in_progress_warn(const struct btrfs_root *root) btrfs_root_id(root), root->dedupe_in_progress); } -long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_args *arg) +long btrfs_ioctl_send(struct btrfs_root *send_root, const struct btrfs_ioctl_send_args *arg) { int ret = 0; - struct btrfs_root *send_root = inode->root; struct btrfs_fs_info *fs_info = send_root->fs_info; struct btrfs_root *clone_root; struct send_ctx *sctx = NULL; @@ -8173,6 +8077,7 @@ long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_a goto out; } + init_path(&sctx->cur_inode_path); INIT_LIST_HEAD(&sctx->new_refs); INIT_LIST_HEAD(&sctx->deleted_refs); @@ -8449,6 +8354,9 @@ out: btrfs_lru_cache_clear(&sctx->dir_created_cache); btrfs_lru_cache_clear(&sctx->dir_utimes_cache); + if (sctx->cur_inode_path.buf != sctx->cur_inode_path.inline_buf) + kfree(sctx->cur_inode_path.buf); + kfree(sctx); } diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 9309886c5ea1..652bb28f63d4 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -11,7 +11,7 @@ #include <linux/sizes.h> #include <linux/align.h> -struct btrfs_inode; +struct btrfs_root; struct btrfs_ioctl_send_args; #define BTRFS_SEND_STREAM_MAGIC "btrfs-stream" @@ -182,6 +182,6 @@ enum { __BTRFS_SEND_A_MAX = 35, }; -long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_args *arg); +long btrfs_ioctl_send(struct btrfs_root *send_root, const struct btrfs_ioctl_send_args *arg); #endif diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index a341d087567a..d9087aa81b21 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 -#include "linux/spinlock.h" +#include <linux/spinlock.h> #include <linux/minmax.h> #include "misc.h" #include "ctree.h" @@ -50,11 +50,11 @@ * num_bytes we want to reserve. * * ->reserve - * space_info->bytes_may_reserve += num_bytes + * space_info->bytes_may_use += num_bytes * * ->extent allocation * Call btrfs_add_reserved_bytes() which does - * space_info->bytes_may_reserve -= num_bytes + * space_info->bytes_may_use -= num_bytes * space_info->bytes_reserved += extent_bytes * * ->insert reference @@ -234,19 +234,11 @@ void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info, WRITE_ONCE(space_info->chunk_size, chunk_size); } -static int create_space_info(struct btrfs_fs_info *info, u64 flags) +static void init_space_info(struct btrfs_fs_info *info, + struct btrfs_space_info *space_info, u64 flags) { - - struct btrfs_space_info *space_info; - int i; - int ret; - - space_info = kzalloc(sizeof(*space_info), GFP_NOFS); - if (!space_info) - return -ENOMEM; - space_info->fs_info = info; - for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) + for (int i = 0; i < BTRFS_NR_RAID_TYPES; i++) INIT_LIST_HEAD(&space_info->block_groups[i]); init_rwsem(&space_info->groups_sem); spin_lock_init(&space_info->lock); @@ -257,9 +249,64 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags) INIT_LIST_HEAD(&space_info->priority_tickets); space_info->clamp = 1; btrfs_update_space_info_chunk_size(space_info, calc_chunk_size(info, flags)); + space_info->subgroup_id = BTRFS_SUB_GROUP_PRIMARY; if (btrfs_is_zoned(info)) space_info->bg_reclaim_threshold = BTRFS_DEFAULT_ZONED_RECLAIM_THRESH; +} + +static int create_space_info_sub_group(struct btrfs_space_info *parent, u64 flags, + enum btrfs_space_info_sub_group id, int index) +{ + struct btrfs_fs_info *fs_info = parent->fs_info; + struct btrfs_space_info *sub_group; + int ret; + + ASSERT(parent->subgroup_id == BTRFS_SUB_GROUP_PRIMARY); + ASSERT(id != BTRFS_SUB_GROUP_PRIMARY); + + sub_group = kzalloc(sizeof(*sub_group), GFP_NOFS); + if (!sub_group) + return -ENOMEM; + + init_space_info(fs_info, sub_group, flags); + parent->sub_group[index] = sub_group; + sub_group->parent = parent; + sub_group->subgroup_id = id; + + ret = btrfs_sysfs_add_space_info_type(fs_info, sub_group); + if (ret) { + kfree(sub_group); + parent->sub_group[index] = NULL; + } + return ret; +} + +static int create_space_info(struct btrfs_fs_info *info, u64 flags) +{ + + struct btrfs_space_info *space_info; + int ret = 0; + + space_info = kzalloc(sizeof(*space_info), GFP_NOFS); + if (!space_info) + return -ENOMEM; + + init_space_info(info, space_info, flags); + + if (btrfs_is_zoned(info)) { + if (flags & BTRFS_BLOCK_GROUP_DATA) + ret = create_space_info_sub_group(space_info, flags, + BTRFS_SUB_GROUP_DATA_RELOC, + 0); + else if (flags & BTRFS_BLOCK_GROUP_METADATA) + ret = create_space_info_sub_group(space_info, flags, + BTRFS_SUB_GROUP_TREELOG, + 0); + + if (ret) + return ret; + } ret = btrfs_sysfs_add_space_info_type(info, space_info); if (ret) @@ -312,31 +359,29 @@ out: void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info, struct btrfs_block_group *block_group) { - struct btrfs_space_info *found; + struct btrfs_space_info *space_info = block_group->space_info; int factor, index; factor = btrfs_bg_type_to_factor(block_group->flags); - found = btrfs_find_space_info(info, block_group->flags); - ASSERT(found); - spin_lock(&found->lock); - found->total_bytes += block_group->length; - found->disk_total += block_group->length * factor; - found->bytes_used += block_group->used; - found->disk_used += block_group->used * factor; - found->bytes_readonly += block_group->bytes_super; - btrfs_space_info_update_bytes_zone_unusable(found, block_group->zone_unusable); + spin_lock(&space_info->lock); + space_info->total_bytes += block_group->length; + space_info->disk_total += block_group->length * factor; + space_info->bytes_used += block_group->used; + space_info->disk_used += block_group->used * factor; + space_info->bytes_readonly += block_group->bytes_super; + btrfs_space_info_update_bytes_zone_unusable(space_info, block_group->zone_unusable); if (block_group->length > 0) - found->full = 0; - btrfs_try_granting_tickets(info, found); - spin_unlock(&found->lock); + space_info->full = 0; + btrfs_try_granting_tickets(info, space_info); + spin_unlock(&space_info->lock); - block_group->space_info = found; + block_group->space_info = space_info; index = btrfs_bg_flags_to_raid_index(block_group->flags); - down_write(&found->groups_sem); - list_add_tail(&block_group->list, &found->block_groups[index]); - up_write(&found->groups_sem); + down_write(&space_info->groups_sem); + list_add_tail(&block_group->list, &space_info->block_groups[index]); + up_write(&space_info->groups_sem); } struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, @@ -556,8 +601,9 @@ static void __btrfs_dump_space_info(const struct btrfs_fs_info *fs_info, lockdep_assert_held(&info->lock); /* The free space could be negative in case of overcommit */ - btrfs_info(fs_info, "space_info %s has %lld free, is %sfull", - flag_str, + btrfs_info(fs_info, + "space_info %s (sub-group id %d) has %lld free, is %sfull", + flag_str, info->subgroup_id, (s64)(info->total_bytes - btrfs_space_info_used(info, true)), info->full ? "" : "not "); btrfs_info(fs_info, @@ -812,7 +858,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, ret = PTR_ERR(trans); break; } - ret = btrfs_chunk_alloc(trans, + ret = btrfs_chunk_alloc(trans, space_info, btrfs_get_alloc_profile(fs_info, space_info->flags), (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE : CHUNK_ALLOC_FORCE); @@ -1083,23 +1129,15 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, return (tickets_id != space_info->tickets_id); } -/* - * This is for normal flushers, we can wait all goddamned day if we want to. We - * will loop and continuously try to flush as long as we are making progress. - * We count progress as clearing off tickets each time we have to loop. - */ -static void btrfs_async_reclaim_metadata_space(struct work_struct *work) +static void do_async_reclaim_metadata_space(struct btrfs_space_info *space_info) { - struct btrfs_fs_info *fs_info; - struct btrfs_space_info *space_info; + struct btrfs_fs_info *fs_info = space_info->fs_info; u64 to_reclaim; enum btrfs_flush_state flush_state; int commit_cycles = 0; u64 last_tickets_id; enum btrfs_flush_state final_state; - fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); - space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); if (btrfs_is_zoned(fs_info)) final_state = RESET_ZONES; else @@ -1174,6 +1212,25 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) } /* + * This is for normal flushers, it can wait as much time as needed. We will + * loop and continuously try to flush as long as we are making progress. We + * count progress as clearing off tickets each time we have to loop. + */ +static void btrfs_async_reclaim_metadata_space(struct work_struct *work) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_space_info *space_info; + + fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); + space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + do_async_reclaim_metadata_space(space_info); + for (int i = 0; i < BTRFS_SPACE_INFO_SUB_GROUP_MAX; i++) { + if (space_info->sub_group[i]) + do_async_reclaim_metadata_space(space_info->sub_group[i]); + } +} + +/* * This handles pre-flushing of metadata space before we get to the point that * we need to start blocking threads on tickets. The logic here is different * from the other flush paths because it doesn't rely on tickets to tell us how @@ -1318,16 +1375,12 @@ static const enum btrfs_flush_state data_flush_states[] = { ALLOC_CHUNK_FORCE, }; -static void btrfs_async_reclaim_data_space(struct work_struct *work) +static void do_async_reclaim_data_space(struct btrfs_space_info *space_info) { - struct btrfs_fs_info *fs_info; - struct btrfs_space_info *space_info; + struct btrfs_fs_info *fs_info = space_info->fs_info; u64 last_tickets_id; enum btrfs_flush_state flush_state = 0; - fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work); - space_info = fs_info->data_sinfo; - spin_lock(&space_info->lock); if (list_empty(&space_info->tickets)) { space_info->flush = 0; @@ -1395,6 +1448,19 @@ aborted_fs: spin_unlock(&space_info->lock); } +static void btrfs_async_reclaim_data_space(struct work_struct *work) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_space_info *space_info; + + fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work); + space_info = fs_info->data_sinfo; + do_async_reclaim_data_space(space_info); + for (int i = 0; i < BTRFS_SPACE_INFO_SUB_GROUP_MAX; i++) + if (space_info->sub_group[i]) + do_async_reclaim_data_space(space_info->sub_group[i]); +} + void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info) { INIT_WORK(&fs_info->async_reclaim_work, btrfs_async_reclaim_metadata_space); @@ -1836,10 +1902,10 @@ int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, * This will reserve bytes from the data space info. If there is not enough * space then we will attempt to flush space as specified by flush. */ -int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, +int btrfs_reserve_data_bytes(struct btrfs_space_info *space_info, u64 bytes, enum btrfs_reserve_flush_enum flush) { - struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; + struct btrfs_fs_info *fs_info = space_info->fs_info; int ret; ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA || @@ -1847,12 +1913,12 @@ int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, flush == BTRFS_RESERVE_NO_FLUSH); ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA); - ret = __reserve_bytes(fs_info, data_sinfo, bytes, flush); + ret = __reserve_bytes(fs_info, space_info, bytes, flush); if (ret == -ENOSPC) { trace_btrfs_space_reservation(fs_info, "space_info:enospc", - data_sinfo->flags, bytes, 1); + space_info->flags, bytes, 1); if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) - btrfs_dump_space_info(fs_info, data_sinfo, bytes, 0); + btrfs_dump_space_info(fs_info, space_info, bytes, 0); } return ret; } diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index a96efdb5e681..92b7f5e2b850 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -98,8 +98,18 @@ enum btrfs_flush_state { RESET_ZONES = 12, }; +enum btrfs_space_info_sub_group { + BTRFS_SUB_GROUP_PRIMARY, + BTRFS_SUB_GROUP_DATA_RELOC, + BTRFS_SUB_GROUP_TREELOG, +}; + +#define BTRFS_SPACE_INFO_SUB_GROUP_MAX 1 struct btrfs_space_info { struct btrfs_fs_info *fs_info; + struct btrfs_space_info *parent; + struct btrfs_space_info *sub_group[BTRFS_SPACE_INFO_SUB_GROUP_MAX]; + int subgroup_id; spinlock_t lock; u64 total_bytes; /* total bytes in the space, @@ -288,7 +298,7 @@ static inline void btrfs_space_info_free_bytes_may_use( btrfs_try_granting_tickets(space_info->fs_info, space_info); spin_unlock(&space_info->lock); } -int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, +int btrfs_reserve_data_bytes(struct btrfs_space_info *space_info, u64 bytes, enum btrfs_reserve_flush_enum flush); void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info); void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 722acf768396..d4f019233493 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -2,12 +2,11 @@ #include <linux/slab.h> #include "messages.h" -#include "ctree.h" #include "subpage.h" #include "btrfs_inode.h" /* - * Subpage (sectorsize < PAGE_SIZE) support overview: + * Subpage (block size < folio size) support overview: * * Limitations: * @@ -64,35 +63,15 @@ * This means a slightly higher tree locking latency. */ -#if PAGE_SIZE > SZ_4K -bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping) -{ - if (fs_info->sectorsize >= PAGE_SIZE) - return false; - - /* - * Only data pages (either through DIO or compression) can have no - * mapping. And if page->mapping->host is data inode, it's subpage. - * As we have ruled our sectorsize >= PAGE_SIZE case already. - */ - if (!mapping || !mapping->host || is_data_inode(BTRFS_I(mapping->host))) - return true; - - /* - * Now the only remaining case is metadata, which we only go subpage - * routine if nodesize < PAGE_SIZE. - */ - if (fs_info->nodesize < PAGE_SIZE) - return true; - return false; -} -#endif - int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio, enum btrfs_subpage_type type) { struct btrfs_subpage *subpage; + /* For metadata we don't support large folio yet. */ + if (type == BTRFS_SUBPAGE_METADATA) + ASSERT(!folio_test_large(folio)); + /* * We have cases like a dummy extent buffer page, which is not mapped * and doesn't need to be locked. @@ -101,10 +80,14 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, ASSERT(folio_test_locked(folio)); /* Either not subpage, or the folio already has private attached. */ - if (!btrfs_is_subpage(fs_info, folio->mapping) || folio_test_private(folio)) + if (folio_test_private(folio)) + return 0; + if (type == BTRFS_SUBPAGE_METADATA && !btrfs_meta_is_subpage(fs_info)) + return 0; + if (type == BTRFS_SUBPAGE_DATA && !btrfs_is_subpage(fs_info, folio)) return 0; - subpage = btrfs_alloc_subpage(fs_info, type); + subpage = btrfs_alloc_subpage(fs_info, folio_size(folio), type); if (IS_ERR(subpage)) return PTR_ERR(subpage); @@ -112,12 +95,17 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, return 0; } -void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio) +void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio, + enum btrfs_subpage_type type) { struct btrfs_subpage *subpage; /* Either not subpage, or the folio already has private attached. */ - if (!btrfs_is_subpage(fs_info, folio->mapping) || !folio_test_private(folio)) + if (!folio_test_private(folio)) + return; + if (type == BTRFS_SUBPAGE_METADATA && !btrfs_meta_is_subpage(fs_info)) + return; + if (type == BTRFS_SUBPAGE_DATA && !btrfs_is_subpage(fs_info, folio)) return; subpage = folio_detach_private(folio); @@ -126,15 +114,16 @@ void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *fol } struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, - enum btrfs_subpage_type type) + size_t fsize, enum btrfs_subpage_type type) { struct btrfs_subpage *ret; unsigned int real_size; - ASSERT(fs_info->sectorsize < PAGE_SIZE); + ASSERT(fs_info->sectorsize < fsize); real_size = struct_size(ret, bitmaps, - BITS_TO_LONGS(btrfs_bitmap_nr_max * fs_info->sectors_per_page)); + BITS_TO_LONGS(btrfs_bitmap_nr_max * + (fsize >> fs_info->sectorsize_bits))); ret = kzalloc(real_size, GFP_NOFS); if (!ret) return ERR_PTR(-ENOMEM); @@ -165,7 +154,7 @@ void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio * { struct btrfs_subpage *subpage; - if (!btrfs_is_subpage(fs_info, folio->mapping)) + if (!btrfs_meta_is_subpage(fs_info)) return; ASSERT(folio_test_private(folio) && folio->mapping); @@ -179,7 +168,7 @@ void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio * { struct btrfs_subpage *subpage; - if (!btrfs_is_subpage(fs_info, folio->mapping)) + if (!btrfs_meta_is_subpage(fs_info)) return; ASSERT(folio_test_private(folio) && folio->mapping); @@ -193,9 +182,6 @@ void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio * static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - /* For subpage support, the folio must be single page. */ - ASSERT(folio_order(folio) == 0); - /* Basic checks */ ASSERT(folio_test_private(folio) && folio_get_private(folio)); ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && @@ -206,16 +192,18 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, */ if (folio->mapping) ASSERT(folio_pos(folio) <= start && - start + len <= folio_pos(folio) + PAGE_SIZE); + start + len <= folio_pos(folio) + folio_size(folio)); } #define subpage_calc_start_bit(fs_info, folio, name, start, len) \ ({ \ - unsigned int __start_bit; \ + unsigned int __start_bit; \ + const unsigned int blocks_per_folio = \ + btrfs_blocks_per_folio(fs_info, folio); \ \ btrfs_subpage_assert(fs_info, folio, start, len); \ - __start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \ - __start_bit += fs_info->sectors_per_page * btrfs_bitmap_nr_##name; \ + __start_bit = offset_in_folio(folio, start) >> fs_info->sectorsize_bits; \ + __start_bit += blocks_per_folio * btrfs_bitmap_nr_##name; \ __start_bit; \ }) @@ -233,7 +221,7 @@ static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len) if (folio_pos(folio) >= orig_start + orig_len) *len = 0; else - *len = min_t(u64, folio_pos(folio) + PAGE_SIZE, + *len = min_t(u64, folio_pos(folio) + folio_size(folio), orig_start + orig_len) - *start; } @@ -296,7 +284,7 @@ void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info, ASSERT(folio_test_locked(folio)); - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) { + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio)) { folio_unlock(folio); return; } @@ -323,13 +311,14 @@ void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, struct folio *folio, unsigned long bitmap) { struct btrfs_subpage *subpage = folio_get_private(folio); - const int start_bit = fs_info->sectors_per_page * btrfs_bitmap_nr_locked; + const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); + const int start_bit = blocks_per_folio * btrfs_bitmap_nr_locked; unsigned long flags; bool last = false; int cleared = 0; int bit; - if (!btrfs_is_subpage(fs_info, folio->mapping)) { + if (!btrfs_is_subpage(fs_info, folio)) { folio_unlock(folio); return; } @@ -341,7 +330,7 @@ void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, } spin_lock_irqsave(&subpage->lock, flags); - for_each_set_bit(bit, &bitmap, fs_info->sectors_per_page) { + for_each_set_bit(bit, &bitmap, blocks_per_folio) { if (test_and_clear_bit(bit + start_bit, subpage->bitmaps)) cleared++; } @@ -352,15 +341,27 @@ void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, folio_unlock(folio); } -#define subpage_test_bitmap_all_set(fs_info, subpage, name) \ +#define subpage_test_bitmap_all_set(fs_info, folio, name) \ +({ \ + struct btrfs_subpage *subpage = folio_get_private(folio); \ + const unsigned int blocks_per_folio = \ + btrfs_blocks_per_folio(fs_info, folio); \ + \ bitmap_test_range_all_set(subpage->bitmaps, \ - fs_info->sectors_per_page * btrfs_bitmap_nr_##name, \ - fs_info->sectors_per_page) + blocks_per_folio * btrfs_bitmap_nr_##name, \ + blocks_per_folio); \ +}) -#define subpage_test_bitmap_all_zero(fs_info, subpage, name) \ +#define subpage_test_bitmap_all_zero(fs_info, folio, name) \ +({ \ + struct btrfs_subpage *subpage = folio_get_private(folio); \ + const unsigned int blocks_per_folio = \ + btrfs_blocks_per_folio(fs_info, folio); \ + \ bitmap_test_range_all_zero(subpage->bitmaps, \ - fs_info->sectors_per_page * btrfs_bitmap_nr_##name, \ - fs_info->sectors_per_page) + blocks_per_folio * btrfs_bitmap_nr_##name, \ + blocks_per_folio); \ +}) void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) @@ -372,7 +373,7 @@ void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info, spin_lock_irqsave(&subpage->lock, flags); bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - if (subpage_test_bitmap_all_set(fs_info, subpage, uptodate)) + if (subpage_test_bitmap_all_set(fs_info, folio, uptodate)) folio_mark_uptodate(folio); spin_unlock_irqrestore(&subpage->lock, flags); } @@ -426,7 +427,7 @@ bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, spin_lock_irqsave(&subpage->lock, flags); bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - if (subpage_test_bitmap_all_zero(fs_info, subpage, dirty)) + if (subpage_test_bitmap_all_zero(fs_info, folio, dirty)) last = true; spin_unlock_irqrestore(&subpage->lock, flags); return last; @@ -467,7 +468,7 @@ void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info, spin_lock_irqsave(&subpage->lock, flags); bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - if (subpage_test_bitmap_all_zero(fs_info, subpage, writeback)) { + if (subpage_test_bitmap_all_zero(fs_info, folio, writeback)) { ASSERT(folio_test_writeback(folio)); folio_end_writeback(folio); } @@ -498,7 +499,7 @@ void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info, spin_lock_irqsave(&subpage->lock, flags); bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - if (subpage_test_bitmap_all_zero(fs_info, subpage, ordered)) + if (subpage_test_bitmap_all_zero(fs_info, folio, ordered)) folio_clear_ordered(folio); spin_unlock_irqrestore(&subpage->lock, flags); } @@ -513,7 +514,7 @@ void btrfs_subpage_set_checked(const struct btrfs_fs_info *fs_info, spin_lock_irqsave(&subpage->lock, flags); bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - if (subpage_test_bitmap_all_set(fs_info, subpage, checked)) + if (subpage_test_bitmap_all_set(fs_info, folio, checked)) folio_set_checked(folio); spin_unlock_irqrestore(&subpage->lock, flags); } @@ -569,7 +570,7 @@ void btrfs_folio_set_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len) \ { \ if (unlikely(!fs_info) || \ - !btrfs_is_subpage(fs_info, folio->mapping)) { \ + !btrfs_is_subpage(fs_info, folio)) { \ folio_set_func(folio); \ return; \ } \ @@ -579,7 +580,7 @@ void btrfs_folio_clear_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len) \ { \ if (unlikely(!fs_info) || \ - !btrfs_is_subpage(fs_info, folio->mapping)) { \ + !btrfs_is_subpage(fs_info, folio)) { \ folio_clear_func(folio); \ return; \ } \ @@ -589,7 +590,7 @@ bool btrfs_folio_test_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len) \ { \ if (unlikely(!fs_info) || \ - !btrfs_is_subpage(fs_info, folio->mapping)) \ + !btrfs_is_subpage(fs_info, folio)) \ return folio_test_func(folio); \ return btrfs_subpage_test_##name(fs_info, folio, start, len); \ } \ @@ -597,7 +598,7 @@ void btrfs_folio_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len) \ { \ if (unlikely(!fs_info) || \ - !btrfs_is_subpage(fs_info, folio->mapping)) { \ + !btrfs_is_subpage(fs_info, folio)) { \ folio_set_func(folio); \ return; \ } \ @@ -608,7 +609,7 @@ void btrfs_folio_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len) \ { \ if (unlikely(!fs_info) || \ - !btrfs_is_subpage(fs_info, folio->mapping)) { \ + !btrfs_is_subpage(fs_info, folio)) { \ folio_clear_func(folio); \ return; \ } \ @@ -619,10 +620,32 @@ bool btrfs_folio_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len) \ { \ if (unlikely(!fs_info) || \ - !btrfs_is_subpage(fs_info, folio->mapping)) \ + !btrfs_is_subpage(fs_info, folio)) \ return folio_test_func(folio); \ btrfs_subpage_clamp_range(folio, &start, &len); \ return btrfs_subpage_test_##name(fs_info, folio, start, len); \ +} \ +void btrfs_meta_folio_set_##name(struct folio *folio, const struct extent_buffer *eb) \ +{ \ + if (!btrfs_meta_is_subpage(eb->fs_info)) { \ + folio_set_func(folio); \ + return; \ + } \ + btrfs_subpage_set_##name(eb->fs_info, folio, eb->start, eb->len); \ +} \ +void btrfs_meta_folio_clear_##name(struct folio *folio, const struct extent_buffer *eb) \ +{ \ + if (!btrfs_meta_is_subpage(eb->fs_info)) { \ + folio_clear_func(folio); \ + return; \ + } \ + btrfs_subpage_clear_##name(eb->fs_info, folio, eb->start, eb->len); \ +} \ +bool btrfs_meta_folio_test_##name(struct folio *folio, const struct extent_buffer *eb) \ +{ \ + if (!btrfs_meta_is_subpage(eb->fs_info)) \ + return folio_test_func(folio); \ + return btrfs_subpage_test_##name(eb->fs_info, folio, eb->start, eb->len); \ } IMPLEMENT_BTRFS_PAGE_OPS(uptodate, folio_mark_uptodate, folio_clear_uptodate, folio_test_uptodate); @@ -635,26 +658,29 @@ IMPLEMENT_BTRFS_PAGE_OPS(ordered, folio_set_ordered, folio_clear_ordered, IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked, folio_test_checked); -#define GET_SUBPAGE_BITMAP(subpage, fs_info, name, dst) \ +#define GET_SUBPAGE_BITMAP(fs_info, folio, name, dst) \ { \ - const int sectors_per_page = fs_info->sectors_per_page; \ + const unsigned int blocks_per_folio = \ + btrfs_blocks_per_folio(fs_info, folio); \ + const struct btrfs_subpage *subpage = folio_get_private(folio); \ \ - ASSERT(sectors_per_page < BITS_PER_LONG); \ + ASSERT(blocks_per_folio <= BITS_PER_LONG); \ *dst = bitmap_read(subpage->bitmaps, \ - sectors_per_page * btrfs_bitmap_nr_##name, \ - sectors_per_page); \ + blocks_per_folio * btrfs_bitmap_nr_##name, \ + blocks_per_folio); \ } #define SUBPAGE_DUMP_BITMAP(fs_info, folio, name, start, len) \ { \ - const struct btrfs_subpage *subpage = folio_get_private(folio); \ unsigned long bitmap; \ + const unsigned int blocks_per_folio = \ + btrfs_blocks_per_folio(fs_info, folio); \ \ - GET_SUBPAGE_BITMAP(subpage, fs_info, name, &bitmap); \ + GET_SUBPAGE_BITMAP(fs_info, folio, name, &bitmap); \ btrfs_warn(fs_info, \ "dumpping bitmap start=%llu len=%u folio=%llu " #name "_bitmap=%*pbl", \ start, len, folio_pos(folio), \ - fs_info->sectors_per_page, &bitmap); \ + blocks_per_folio, &bitmap); \ } /* @@ -672,7 +698,7 @@ void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) return; - if (!btrfs_is_subpage(fs_info, folio->mapping)) { + if (!btrfs_is_subpage(fs_info, folio)) { ASSERT(!folio_test_dirty(folio)); return; } @@ -707,7 +733,7 @@ void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info, int ret; ASSERT(folio_test_locked(folio)); - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio)) return; subpage = folio_get_private(folio); @@ -721,15 +747,37 @@ void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info, } bitmap_set(subpage->bitmaps, start_bit, nbits); ret = atomic_add_return(nbits, &subpage->nr_locked); - ASSERT(ret <= fs_info->sectors_per_page); + ASSERT(ret <= btrfs_blocks_per_folio(fs_info, folio)); spin_unlock_irqrestore(&subpage->lock, flags); } +/* + * Clear the dirty flag for the folio. + * + * If the affected folio is no longer dirty, return true. Otherwise return false. + */ +bool btrfs_meta_folio_clear_and_test_dirty(struct folio *folio, const struct extent_buffer *eb) +{ + bool last; + + if (!btrfs_meta_is_subpage(eb->fs_info)) { + folio_clear_dirty_for_io(folio); + return true; + } + + last = btrfs_subpage_clear_and_test_dirty(eb->fs_info, folio, eb->start, eb->len); + if (last) { + folio_clear_dirty_for_io(folio); + return true; + } + return false; +} + void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { struct btrfs_subpage *subpage; - const u32 sectors_per_page = fs_info->sectors_per_page; + const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); unsigned long uptodate_bitmap; unsigned long dirty_bitmap; unsigned long writeback_bitmap; @@ -739,28 +787,28 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, unsigned long flags; ASSERT(folio_test_private(folio) && folio_get_private(folio)); - ASSERT(sectors_per_page > 1); + ASSERT(blocks_per_folio > 1); subpage = folio_get_private(folio); spin_lock_irqsave(&subpage->lock, flags); - GET_SUBPAGE_BITMAP(subpage, fs_info, uptodate, &uptodate_bitmap); - GET_SUBPAGE_BITMAP(subpage, fs_info, dirty, &dirty_bitmap); - GET_SUBPAGE_BITMAP(subpage, fs_info, writeback, &writeback_bitmap); - GET_SUBPAGE_BITMAP(subpage, fs_info, ordered, &ordered_bitmap); - GET_SUBPAGE_BITMAP(subpage, fs_info, checked, &checked_bitmap); - GET_SUBPAGE_BITMAP(subpage, fs_info, locked, &locked_bitmap); + GET_SUBPAGE_BITMAP(fs_info, folio, uptodate, &uptodate_bitmap); + GET_SUBPAGE_BITMAP(fs_info, folio, dirty, &dirty_bitmap); + GET_SUBPAGE_BITMAP(fs_info, folio, writeback, &writeback_bitmap); + GET_SUBPAGE_BITMAP(fs_info, folio, ordered, &ordered_bitmap); + GET_SUBPAGE_BITMAP(fs_info, folio, checked, &checked_bitmap); + GET_SUBPAGE_BITMAP(fs_info, folio, locked, &locked_bitmap); spin_unlock_irqrestore(&subpage->lock, flags); dump_page(folio_page(folio, 0), "btrfs subpage dump"); btrfs_warn(fs_info, "start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl dirty=%*pbl locked=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl", start, len, folio_pos(folio), - sectors_per_page, &uptodate_bitmap, - sectors_per_page, &dirty_bitmap, - sectors_per_page, &locked_bitmap, - sectors_per_page, &writeback_bitmap, - sectors_per_page, &ordered_bitmap, - sectors_per_page, &checked_bitmap); + blocks_per_folio, &uptodate_bitmap, + blocks_per_folio, &dirty_bitmap, + blocks_per_folio, &locked_bitmap, + blocks_per_folio, &writeback_bitmap, + blocks_per_folio, &ordered_bitmap, + blocks_per_folio, &checked_bitmap); } void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info, @@ -771,10 +819,10 @@ void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info, unsigned long flags; ASSERT(folio_test_private(folio) && folio_get_private(folio)); - ASSERT(fs_info->sectors_per_page > 1); + ASSERT(btrfs_blocks_per_folio(fs_info, folio) > 1); subpage = folio_get_private(folio); spin_lock_irqsave(&subpage->lock, flags); - GET_SUBPAGE_BITMAP(subpage, fs_info, dirty, ret_bitmap); + GET_SUBPAGE_BITMAP(fs_info, folio, dirty, ret_bitmap); spin_unlock_irqrestore(&subpage->lock, flags); } diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index 44fff1f4eac4..3042c5ea840a 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -6,10 +6,11 @@ #include <linux/spinlock.h> #include <linux/atomic.h> #include <linux/sizes.h> +#include "btrfs_inode.h" +#include "fs.h" struct address_space; struct folio; -struct btrfs_fs_info; /* * Extra info for subpapge bitmap. @@ -69,23 +70,49 @@ enum btrfs_subpage_type { BTRFS_SUBPAGE_DATA, }; -#if PAGE_SIZE > SZ_4K -bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping); +#if PAGE_SIZE > BTRFS_MIN_BLOCKSIZE +/* + * Subpage support for metadata is more complex, as we can have dummy extent + * buffers, where folios have no mapping to determine the owning inode. + * + * Thankfully we only need to check if node size is smaller than page size. + * Even with larger folio support, we will only allocate a folio as large as + * node size. + * Thus if nodesize < PAGE_SIZE, we know metadata needs need to subpage routine. + */ +static inline bool btrfs_meta_is_subpage(const struct btrfs_fs_info *fs_info) +{ + return fs_info->nodesize < PAGE_SIZE; +} +static inline bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, + struct folio *folio) +{ + if (folio->mapping && folio->mapping->host) + ASSERT(is_data_inode(BTRFS_I(folio->mapping->host))); + return fs_info->sectorsize < folio_size(folio); +} #else +static inline bool btrfs_meta_is_subpage(const struct btrfs_fs_info *fs_info) +{ + return false; +} static inline bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, - struct address_space *mapping) + struct folio *folio) { + if (folio->mapping && folio->mapping->host) + ASSERT(is_data_inode(BTRFS_I(folio->mapping->host))); return false; } #endif int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio, enum btrfs_subpage_type type); -void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio); +void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio, + enum btrfs_subpage_type type); /* Allocate additional data where page represents more than one sector */ struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, - enum btrfs_subpage_type type); + size_t fsize, enum btrfs_subpage_type type); void btrfs_free_subpage(struct btrfs_subpage *subpage); void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio); @@ -110,6 +137,13 @@ void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, * btrfs_folio_clamp_*() are similar to btrfs_folio_*(), except the range doesn't * need to be inside the page. Those functions will truncate the range * automatically. + * + * Both btrfs_folio_*() and btrfs_folio_clamp_*() are for data folios. + * + * For metadata, one should use btrfs_meta_folio_*() helpers instead, and there + * is no clamp version for metadata helpers, as we either go subpage + * (nodesize < PAGE_SIZE) or go regular folio helpers (nodesize >= PAGE_SIZE, + * and our folio is never larger than nodesize). */ #define DECLARE_BTRFS_SUBPAGE_OPS(name) \ void btrfs_subpage_set_##name(const struct btrfs_fs_info *fs_info, \ @@ -129,7 +163,10 @@ void btrfs_folio_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ void btrfs_folio_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len); \ bool btrfs_folio_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ - struct folio *folio, u64 start, u32 len); + struct folio *folio, u64 start, u32 len); \ +void btrfs_meta_folio_set_##name(struct folio *folio, const struct extent_buffer *eb); \ +void btrfs_meta_folio_clear_##name(struct folio *folio, const struct extent_buffer *eb); \ +bool btrfs_meta_folio_test_##name(struct folio *folio, const struct extent_buffer *eb); DECLARE_BTRFS_SUBPAGE_OPS(uptodate); DECLARE_BTRFS_SUBPAGE_OPS(dirty); @@ -155,6 +192,7 @@ bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len); +bool btrfs_meta_folio_clear_and_test_dirty(struct folio *folio, const struct extent_buffer *eb); void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info, struct folio *folio, unsigned long *ret_bitmap); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index dc4fee519ca6..a0c65adce1ab 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -84,7 +84,7 @@ struct btrfs_fs_context { u32 thread_pool_size; unsigned long long mount_opt; unsigned long compress_type:4; - unsigned int compress_level; + int compress_level; refcount_t refs; }; @@ -125,7 +125,6 @@ enum { /* Rescue options */ Opt_rescue, Opt_usebackuproot, - Opt_nologreplay, /* Debugging options */ Opt_enospc_debug, @@ -246,8 +245,6 @@ static const struct fs_parameter_spec btrfs_fs_parameters[] = { /* Rescue options. */ fsparam_enum("rescue", Opt_rescue, btrfs_parameter_rescue), - /* Deprecated, with alias rescue=nologreplay */ - __fsparam(NULL, "nologreplay", Opt_nologreplay, fs_param_deprecated, NULL), /* Deprecated, with alias rescue=usebackuproot */ __fsparam(NULL, "usebackuproot", Opt_usebackuproot, fs_param_deprecated, NULL), /* For compatibility only, alias for "rescue=nologreplay". */ @@ -449,11 +446,6 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param) else btrfs_clear_opt(ctx->mount_opt, NOTREELOG); break; - case Opt_nologreplay: - btrfs_warn(NULL, - "'nologreplay' is deprecated, use 'rescue=nologreplay' instead"); - btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY); - break; case Opt_norecovery: btrfs_info(NULL, "'norecovery' is for compatibility only, recommended to use 'rescue=nologreplay'"); @@ -569,6 +561,10 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param) break; case Opt_commit_interval: ctx->commit_interval = result.uint_32; + if (ctx->commit_interval > BTRFS_WARNING_COMMIT_INTERVAL) { + btrfs_warn(NULL, "excessive commit interval %u, use with care", + ctx->commit_interval); + } if (ctx->commit_interval == 0) ctx->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; break; @@ -947,7 +943,7 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec static int btrfs_fill_super(struct super_block *sb, struct btrfs_fs_devices *fs_devices) { - struct inode *inode; + struct btrfs_inode *inode; struct btrfs_fs_info *fs_info = btrfs_sb(sb); int err; @@ -982,7 +978,7 @@ static int btrfs_fill_super(struct super_block *sb, goto fail_close; } - sb->s_root = d_make_root(inode); + sb->s_root = d_make_root(&inode->vfs_inode); if (!sb->s_root) { err = -ENOMEM; goto fail_close; @@ -1139,8 +1135,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) subvol_name = btrfs_get_subvol_name_from_objectid(info, btrfs_root_id(BTRFS_I(d_inode(dentry))->root)); if (!IS_ERR(subvol_name)) { - seq_puts(seq, ",subvol="); - seq_escape(seq, subvol_name, " \t\n\\"); + seq_show_option(seq, "subvol", subvol_name); kfree(subvol_name); } return 0; @@ -1149,11 +1144,11 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) /* * subvolumes are identified by ino 256 */ -static inline int is_subvolume_inode(struct inode *inode) +static inline bool is_subvolume_inode(struct inode *inode) { if (inode && inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) - return 1; - return 0; + return true; + return false; } static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid, @@ -2293,7 +2288,7 @@ static int check_dev_super(struct btrfs_device *dev) return 0; /* Only need to check the primary super block. */ - sb = btrfs_read_dev_one_super(dev->bdev, 0, true); + sb = btrfs_read_disk_super(dev->bdev, 0, true); if (IS_ERR(sb)) return PTR_ERR(sb); @@ -2526,8 +2521,8 @@ static const struct init_sequence mod_init_seq[] = { .init_func = btrfs_free_space_init, .exit_func = btrfs_free_space_exit, }, { - .init_func = extent_state_init_cachep, - .exit_func = extent_state_free_cachep, + .init_func = btrfs_extent_state_init_cachep, + .exit_func = btrfs_extent_state_free_cachep, }, { .init_func = extent_buffer_init_cachep, .exit_func = extent_buffer_free_cachep, @@ -2535,8 +2530,8 @@ static const struct init_sequence mod_init_seq[] = { .init_func = btrfs_bioset_init, .exit_func = btrfs_bioset_exit, }, { - .init_func = extent_map_init, - .exit_func = extent_map_exit, + .init_func = btrfs_extent_map_init, + .exit_func = btrfs_extent_map_exit, #ifdef CONFIG_BTRFS_EXPERIMENTAL }, { .init_func = btrfs_read_policy_init, diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 53b846d99ece..5d93d9dd2c12 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -411,7 +411,8 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj, { ssize_t ret = 0; - /* An artificial limit to only support 4K and PAGE_SIZE */ + if (BTRFS_MIN_BLOCKSIZE != SZ_4K && BTRFS_MIN_BLOCKSIZE != PAGE_SIZE) + ret += sysfs_emit_at(buf, ret, "%u ", BTRFS_MIN_BLOCKSIZE); if (PAGE_SIZE > SZ_4K) ret += sysfs_emit_at(buf, ret, "%u ", SZ_4K); ret += sysfs_emit_at(buf, ret, "%lu\n", PAGE_SIZE); @@ -1330,29 +1331,30 @@ MODULE_PARM_DESC(read_policy, int btrfs_read_policy_to_enum(const char *str, s64 *value_ret) { - char param[32] = { 0 }; + char param[32]; char __maybe_unused *value_str; if (!str || strlen(str) == 0) return 0; - strncpy(param, str, sizeof(param) - 1); + strscpy(param, str); #ifdef CONFIG_BTRFS_EXPERIMENTAL /* Separate value from input in policy:value format. */ value_str = strchr(param, ':'); if (value_str) { - int ret; + char *retptr; *value_str = 0; value_str++; if (!value_ret) return -EINVAL; - ret = kstrtos64(value_str, 10, value_ret); - if (ret) + + *value_ret = memparse(value_str, &retptr); + /* There could be any trailing typos after the value. */ + retptr = skip_spaces(retptr); + if (*retptr != 0 || *value_ret <= 0) return -EINVAL; - if (*value_ret < 0) - return -ERANGE; } #endif @@ -1928,16 +1930,35 @@ void btrfs_sysfs_remove_space_info(struct btrfs_space_info *space_info) kobject_put(&space_info->kobj); } -static const char *alloc_name(u64 flags) +static const char *alloc_name(struct btrfs_space_info *space_info) { + u64 flags = space_info->flags; + switch (flags) { case BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA: return "mixed"; case BTRFS_BLOCK_GROUP_METADATA: - return "metadata"; + switch (space_info->subgroup_id) { + case BTRFS_SUB_GROUP_PRIMARY: + return "metadata"; + case BTRFS_SUB_GROUP_TREELOG: + return "metadata-treelog"; + default: + WARN_ON_ONCE(1); + return "metadata (unknown sub-group)"; + } case BTRFS_BLOCK_GROUP_DATA: - return "data"; + switch (space_info->subgroup_id) { + case BTRFS_SUB_GROUP_PRIMARY: + return "data"; + case BTRFS_SUB_GROUP_DATA_RELOC: + return "data-reloc"; + default: + WARN_ON_ONCE(1); + return "data (unknown sub-group)"; + } case BTRFS_BLOCK_GROUP_SYSTEM: + ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_PRIMARY); return "system"; default: WARN_ON(1); @@ -1956,7 +1977,7 @@ int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info, ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype, fs_info->space_info_kobj, "%s", - alloc_name(space_info->flags)); + alloc_name(space_info)); if (ret) { kobject_put(&space_info->kobj); return ret; diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 3fc5c6f90dc4..0f94ae923210 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -7,6 +7,7 @@ #include <linux/compiler_types.h> #include <linux/kobject.h> +struct block_device; struct btrfs_fs_info; struct btrfs_device; struct btrfs_fs_devices; diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 5eff8d7d2360..b576897d71cc 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -102,7 +102,7 @@ struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info) if (!dev) return ERR_PTR(-ENOMEM); - extent_io_tree_init(fs_info, &dev->alloc_state, 0); + btrfs_extent_io_tree_init(fs_info, &dev->alloc_state, 0); INIT_LIST_HEAD(&dev->dev_list); list_add(&dev->dev_list, &fs_info->fs_devices->devices); @@ -111,7 +111,7 @@ struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info) static void btrfs_free_dummy_device(struct btrfs_device *dev) { - extent_io_tree_release(&dev->alloc_state); + btrfs_extent_io_tree_release(&dev->alloc_state); kfree(dev); } @@ -157,9 +157,9 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize) void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) { - struct radix_tree_iter iter; - void **slot; struct btrfs_device *dev, *tmp; + struct extent_buffer *eb; + unsigned long index; if (!fs_info) return; @@ -169,25 +169,13 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) test_mnt->mnt_sb->s_fs_info = NULL; - spin_lock(&fs_info->buffer_lock); - radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) { - struct extent_buffer *eb; - - eb = radix_tree_deref_slot_protected(slot, &fs_info->buffer_lock); - if (!eb) - continue; - /* Shouldn't happen but that kind of thinking creates CVE's */ - if (radix_tree_exception(eb)) { - if (radix_tree_deref_retry(eb)) - slot = radix_tree_iter_retry(&iter); - continue; - } - slot = radix_tree_iter_resume(slot, &iter); - spin_unlock(&fs_info->buffer_lock); - free_extent_buffer_stale(eb); - spin_lock(&fs_info->buffer_lock); + xa_lock_irq(&fs_info->buffer_tree); + xa_for_each(&fs_info->buffer_tree, index, eb) { + xa_unlock_irq(&fs_info->buffer_tree); + free_extent_buffer(eb); + xa_lock_irq(&fs_info->buffer_tree); } - spin_unlock(&fs_info->buffer_lock); + xa_unlock_irq(&fs_info->buffer_tree); btrfs_mapping_tree_free(fs_info); list_for_each_entry_safe(dev, tmp, &fs_info->fs_devices->devices, diff --git a/fs/btrfs/tests/delayed-refs-tests.c b/fs/btrfs/tests/delayed-refs-tests.c index 6558508c2ddf..265370e79a54 100644 --- a/fs/btrfs/tests/delayed-refs-tests.c +++ b/fs/btrfs/tests/delayed-refs-tests.c @@ -1009,6 +1009,7 @@ int btrfs_test_delayed_refs(u32 sectorsize, u32 nodesize) if (!ret) ret = select_delayed_refs_test(&trans); + kfree(transaction); out_free_fs_info: btrfs_free_dummy_fs_info(fs_info); return ret; diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 0a2dbfaaf49e..00da54f0164c 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -14,9 +14,9 @@ #include "../disk-io.h" #include "../btrfs_inode.h" -#define PROCESS_UNLOCK (1 << 0) -#define PROCESS_RELEASE (1 << 1) -#define PROCESS_TEST_LOCKED (1 << 2) +#define PROCESS_UNLOCK (1U << 0) +#define PROCESS_RELEASE (1U << 1) +#define PROCESS_TEST_LOCKED (1U << 2) static noinline int process_page_range(struct inode *inode, u64 start, u64 end, unsigned long flags) @@ -74,7 +74,6 @@ static void extent_flag_to_str(const struct extent_state *state, char *dest) dest[0] = 0; PRINT_ONE_FLAG(state, dest, cur, DIRTY); - PRINT_ONE_FLAG(state, dest, cur, UPTODATE); PRINT_ONE_FLAG(state, dest, cur, LOCKED); PRINT_ONE_FLAG(state, dest, cur, NEW); PRINT_ONE_FLAG(state, dest, cur, DELALLOC); @@ -150,7 +149,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) * Passing NULL as we don't have fs_info but tracepoints are not used * at this point */ - extent_io_tree_init(NULL, tmp, IO_TREE_SELFTEST); + btrfs_extent_io_tree_init(NULL, tmp, IO_TREE_SELFTEST); /* * First go through and create and mark all of our pages dirty, we pin @@ -177,7 +176,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) * |--- delalloc ---| * |--- search ---| */ - set_extent_bit(tmp, 0, sectorsize - 1, EXTENT_DELALLOC, NULL); + btrfs_set_extent_bit(tmp, 0, sectorsize - 1, EXTENT_DELALLOC, NULL); start = 0; end = start + PAGE_SIZE - 1; found = find_lock_delalloc_range(inode, page_folio(locked_page), &start, @@ -191,7 +190,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) sectorsize - 1, start, end); goto out_bits; } - unlock_extent(tmp, start, end, NULL); + btrfs_unlock_extent(tmp, start, end, NULL); unlock_page(locked_page); put_page(locked_page); @@ -208,7 +207,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) test_err("couldn't find the locked page"); goto out_bits; } - set_extent_bit(tmp, sectorsize, max_bytes - 1, EXTENT_DELALLOC, NULL); + btrfs_set_extent_bit(tmp, sectorsize, max_bytes - 1, EXTENT_DELALLOC, NULL); start = test_start; end = start + PAGE_SIZE - 1; found = find_lock_delalloc_range(inode, page_folio(locked_page), &start, @@ -227,7 +226,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) test_err("there were unlocked pages in the range"); goto out_bits; } - unlock_extent(tmp, start, end, NULL); + btrfs_unlock_extent(tmp, start, end, NULL); /* locked_page was unlocked above */ put_page(locked_page); @@ -263,7 +262,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) * * We are re-using our test_start from above since it works out well. */ - set_extent_bit(tmp, max_bytes, total_dirty - 1, EXTENT_DELALLOC, NULL); + btrfs_set_extent_bit(tmp, max_bytes, total_dirty - 1, EXTENT_DELALLOC, NULL); start = test_start; end = start + PAGE_SIZE - 1; found = find_lock_delalloc_range(inode, page_folio(locked_page), &start, @@ -282,7 +281,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) test_err("pages in range were not all locked"); goto out_bits; } - unlock_extent(tmp, start, end, NULL); + btrfs_unlock_extent(tmp, start, end, NULL); /* * Now to test where we run into a page that is no longer dirty in the @@ -327,7 +326,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) out_bits: if (ret) dump_extent_io_tree(tmp); - clear_extent_bits(tmp, 0, total_dirty - 1, (unsigned)-1); + btrfs_clear_extent_bits(tmp, 0, total_dirty - 1, (unsigned)-1); out: if (locked_page) put_page(locked_page); @@ -525,7 +524,7 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize) goto out; } - eb = __alloc_dummy_extent_buffer(fs_info, 0, nodesize); + eb = alloc_dummy_extent_buffer(fs_info, 0); if (!eb) { test_std_err(TEST_ALLOC_ROOT); ret = -ENOMEM; @@ -542,7 +541,7 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize) * Test again for case where the tree block is sectorsize aligned but * not nodesize aligned. */ - eb = __alloc_dummy_extent_buffer(fs_info, sectorsize, nodesize); + eb = alloc_dummy_extent_buffer(fs_info, sectorsize); if (!eb) { test_std_err(TEST_ALLOC_ROOT); ret = -ENOMEM; @@ -565,10 +564,10 @@ static int test_find_first_clear_extent_bit(void) test_msg("running find_first_clear_extent_bit test"); - extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST); + btrfs_extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST); /* Test correct handling of empty tree */ - find_first_clear_extent_bit(&tree, 0, &start, &end, CHUNK_TRIMMED); + btrfs_find_first_clear_extent_bit(&tree, 0, &start, &end, CHUNK_TRIMMED); if (start != 0 || end != -1) { test_err( "error getting a range from completely empty tree: start %llu end %llu", @@ -579,11 +578,11 @@ static int test_find_first_clear_extent_bit(void) * Set 1M-4M alloc/discard and 32M-64M thus leaving a hole between * 4M-32M */ - set_extent_bit(&tree, SZ_1M, SZ_4M - 1, - CHUNK_TRIMMED | CHUNK_ALLOCATED, NULL); + btrfs_set_extent_bit(&tree, SZ_1M, SZ_4M - 1, + CHUNK_TRIMMED | CHUNK_ALLOCATED, NULL); - find_first_clear_extent_bit(&tree, SZ_512K, &start, &end, - CHUNK_TRIMMED | CHUNK_ALLOCATED); + btrfs_find_first_clear_extent_bit(&tree, SZ_512K, &start, &end, + CHUNK_TRIMMED | CHUNK_ALLOCATED); if (start != 0 || end != SZ_1M - 1) { test_err("error finding beginning range: start %llu end %llu", @@ -592,14 +591,14 @@ static int test_find_first_clear_extent_bit(void) } /* Now add 32M-64M so that we have a hole between 4M-32M */ - set_extent_bit(&tree, SZ_32M, SZ_64M - 1, - CHUNK_TRIMMED | CHUNK_ALLOCATED, NULL); + btrfs_set_extent_bit(&tree, SZ_32M, SZ_64M - 1, + CHUNK_TRIMMED | CHUNK_ALLOCATED, NULL); /* * Request first hole starting at 12M, we should get 4M-32M */ - find_first_clear_extent_bit(&tree, 12 * SZ_1M, &start, &end, - CHUNK_TRIMMED | CHUNK_ALLOCATED); + btrfs_find_first_clear_extent_bit(&tree, 12 * SZ_1M, &start, &end, + CHUNK_TRIMMED | CHUNK_ALLOCATED); if (start != SZ_4M || end != SZ_32M - 1) { test_err("error finding trimmed range: start %llu end %llu", @@ -611,8 +610,8 @@ static int test_find_first_clear_extent_bit(void) * Search in the middle of allocated range, should get the next one * available, which happens to be unallocated -> 4M-32M */ - find_first_clear_extent_bit(&tree, SZ_2M, &start, &end, - CHUNK_TRIMMED | CHUNK_ALLOCATED); + btrfs_find_first_clear_extent_bit(&tree, SZ_2M, &start, &end, + CHUNK_TRIMMED | CHUNK_ALLOCATED); if (start != SZ_4M || end != SZ_32M - 1) { test_err("error finding next unalloc range: start %llu end %llu", @@ -624,9 +623,9 @@ static int test_find_first_clear_extent_bit(void) * Set 64M-72M with CHUNK_ALLOC flag, then search for CHUNK_TRIMMED flag * being unset in this range, we should get the entry in range 64M-72M */ - set_extent_bit(&tree, SZ_64M, SZ_64M + SZ_8M - 1, CHUNK_ALLOCATED, NULL); - find_first_clear_extent_bit(&tree, SZ_64M + SZ_1M, &start, &end, - CHUNK_TRIMMED); + btrfs_set_extent_bit(&tree, SZ_64M, SZ_64M + SZ_8M - 1, CHUNK_ALLOCATED, NULL); + btrfs_find_first_clear_extent_bit(&tree, SZ_64M + SZ_1M, &start, &end, + CHUNK_TRIMMED); if (start != SZ_64M || end != SZ_64M + SZ_8M - 1) { test_err("error finding exact range: start %llu end %llu", @@ -634,8 +633,8 @@ static int test_find_first_clear_extent_bit(void) goto out; } - find_first_clear_extent_bit(&tree, SZ_64M - SZ_8M, &start, &end, - CHUNK_TRIMMED); + btrfs_find_first_clear_extent_bit(&tree, SZ_64M - SZ_8M, &start, &end, + CHUNK_TRIMMED); /* * Search in the middle of set range whose immediate neighbour doesn't @@ -651,7 +650,7 @@ static int test_find_first_clear_extent_bit(void) * Search beyond any known range, shall return after last known range * and end should be -1 */ - find_first_clear_extent_bit(&tree, -1, &start, &end, CHUNK_TRIMMED); + btrfs_find_first_clear_extent_bit(&tree, -1, &start, &end, CHUNK_TRIMMED); if (start != SZ_64M + SZ_8M || end != -1) { test_err( "error handling beyond end of range search: start %llu end %llu", @@ -663,7 +662,7 @@ static int test_find_first_clear_extent_bit(void) out: if (ret) dump_extent_io_tree(&tree); - clear_extent_bits(&tree, 0, (u64)-1, CHUNK_TRIMMED | CHUNK_ALLOCATED); + btrfs_clear_extent_bits(&tree, 0, (u64)-1, CHUNK_TRIMMED | CHUNK_ALLOCATED); return ret; } @@ -730,7 +729,7 @@ static int test_eb_mem_ops(u32 sectorsize, u32 nodesize) goto out; } - eb = __alloc_dummy_extent_buffer(fs_info, SZ_1M, nodesize); + eb = alloc_dummy_extent_buffer(fs_info, SZ_1M); if (!eb) { test_std_err(TEST_ALLOC_EXTENT_BUFFER); ret = -ENOMEM; diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index 56e61ac1cc64..3a86534c116f 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -22,7 +22,7 @@ static int free_extent_map_tree(struct btrfs_inode *inode) while (!RB_EMPTY_ROOT(&em_tree->root)) { node = rb_first(&em_tree->root); em = rb_entry(node, struct extent_map, rb_node); - remove_extent_mapping(inode, em); + btrfs_remove_extent_mapping(inode, em); #ifdef CONFIG_BTRFS_DEBUG if (refcount_read(&em->refs) != 1) { @@ -36,7 +36,7 @@ static int free_extent_map_tree(struct btrfs_inode *inode) refcount_set(&em->refs, 1); } #endif - free_extent_map(em); + btrfs_free_extent_map(em); } write_unlock(&em_tree->lock); @@ -68,7 +68,7 @@ static int test_case_1(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) int ret; int ret2; - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); return -ENOMEM; @@ -87,10 +87,10 @@ static int test_case_1(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) test_err("cannot add extent range [0, 16K)"); goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); /* Add [16K, 20K) following [0, 16K) */ - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); ret = -ENOMEM; @@ -109,9 +109,9 @@ static int test_case_1(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) test_err("cannot add extent range [16K, 20K)"); goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); ret = -ENOMEM; @@ -137,7 +137,7 @@ static int test_case_1(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) ret = -ENOENT; goto out; } - if (em->start != 0 || extent_map_end(em) != SZ_16K || + if (em->start != 0 || btrfs_extent_map_end(em) != SZ_16K || em->disk_bytenr != 0 || em->disk_num_bytes != SZ_16K) { test_err( "case1 [%llu %llu]: ret %d return a wrong em (start %llu len %llu disk_bytenr %llu disk_num_bytes %llu", @@ -145,7 +145,7 @@ static int test_case_1(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) em->disk_bytenr, em->disk_num_bytes); ret = -EINVAL; } - free_extent_map(em); + btrfs_free_extent_map(em); out: ret2 = free_extent_map_tree(inode); if (ret == 0) @@ -167,7 +167,7 @@ static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) int ret; int ret2; - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); return -ENOMEM; @@ -186,10 +186,10 @@ static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) test_err("cannot add extent range [0, 1K)"); goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); /* Add [4K, 8K) following [0, 1K) */ - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); ret = -ENOMEM; @@ -208,9 +208,9 @@ static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) test_err("cannot add extent range [4K, 8K)"); goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); ret = -ENOMEM; @@ -235,14 +235,14 @@ static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) ret = -ENOENT; goto out; } - if (em->start != 0 || extent_map_end(em) != SZ_1K || + if (em->start != 0 || btrfs_extent_map_end(em) != SZ_1K || em->disk_bytenr != EXTENT_MAP_INLINE) { test_err( "case2 [0 1K]: ret %d return a wrong em (start %llu len %llu disk_bytenr %llu", ret, em->start, em->len, em->disk_bytenr); ret = -EINVAL; } - free_extent_map(em); + btrfs_free_extent_map(em); out: ret2 = free_extent_map_tree(inode); if (ret == 0) @@ -260,7 +260,7 @@ static int __test_case_3(struct btrfs_fs_info *fs_info, int ret; int ret2; - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); return -ENOMEM; @@ -279,9 +279,9 @@ static int __test_case_3(struct btrfs_fs_info *fs_info, test_err("cannot add extent range [4K, 8K)"); goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); ret = -ENOMEM; @@ -312,15 +312,15 @@ static int __test_case_3(struct btrfs_fs_info *fs_info, * Since bytes within em are contiguous, em->block_start is identical to * em->start. */ - if (start < em->start || start + len > extent_map_end(em) || - em->start != extent_map_block_start(em)) { + if (start < em->start || start + len > btrfs_extent_map_end(em) || + em->start != btrfs_extent_map_block_start(em)) { test_err( "case3 [%llu %llu): ret %d em (start %llu len %llu disk_bytenr %llu block_len %llu)", start, start + len, ret, em->start, em->len, em->disk_bytenr, em->disk_num_bytes); ret = -EINVAL; } - free_extent_map(em); + btrfs_free_extent_map(em); out: ret2 = free_extent_map_tree(inode); if (ret == 0) @@ -369,7 +369,7 @@ static int __test_case_4(struct btrfs_fs_info *fs_info, int ret; int ret2; - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); return -ENOMEM; @@ -388,9 +388,9 @@ static int __test_case_4(struct btrfs_fs_info *fs_info, test_err("cannot add extent range [0, 8K)"); goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); ret = -ENOMEM; @@ -410,9 +410,9 @@ static int __test_case_4(struct btrfs_fs_info *fs_info, test_err("cannot add extent range [8K, 32K)"); goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); ret = -ENOMEM; @@ -438,14 +438,14 @@ static int __test_case_4(struct btrfs_fs_info *fs_info, ret = -ENOENT; goto out; } - if (start < em->start || start + len > extent_map_end(em)) { + if (start < em->start || start + len > btrfs_extent_map_end(em)) { test_err( "case4 [%llu %llu): ret %d, added wrong em (start %llu len %llu disk_bytenr %llu disk_num_bytes %llu)", start, start + len, ret, em->start, em->len, em->disk_bytenr, em->disk_num_bytes); ret = -EINVAL; } - free_extent_map(em); + btrfs_free_extent_map(em); out: ret2 = free_extent_map_tree(inode); if (ret == 0) @@ -498,7 +498,7 @@ static int add_compressed_extent(struct btrfs_inode *inode, struct extent_map *em; int ret; - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); return -ENOMEM; @@ -513,7 +513,7 @@ static int add_compressed_extent(struct btrfs_inode *inode, write_lock(&em_tree->lock); ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len); write_unlock(&em_tree->lock); - free_extent_map(em); + btrfs_free_extent_map(em); if (ret < 0) { test_err("cannot add extent map [%llu, %llu)", start, start + len); return ret; @@ -719,7 +719,7 @@ static int test_case_6(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) if (ret) goto out; - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); ret = -ENOMEM; @@ -751,7 +751,7 @@ static int test_case_6(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) } ret = 0; out: - free_extent_map(em); + btrfs_free_extent_map(em); ret2 = free_extent_map_tree(inode); if (ret == 0) ret = ret2; @@ -773,7 +773,7 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) test_msg("Running btrfs_drop_extent_cache with pinned"); - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); return -ENOMEM; @@ -793,9 +793,9 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) test_err("couldn't add extent map"); goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); ret = -ENOMEM; @@ -815,7 +815,7 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) test_err("couldn't add extent map"); goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); /* * Drop [0, 36K) This should skip the [0, 4K) extent and then split the @@ -826,7 +826,7 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) /* Make sure our extent maps look sane. */ ret = -EINVAL; - em = lookup_extent_mapping(em_tree, 0, SZ_16K); + em = btrfs_lookup_extent_mapping(em_tree, 0, SZ_16K); if (!em) { test_err("didn't find an em at 0 as expected"); goto out; @@ -842,10 +842,10 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, SZ_16K, SZ_16K); + em = btrfs_lookup_extent_mapping(em_tree, SZ_16K, SZ_16K); read_unlock(&em_tree->lock); if (em) { test_err("found an em when we weren't expecting one"); @@ -853,7 +853,7 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) } read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, SZ_32K, SZ_16K); + em = btrfs_lookup_extent_mapping(em_tree, SZ_32K, SZ_16K); read_unlock(&em_tree->lock); if (!em) { test_err("didn't find an em at 32K as expected"); @@ -870,16 +870,16 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) goto out; } - if (extent_map_block_start(em) != SZ_32K + SZ_4K) { + if (btrfs_extent_map_block_start(em) != SZ_32K + SZ_4K) { test_err("em->block_start is %llu, expected 36K", - extent_map_block_start(em)); + btrfs_extent_map_block_start(em)); goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, 48 * SZ_1K, (u64)-1); + em = btrfs_lookup_extent_mapping(em_tree, 48 * SZ_1K, (u64)-1); read_unlock(&em_tree->lock); if (em) { test_err("found an unexpected em above 48K"); @@ -888,9 +888,9 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) ret = 0; out: - free_extent_map(em); + btrfs_free_extent_map(em); /* Unpin our extent to prevent warning when removing it below. */ - ret2 = unpin_extent_cache(inode, 0, SZ_16K, 0); + ret2 = btrfs_unpin_extent_cache(inode, 0, SZ_16K, 0); if (ret == 0) ret = ret2; ret2 = free_extent_map_tree(inode); @@ -913,7 +913,7 @@ static int test_case_8(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) int ret; int ret2; - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); return -ENOMEM; @@ -928,13 +928,13 @@ static int test_case_8(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) write_lock(&em_tree->lock); ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len); write_unlock(&em_tree->lock); - free_extent_map(em); + btrfs_free_extent_map(em); if (ret < 0) { test_err("couldn't add extent map for range [120K, 128K)"); goto out; } - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); ret = -ENOMEM; @@ -967,7 +967,7 @@ static int test_case_8(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) write_lock(&em_tree->lock); ret = btrfs_add_extent_mapping(inode, &em, SZ_1K * 140, SZ_4K); write_unlock(&em_tree->lock); - free_extent_map(em); + btrfs_free_extent_map(em); if (ret < 0) { test_err("couldn't add extent map for range [108K, 144K)"); goto out; @@ -1045,6 +1045,7 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info, ret = btrfs_add_chunk_map(fs_info, map); if (ret) { test_err("error adding chunk map to mapping tree"); + btrfs_free_chunk_map(map); goto out_free; } diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 3ea3bc2225fe..a29d2c02c2c8 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -268,7 +268,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) test_err("expected a hole, got %llu", em->disk_bytenr); goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false); /* @@ -314,7 +314,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) * this? */ offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { @@ -336,7 +336,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); /* Regular extent */ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); @@ -363,7 +363,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); /* The next 3 are split extents */ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); @@ -389,10 +389,10 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) test_err("wrong offset, want 0, have %llu", em->offset); goto out; } - disk_bytenr = extent_map_block_start(em); + disk_bytenr = btrfs_extent_map_block_start(em); orig_start = em->start; offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { @@ -414,7 +414,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { @@ -441,13 +441,13 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } disk_bytenr += (em->start - orig_start); - if (extent_map_block_start(em) != disk_bytenr) { + if (btrfs_extent_map_block_start(em) != disk_bytenr) { test_err("wrong block start, want %llu, have %llu", - disk_bytenr, extent_map_block_start(em)); + disk_bytenr, btrfs_extent_map_block_start(em)); goto out; } offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); /* Prealloc extent */ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); @@ -475,7 +475,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); /* The next 3 are a half written prealloc extent */ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); @@ -502,10 +502,10 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) test_err("wrong offset, want 0, have %llu", em->offset); goto out; } - disk_bytenr = extent_map_block_start(em); + disk_bytenr = btrfs_extent_map_block_start(em); orig_start = em->start; offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { @@ -531,13 +531,13 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) em->start - orig_start, em->offset); goto out; } - if (extent_map_block_start(em) != disk_bytenr + em->offset) { + if (btrfs_extent_map_block_start(em) != disk_bytenr + em->offset) { test_err("unexpected block start, wanted %llu, have %llu", - disk_bytenr + em->offset, extent_map_block_start(em)); + disk_bytenr + em->offset, btrfs_extent_map_block_start(em)); goto out; } offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { @@ -564,13 +564,13 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) em->start, em->offset, orig_start); goto out; } - if (extent_map_block_start(em) != disk_bytenr + em->offset) { + if (btrfs_extent_map_block_start(em) != disk_bytenr + em->offset) { test_err("unexpected block start, wanted %llu, have %llu", - disk_bytenr + em->offset, extent_map_block_start(em)); + disk_bytenr + em->offset, btrfs_extent_map_block_start(em)); goto out; } offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); /* Now for the compressed extent */ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); @@ -597,13 +597,13 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) test_err("wrong offset, want 0, have %llu", em->offset); goto out; } - if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) { + if (btrfs_extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) { test_err("unexpected compress type, wanted %d, got %d", - BTRFS_COMPRESS_ZLIB, extent_map_compression(em)); + BTRFS_COMPRESS_ZLIB, btrfs_extent_map_compression(em)); goto out; } offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); /* Split compressed extent */ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); @@ -630,15 +630,15 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) test_err("wrong offset, want 0, have %llu", em->offset); goto out; } - if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) { + if (btrfs_extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) { test_err("unexpected compress type, wanted %d, got %d", - BTRFS_COMPRESS_ZLIB, extent_map_compression(em)); + BTRFS_COMPRESS_ZLIB, btrfs_extent_map_compression(em)); goto out; } - disk_bytenr = extent_map_block_start(em); + disk_bytenr = btrfs_extent_map_block_start(em); orig_start = em->start; offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { @@ -664,16 +664,16 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; } - if (extent_map_block_start(em) != disk_bytenr) { + if (btrfs_extent_map_block_start(em) != disk_bytenr) { test_err("block start does not match, want %llu got %llu", - disk_bytenr, extent_map_block_start(em)); + disk_bytenr, btrfs_extent_map_block_start(em)); goto out; } if (em->start != offset || em->len != 2 * sectorsize) { @@ -692,13 +692,13 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) em->start, em->offset, orig_start); goto out; } - if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) { + if (btrfs_extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) { test_err("unexpected compress type, wanted %d, got %d", - BTRFS_COMPRESS_ZLIB, extent_map_compression(em)); + BTRFS_COMPRESS_ZLIB, btrfs_extent_map_compression(em)); goto out; } offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); /* A hole between regular extents but no hole extent */ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset + 6, sectorsize); @@ -725,7 +725,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, SZ_4M); if (IS_ERR(em)) { @@ -757,7 +757,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { @@ -785,7 +785,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) ret = 0; out: if (!IS_ERR(em)) - free_extent_map(em); + btrfs_free_extent_map(em); iput(inode); btrfs_free_dummy_root(root); btrfs_free_dummy_fs_info(fs_info); @@ -858,15 +858,16 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) em->flags); goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, sectorsize, 2 * sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; } - if (extent_map_block_start(em) != sectorsize) { - test_err("expected a real extent, got %llu", extent_map_block_start(em)); + if (btrfs_extent_map_block_start(em) != sectorsize) { + test_err("expected a real extent, got %llu", + btrfs_extent_map_block_start(em)); goto out; } if (em->start != sectorsize || em->len != sectorsize) { @@ -883,7 +884,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) ret = 0; out: if (!IS_ERR(em)) - free_extent_map(em); + btrfs_free_extent_map(em); iput(inode); btrfs_free_dummy_root(root); btrfs_free_dummy_fs_info(fs_info); @@ -949,11 +950,10 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) } /* [BTRFS_MAX_EXTENT_SIZE/2][sectorsize HOLE][the rest] */ - ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, - BTRFS_MAX_EXTENT_SIZE >> 1, - (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1, - EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | - EXTENT_UPTODATE, NULL); + ret = btrfs_clear_extent_bits(&BTRFS_I(inode)->io_tree, + BTRFS_MAX_EXTENT_SIZE >> 1, + (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1, + EXTENT_DELALLOC | EXTENT_DELALLOC_NEW); if (ret) { test_err("clear_extent_bit returned %d", ret); goto out; @@ -1017,11 +1017,10 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) } /* [BTRFS_MAX_EXTENT_SIZE+4k][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4k] */ - ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, - BTRFS_MAX_EXTENT_SIZE + sectorsize, - BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, - EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | - EXTENT_UPTODATE, NULL); + ret = btrfs_clear_extent_bits(&BTRFS_I(inode)->io_tree, + BTRFS_MAX_EXTENT_SIZE + sectorsize, + BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, + EXTENT_DELALLOC | EXTENT_DELALLOC_NEW); if (ret) { test_err("clear_extent_bit returned %d", ret); goto out; @@ -1052,9 +1051,8 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) } /* Empty */ - ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, - EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | - EXTENT_UPTODATE, NULL); + ret = btrfs_clear_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1, + EXTENT_DELALLOC | EXTENT_DELALLOC_NEW); if (ret) { test_err("clear_extent_bit returned %d", ret); goto out; @@ -1068,9 +1066,8 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) ret = 0; out: if (ret) - clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, - EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | - EXTENT_UPTODATE, NULL); + btrfs_clear_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1, + EXTENT_DELALLOC | EXTENT_DELALLOC_NEW); iput(inode); btrfs_free_dummy_root(root); btrfs_free_dummy_fs_info(fs_info); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index aca83a98b75a..b96195d6480f 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -160,7 +160,13 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) cache = list_first_entry(&transaction->deleted_bgs, struct btrfs_block_group, bg_list); + /* + * Not strictly necessary to lock, as no other task will be using a + * block_group on the deleted_bgs list during a transaction abort. + */ + spin_lock(&transaction->fs_info->unused_bgs_lock); list_del_init(&cache->bg_list); + spin_unlock(&transaction->fs_info->unused_bgs_lock); btrfs_unfreeze_block_group(cache); btrfs_put_block_group(cache); } @@ -191,7 +197,7 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans) list_del_init(&root->dirty_list); free_extent_buffer(root->commit_root); root->commit_root = btrfs_root_node(root); - extent_io_tree_release(&root->dirty_log_pages); + btrfs_extent_io_tree_release(&root->dirty_log_pages); btrfs_qgroup_clean_swapped_blocks(root); } @@ -377,10 +383,10 @@ loop: INIT_LIST_HEAD(&cur_trans->deleted_bgs); spin_lock_init(&cur_trans->dropped_roots_lock); list_add_tail(&cur_trans->list, &fs_info->trans_list); - extent_io_tree_init(fs_info, &cur_trans->dirty_pages, - IO_TREE_TRANS_DIRTY_PAGES); - extent_io_tree_init(fs_info, &cur_trans->pinned_extents, - IO_TREE_FS_PINNED_EXTENTS); + btrfs_extent_io_tree_init(fs_info, &cur_trans->dirty_pages, + IO_TREE_TRANS_DIRTY_PAGES); + btrfs_extent_io_tree_init(fs_info, &cur_trans->pinned_extents, + IO_TREE_FS_PINNED_EXTENTS); btrfs_set_fs_generation(fs_info, fs_info->generation + 1); cur_trans->transid = fs_info->generation; fs_info->running_transaction = cur_trans; @@ -532,15 +538,15 @@ static void wait_current_trans(struct btrfs_fs_info *fs_info) } } -static int may_wait_transaction(struct btrfs_fs_info *fs_info, int type) +static bool may_wait_transaction(struct btrfs_fs_info *fs_info, int type) { if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) - return 0; + return false; if (type == TRANS_START) - return 1; + return true; - return 0; + return false; } static inline bool need_reserve_reloc_root(struct btrfs_root *root) @@ -755,9 +761,10 @@ got_it: * value here. */ if (do_chunk_alloc && num_bytes) { - u64 flags = h->block_rsv->space_info->flags; + struct btrfs_space_info *space_info = h->block_rsv->space_info; + u64 flags = space_info->flags; - btrfs_chunk_alloc(h, btrfs_get_alloc_profile(fs_info, flags), + btrfs_chunk_alloc(h, space_info, btrfs_get_alloc_profile(fs_info, flags), CHUNK_ALLOC_NO_FORCE); } @@ -1122,13 +1129,13 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, u64 start = 0; u64 end; - while (find_first_extent_bit(dirty_pages, start, &start, &end, - mark, &cached_state)) { + while (btrfs_find_first_extent_bit(dirty_pages, start, &start, &end, + mark, &cached_state)) { bool wait_writeback = false; - ret = convert_extent_bit(dirty_pages, start, end, - EXTENT_NEED_WAIT, - mark, &cached_state); + ret = btrfs_convert_extent_bit(dirty_pages, start, end, + EXTENT_NEED_WAIT, + mark, &cached_state); /* * convert_extent_bit can return -ENOMEM, which is most of the * time a temporary error. So when it happens, ignore the error @@ -1149,8 +1156,8 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, if (!ret) ret = filemap_fdatawrite_range(mapping, start, end); if (!ret && wait_writeback) - ret = filemap_fdatawait_range(mapping, start, end); - free_extent_state(cached_state); + btrfs_btree_wait_writeback_range(fs_info, start, end); + btrfs_free_extent_state(cached_state); if (ret) break; cached_state = NULL; @@ -1169,14 +1176,13 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info, struct extent_io_tree *dirty_pages) { - struct address_space *mapping = fs_info->btree_inode->i_mapping; struct extent_state *cached_state = NULL; u64 start = 0; u64 end; int ret = 0; - while (find_first_extent_bit(dirty_pages, start, &start, &end, - EXTENT_NEED_WAIT, &cached_state)) { + while (btrfs_find_first_extent_bit(dirty_pages, start, &start, &end, + EXTENT_NEED_WAIT, &cached_state)) { /* * Ignore -ENOMEM errors returned by clear_extent_bit(). * When committing the transaction, we'll remove any entries @@ -1185,13 +1191,13 @@ static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info, * concurrently - we do it only at transaction commit time when * it's safe to do it (through extent_io_tree_release()). */ - ret = clear_extent_bit(dirty_pages, start, end, - EXTENT_NEED_WAIT, &cached_state); + ret = btrfs_clear_extent_bit(dirty_pages, start, end, + EXTENT_NEED_WAIT, &cached_state); if (ret == -ENOMEM) ret = 0; if (!ret) - ret = filemap_fdatawait_range(mapping, start, end); - free_extent_state(cached_state); + btrfs_btree_wait_writeback_range(fs_info, start, end); + btrfs_free_extent_state(cached_state); if (ret) break; cached_state = NULL; @@ -1259,7 +1265,7 @@ static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans) blk_finish_plug(&plug); ret2 = btrfs_wait_extents(fs_info, dirty_pages); - extent_io_tree_release(&trans->transaction->dirty_pages); + btrfs_extent_io_tree_release(&trans->transaction->dirty_pages); if (ret) return ret; @@ -1321,7 +1327,6 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans) struct btrfs_fs_info *fs_info = trans->fs_info; struct list_head *dirty_bgs = &trans->transaction->dirty_bgs; struct list_head *io_bgs = &trans->transaction->io_bgs; - struct list_head *next; struct extent_buffer *eb; int ret; @@ -1357,13 +1362,13 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans) again: while (!list_empty(&fs_info->dirty_cowonly_roots)) { struct btrfs_root *root; - next = fs_info->dirty_cowonly_roots.next; - list_del_init(next); - root = list_entry(next, struct btrfs_root, dirty_list); + + root = list_first_entry(&fs_info->dirty_cowonly_roots, + struct btrfs_root, dirty_list); clear_bit(BTRFS_ROOT_DIRTY, &root->state); + list_move_tail(&root->dirty_list, + &trans->transaction->switch_commits); - list_add_tail(&root->dirty_list, - &trans->transaction->switch_commits); ret = update_cowonly_root(trans, root); if (ret) return ret; @@ -1635,7 +1640,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root *root = pending->root; struct btrfs_root *parent_root; struct btrfs_block_rsv *rsv; - struct inode *parent_inode = &pending->dir->vfs_inode; + struct btrfs_inode *parent_inode = pending->dir; struct btrfs_path *path; struct btrfs_dir_item *dir_item; struct extent_buffer *tmp; @@ -1661,7 +1666,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, * filesystem. */ nofs_flags = memalloc_nofs_save(); - pending->error = fscrypt_setup_filename(parent_inode, + pending->error = fscrypt_setup_filename(&parent_inode->vfs_inode, &pending->dentry->d_name, 0, &fname); memalloc_nofs_restore(nofs_flags); @@ -1690,8 +1695,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, } key.objectid = objectid; - key.offset = (u64)-1; key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; rsv = trans->block_rsv; trans->block_rsv = &pending->block_rsv; @@ -1699,16 +1704,16 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, trace_btrfs_space_reservation(fs_info, "transaction", trans->transid, trans->bytes_reserved, 1); - parent_root = BTRFS_I(parent_inode)->root; + parent_root = parent_inode->root; ret = record_root_in_trans(trans, parent_root, 0); if (ret) goto fail; - cur_time = current_time(parent_inode); + cur_time = current_time(&parent_inode->vfs_inode); /* * insert the directory item */ - ret = btrfs_set_inode_index(BTRFS_I(parent_inode), &index); + ret = btrfs_set_inode_index(parent_inode, &index); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; @@ -1716,7 +1721,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, /* check if there is a file/dir which has the same name. */ dir_item = btrfs_lookup_dir_item(NULL, parent_root, path, - btrfs_ino(BTRFS_I(parent_inode)), + btrfs_ino(parent_inode), &fname.disk_name, 0); if (dir_item != NULL && !IS_ERR(dir_item)) { pending->error = -EEXIST; @@ -1817,7 +1822,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, */ ret = btrfs_add_root_ref(trans, objectid, btrfs_root_id(parent_root), - btrfs_ino(BTRFS_I(parent_inode)), index, + btrfs_ino(parent_inode), index, &fname.disk_name); if (ret) { btrfs_abort_transaction(trans, ret); @@ -1855,18 +1860,18 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto fail; ret = btrfs_insert_dir_item(trans, &fname.disk_name, - BTRFS_I(parent_inode), &key, BTRFS_FT_DIR, + parent_inode, &key, BTRFS_FT_DIR, index); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; } - btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size + + btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size + fname.disk_name.len * 2); - inode_set_mtime_to_ts(parent_inode, - inode_set_ctime_current(parent_inode)); - ret = btrfs_update_inode_fallback(trans, BTRFS_I(parent_inode)); + inode_set_mtime_to_ts(&parent_inode->vfs_inode, + inode_set_ctime_current(&parent_inode->vfs_inode)); + ret = btrfs_update_inode_fallback(trans, parent_inode); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; @@ -2096,7 +2101,14 @@ static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans) list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info); + /* + * Not strictly necessary to lock, as no other task will be using a + * block_group on the new_bgs list during a transaction abort. + */ + spin_lock(&fs_info->unused_bgs_lock); list_del_init(&block_group->bg_list); + btrfs_put_block_group(block_group); + spin_unlock(&fs_info->unused_bgs_lock); } } @@ -2258,14 +2270,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) wake_up(&fs_info->transaction_blocked_wait); btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP); - if (cur_trans->list.prev != &fs_info->trans_list) { + if (!list_is_first(&cur_trans->list, &fs_info->trans_list)) { enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED; if (trans->in_fsync) want_state = TRANS_STATE_SUPER_COMMITTED; - prev_trans = list_entry(cur_trans->list.prev, - struct btrfs_transaction, list); + prev_trans = list_prev_entry(cur_trans, list); if (prev_trans->state < want_state) { refcount_inc(&prev_trans->use_count); spin_unlock(&fs_info->trans_lock); @@ -2542,7 +2553,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) wake_up(&cur_trans->commit_wait); btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED); - btrfs_finish_extent_commit(trans); + ret = btrfs_finish_extent_commit(trans); + if (ret) + goto scrub_continue; if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags)) btrfs_clear_space_info_full(fs_info); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 43979891f7c8..8f4703b488b7 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1571,7 +1571,7 @@ static int check_extent_item(struct extent_buffer *leaf, inline_type); return -EUCLEAN; } - if (inline_type < last_type) { + if (unlikely(inline_type < last_type)) { extent_err(leaf, slot, "inline ref out-of-order: has type %u, prev type %u", inline_type, last_type); @@ -1580,7 +1580,7 @@ static int check_extent_item(struct extent_buffer *leaf, /* Type changed, allow the sequence starts from U64_MAX again. */ if (inline_type > last_type) last_seq = U64_MAX; - if (seq > last_seq) { + if (unlikely(seq > last_seq)) { extent_err(leaf, slot, "inline ref out-of-order: has type %u offset %llu seq 0x%llx, prev type %u seq 0x%llx", inline_type, inline_offset, seq, @@ -1929,7 +1929,7 @@ static enum btrfs_tree_block_status check_leaf_item(struct extent_buffer *leaf, break; } - if (ret) + if (unlikely(ret)) return BTRFS_TREE_BLOCK_INVALID_ITEM; return BTRFS_TREE_BLOCK_CLEAN; } @@ -2229,13 +2229,12 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int ret; found_level = btrfs_header_level(eb); - if (found_level != check->level) { - WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), - KERN_ERR "BTRFS: tree level check failed\n"); + if (unlikely(found_level != check->level)) { + DEBUG_WARN(); btrfs_err(fs_info, "tree level mismatch detected, bytenr=%llu level expected=%u has=%u", eb->start, check->level, found_level); - return -EIO; + return -EUCLEAN; } if (!check->has_first_key) @@ -2251,11 +2250,11 @@ int btrfs_verify_level_key(struct extent_buffer *eb, return 0; /* We have @first_key, so this @eb must have at least one item */ - if (btrfs_header_nritems(eb) == 0) { + if (unlikely(btrfs_header_nritems(eb) == 0)) { btrfs_err(fs_info, "invalid tree nritems, bytenr=%llu nritems=0 expect >0", eb->start); - WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + DEBUG_WARN(); return -EUCLEAN; } @@ -2263,11 +2262,10 @@ int btrfs_verify_level_key(struct extent_buffer *eb, btrfs_node_key_to_cpu(eb, &found_key, 0); else btrfs_item_key_to_cpu(eb, &found_key, 0); - ret = btrfs_comp_cpu_keys(&check->first_key, &found_key); - if (ret) { - WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), - KERN_ERR "BTRFS: tree first key check failed\n"); + ret = btrfs_comp_cpu_keys(&check->first_key, &found_key); + if (unlikely(ret)) { + DEBUG_WARN(); btrfs_err(fs_info, "tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)", eb->start, check->transid, check->first_key.objectid, diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 955d1677e865..97e933113b82 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -138,10 +138,10 @@ static void wait_log_commit(struct btrfs_root *root, int transid); * and once to do all the other items. */ -static struct inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root) +static struct btrfs_inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root) { unsigned int nofs_flag; - struct inode *inode; + struct btrfs_inode *inode; /* * We're holding a transaction handle whether we are logging or @@ -376,12 +376,12 @@ static int process_one_buffer(struct btrfs_root *log, } /* - * Item overwrite used by replay and tree logging. eb, slot and key all refer - * to the src data we are copying out. + * Item overwrite used by log replay. The given eb, slot and key all refer to + * the source data we are copying out. * - * root is the tree we are copying into, and path is a scratch - * path for use in this function (it should be released on entry and - * will be released on exit). + * The given root is for the tree we are copying into, and path is a scratch + * path for use in this function (it should be released on entry and will be + * released on exit). * * If the key is already in the destination tree the existing item is * overwritten. If the existing item isn't big enough, it is extended. @@ -401,6 +401,8 @@ static int overwrite_item(struct btrfs_trans_handle *trans, int save_old_i_size = 0; unsigned long src_ptr; unsigned long dst_ptr; + struct extent_buffer *dst_eb; + int dst_slot; bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; /* @@ -420,11 +422,13 @@ static int overwrite_item(struct btrfs_trans_handle *trans, if (ret < 0) return ret; + dst_eb = path->nodes[0]; + dst_slot = path->slots[0]; + if (ret == 0) { char *src_copy; - char *dst_copy; - u32 dst_size = btrfs_item_size(path->nodes[0], - path->slots[0]); + const u32 dst_size = btrfs_item_size(dst_eb, dst_slot); + if (dst_size != item_size) goto insert; @@ -432,23 +436,16 @@ static int overwrite_item(struct btrfs_trans_handle *trans, btrfs_release_path(path); return 0; } - dst_copy = kmalloc(item_size, GFP_NOFS); src_copy = kmalloc(item_size, GFP_NOFS); - if (!dst_copy || !src_copy) { + if (!src_copy) { btrfs_release_path(path); - kfree(dst_copy); - kfree(src_copy); return -ENOMEM; } read_extent_buffer(eb, src_copy, src_ptr, item_size); + dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot); + ret = memcmp_extent_buffer(dst_eb, src_copy, dst_ptr, item_size); - dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); - read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, - item_size); - ret = memcmp(dst_copy, src_copy, item_size); - - kfree(dst_copy); kfree(src_copy); /* * they have the same contents, just return, this saves @@ -470,9 +467,9 @@ static int overwrite_item(struct btrfs_trans_handle *trans, u64 nbytes; u32 mode; - item = btrfs_item_ptr(path->nodes[0], path->slots[0], + item = btrfs_item_ptr(dst_eb, dst_slot, struct btrfs_inode_item); - nbytes = btrfs_inode_nbytes(path->nodes[0], item); + nbytes = btrfs_inode_nbytes(dst_eb, item); item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); btrfs_set_inode_nbytes(eb, item, nbytes); @@ -514,11 +511,13 @@ insert: key, item_size); path->skip_release_on_error = 0; + dst_eb = path->nodes[0]; + dst_slot = path->slots[0]; + /* make sure any existing item is the correct size */ if (ret == -EEXIST || ret == -EOVERFLOW) { - u32 found_size; - found_size = btrfs_item_size(path->nodes[0], - path->slots[0]); + const u32 found_size = btrfs_item_size(dst_eb, dst_slot); + if (found_size > item_size) btrfs_truncate_item(trans, path, item_size, 1); else if (found_size < item_size) @@ -526,8 +525,7 @@ insert: } else if (ret) { return ret; } - dst_ptr = btrfs_item_ptr_offset(path->nodes[0], - path->slots[0]); + dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot); /* don't overwrite an existing inode if the generation number * was logged as zero. This is done when the tree logging code @@ -546,7 +544,6 @@ insert: dst_item = (struct btrfs_inode_item *)dst_ptr; if (btrfs_inode_generation(eb, src_item) == 0) { - struct extent_buffer *dst_eb = path->nodes[0]; const u64 ino_size = btrfs_inode_size(eb, src_item); /* @@ -564,30 +561,28 @@ insert: } if (S_ISDIR(btrfs_inode_mode(eb, src_item)) && - S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { + S_ISDIR(btrfs_inode_mode(dst_eb, dst_item))) { save_old_i_size = 1; - saved_i_size = btrfs_inode_size(path->nodes[0], - dst_item); + saved_i_size = btrfs_inode_size(dst_eb, dst_item); } } - copy_extent_buffer(path->nodes[0], eb, dst_ptr, - src_ptr, item_size); + copy_extent_buffer(dst_eb, eb, dst_ptr, src_ptr, item_size); if (save_old_i_size) { struct btrfs_inode_item *dst_item; + dst_item = (struct btrfs_inode_item *)dst_ptr; - btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); + btrfs_set_inode_size(dst_eb, dst_item, saved_i_size); } /* make sure the generation is filled in */ if (key->type == BTRFS_INODE_ITEM_KEY) { struct btrfs_inode_item *dst_item; + dst_item = (struct btrfs_inode_item *)dst_ptr; - if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { - btrfs_set_inode_generation(path->nodes[0], dst_item, - trans->transid); - } + if (btrfs_inode_generation(dst_eb, dst_item) == 0) + btrfs_set_inode_generation(dst_eb, dst_item, trans->transid); } no_copy: btrfs_release_path(path); @@ -613,14 +608,14 @@ static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len, * simple helper to read an inode off the disk from a given root * This can only be called for subvolume roots and not for the log */ -static noinline struct inode *read_one_inode(struct btrfs_root *root, - u64 objectid) +static noinline struct btrfs_inode *read_one_inode(struct btrfs_root *root, + u64 objectid) { - struct inode *inode; + struct btrfs_inode *inode; inode = btrfs_iget_logging(objectid, root); if (IS_ERR(inode)) - inode = NULL; + return NULL; return inode; } @@ -649,7 +644,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, u64 start = key->offset; u64 nbytes = 0; struct btrfs_file_extent_item *item; - struct inode *inode = NULL; + struct btrfs_inode *inode = NULL; unsigned long size; int ret = 0; @@ -688,31 +683,23 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, * file. This must be done before the btrfs_drop_extents run * so we don't try to drop this extent. */ - ret = btrfs_lookup_file_extent(trans, root, path, - btrfs_ino(BTRFS_I(inode)), start, 0); + ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), start, 0); if (ret == 0 && (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC)) { - struct btrfs_file_extent_item cmp1; - struct btrfs_file_extent_item cmp2; - struct btrfs_file_extent_item *existing; - struct extent_buffer *leaf; - - leaf = path->nodes[0]; - existing = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); + struct btrfs_file_extent_item existing; + unsigned long ptr; - read_extent_buffer(eb, &cmp1, (unsigned long)item, - sizeof(cmp1)); - read_extent_buffer(leaf, &cmp2, (unsigned long)existing, - sizeof(cmp2)); + ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); + read_extent_buffer(path->nodes[0], &existing, ptr, sizeof(existing)); /* * we already have a pointer to this exact extent, * we don't have to do anything */ - if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { + if (memcmp_extent_buffer(eb, &existing, (unsigned long)item, + sizeof(existing)) == 0) { btrfs_release_path(path); goto out; } @@ -723,7 +710,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, drop_args.start = start; drop_args.end = extent_end; drop_args.drop_cache = true; - ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args); + ret = btrfs_drop_extents(trans, root, inode, &drop_args); if (ret) goto out; @@ -747,8 +734,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, (unsigned long)item, sizeof(*item)); ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); - ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); ins.type = BTRFS_EXTENT_ITEM_KEY; + ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); offset = key->offset - btrfs_file_extent_offset(eb, item); /* @@ -873,9 +860,9 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, struct btrfs_ordered_sum *sums; struct btrfs_root *csum_root; - sums = list_entry(ordered_sums.next, - struct btrfs_ordered_sum, - list); + sums = list_first_entry(&ordered_sums, + struct btrfs_ordered_sum, + list); csum_root = btrfs_csum_root(fs_info, sums->logical); if (!ret) @@ -901,16 +888,15 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, goto out; } - ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, - extent_end - start); + ret = btrfs_inode_set_file_extent_range(inode, start, extent_end - start); if (ret) goto out; update_inode: - btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found); - ret = btrfs_update_inode(trans, BTRFS_I(inode)); + btrfs_update_inode_bytes(inode, nbytes, drop_args.bytes_found); + ret = btrfs_update_inode(trans, inode); out: - iput(inode); + iput(&inode->vfs_inode); return ret; } @@ -947,7 +933,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, struct btrfs_dir_item *di) { struct btrfs_root *root = dir->root; - struct inode *inode; + struct btrfs_inode *inode; struct fscrypt_str name; struct extent_buffer *leaf; struct btrfs_key location; @@ -972,10 +958,10 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, if (ret) goto out; - ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), &name); + ret = unlink_inode_for_log_replay(trans, dir, inode, &name); out: kfree(name.name); - iput(inode); + iput(&inode->vfs_inode); return ret; } @@ -1148,7 +1134,7 @@ again: u32 item_size; u32 cur_offset = 0; unsigned long base; - struct inode *victim_parent; + struct btrfs_inode *victim_parent; leaf = path->nodes[0]; @@ -1188,10 +1174,10 @@ again: btrfs_release_path(path); ret = unlink_inode_for_log_replay(trans, - BTRFS_I(victim_parent), + victim_parent, inode, &victim_name); } - iput(victim_parent); + iput(&victim_parent->vfs_inode); kfree(victim_name.name); if (ret) return ret; @@ -1325,7 +1311,7 @@ again: ret = !!btrfs_find_name_in_backref(log_eb, log_slot, &name); if (!ret) { - struct inode *dir; + struct btrfs_inode *dir; btrfs_release_path(path); dir = read_one_inode(root, parent_id); @@ -1334,10 +1320,9 @@ again: kfree(name.name); goto out; } - ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), - inode, &name); + ret = unlink_inode_for_log_replay(trans, dir, inode, &name); kfree(name.name); - iput(dir); + iput(&dir->vfs_inode); if (ret) goto out; goto again; @@ -1369,8 +1354,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, struct extent_buffer *eb, int slot, struct btrfs_key *key) { - struct inode *dir = NULL; - struct inode *inode = NULL; + struct btrfs_inode *dir = NULL; + struct btrfs_inode *inode = NULL; unsigned long ref_ptr; unsigned long ref_end; struct fscrypt_str name = { 0 }; @@ -1435,8 +1420,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, if (ret) goto out; - ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), - btrfs_ino(BTRFS_I(inode)), ref_index, &name); + ret = inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode), + ref_index, &name); if (ret < 0) { goto out; } else if (ret == 0) { @@ -1447,8 +1432,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, * overwrite any existing back reference, and we don't * want to create dangling pointers in the directory. */ - ret = __add_inode_ref(trans, root, path, log, - BTRFS_I(dir), BTRFS_I(inode), + ret = __add_inode_ref(trans, root, path, log, dir, inode, inode_objectid, parent_objectid, ref_index, &name); if (ret) { @@ -1458,12 +1442,11 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, } /* insert our name */ - ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), - &name, 0, ref_index); + ret = btrfs_add_link(trans, dir, inode, &name, 0, ref_index); if (ret) goto out; - ret = btrfs_update_inode(trans, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, inode); if (ret) goto out; } @@ -1473,7 +1456,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, kfree(name.name); name.name = NULL; if (log_ref_ver) { - iput(dir); + iput(&dir->vfs_inode); dir = NULL; } } @@ -1486,8 +1469,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, * dir index entries exist for a name but there is no inode reference * item with the same name. */ - ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot, - key); + ret = unlink_old_inode_refs(trans, root, path, inode, eb, slot, key); if (ret) goto out; @@ -1496,8 +1478,10 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, out: btrfs_release_path(path); kfree(name.name); - iput(dir); - iput(inode); + if (dir) + iput(&dir->vfs_inode); + if (inode) + iput(&inode->vfs_inode); return ret; } @@ -1611,25 +1595,25 @@ process_slot: * will free the inode. */ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, - struct inode *inode) + struct btrfs_inode *inode) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; struct btrfs_path *path; int ret; u64 nlink = 0; - u64 ino = btrfs_ino(BTRFS_I(inode)); + const u64 ino = btrfs_ino(inode); path = btrfs_alloc_path(); if (!path) return -ENOMEM; - ret = count_inode_refs(BTRFS_I(inode), path); + ret = count_inode_refs(inode, path); if (ret < 0) goto out; nlink = ret; - ret = count_inode_extrefs(BTRFS_I(inode), path); + ret = count_inode_extrefs(inode, path); if (ret < 0) goto out; @@ -1637,17 +1621,17 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, ret = 0; - if (nlink != inode->i_nlink) { - set_nlink(inode, nlink); - ret = btrfs_update_inode(trans, BTRFS_I(inode)); + if (nlink != inode->vfs_inode.i_nlink) { + set_nlink(&inode->vfs_inode, nlink); + ret = btrfs_update_inode(trans, inode); if (ret) goto out; } - if (S_ISDIR(inode->i_mode)) - BTRFS_I(inode)->index_cnt = (u64)-1; + if (S_ISDIR(inode->vfs_inode.i_mode)) + inode->index_cnt = (u64)-1; - if (inode->i_nlink == 0) { - if (S_ISDIR(inode->i_mode)) { + if (inode->vfs_inode.i_nlink == 0) { + if (S_ISDIR(inode->vfs_inode.i_mode)) { ret = replay_dir_deletes(trans, root, NULL, path, ino, 1); if (ret) @@ -1669,12 +1653,13 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, { int ret; struct btrfs_key key; - struct inode *inode; key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; key.type = BTRFS_ORPHAN_ITEM_KEY; key.offset = (u64)-1; while (1) { + struct btrfs_inode *inode; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) break; @@ -1703,7 +1688,7 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, } ret = fixup_inode_link_count(trans, inode); - iput(inode); + iput(&inode->vfs_inode); if (ret) break; @@ -1731,12 +1716,14 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, { struct btrfs_key key; int ret = 0; - struct inode *inode; + struct btrfs_inode *inode; + struct inode *vfs_inode; inode = read_one_inode(root, objectid); if (!inode) return -EIO; + vfs_inode = &inode->vfs_inode; key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; key.type = BTRFS_ORPHAN_ITEM_KEY; key.offset = objectid; @@ -1745,15 +1732,15 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, btrfs_release_path(path); if (ret == 0) { - if (!inode->i_nlink) - set_nlink(inode, 1); + if (!vfs_inode->i_nlink) + set_nlink(vfs_inode, 1); else - inc_nlink(inode); - ret = btrfs_update_inode(trans, BTRFS_I(inode)); + inc_nlink(vfs_inode); + ret = btrfs_update_inode(trans, inode); } else if (ret == -EEXIST) { ret = 0; } - iput(inode); + iput(vfs_inode); return ret; } @@ -1769,8 +1756,8 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans, const struct fscrypt_str *name, struct btrfs_key *location) { - struct inode *inode; - struct inode *dir; + struct btrfs_inode *inode; + struct btrfs_inode *dir; int ret; inode = read_one_inode(root, location->objectid); @@ -1779,17 +1766,16 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans, dir = read_one_inode(root, dirid); if (!dir) { - iput(inode); + iput(&inode->vfs_inode); return -EIO; } - ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, - 1, index); + ret = btrfs_add_link(trans, dir, inode, name, 1, index); /* FIXME, put inode into FIXUP list */ - iput(inode); - iput(dir); + iput(&inode->vfs_inode); + iput(&dir->vfs_inode); return ret; } @@ -1851,7 +1837,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, bool index_dst_matches = false; struct btrfs_key log_key; struct btrfs_key search_key; - struct inode *dir; + struct btrfs_inode *dir; u8 log_flags; bool exists; int ret; @@ -1881,9 +1867,8 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, ret = PTR_ERR(dir_dst_di); goto out; } else if (dir_dst_di) { - ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, - dir_dst_di, &log_key, - log_flags, exists); + ret = delete_conflicting_dir_entry(trans, dir, path, dir_dst_di, + &log_key, log_flags, exists); if (ret < 0) goto out; dir_dst_matches = (ret == 1); @@ -1898,9 +1883,8 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, ret = PTR_ERR(index_dst_di); goto out; } else if (index_dst_di) { - ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, - index_dst_di, &log_key, - log_flags, exists); + ret = delete_conflicting_dir_entry(trans, dir, path, index_dst_di, + &log_key, log_flags, exists); if (ret < 0) goto out; index_dst_matches = (ret == 1); @@ -1955,11 +1939,11 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, out: if (!ret && update_size) { - btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name.len * 2); - ret = btrfs_update_inode(trans, BTRFS_I(dir)); + btrfs_i_size_write(dir, dir->vfs_inode.i_size + name.len * 2); + ret = btrfs_update_inode(trans, dir); } kfree(name.name); - iput(dir); + iput(&dir->vfs_inode); if (!ret && name_added) ret = 1; return ret; @@ -2116,16 +2100,16 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *log, struct btrfs_path *path, struct btrfs_path *log_path, - struct inode *dir, + struct btrfs_inode *dir, struct btrfs_key *dir_key) { - struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_root *root = dir->root; int ret; struct extent_buffer *eb; int slot; struct btrfs_dir_item *di; struct fscrypt_str name = { 0 }; - struct inode *inode = NULL; + struct btrfs_inode *inode = NULL; struct btrfs_key location; /* @@ -2172,9 +2156,8 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, if (ret) goto out; - inc_nlink(inode); - ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode), - &name); + inc_nlink(&inode->vfs_inode); + ret = unlink_inode_for_log_replay(trans, dir, inode, &name); /* * Unlike dir item keys, dir index keys can only have one name (entry) in * them, as there are no key collisions since each key has a unique offset @@ -2184,7 +2167,8 @@ out: btrfs_release_path(path); btrfs_release_path(log_path); kfree(name.name); - iput(inode); + if (inode) + iput(&inode->vfs_inode); return ret; } @@ -2308,7 +2292,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, struct btrfs_key dir_key; struct btrfs_key found_key; struct btrfs_path *log_path; - struct inode *dir; + struct btrfs_inode *dir; dir_key.objectid = dirid; dir_key.type = BTRFS_DIR_INDEX_KEY; @@ -2385,7 +2369,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, out: btrfs_release_path(path); btrfs_free_path(log_path); - iput(dir); + iput(&dir->vfs_inode); return ret; } @@ -2479,7 +2463,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, */ if (S_ISREG(mode)) { struct btrfs_drop_extents_args drop_args = { 0 }; - struct inode *inode; + struct btrfs_inode *inode; u64 from; inode = read_one_inode(root, key.objectid); @@ -2487,22 +2471,20 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, ret = -EIO; break; } - from = ALIGN(i_size_read(inode), + from = ALIGN(i_size_read(&inode->vfs_inode), root->fs_info->sectorsize); drop_args.start = from; drop_args.end = (u64)-1; drop_args.drop_cache = true; - ret = btrfs_drop_extents(wc->trans, root, - BTRFS_I(inode), + ret = btrfs_drop_extents(wc->trans, root, inode, &drop_args); if (!ret) { - inode_sub_bytes(inode, + inode_sub_bytes(&inode->vfs_inode, drop_args.bytes_found); /* Update the inode's nbytes. */ - ret = btrfs_update_inode(wc->trans, - BTRFS_I(inode)); + ret = btrfs_update_inode(wc->trans, inode); } - iput(inode); + iput(&inode->vfs_inode); if (ret) break; } @@ -3269,8 +3251,8 @@ static void free_log_tree(struct btrfs_trans_handle *trans, } } - extent_io_tree_release(&log->dirty_log_pages); - extent_io_tree_release(&log->log_csum_range); + btrfs_extent_io_tree_release(&log->dirty_log_pages); + btrfs_extent_io_tree_release(&log->log_csum_range); btrfs_put_root(log); } @@ -3560,8 +3542,8 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, struct btrfs_dir_log_item *item; key.objectid = dirid; - key.offset = first_offset; key.type = BTRFS_DIR_LOG_INDEX_KEY; + key.offset = first_offset; ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); /* * -EEXIST is fine and can happen sporadically when we are logging a @@ -4318,8 +4300,8 @@ static int log_csums(struct btrfs_trans_handle *trans, * file which happens to refer to the same extent as well. Such races * can leave checksum items in the log with overlapping ranges. */ - ret = lock_extent(&log_root->log_csum_range, sums->logical, lock_end, - &cached_state); + ret = btrfs_lock_extent(&log_root->log_csum_range, sums->logical, lock_end, + &cached_state); if (ret) return ret; /* @@ -4335,8 +4317,8 @@ static int log_csums(struct btrfs_trans_handle *trans, if (!ret) ret = btrfs_csum_file_blocks(trans, log_root, sums); - unlock_extent(&log_root->log_csum_range, sums->logical, lock_end, - &cached_state); + btrfs_unlock_extent(&log_root->log_csum_range, sums->logical, lock_end, + &cached_state); return ret; } @@ -4666,7 +4648,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, return 0; /* If we're compressed we have to save the entire range of csums. */ - if (extent_map_is_compressed(em)) { + if (btrfs_extent_map_is_compressed(em)) { csum_offset = 0; csum_len = em->disk_num_bytes; } else { @@ -4675,7 +4657,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, } /* block start is already adjusted for the file extent offset. */ - block_start = extent_map_block_start(em); + block_start = btrfs_extent_map_block_start(em); csum_root = btrfs_csum_root(trans->fs_info, block_start); ret = btrfs_lookup_csums_list(csum_root, block_start + csum_offset, block_start + csum_offset + csum_len - 1, @@ -4685,9 +4667,9 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, ret = 0; while (!list_empty(&ordered_sums)) { - struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, - struct btrfs_ordered_sum, - list); + struct btrfs_ordered_sum *sums = list_first_entry(&ordered_sums, + struct btrfs_ordered_sum, + list); if (!ret) ret = log_csums(trans, inode, log_root, sums); list_del(&sums->list); @@ -4710,7 +4692,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, struct btrfs_key key; enum btrfs_compression_type compress_type; u64 extent_offset = em->offset; - u64 block_start = extent_map_block_start(em); + u64 block_start = btrfs_extent_map_block_start(em); u64 block_len; int ret; @@ -4721,7 +4703,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG); block_len = em->disk_num_bytes; - compress_type = extent_map_compression(em); + compress_type = btrfs_extent_map_compression(em); if (compress_type != BTRFS_COMPRESS_NONE) { btrfs_set_stack_file_extent_disk_bytenr(&fi, block_start); btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len); @@ -4965,7 +4947,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, list_sort(NULL, &extents, extent_cmp); process: while (!list_empty(&extents)) { - em = list_entry(extents.next, struct extent_map, list); + em = list_first_entry(&extents, struct extent_map, list); list_del_init(&em->list); @@ -4974,8 +4956,8 @@ process: * private list. */ if (ret) { - clear_em_logging(inode, em); - free_extent_map(em); + btrfs_clear_em_logging(inode, em); + btrfs_free_extent_map(em); continue; } @@ -4983,8 +4965,8 @@ process: ret = log_one_extent(trans, inode, em, path, ctx); write_lock(&tree->lock); - clear_em_logging(inode, em); - free_extent_map(em); + btrfs_clear_em_logging(inode, em); + btrfs_free_extent_map(em); } WARN_ON(!list_empty(&extents)); write_unlock(&tree->lock); @@ -5481,7 +5463,6 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans, ihold(&curr_inode->vfs_inode); while (true) { - struct inode *vfs_inode; struct btrfs_key key; struct btrfs_key found_key; u64 next_index; @@ -5497,7 +5478,7 @@ again: struct extent_buffer *leaf = path->nodes[0]; struct btrfs_dir_item *di; struct btrfs_key di_key; - struct inode *di_inode; + struct btrfs_inode *di_inode; int log_mode = LOG_INODE_EXISTS; int type; @@ -5524,17 +5505,16 @@ again: goto out; } - if (!need_log_inode(trans, BTRFS_I(di_inode))) { - btrfs_add_delayed_iput(BTRFS_I(di_inode)); + if (!need_log_inode(trans, di_inode)) { + btrfs_add_delayed_iput(di_inode); break; } ctx->log_new_dentries = false; if (type == BTRFS_FT_DIR) log_mode = LOG_INODE_ALL; - ret = btrfs_log_inode(trans, BTRFS_I(di_inode), - log_mode, ctx); - btrfs_add_delayed_iput(BTRFS_I(di_inode)); + ret = btrfs_log_inode(trans, di_inode, log_mode, ctx); + btrfs_add_delayed_iput(di_inode); if (ret) goto out; if (ctx->log_new_dentries) { @@ -5576,14 +5556,13 @@ again: kfree(dir_elem); btrfs_add_delayed_iput(curr_inode); - curr_inode = NULL; - vfs_inode = btrfs_iget_logging(ino, root); - if (IS_ERR(vfs_inode)) { - ret = PTR_ERR(vfs_inode); + curr_inode = btrfs_iget_logging(ino, root); + if (IS_ERR(curr_inode)) { + ret = PTR_ERR(curr_inode); + curr_inode = NULL; break; } - curr_inode = BTRFS_I(vfs_inode); } out: btrfs_free_path(path); @@ -5661,7 +5640,7 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans, struct btrfs_log_ctx *ctx) { struct btrfs_ino_list *ino_elem; - struct inode *inode; + struct btrfs_inode *inode; /* * It's rare to have a lot of conflicting inodes, in practice it is not @@ -5752,12 +5731,12 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans, * inode in LOG_INODE_EXISTS mode and rename operations update the log, * so that the log ends up with the new name and without the old name. */ - if (!need_log_inode(trans, BTRFS_I(inode))) { - btrfs_add_delayed_iput(BTRFS_I(inode)); + if (!need_log_inode(trans, inode)) { + btrfs_add_delayed_iput(inode); return 0; } - btrfs_add_delayed_iput(BTRFS_I(inode)); + btrfs_add_delayed_iput(inode); ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); if (!ino_elem) @@ -5793,7 +5772,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, */ while (!list_empty(&ctx->conflict_inodes)) { struct btrfs_ino_list *curr; - struct inode *inode; + struct btrfs_inode *inode; u64 ino; u64 parent; @@ -5829,9 +5808,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, * dir index key range logged for the directory. So we * must make sure the deletion is recorded. */ - ret = btrfs_log_inode(trans, BTRFS_I(inode), - LOG_INODE_ALL, ctx); - btrfs_add_delayed_iput(BTRFS_I(inode)); + ret = btrfs_log_inode(trans, inode, LOG_INODE_ALL, ctx); + btrfs_add_delayed_iput(inode); if (ret) break; continue; @@ -5847,8 +5825,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, * it again because if some other task logged the inode after * that, we can avoid doing it again. */ - if (!need_log_inode(trans, BTRFS_I(inode))) { - btrfs_add_delayed_iput(BTRFS_I(inode)); + if (!need_log_inode(trans, inode)) { + btrfs_add_delayed_iput(inode); continue; } @@ -5859,8 +5837,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, * well because during a rename we pin the log and update the * log with the new name before we unpin it. */ - ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_EXISTS, ctx); - btrfs_add_delayed_iput(BTRFS_I(inode)); + ret = btrfs_log_inode(trans, inode, LOG_INODE_EXISTS, ctx); + btrfs_add_delayed_iput(inode); if (ret) break; } @@ -6351,7 +6329,7 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, list_for_each_entry(item, delayed_ins_list, log_list) { struct btrfs_dir_item *dir_item; - struct inode *di_inode; + struct btrfs_inode *di_inode; struct btrfs_key key; int log_mode = LOG_INODE_EXISTS; @@ -6367,8 +6345,8 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, break; } - if (!need_log_inode(trans, BTRFS_I(di_inode))) { - btrfs_add_delayed_iput(BTRFS_I(di_inode)); + if (!need_log_inode(trans, di_inode)) { + btrfs_add_delayed_iput(di_inode); continue; } @@ -6376,12 +6354,12 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, log_mode = LOG_INODE_ALL; ctx->log_new_dentries = false; - ret = btrfs_log_inode(trans, BTRFS_I(di_inode), log_mode, ctx); + ret = btrfs_log_inode(trans, di_inode, log_mode, ctx); if (!ret && ctx->log_new_dentries) - ret = log_new_dir_dentries(trans, BTRFS_I(di_inode), ctx); + ret = log_new_dir_dentries(trans, di_inode, ctx); - btrfs_add_delayed_iput(BTRFS_I(di_inode)); + btrfs_add_delayed_iput(di_inode); if (ret) break; @@ -6605,6 +6583,19 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, btrfs_log_get_delayed_items(inode, &delayed_ins_list, &delayed_del_list); + /* + * If we are fsyncing a file with 0 hard links, then commit the delayed + * inode because the last inode ref (or extref) item may still be in the + * subvolume tree and if we log it the file will still exist after a log + * replay. So commit the delayed inode to delete that last ref and we + * skip logging it. + */ + if (inode->vfs_inode.i_nlink == 0) { + ret = btrfs_commit_inode_delayed_inode(inode); + if (ret) + goto out_unlock; + } + ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key, path, dst_path, logged_isize, inode_only, ctx, @@ -6789,7 +6780,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, ptr = btrfs_item_ptr_offset(leaf, slot); while (cur_offset < item_size) { struct btrfs_key inode_key; - struct inode *dir_inode; + struct btrfs_inode *dir_inode; inode_key.type = BTRFS_INODE_ITEM_KEY; inode_key.offset = 0; @@ -6838,18 +6829,16 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, goto out; } - if (!need_log_inode(trans, BTRFS_I(dir_inode))) { - btrfs_add_delayed_iput(BTRFS_I(dir_inode)); + if (!need_log_inode(trans, dir_inode)) { + btrfs_add_delayed_iput(dir_inode); continue; } ctx->log_new_dentries = false; - ret = btrfs_log_inode(trans, BTRFS_I(dir_inode), - LOG_INODE_ALL, ctx); + ret = btrfs_log_inode(trans, dir_inode, LOG_INODE_ALL, ctx); if (!ret && ctx->log_new_dentries) - ret = log_new_dir_dentries(trans, - BTRFS_I(dir_inode), ctx); - btrfs_add_delayed_iput(BTRFS_I(dir_inode)); + ret = log_new_dir_dentries(trans, dir_inode, ctx); + btrfs_add_delayed_iput(dir_inode); if (ret) goto out; } @@ -6874,7 +6863,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; int slot; struct btrfs_key search_key; - struct inode *inode; + struct btrfs_inode *inode; u64 ino; int ret = 0; @@ -6889,11 +6878,10 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, if (IS_ERR(inode)) return PTR_ERR(inode); - if (BTRFS_I(inode)->generation >= trans->transid && - need_log_inode(trans, BTRFS_I(inode))) - ret = btrfs_log_inode(trans, BTRFS_I(inode), - LOG_INODE_EXISTS, ctx); - btrfs_add_delayed_iput(BTRFS_I(inode)); + if (inode->generation >= trans->transid && + need_log_inode(trans, inode)) + ret = btrfs_log_inode(trans, inode, LOG_INODE_EXISTS, ctx); + btrfs_add_delayed_iput(inode); if (ret) return ret; @@ -7061,42 +7049,29 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; - bool log_dentries = false; + bool log_dentries; - if (btrfs_test_opt(fs_info, NOTREELOG)) { - ret = BTRFS_LOG_FORCE_COMMIT; - goto end_no_trans; - } + if (btrfs_test_opt(fs_info, NOTREELOG)) + return BTRFS_LOG_FORCE_COMMIT; - if (btrfs_root_refs(&root->root_item) == 0) { - ret = BTRFS_LOG_FORCE_COMMIT; - goto end_no_trans; - } + if (btrfs_root_refs(&root->root_item) == 0) + return BTRFS_LOG_FORCE_COMMIT; /* * If we're logging an inode from a subvolume created in the current * transaction we must force a commit since the root is not persisted. */ - if (btrfs_root_generation(&root->root_item) == trans->transid) { - ret = BTRFS_LOG_FORCE_COMMIT; - goto end_no_trans; - } + if (btrfs_root_generation(&root->root_item) == trans->transid) + return BTRFS_LOG_FORCE_COMMIT; - /* - * Skip already logged inodes or inodes corresponding to tmpfiles - * (since logging them is pointless, a link count of 0 means they - * will never be accessible). - */ - if ((btrfs_inode_in_log(inode, trans->transid) && - list_empty(&ctx->ordered_extents)) || - inode->vfs_inode.i_nlink == 0) { - ret = BTRFS_NO_LOG_SYNC; - goto end_no_trans; - } + /* Skip already logged inodes and without new extents. */ + if (btrfs_inode_in_log(inode, trans->transid) && + list_empty(&ctx->ordered_extents)) + return BTRFS_NO_LOG_SYNC; ret = start_log_trans(trans, root, ctx); if (ret) - goto end_no_trans; + return ret; ret = btrfs_log_inode(trans, inode, inode_only, ctx); if (ret) @@ -7115,8 +7090,11 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_trans; } - if (S_ISDIR(inode->vfs_inode.i_mode) && ctx->log_new_dentries) - log_dentries = true; + /* + * Track if we need to log dentries because ctx->log_new_dentries can + * be modified in the call chains below. + */ + log_dentries = ctx->log_new_dentries; /* * On unlink we must make sure all our current and old parent directory @@ -7171,8 +7149,6 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (log_dentries) ret = log_new_dir_dentries(trans, inode, ctx); - else - ret = 0; end_trans: if (ret < 0) { btrfs_set_log_full_commit(trans); @@ -7182,7 +7158,7 @@ end_trans: if (ret) btrfs_remove_log_ctx(root, ctx); btrfs_end_log_trans(root); -end_no_trans: + return ret; } @@ -7247,8 +7223,8 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) again: key.objectid = BTRFS_TREE_LOG_OBJECTID; - key.offset = (u64)-1; key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; while (1) { ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index e97ad824ae16..b7a96a005487 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -485,7 +485,7 @@ static int rollback_verity(struct btrfs_inode *inode) goto out; } inode->ro_flags &= ~BTRFS_INODE_RO_VERITY; - btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode); + btrfs_sync_inode_flags_to_i_flags(inode); ret = btrfs_update_inode(trans, inode); if (ret) { btrfs_abort_transaction(trans, ret); @@ -552,7 +552,7 @@ static int finish_verity(struct btrfs_inode *inode, const void *desc, goto out; } inode->ro_flags |= BTRFS_INODE_RO_VERITY; - btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode); + btrfs_sync_inode_flags_to_i_flags(inode); ret = btrfs_update_inode(trans, inode); if (ret) goto end_trans; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 0a0776489055..89835071cfea 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -404,7 +404,7 @@ static void btrfs_free_device(struct btrfs_device *device) { WARN_ON(!list_empty(&device->post_commit_list)); rcu_string_free(device->name); - extent_io_tree_release(&device->alloc_state); + btrfs_extent_io_tree_release(&device->alloc_state); btrfs_destroy_dev_zone_info(device); kfree(device); } @@ -415,8 +415,8 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices) WARN_ON(fs_devices->opened); while (!list_empty(&fs_devices->devices)) { - device = list_entry(fs_devices->devices.next, - struct btrfs_device, dev_list); + device = list_first_entry(&fs_devices->devices, + struct btrfs_device, dev_list); list_del(&device->dev_list); btrfs_free_device(device); } @@ -428,8 +428,8 @@ void __exit btrfs_cleanup_fs_uuids(void) struct btrfs_fs_devices *fs_devices; while (!list_empty(&fs_uuids)) { - fs_devices = list_entry(fs_uuids.next, - struct btrfs_fs_devices, fs_list); + fs_devices = list_first_entry(&fs_uuids, struct btrfs_fs_devices, + fs_list); list_del(&fs_devices->fs_list); free_fs_devices(fs_devices); } @@ -493,7 +493,7 @@ btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder, } } invalidate_bdev(bdev); - *disk_super = btrfs_read_dev_super(bdev); + *disk_super = btrfs_read_disk_super(bdev, 0, false); if (IS_ERR(*disk_super)) { ret = PTR_ERR(*disk_super); fput(*bdev_file); @@ -733,82 +733,6 @@ const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb) return has_metadata_uuid ? sb->metadata_uuid : sb->fsid; } -/* - * We can have very weird soft links passed in. - * One example is "/proc/self/fd/<fd>", which can be a soft link to - * a block device. - * - * But it's never a good idea to use those weird names. - * Here we check if the path (not following symlinks) is a good one inside - * "/dev/". - */ -static bool is_good_dev_path(const char *dev_path) -{ - struct path path = { .mnt = NULL, .dentry = NULL }; - char *path_buf = NULL; - char *resolved_path; - bool is_good = false; - int ret; - - if (!dev_path) - goto out; - - path_buf = kmalloc(PATH_MAX, GFP_KERNEL); - if (!path_buf) - goto out; - - /* - * Do not follow soft link, just check if the original path is inside - * "/dev/". - */ - ret = kern_path(dev_path, 0, &path); - if (ret) - goto out; - resolved_path = d_path(&path, path_buf, PATH_MAX); - if (IS_ERR(resolved_path)) - goto out; - if (strncmp(resolved_path, "/dev/", strlen("/dev/"))) - goto out; - is_good = true; -out: - kfree(path_buf); - path_put(&path); - return is_good; -} - -static int get_canonical_dev_path(const char *dev_path, char *canonical) -{ - struct path path = { .mnt = NULL, .dentry = NULL }; - char *path_buf = NULL; - char *resolved_path; - int ret; - - if (!dev_path) { - ret = -EINVAL; - goto out; - } - - path_buf = kmalloc(PATH_MAX, GFP_KERNEL); - if (!path_buf) { - ret = -ENOMEM; - goto out; - } - - ret = kern_path(dev_path, LOOKUP_FOLLOW, &path); - if (ret) - goto out; - resolved_path = d_path(&path, path_buf, PATH_MAX); - if (IS_ERR(resolved_path)) { - ret = PTR_ERR(resolved_path); - goto out; - } - ret = strscpy(canonical, resolved_path, PATH_MAX); -out: - kfree(path_buf); - path_put(&path); - return ret; -} - static bool is_same_device(struct btrfs_device *device, const char *new_path) { struct path old = { .mnt = NULL, .dentry = NULL }; @@ -1225,7 +1149,7 @@ static void btrfs_close_one_device(struct btrfs_device *device) device->fs_info = NULL; atomic_set(&device->dev_stats_ccnt, 0); - extent_io_tree_release(&device->alloc_state); + btrfs_extent_io_tree_release(&device->alloc_state); /* * Reset the flush error record. We might have a transient flush error @@ -1401,48 +1325,58 @@ void btrfs_release_disk_super(struct btrfs_super_block *super) put_page(page); } -static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev, - u64 bytenr, u64 bytenr_orig) +struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev, + int copy_num, bool drop_cache) { - struct btrfs_super_block *disk_super; + struct btrfs_super_block *super; struct page *page; - void *p; - pgoff_t index; + u64 bytenr, bytenr_orig; + struct address_space *mapping = bdev->bd_mapping; + int ret; - /* make sure our super fits in the device */ - if (bytenr + PAGE_SIZE >= bdev_nr_bytes(bdev)) - return ERR_PTR(-EINVAL); + bytenr_orig = btrfs_sb_offset(copy_num); + ret = btrfs_sb_log_location_bdev(bdev, copy_num, READ, &bytenr); + if (ret < 0) { + if (ret == -ENOENT) + ret = -EINVAL; + return ERR_PTR(ret); + } - /* make sure our super fits in the page */ - if (sizeof(*disk_super) > PAGE_SIZE) + if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev)) return ERR_PTR(-EINVAL); - /* make sure our super doesn't straddle pages on disk */ - index = bytenr >> PAGE_SHIFT; - if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index) - return ERR_PTR(-EINVAL); + if (drop_cache) { + /* This should only be called with the primary sb. */ + ASSERT(copy_num == 0); - /* pull in the page with our super */ - page = read_cache_page_gfp(bdev->bd_mapping, index, GFP_KERNEL); + /* + * Drop the page of the primary superblock, so later read will + * always read from the device. + */ + invalidate_inode_pages2_range(mapping, bytenr >> PAGE_SHIFT, + (bytenr + BTRFS_SUPER_INFO_SIZE) >> PAGE_SHIFT); + } + page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS); if (IS_ERR(page)) return ERR_CAST(page); - p = page_address(page); - - /* align our pointer to the offset of the super block */ - disk_super = p + offset_in_page(bytenr); - - if (btrfs_super_bytenr(disk_super) != bytenr_orig || - btrfs_super_magic(disk_super) != BTRFS_MAGIC) { - btrfs_release_disk_super(p); + super = page_address(page); + if (btrfs_super_magic(super) != BTRFS_MAGIC || + btrfs_super_bytenr(super) != bytenr_orig) { + btrfs_release_disk_super(super); return ERR_PTR(-EINVAL); } - if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1]) - disk_super->label[BTRFS_LABEL_SIZE - 1] = 0; + /* + * Make sure the last byte of label is properly NUL termiated. We use + * '%s' to print the label, if not properly NUL termiated we can access + * beyond the label. + */ + if (super->label[0] && super->label[BTRFS_LABEL_SIZE - 1]) + super->label[BTRFS_LABEL_SIZE - 1] = 0; - return disk_super; + return super; } int btrfs_forget_devices(dev_t devt) @@ -1513,23 +1447,10 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, bool new_device_added = false; struct btrfs_device *device = NULL; struct file *bdev_file; - char *canonical_path = NULL; - u64 bytenr; dev_t devt; - int ret; lockdep_assert_held(&uuid_mutex); - if (!is_good_dev_path(path)) { - canonical_path = kmalloc(PATH_MAX, GFP_KERNEL); - if (canonical_path) { - ret = get_canonical_dev_path(path, canonical_path); - if (ret < 0) { - kfree(canonical_path); - canonical_path = NULL; - } - } - } /* * Avoid an exclusive open here, as the systemd-udev may initiate the * device scan which may race with the user's mount or mkfs command, @@ -1544,20 +1465,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, if (IS_ERR(bdev_file)) return ERR_CAST(bdev_file); - /* - * We would like to check all the super blocks, but doing so would - * allow a mount to succeed after a mkfs from a different filesystem. - * Currently, recovery from a bad primary btrfs superblock is done - * using the userspace command 'btrfs check --super'. - */ - ret = btrfs_sb_log_location_bdev(file_bdev(bdev_file), 0, READ, &bytenr); - if (ret) { - device = ERR_PTR(ret); - goto error_bdev_put; - } - - disk_super = btrfs_read_disk_super(file_bdev(bdev_file), bytenr, - btrfs_sb_offset(0)); + disk_super = btrfs_read_disk_super(file_bdev(bdev_file), 0, false); if (IS_ERR(disk_super)) { device = ERR_CAST(disk_super); goto error_bdev_put; @@ -1574,8 +1482,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, goto free_disk_super; } - device = device_list_add(canonical_path ? : path, disk_super, - &new_device_added); + device = device_list_add(path, disk_super, &new_device_added); if (!IS_ERR(device) && new_device_added) btrfs_free_stale_devices(device->devt, device); @@ -1584,7 +1491,6 @@ free_disk_super: error_bdev_put: fput(bdev_file); - kfree(canonical_path); return device; } @@ -1600,9 +1506,9 @@ static bool contains_pending_extent(struct btrfs_device *device, u64 *start, lockdep_assert_held(&device->fs_info->chunk_mutex); - if (find_first_extent_bit(&device->alloc_state, *start, - &physical_start, &physical_end, - CHUNK_ALLOCATED, NULL)) { + if (btrfs_find_first_extent_bit(&device->alloc_state, *start, + &physical_start, &physical_end, + CHUNK_ALLOCATED, NULL)) { if (in_range(physical_start, *start, len) || in_range(*start, physical_start, @@ -1617,6 +1523,9 @@ static bool contains_pending_extent(struct btrfs_device *device, u64 *start, static u64 dev_extent_search_start(struct btrfs_device *device) { switch (device->fs_devices->chunk_alloc_policy) { + default: + btrfs_warn_unknown_chunk_allocation(device->fs_devices->chunk_alloc_policy); + fallthrough; case BTRFS_CHUNK_ALLOC_REGULAR: return BTRFS_DEVICE_RANGE_RESERVED; case BTRFS_CHUNK_ALLOC_ZONED: @@ -1626,8 +1535,6 @@ static u64 dev_extent_search_start(struct btrfs_device *device) * for superblock logging. */ return 0; - default: - BUG(); } } @@ -1640,7 +1547,8 @@ static bool dev_extent_hole_check_zoned(struct btrfs_device *device, int ret; bool changed = false; - ASSERT(IS_ALIGNED(*hole_start, zone_size)); + ASSERT(IS_ALIGNED(*hole_start, zone_size), + "hole_start=%llu zone_size=%llu", *hole_start, zone_size); while (*hole_size > 0) { pos = btrfs_find_allocatable_zones(device, *hole_start, @@ -1706,6 +1614,9 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, } switch (device->fs_devices->chunk_alloc_policy) { + default: + btrfs_warn_unknown_chunk_allocation(device->fs_devices->chunk_alloc_policy); + fallthrough; case BTRFS_CHUNK_ALLOC_REGULAR: /* No extra check */ break; @@ -1720,8 +1631,6 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, continue; } break; - default: - BUG(); } break; @@ -1798,8 +1707,8 @@ again: path->skip_locking = 1; key.objectid = device->devid; - key.offset = search_start; key.type = BTRFS_DEV_EXTENT_KEY; + key.offset = search_start; ret = btrfs_search_backwards(root, &key, path); if (ret < 0) @@ -1891,7 +1800,9 @@ next: else ret = 0; - ASSERT(max_hole_start + max_hole_size <= search_end); + ASSERT(max_hole_start + max_hole_size <= search_end, + "max_hole_start=%llu max_hole_size=%llu search_end=%llu", + max_hole_start, max_hole_size, search_end); out: btrfs_free_path(path); *start = max_hole_start; @@ -1918,8 +1829,8 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, return -ENOMEM; key.objectid = device->devid; - key.offset = start; key.type = BTRFS_DEV_EXTENT_KEY; + key.offset = start; again: ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) { @@ -2204,7 +2115,7 @@ static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info) down_read(&fs_info->dev_replace.rwsem); if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { - ASSERT(num_devices > 1); + ASSERT(num_devices > 1, "num_devices=%llu", num_devices); num_devices--; } up_read(&fs_info->dev_replace.rwsem); @@ -2220,7 +2131,7 @@ static void btrfs_scratch_superblock(struct btrfs_fs_info *fs_info, const u64 bytenr = btrfs_sb_offset(copy_num); int ret; - disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr); + disk_super = btrfs_read_disk_super(bdev, copy_num, false); if (IS_ERR(disk_super)) return; @@ -2408,7 +2319,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, */ if (cur_devices->num_devices == 0) { list_del_init(&cur_devices->seed_list); - ASSERT(cur_devices->opened == 1); + ASSERT(cur_devices->opened == 1, "opened=%d", cur_devices->opened); cur_devices->opened--; free_fs_devices(cur_devices); } @@ -2721,8 +2632,8 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans) return -ENOMEM; key.objectid = BTRFS_DEV_ITEMS_OBJECTID; - key.offset = 0; key.type = BTRFS_DEV_ITEM_KEY; + key.offset = 0; while (1) { btrfs_reserve_chunk_metadata(trans, false); @@ -3119,8 +3030,8 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) return -ENOMEM; key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; - key.offset = chunk_offset; key.type = BTRFS_CHUNK_ITEM_KEY; + key.offset = chunk_offset; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) @@ -3338,7 +3249,8 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) * user having built with ASSERT enabled, so if ASSERT doesn't * do anything we still error out. */ - ASSERT(0); + DEBUG_WARN("errr %ld reading chunk map at offset %llu", + PTR_ERR(map), chunk_offset); return PTR_ERR(map); } @@ -3419,8 +3331,16 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) if (ret == -ENOSPC) { const u64 sys_flags = btrfs_system_alloc_profile(fs_info); struct btrfs_block_group *sys_bg; + struct btrfs_space_info *space_info; - sys_bg = btrfs_create_chunk(trans, sys_flags); + space_info = btrfs_find_space_info(fs_info, sys_flags); + if (!space_info) { + ret = -EINVAL; + btrfs_abort_transaction(trans, ret); + goto out; + } + + sys_bg = btrfs_create_chunk(trans, space_info, sys_flags); if (IS_ERR(sys_bg)) { ret = PTR_ERR(sys_bg); btrfs_abort_transaction(trans, ret); @@ -3577,8 +3497,8 @@ static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info) again: key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; - key.offset = (u64)-1; key.type = BTRFS_CHUNK_ITEM_KEY; + key.offset = (u64)-1; while (1) { mutex_lock(&fs_info->reclaim_bgs_lock); @@ -3880,26 +3800,25 @@ static void reset_balance_state(struct btrfs_fs_info *fs_info) * Balance filters. Return 1 if chunk should be filtered out * (should not be balanced). */ -static int chunk_profiles_filter(u64 chunk_type, - struct btrfs_balance_args *bargs) +static bool chunk_profiles_filter(u64 chunk_type, struct btrfs_balance_args *bargs) { chunk_type = chunk_to_extended(chunk_type) & BTRFS_EXTENDED_PROFILE_MASK; if (bargs->profiles & chunk_type) - return 0; + return false; - return 1; + return true; } -static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, - struct btrfs_balance_args *bargs) +static bool chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, + struct btrfs_balance_args *bargs) { struct btrfs_block_group *cache; u64 chunk_used; u64 user_thresh_min; u64 user_thresh_max; - int ret = 1; + bool ret = true; cache = btrfs_lookup_block_group(fs_info, chunk_offset); chunk_used = cache->used; @@ -3917,18 +3836,18 @@ static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_off user_thresh_max = mult_perc(cache->length, bargs->usage_max); if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max) - ret = 0; + ret = false; btrfs_put_block_group(cache); return ret; } -static int chunk_usage_filter(struct btrfs_fs_info *fs_info, - u64 chunk_offset, struct btrfs_balance_args *bargs) +static bool chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, + struct btrfs_balance_args *bargs) { struct btrfs_block_group *cache; u64 chunk_used, user_thresh; - int ret = 1; + bool ret = true; cache = btrfs_lookup_block_group(fs_info, chunk_offset); chunk_used = cache->used; @@ -3941,15 +3860,14 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, user_thresh = mult_perc(cache->length, bargs->usage); if (chunk_used < user_thresh) - ret = 0; + ret = false; btrfs_put_block_group(cache); return ret; } -static int chunk_devid_filter(struct extent_buffer *leaf, - struct btrfs_chunk *chunk, - struct btrfs_balance_args *bargs) +static bool chunk_devid_filter(struct extent_buffer *leaf, struct btrfs_chunk *chunk, + struct btrfs_balance_args *bargs) { struct btrfs_stripe *stripe; int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); @@ -3958,10 +3876,10 @@ static int chunk_devid_filter(struct extent_buffer *leaf, for (i = 0; i < num_stripes; i++) { stripe = btrfs_stripe_nr(chunk, i); if (btrfs_stripe_devid(leaf, stripe) == bargs->devid) - return 0; + return false; } - return 1; + return true; } static u64 calc_data_stripes(u64 type, int num_stripes) @@ -3974,9 +3892,8 @@ static u64 calc_data_stripes(u64 type, int num_stripes) } /* [pstart, pend) */ -static int chunk_drange_filter(struct extent_buffer *leaf, - struct btrfs_chunk *chunk, - struct btrfs_balance_args *bargs) +static bool chunk_drange_filter(struct extent_buffer *leaf, struct btrfs_chunk *chunk, + struct btrfs_balance_args *bargs) { struct btrfs_stripe *stripe; int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); @@ -3987,7 +3904,7 @@ static int chunk_drange_filter(struct extent_buffer *leaf, int i; if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) - return 0; + return false; type = btrfs_chunk_type(leaf, chunk); factor = calc_data_stripes(type, num_stripes); @@ -4003,56 +3920,53 @@ static int chunk_drange_filter(struct extent_buffer *leaf, if (stripe_offset < bargs->pend && stripe_offset + stripe_length > bargs->pstart) - return 0; + return false; } - return 1; + return true; } /* [vstart, vend) */ -static int chunk_vrange_filter(struct extent_buffer *leaf, - struct btrfs_chunk *chunk, - u64 chunk_offset, - struct btrfs_balance_args *bargs) +static bool chunk_vrange_filter(struct extent_buffer *leaf, struct btrfs_chunk *chunk, + u64 chunk_offset, struct btrfs_balance_args *bargs) { if (chunk_offset < bargs->vend && chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart) /* at least part of the chunk is inside this vrange */ - return 0; + return false; - return 1; + return true; } -static int chunk_stripes_range_filter(struct extent_buffer *leaf, - struct btrfs_chunk *chunk, - struct btrfs_balance_args *bargs) +static bool chunk_stripes_range_filter(struct extent_buffer *leaf, + struct btrfs_chunk *chunk, + struct btrfs_balance_args *bargs) { int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); if (bargs->stripes_min <= num_stripes && num_stripes <= bargs->stripes_max) - return 0; + return false; - return 1; + return true; } -static int chunk_soft_convert_filter(u64 chunk_type, - struct btrfs_balance_args *bargs) +static bool chunk_soft_convert_filter(u64 chunk_type, struct btrfs_balance_args *bargs) { if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) - return 0; + return false; chunk_type = chunk_to_extended(chunk_type) & BTRFS_EXTENDED_PROFILE_MASK; if (bargs->target == chunk_type) - return 1; + return true; - return 0; + return false; } -static int should_balance_chunk(struct extent_buffer *leaf, - struct btrfs_chunk *chunk, u64 chunk_offset) +static bool should_balance_chunk(struct extent_buffer *leaf, struct btrfs_chunk *chunk, + u64 chunk_offset) { struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_balance_control *bctl = fs_info->balance_ctl; @@ -4062,7 +3976,7 @@ static int should_balance_chunk(struct extent_buffer *leaf, /* type filter */ if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { - return 0; + return false; } if (chunk_type & BTRFS_BLOCK_GROUP_DATA) @@ -4075,46 +3989,46 @@ static int should_balance_chunk(struct extent_buffer *leaf, /* profiles filter */ if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) && chunk_profiles_filter(chunk_type, bargs)) { - return 0; + return false; } /* usage filter */ if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && chunk_usage_filter(fs_info, chunk_offset, bargs)) { - return 0; + return false; } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && chunk_usage_range_filter(fs_info, chunk_offset, bargs)) { - return 0; + return false; } /* devid filter */ if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) && chunk_devid_filter(leaf, chunk, bargs)) { - return 0; + return false; } /* drange filter, makes sense only with devid filter */ if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) && chunk_drange_filter(leaf, chunk, bargs)) { - return 0; + return false; } /* vrange filter */ if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) && chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) { - return 0; + return false; } /* stripes filter */ if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) && chunk_stripes_range_filter(leaf, chunk, bargs)) { - return 0; + return false; } /* soft profile changing mode */ if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && chunk_soft_convert_filter(chunk_type, bargs)) { - return 0; + return false; } /* @@ -4122,7 +4036,7 @@ static int should_balance_chunk(struct extent_buffer *leaf, */ if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) { if (bargs->limit == 0) - return 0; + return false; else bargs->limit--; } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) { @@ -4132,12 +4046,12 @@ static int should_balance_chunk(struct extent_buffer *leaf, * about the count of all chunks that satisfy the filters. */ if (bargs->limit_max == 0) - return 0; + return false; else bargs->limit_max--; } - return 1; + return true; } static int __btrfs_balance(struct btrfs_fs_info *fs_info) @@ -4184,8 +4098,8 @@ again: bctl->sys.limit = limit_sys; } key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; - key.offset = (u64)-1; key.type = BTRFS_CHUNK_ITEM_KEY; + key.offset = (u64)-1; while (1) { if ((!counting && atomic_read(&fs_info->balance_pause_req)) || @@ -4752,7 +4666,8 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) } spin_lock(&fs_info->super_lock); - ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED); + ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED, + "exclusive_operation=%d", fs_info->exclusive_operation); fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE; spin_unlock(&fs_info->super_lock); /* @@ -5001,8 +4916,8 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) again: key.objectid = device->devid; - key.offset = (u64)-1; key.type = BTRFS_DEV_EXTENT_KEY; + key.offset = (u64)-1; do { mutex_lock(&fs_info->reclaim_bgs_lock); @@ -5088,8 +5003,8 @@ again: mutex_lock(&fs_info->chunk_mutex); /* Clear all state bits beyond the shrunk device size */ - clear_extent_bits(&device->alloc_state, new_size, (u64)-1, - CHUNK_STATE_MASK); + btrfs_clear_extent_bits(&device->alloc_state, new_size, (u64)-1, + CHUNK_STATE_MASK); btrfs_device_set_disk_total_bytes(device, new_size); if (list_empty(&device->post_commit_list)) @@ -5216,6 +5131,8 @@ struct alloc_chunk_ctl { u64 stripe_size; u64 chunk_size; int ndevs; + /* Space_info the block group is going to belong. */ + struct btrfs_space_info *space_info; }; static void init_alloc_chunk_ctl_policy_regular( @@ -5289,14 +5206,15 @@ static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices, ctl->ndevs = 0; switch (fs_devices->chunk_alloc_policy) { + default: + btrfs_warn_unknown_chunk_allocation(fs_devices->chunk_alloc_policy); + fallthrough; case BTRFS_CHUNK_ALLOC_REGULAR: init_alloc_chunk_ctl_policy_regular(fs_devices, ctl); break; case BTRFS_CHUNK_ALLOC_ZONED: init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl); break; - default: - BUG(); } } @@ -5435,7 +5353,9 @@ static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl, * It should hold because: * dev_extent_min == dev_extent_want == zone_size * dev_stripes */ - ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min); + ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min, + "ndevs=%d max_avail=%llu dev_extent_min=%llu", ctl->ndevs, + devices_info[ctl->ndevs - 1].max_avail, ctl->dev_extent_min); ctl->stripe_size = zone_size; ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; @@ -5448,7 +5368,9 @@ static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl, ctl->dev_stripes); ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; - ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size); + ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size, + "stripe_size=%llu data_stripes=%d max_chunk_size=%llu", + ctl->stripe_size, data_stripes, ctl->max_chunk_size); } ctl->chunk_size = ctl->stripe_size * data_stripes; @@ -5481,12 +5403,13 @@ static int decide_stripe_size(struct btrfs_fs_devices *fs_devices, ctl->ndevs = min(ctl->ndevs, ctl->devs_max); switch (fs_devices->chunk_alloc_policy) { + default: + btrfs_warn_unknown_chunk_allocation(fs_devices->chunk_alloc_policy); + fallthrough; case BTRFS_CHUNK_ALLOC_REGULAR: return decide_stripe_size_regular(ctl, devices_info); case BTRFS_CHUNK_ALLOC_ZONED: return decide_stripe_size_zoned(ctl, devices_info); - default: - BUG(); } } @@ -5496,9 +5419,9 @@ static void chunk_map_device_set_bits(struct btrfs_chunk_map *map, unsigned int struct btrfs_io_stripe *stripe = &map->stripes[i]; struct btrfs_device *device = stripe->dev; - set_extent_bit(&device->alloc_state, stripe->physical, - stripe->physical + map->stripe_size - 1, - bits | EXTENT_NOWAIT, NULL); + btrfs_set_extent_bit(&device->alloc_state, stripe->physical, + stripe->physical + map->stripe_size - 1, + bits | EXTENT_NOWAIT, NULL); } } @@ -5508,10 +5431,9 @@ static void chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned in struct btrfs_io_stripe *stripe = &map->stripes[i]; struct btrfs_device *device = stripe->dev; - __clear_extent_bit(&device->alloc_state, stripe->physical, - stripe->physical + map->stripe_size - 1, - bits | EXTENT_NOWAIT, - NULL, NULL); + btrfs_clear_extent_bits(&device->alloc_state, stripe->physical, + stripe->physical + map->stripe_size - 1, + bits | EXTENT_NOWAIT); } } @@ -5618,7 +5540,8 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans, return ERR_PTR(ret); } - block_group = btrfs_make_block_group(trans, type, start, ctl->chunk_size); + block_group = btrfs_make_block_group(trans, ctl->space_info, type, start, + ctl->chunk_size); if (IS_ERR(block_group)) { btrfs_remove_chunk_map(info, map); return block_group; @@ -5644,7 +5567,8 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans, } struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, - u64 type) + struct btrfs_space_info *space_info, + u64 type) { struct btrfs_fs_info *info = trans->fs_info; struct btrfs_fs_devices *fs_devices = info->fs_devices; @@ -5656,7 +5580,7 @@ struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, lockdep_assert_held(&info->chunk_mutex); if (!alloc_profile_is_valid(type, 0)) { - ASSERT(0); + DEBUG_WARN("invalid alloc profile for type %llu", type); return ERR_PTR(-EINVAL); } @@ -5668,12 +5592,13 @@ struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { btrfs_err(info, "invalid chunk type 0x%llx requested", type); - ASSERT(0); + DEBUG_WARN(); return ERR_PTR(-EINVAL); } ctl.start = find_next_chunk(info); ctl.type = type; + ctl.space_info = space_info; init_alloc_chunk_ctl(fs_devices, &ctl); devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), @@ -5817,7 +5742,9 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans) struct btrfs_fs_info *fs_info = trans->fs_info; u64 alloc_profile; struct btrfs_block_group *meta_bg; + struct btrfs_space_info *meta_space_info; struct btrfs_block_group *sys_bg; + struct btrfs_space_info *sys_space_info; /* * When adding a new device for sprouting, the seed device is read-only @@ -5841,12 +5768,22 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans) */ alloc_profile = btrfs_metadata_alloc_profile(fs_info); - meta_bg = btrfs_create_chunk(trans, alloc_profile); + meta_space_info = btrfs_find_space_info(fs_info, alloc_profile); + if (!meta_space_info) { + DEBUG_WARN(); + return -EINVAL; + } + meta_bg = btrfs_create_chunk(trans, meta_space_info, alloc_profile); if (IS_ERR(meta_bg)) return PTR_ERR(meta_bg); alloc_profile = btrfs_system_alloc_profile(fs_info); - sys_bg = btrfs_create_chunk(trans, alloc_profile); + sys_space_info = btrfs_find_space_info(fs_info, alloc_profile); + if (!sys_space_info) { + DEBUG_WARN(); + return -EINVAL; + } + sys_bg = btrfs_create_chunk(trans, sys_space_info, alloc_profile); if (IS_ERR(sys_bg)) return PTR_ERR(sys_bg); @@ -6046,7 +5983,7 @@ static int btrfs_read_rr(const struct btrfs_chunk_map *map, int first, int num_s static int find_live_mirror(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map, int first, - int dev_replace_is_ongoing) + bool dev_replace_is_ongoing) { const enum btrfs_read_policy policy = READ_ONCE(fs_info->fs_devices->read_policy); int i; @@ -6055,8 +5992,8 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, int tolerance; struct btrfs_device *srcdev; - ASSERT((map->type & - (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10))); + ASSERT((map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)), + "type=%llu", map->type); if (map->type & BTRFS_BLOCK_GROUP_RAID10) num_stripes = map->sub_stripes; @@ -6357,7 +6294,7 @@ static void handle_ops_on_dev_replace(struct btrfs_io_context *bioc, } /* We can only have at most 2 extra nr_stripes (for DUP). */ - ASSERT(nr_extra_stripes <= 2); + ASSERT(nr_extra_stripes <= 2, "nr_extra_stripes=%d", nr_extra_stripes); /* * For GET_READ_MIRRORS, we can only return at most 1 extra stripe for * replace. @@ -6368,7 +6305,8 @@ static void handle_ops_on_dev_replace(struct btrfs_io_context *bioc, struct btrfs_io_stripe *second = &bioc->stripes[num_stripes + 1]; /* Only DUP can have two extra stripes. */ - ASSERT(bioc->map_type & BTRFS_BLOCK_GROUP_DUP); + ASSERT(bioc->map_type & BTRFS_BLOCK_GROUP_DUP, + "map_type=%llu", bioc->map_type); /* * Swap the last stripe stripes and reduce @nr_extra_stripes. @@ -6395,7 +6333,8 @@ static u64 btrfs_max_io_len(struct btrfs_chunk_map *map, u64 offset, */ io_geom->stripe_offset = offset & BTRFS_STRIPE_LEN_MASK; io_geom->stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT; - ASSERT(io_geom->stripe_offset < U32_MAX); + ASSERT(io_geom->stripe_offset < U32_MAX, + "stripe_offset=%llu", io_geom->stripe_offset); if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { unsigned long full_stripe_len = @@ -6413,8 +6352,12 @@ static u64 btrfs_max_io_len(struct btrfs_chunk_map *map, u64 offset, io_geom->raid56_full_stripe_start = btrfs_stripe_nr_to_offset( rounddown(io_geom->stripe_nr, nr_data_stripes(map))); - ASSERT(io_geom->raid56_full_stripe_start + full_stripe_len > offset); - ASSERT(io_geom->raid56_full_stripe_start <= offset); + ASSERT(io_geom->raid56_full_stripe_start + full_stripe_len > offset, + "raid56_full_stripe_start=%llu full_stripe_len=%lu offset=%llu", + io_geom->raid56_full_stripe_start, full_stripe_len, offset); + ASSERT(io_geom->raid56_full_stripe_start <= offset, + "raid56_full_stripe_start=%llu offset=%llu", + io_geom->raid56_full_stripe_start, offset); /* * For writes to RAID56, allow to write a full stripe set, but * no straddling of stripe sets. @@ -6580,7 +6523,7 @@ static void map_blocks_raid56_read(struct btrfs_chunk_map *map, { int data_stripes = nr_data_stripes(map); - ASSERT(io_geom->mirror_num <= 1); + ASSERT(io_geom->mirror_num <= 1, "mirror_num=%d", io_geom->mirror_num); /* Just grab the data stripe directly. */ io_geom->stripe_index = io_geom->stripe_nr % data_stripes; io_geom->stripe_nr /= data_stripes; @@ -6648,7 +6591,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, int num_copies; struct btrfs_io_context *bioc = NULL; struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; - int dev_replace_is_ongoing = 0; + bool dev_replace_is_ongoing = false; u16 num_alloc_stripes; u64 max_len; @@ -6953,7 +6896,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, atomic_set(&dev->dev_stats_ccnt, 0); btrfs_device_data_ordered_init(dev); - extent_io_tree_init(fs_info, &dev->alloc_state, IO_TREE_DEVICE_ALLOC_STATE); + btrfs_extent_io_tree_init(fs_info, &dev->alloc_state, IO_TREE_DEVICE_ALLOC_STATE); if (devid) tmp = *devid; @@ -7155,6 +7098,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, btrfs_err(fs_info, "failed to add chunk map, start=%llu len=%llu: %d", map->start, map->chunk_len, ret); + btrfs_free_chunk_map(map); } return ret; @@ -7200,8 +7144,12 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, fs_devices = find_fsid(fsid, NULL); if (!fs_devices) { - if (!btrfs_test_opt(fs_info, DEGRADED)) + if (!btrfs_test_opt(fs_info, DEGRADED)) { + btrfs_err(fs_info, + "failed to find fsid %pU when attempting to open seed devices", + fsid); return ERR_PTR(-ENOENT); + } fs_devices = alloc_fs_devices(fsid); if (IS_ERR(fs_devices)) @@ -7534,8 +7482,8 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID). */ key.objectid = BTRFS_DEV_ITEMS_OBJECTID; - key.offset = 0; key.type = 0; + key.offset = 0; btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { struct extent_buffer *node = path->nodes[1]; @@ -7920,7 +7868,7 @@ void btrfs_commit_device_sizes(struct btrfs_transaction *trans) { struct btrfs_device *curr, *next; - ASSERT(trans->state == TRANS_STATE_COMMIT_DOING); + ASSERT(trans->state == TRANS_STATE_COMMIT_DOING, "state=%d" , trans->state); if (list_empty(&trans->dev_update_list)) return; @@ -8289,7 +8237,7 @@ static void map_raid56_repair_block(struct btrfs_io_context *bioc, logical < stripe_start + BTRFS_STRIPE_LEN) break; } - ASSERT(i < data_stripes); + ASSERT(i < data_stripes, "i=%d data_stripes=%d", i, data_stripes); smap->dev = bioc->stripes[i].dev; smap->physical = bioc->stripes[i].physical + ((logical - bioc->full_stripe_logical) & @@ -8318,7 +8266,7 @@ int btrfs_map_repair_block(struct btrfs_fs_info *fs_info, int mirror_ret = mirror_num; int ret; - ASSERT(mirror_num > 0); + ASSERT(mirror_num > 0, "mirror_num=%d", mirror_num); ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length, &bioc, smap, &mirror_ret); @@ -8326,7 +8274,7 @@ int btrfs_map_repair_block(struct btrfs_fs_info *fs_info, return ret; /* The map range should not cross stripe boundary. */ - ASSERT(map_length >= length); + ASSERT(map_length >= length, "map_length=%llu length=%u", map_length, length); /* Already mapped to single stripe. */ if (!bioc) @@ -8338,7 +8286,8 @@ int btrfs_map_repair_block(struct btrfs_fs_info *fs_info, goto out; } - ASSERT(mirror_num <= bioc->num_stripes); + ASSERT(mirror_num <= bioc->num_stripes, + "mirror_num=%d num_stripes=%d", mirror_num, bioc->num_stripes); smap->dev = bioc->stripes[mirror_num - 1].dev; smap->physical = bioc->stripes[mirror_num - 1].physical; out: diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 120f65e21eeb..137cc232f58e 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -7,6 +7,7 @@ #define BTRFS_VOLUMES_H #include <linux/blk_types.h> +#include <linux/blkdev.h> #include <linux/sizes.h> #include <linux/atomic.h> #include <linux/sort.h> @@ -18,14 +19,17 @@ #include <linux/completion.h> #include <linux/rbtree.h> #include <uapi/linux/btrfs.h> +#include <uapi/linux/btrfs_tree.h> #include "messages.h" #include "rcu-string.h" +#include "extent-io-tree.h" struct block_device; struct bdev_handle; struct btrfs_fs_info; struct btrfs_block_group; struct btrfs_trans_handle; +struct btrfs_transaction; struct btrfs_zoned_device_info; #define BTRFS_MAX_DATA_CHUNK_SIZE (10ULL * SZ_1G) @@ -469,7 +473,6 @@ struct btrfs_io_stripe { struct btrfs_device *dev; /* Block mapping. */ u64 physical; - u64 length; bool rst_search_commit_root; /* For the endio handler. */ struct btrfs_io_context *bioc; @@ -711,7 +714,8 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, int btrfs_read_sys_array(struct btrfs_fs_info *fs_info); int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, - u64 type); + struct btrfs_space_info *space_info, + u64 type); void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info); int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, blk_mode_t flags, void *holder); @@ -782,6 +786,8 @@ struct btrfs_chunk_map *btrfs_find_chunk_map_nolock(struct btrfs_fs_info *fs_inf struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, u64 logical, u64 length); void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map); +struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev, + int copy_num, bool drop_cache); void btrfs_release_disk_super(struct btrfs_super_block *super); static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, @@ -843,6 +849,11 @@ static inline const char *btrfs_dev_name(const struct btrfs_device *device) return rcu_str_deref(device->name); } +static inline void btrfs_warn_unknown_chunk_allocation(enum btrfs_chunk_allocation_policy pol) +{ + WARN_ONCE(1, "unknown allocation policy %d, fallback to regular", pol); +} + void btrfs_commit_device_sizes(struct btrfs_transaction *trans); struct list_head * __attribute_const__ btrfs_get_fs_uuids(void); diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h index 8dc4cf49f6f0..0ce10e4ec836 100644 --- a/fs/btrfs/xattr.h +++ b/fs/btrfs/xattr.h @@ -6,6 +6,8 @@ #ifndef BTRFS_XATTR_H #define BTRFS_XATTR_H +#include <linux/types.h> + struct dentry; struct inode; struct qstr; diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index c9e92c6941ec..5292cd341f70 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -94,6 +94,45 @@ fail: return ERR_PTR(-ENOMEM); } +/* + * Helper for S390x with hardware zlib compression support. + * + * That hardware acceleration requires a buffer size larger than a single page + * to get ideal performance, thus we need to do the memory copy rather than + * use the page cache directly as input buffer. + */ +static int copy_data_into_buffer(struct address_space *mapping, + struct workspace *workspace, u64 filepos, + unsigned long length) +{ + u64 cur = filepos; + + /* It's only for hardware accelerated zlib code. */ + ASSERT(zlib_deflate_dfltcc_enabled()); + + while (cur < filepos + length) { + struct folio *folio; + void *data_in; + unsigned int offset; + unsigned long copy_length; + int ret; + + ret = btrfs_compress_filemap_get_folio(mapping, cur, &folio); + if (ret < 0) + return ret; + + offset = offset_in_folio(folio, cur); + copy_length = min(folio_size(folio) - offset, + filepos + length - cur); + + data_in = kmap_local_folio(folio, offset); + memcpy(workspace->buf + cur - filepos, data_in, copy_length); + kunmap_local(data_in); + cur += copy_length; + } + return 0; +} + int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out) @@ -105,8 +144,6 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, int nr_folios = 0; struct folio *in_folio = NULL; struct folio *out_folio = NULL; - unsigned long bytes_left; - unsigned int in_buf_folios; unsigned long len = *total_out; unsigned long nr_dest_folios = *out_folios; const unsigned long max_out = nr_dest_folios * PAGE_SIZE; @@ -150,36 +187,22 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, * the workspace buffer if required. */ if (workspace->strm.avail_in == 0) { - bytes_left = len - workspace->strm.total_in; - in_buf_folios = min(DIV_ROUND_UP(bytes_left, PAGE_SIZE), - workspace->buf_size / PAGE_SIZE); - if (in_buf_folios > 1) { - int i; - - /* S390 hardware acceleration path, not subpage. */ - ASSERT(!btrfs_is_subpage( - inode_to_fs_info(mapping->host), - mapping)); - for (i = 0; i < in_buf_folios; i++) { - if (data_in) { - kunmap_local(data_in); - folio_put(in_folio); - data_in = NULL; - } - ret = btrfs_compress_filemap_get_folio(mapping, - start, &in_folio); - if (ret < 0) - goto out; - data_in = kmap_local_folio(in_folio, 0); - copy_page(workspace->buf + i * PAGE_SIZE, - data_in); - start += PAGE_SIZE; - } + unsigned long bytes_left = len - workspace->strm.total_in; + unsigned int copy_length = min(bytes_left, workspace->buf_size); + + /* + * This can only happen when hardware zlib compression is + * enabled. + */ + if (copy_length > PAGE_SIZE) { + ret = copy_data_into_buffer(mapping, workspace, + start, copy_length); + if (ret < 0) + goto out; + start += copy_length; workspace->strm.next_in = workspace->buf; - workspace->strm.avail_in = min(bytes_left, - in_buf_folios << PAGE_SHIFT); + workspace->strm.avail_in = copy_length; } else { - unsigned int pg_off; unsigned int cur_len; if (data_in) { @@ -191,9 +214,9 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, start, &in_folio); if (ret < 0) goto out; - pg_off = offset_in_page(start); - cur_len = btrfs_calc_input_length(orig_end, start); - data_in = kmap_local_folio(in_folio, pg_off); + cur_len = btrfs_calc_input_length(in_folio, orig_end, start); + data_in = kmap_local_folio(in_folio, + offset_in_folio(in_folio, start)); start += cur_len; workspace->strm.next_in = data_in; workspace->strm.avail_in = cur_len; @@ -463,6 +486,7 @@ out: const struct btrfs_compress_op btrfs_zlib_compress = { .workspace_manager = &wsm, + .min_level = 1, .max_level = 9, .default_level = BTRFS_ZLIB_DEFAULT_LEVEL, }; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 73e0aa9fc08a..b5b0156d5b95 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -989,7 +989,7 @@ int btrfs_advance_sb_log(struct btrfs_device *device, int mirror) } /* All the zones are FULL. Should not reach here. */ - ASSERT(0); + DEBUG_WARN("unexpected state, all zones full"); return -EIO; } @@ -1277,7 +1277,7 @@ struct zone_info { static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, struct zone_info *info, unsigned long *active, - struct btrfs_chunk_map *map) + struct btrfs_chunk_map *map, bool new) { struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; struct btrfs_device *device; @@ -1307,6 +1307,8 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, return 0; } + ASSERT(!new || btrfs_dev_is_empty_zone(device, info->physical)); + /* This zone will be used for allocation, so mark this zone non-empty. */ btrfs_dev_clear_zone_empty(device, info->physical); @@ -1319,6 +1321,18 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, * to determine the allocation offset within the zone. */ WARN_ON(!IS_ALIGNED(info->physical, fs_info->zone_size)); + + if (new) { + sector_t capacity; + + capacity = bdev_zone_capacity(device->bdev, info->physical >> SECTOR_SHIFT); + up_read(&dev_replace->rwsem); + info->alloc_offset = 0; + info->capacity = capacity << SECTOR_SHIFT; + + return 0; + } + nofs_flag = memalloc_nofs_save(); ret = btrfs_get_dev_zone(device, info->physical, &zone); memalloc_nofs_restore(nofs_flag); @@ -1588,7 +1602,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) } for (i = 0; i < map->num_stripes; i++) { - ret = btrfs_load_zone_info(fs_info, i, &zone_info[i], active, map); + ret = btrfs_load_zone_info(fs_info, i, &zone_info[i], active, map, new); if (ret) goto out; @@ -1659,7 +1673,6 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) * stripe. */ cache->alloc_offset = cache->zone_capacity; - ret = 0; } out: @@ -1784,12 +1797,12 @@ static void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered, ordered->disk_bytenr = logical; write_lock(&em_tree->lock); - em = search_extent_mapping(em_tree, ordered->file_offset, - ordered->num_bytes); + em = btrfs_search_extent_mapping(em_tree, ordered->file_offset, + ordered->num_bytes); /* The em should be a new COW extent, thus it should not have an offset. */ ASSERT(em->offset == 0); em->disk_bytenr = logical; - free_extent_map(em); + btrfs_free_extent_map(em); write_unlock(&em_tree->lock); } @@ -1799,8 +1812,8 @@ static bool btrfs_zoned_split_ordered(struct btrfs_ordered_extent *ordered, struct btrfs_ordered_extent *new; if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) && - split_extent_map(ordered->inode, ordered->file_offset, - ordered->num_bytes, len, logical)) + btrfs_split_extent_map(ordered->inode, ordered->file_offset, + ordered->num_bytes, len, logical)) return false; new = btrfs_split_ordered_extent(ordered, len); @@ -2111,6 +2124,9 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) physical = map->stripes[i].physical; zinfo = device->zone_info; + if (!device->bdev) + continue; + if (zinfo->max_active_zones == 0) continue; @@ -2155,27 +2171,15 @@ static void wait_eb_writebacks(struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; const u64 end = block_group->start + block_group->length; - struct radix_tree_iter iter; struct extent_buffer *eb; - void __rcu **slot; + unsigned long index, start = (block_group->start >> fs_info->sectorsize_bits); rcu_read_lock(); - radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, - block_group->start >> fs_info->sectorsize_bits) { - eb = radix_tree_deref_slot(slot); - if (!eb) - continue; - if (radix_tree_deref_retry(eb)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - + xa_for_each_start(&fs_info->buffer_tree, index, eb, start) { if (eb->start < block_group->start) continue; if (eb->start >= end) break; - - slot = radix_tree_iter_resume(slot, &iter); rcu_read_unlock(); wait_on_extent_buffer_writeback(eb); rcu_read_lock(); @@ -2272,6 +2276,9 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ struct btrfs_zoned_device_info *zinfo = device->zone_info; unsigned int nofs_flags; + if (!device->bdev) + continue; + if (zinfo->max_active_zones == 0) continue; @@ -2325,6 +2332,9 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) if (!btrfs_is_zoned(fs_info)) return true; + if (test_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags)) + return false; + /* Check if there is a device with active zones left */ mutex_lock(&fs_info->chunk_mutex); spin_lock(&fs_info->zone_active_bgs_lock); diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 5232b56d5892..4a796a049b5a 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -24,13 +24,14 @@ #include "super.h" #define ZSTD_BTRFS_MAX_WINDOWLOG 17 -#define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG) +#define ZSTD_BTRFS_MAX_INPUT (1U << ZSTD_BTRFS_MAX_WINDOWLOG) #define ZSTD_BTRFS_DEFAULT_LEVEL 3 +#define ZSTD_BTRFS_MIN_LEVEL -15 #define ZSTD_BTRFS_MAX_LEVEL 15 /* 307s to avoid pathologically clashing with transaction commit */ #define ZSTD_BTRFS_RECLAIM_JIFFIES (307 * HZ) -static zstd_parameters zstd_get_btrfs_parameters(unsigned int level, +static zstd_parameters zstd_get_btrfs_parameters(int level, size_t src_len) { zstd_parameters params = zstd_get_params(level, src_len); @@ -45,13 +46,14 @@ struct workspace { void *mem; size_t size; char *buf; - unsigned int level; - unsigned int req_level; + int level; + int req_level; unsigned long last_used; /* jiffies */ struct list_head list; struct list_head lru_list; zstd_in_buffer in_buf; zstd_out_buffer out_buf; + zstd_parameters params; }; /* @@ -93,8 +95,10 @@ static inline struct workspace *list_to_workspace(struct list_head *list) return container_of(list, struct workspace, list); } -void zstd_free_workspace(struct list_head *ws); -struct list_head *zstd_alloc_workspace(unsigned int level); +static inline int clip_level(int level) +{ + return max(0, level - 1); +} /* * Timer callback to free unused workspaces. @@ -123,7 +127,7 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer) list_for_each_prev_safe(pos, next, &wsm.lru_list) { struct workspace *victim = container_of(pos, struct workspace, lru_list); - unsigned int level; + int level; if (time_after(victim->last_used, reclaim_threshold)) break; @@ -137,8 +141,8 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer) list_del(&victim->list); zstd_free_workspace(&victim->list); - if (list_empty(&wsm.idle_ws[level - 1])) - clear_bit(level - 1, &wsm.active_map); + if (list_empty(&wsm.idle_ws[level])) + clear_bit(level, &wsm.active_map); } @@ -160,9 +164,11 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer) static void zstd_calc_ws_mem_sizes(void) { size_t max_size = 0; - unsigned int level; + int level; - for (level = 1; level <= ZSTD_BTRFS_MAX_LEVEL; level++) { + for (level = ZSTD_BTRFS_MIN_LEVEL; level <= ZSTD_BTRFS_MAX_LEVEL; level++) { + if (level == 0) + continue; zstd_parameters params = zstd_get_btrfs_parameters(level, ZSTD_BTRFS_MAX_INPUT); size_t level_size = @@ -171,7 +177,8 @@ static void zstd_calc_ws_mem_sizes(void) zstd_dstream_workspace_bound(ZSTD_BTRFS_MAX_INPUT)); max_size = max_t(size_t, max_size, level_size); - zstd_ws_mem_sizes[level - 1] = max_size; + /* Use level 1 workspace size for all the fast mode negative levels. */ + zstd_ws_mem_sizes[clip_level(level)] = max_size; } } @@ -218,7 +225,7 @@ void zstd_cleanup_workspace_manager(void) } spin_unlock_bh(&wsm.lock); - del_timer_sync(&wsm.timer); + timer_delete_sync(&wsm.timer); } /* @@ -233,11 +240,11 @@ void zstd_cleanup_workspace_manager(void) * offer the opportunity to reclaim the workspace in favor of allocating an * appropriately sized one in the future. */ -static struct list_head *zstd_find_workspace(unsigned int level) +static struct list_head *zstd_find_workspace(int level) { struct list_head *ws; struct workspace *workspace; - int i = level - 1; + int i = clip_level(level); spin_lock_bh(&wsm.lock); for_each_set_bit_from(i, &wsm.active_map, ZSTD_BTRFS_MAX_LEVEL) { @@ -247,7 +254,7 @@ static struct list_head *zstd_find_workspace(unsigned int level) list_del_init(ws); /* keep its place if it's a lower level using this */ workspace->req_level = level; - if (level == workspace->level) + if (clip_level(level) == workspace->level) list_del(&workspace->lru_list); if (list_empty(&wsm.idle_ws[i])) clear_bit(i, &wsm.active_map); @@ -270,7 +277,7 @@ static struct list_head *zstd_find_workspace(unsigned int level) * attempt to allocate a new workspace. If we fail to allocate one due to * memory pressure, go to sleep waiting for the max level workspace to free up. */ -struct list_head *zstd_get_workspace(unsigned int level) +struct list_head *zstd_get_workspace(int level) { struct list_head *ws; unsigned int nofs_flag; @@ -319,7 +326,7 @@ void zstd_put_workspace(struct list_head *ws) spin_lock_bh(&wsm.lock); /* A node is only taken off the lru if we are the corresponding level */ - if (workspace->req_level == workspace->level) { + if (clip_level(workspace->req_level) == workspace->level) { /* Hide a max level workspace from reclaim */ if (list_empty(&wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1])) { INIT_LIST_HEAD(&workspace->lru_list); @@ -332,13 +339,13 @@ void zstd_put_workspace(struct list_head *ws) } } - set_bit(workspace->level - 1, &wsm.active_map); - list_add(&workspace->list, &wsm.idle_ws[workspace->level - 1]); + set_bit(workspace->level, &wsm.active_map); + list_add(&workspace->list, &wsm.idle_ws[workspace->level]); workspace->req_level = 0; spin_unlock_bh(&wsm.lock); - if (workspace->level == ZSTD_BTRFS_MAX_LEVEL) + if (workspace->level == clip_level(ZSTD_BTRFS_MAX_LEVEL)) cond_wake_up(&wsm.wait); } @@ -351,7 +358,7 @@ void zstd_free_workspace(struct list_head *ws) kfree(workspace); } -struct list_head *zstd_alloc_workspace(unsigned int level) +struct list_head *zstd_alloc_workspace(int level) { struct workspace *workspace; @@ -359,8 +366,9 @@ struct list_head *zstd_alloc_workspace(unsigned int level) if (!workspace) return ERR_PTR(-ENOMEM); - workspace->size = zstd_ws_mem_sizes[level - 1]; - workspace->level = level; + /* Use level 1 workspace size for all the fast mode negative levels. */ + workspace->size = zstd_ws_mem_sizes[clip_level(level)]; + workspace->level = clip_level(level); workspace->req_level = level; workspace->last_used = jiffies; workspace->mem = kvmalloc(workspace->size, GFP_KERNEL | __GFP_NOWARN); @@ -393,17 +401,15 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, const unsigned long nr_dest_folios = *out_folios; const u64 orig_end = start + len; unsigned long max_out = nr_dest_folios * PAGE_SIZE; - unsigned int pg_off; unsigned int cur_len; - zstd_parameters params = zstd_get_btrfs_parameters(workspace->req_level, - len); + workspace->params = zstd_get_btrfs_parameters(workspace->req_level, len); *out_folios = 0; *total_out = 0; *total_in = 0; /* Initialize the stream */ - stream = zstd_init_cstream(¶ms, len, workspace->mem, + stream = zstd_init_cstream(&workspace->params, len, workspace->mem, workspace->size); if (unlikely(!stream)) { struct btrfs_inode *inode = BTRFS_I(mapping->host); @@ -420,9 +426,8 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio); if (ret < 0) goto out; - pg_off = offset_in_page(start); - cur_len = btrfs_calc_input_length(orig_end, start); - workspace->in_buf.src = kmap_local_folio(in_folio, pg_off); + cur_len = btrfs_calc_input_length(in_folio, orig_end, start); + workspace->in_buf.src = kmap_local_folio(in_folio, offset_in_folio(in_folio, start)); workspace->in_buf.pos = 0; workspace->in_buf.size = cur_len; @@ -506,9 +511,9 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio); if (ret < 0) goto out; - pg_off = offset_in_page(start); - cur_len = btrfs_calc_input_length(orig_end, start); - workspace->in_buf.src = kmap_local_folio(in_folio, pg_off); + cur_len = btrfs_calc_input_length(in_folio, orig_end, start); + workspace->in_buf.src = kmap_local_folio(in_folio, + offset_in_folio(in_folio, start)); workspace->in_buf.pos = 0; workspace->in_buf.size = cur_len; } @@ -717,6 +722,7 @@ finish: const struct btrfs_compress_op btrfs_zstd_compress = { /* ZSTD uses own workspace manager */ .workspace_manager = NULL, + .min_level = ZSTD_BTRFS_MIN_LEVEL, .max_level = ZSTD_BTRFS_MAX_LEVEL, .default_level = ZSTD_BTRFS_DEFAULT_LEVEL, }; |