diff options
Diffstat (limited to 'fs/btrfs')
72 files changed, 3806 insertions, 3338 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 73a2dfb854c5..c352f3ae0385 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -52,10 +52,10 @@ config BTRFS_FS_RUN_SANITY_TESTS bool "Btrfs will run sanity tests upon loading" depends on BTRFS_FS help - This will run some basic sanity tests on the free space cache - code to make sure it is acting as it should. These are mostly - regression tests and are only really interesting to btrfs - developers. + This will run sanity tests for core functionality like free space, + extent maps, extent io, extent buffers, inodes, qgroups and others, + at module load time. These are mostly regression tests and are only + interesting to developers. If unsure, say N. @@ -63,9 +63,12 @@ config BTRFS_DEBUG bool "Btrfs debugging support" depends on BTRFS_FS help - Enable run-time debugging support for the btrfs filesystem. This may - enable additional and expensive checks with negative impact on - performance, or export extra information via sysfs. + Enable run-time debugging support for the btrfs filesystem. + + Additional potentially expensive checks, debugging functionality or + sysfs exported information is enabled, like leak checks of internal + objects, optional forced space fragmentation and /sys/fs/btrfs/debug . + This has negative impact on performance. If unsure, say N. @@ -73,8 +76,10 @@ config BTRFS_ASSERT bool "Btrfs assert support" depends on BTRFS_FS help - Enable run-time assertion checking. This will result in panics if - any of the assertions trip. This is meant for btrfs developers only. + Enable run-time assertion checking. Additional safety checks are + done, simple enough not to affect performance but verify invariants + and assumptions of code to run properly. This may result in panics, + and is meant for developers but can be enabled in general. If unsure, say N. @@ -89,7 +94,14 @@ config BTRFS_EXPERIMENTAL Current list: - - extent map shrinker - performance problems with too frequent shrinks + - COW fixup worker warning - last warning before removing the + functionality catching out-of-band page + dirtying, not necessary since 5.8 + + - RAID mirror read policy - additional read policies for balancing + reading from redundant block group + profiles (currently: pid, round-robin, + fixed devid) - send stream protocol v3 - fs-verity support diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index f3bffe08b290..6c6f3bb58f4e 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -219,8 +219,7 @@ static void run_ordered_work(struct btrfs_workqueue *wq, spin_lock_irqsave(lock, flags); if (list_empty(list)) break; - work = list_entry(list->next, struct btrfs_work, - ordered_list); + work = list_first_entry(list, struct btrfs_work, ordered_list); if (!test_bit(WORK_DONE_BIT, &work->flags)) break; /* diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 5936cff80ff3..ed497f5f8d1b 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -2877,7 +2877,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr) goto release; } if (path->slots[0] == 0) { - WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + DEBUG_WARN(); ret = -EUCLEAN; goto release; } @@ -3134,8 +3134,8 @@ void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache, return; while (!list_empty(&node->upper)) { - edge = list_entry(node->upper.next, struct btrfs_backref_edge, - list[LOWER]); + edge = list_first_entry(&node->upper, struct btrfs_backref_edge, + list[LOWER]); list_del(&edge->list[LOWER]); list_del(&edge->list[UPPER]); btrfs_backref_free_edge(cache, edge); @@ -3473,8 +3473,8 @@ int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans, * type BTRFS_TREE_BLOCK_REF_KEY */ ASSERT(list_is_singular(&cur->upper)); - edge = list_entry(cur->upper.next, struct btrfs_backref_edge, - list[LOWER]); + edge = list_first_entry(&cur->upper, struct btrfs_backref_edge, + list[LOWER]); ASSERT(list_empty(&edge->list[UPPER])); exist = edge->node[UPPER]; /* @@ -3617,7 +3617,7 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache, /* Sanity check, we shouldn't have any unchecked nodes */ if (!upper->checked) { - ASSERT(0); + DEBUG_WARN("we should not have any unchecked nodes"); return -EUCLEAN; } diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 74e614031274..953637115956 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -423,8 +423,8 @@ struct btrfs_backref_node *btrfs_backref_alloc_node( struct btrfs_backref_edge *btrfs_backref_alloc_edge( struct btrfs_backref_cache *cache); -#define LINK_LOWER (1 << 0) -#define LINK_UPPER (1 << 1) +#define LINK_LOWER (1U << 0) +#define LINK_UPPER (1U << 1) void btrfs_backref_link_edge(struct btrfs_backref_edge *edge, struct btrfs_backref_node *lower, diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 8c2eee1f1878..f7d8958b7327 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -192,7 +192,7 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, btrfs_repair_io_failure(fs_info, btrfs_ino(inode), repair_bbio->file_offset, fs_info->sectorsize, repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT, - page_folio(bv->bv_page), bv->bv_offset, mirror); + bvec_phys(bv), mirror); } while (mirror != fbio->bbio->mirror_num); done: @@ -512,7 +512,7 @@ static void btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc, } } -static blk_status_t btrfs_bio_csum(struct btrfs_bio *bbio) +static int btrfs_bio_csum(struct btrfs_bio *bbio) { if (bbio->bio.bi_opf & REQ_META) return btree_csum_one_bio(bbio); @@ -543,11 +543,11 @@ static void run_one_async_start(struct btrfs_work *work) { struct async_submit_bio *async = container_of(work, struct async_submit_bio, work); - blk_status_t ret; + int ret; ret = btrfs_bio_csum(async->bbio); if (ret) - async->bbio->bio.bi_status = ret; + async->bbio->bio.bi_status = errno_to_blk_status(ret); } /* @@ -674,8 +674,8 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) bool use_append = btrfs_use_zone_append(bbio); struct btrfs_io_context *bioc = NULL; struct btrfs_io_stripe smap; - blk_status_t ret; - int error; + blk_status_t status; + int ret; if (!bbio->inode || btrfs_is_data_reloc_root(inode->root)) smap.rst_search_commit_root = true; @@ -683,10 +683,10 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) smap.rst_search_commit_root = false; btrfs_bio_counter_inc_blocked(fs_info); - error = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, - &bioc, &smap, &mirror_num); - if (error) { - ret = errno_to_blk_status(error); + ret = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, + &bioc, &smap, &mirror_num); + if (ret) { + status = errno_to_blk_status(ret); btrfs_bio_counter_dec(fs_info); goto end_bbio; } @@ -700,7 +700,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) split = btrfs_split_bio(fs_info, bbio, map_length); if (IS_ERR(split)) { - ret = errno_to_blk_status(PTR_ERR(split)); + status = errno_to_blk_status(PTR_ERR(split)); btrfs_bio_counter_dec(fs_info); goto end_bbio; } @@ -715,7 +715,8 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) { bbio->saved_iter = bio->bi_iter; ret = btrfs_lookup_bio_sums(bbio); - if (ret) + status = errno_to_blk_status(ret); + if (status) goto fail; } @@ -748,13 +749,15 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) goto done; ret = btrfs_bio_csum(bbio); - if (ret) + status = errno_to_blk_status(ret); + if (status) goto fail; } else if (use_append || (btrfs_is_zoned(fs_info) && inode && inode->flags & BTRFS_INODE_NODATASUM)) { ret = btrfs_alloc_dummy_sum(bbio); - if (ret) + status = errno_to_blk_status(ret); + if (status) goto fail; } } @@ -775,10 +778,10 @@ fail: ASSERT(bbio->bio.bi_pool == &btrfs_clone_bioset); ASSERT(remaining); - btrfs_bio_end_io(remaining, ret); + btrfs_bio_end_io(remaining, status); } end_bbio: - btrfs_bio_end_io(bbio, ret); + btrfs_bio_end_io(bbio, status); /* Do not submit another chunk */ return true; } @@ -803,8 +806,7 @@ void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num) * freeing the bio. */ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, - u64 length, u64 logical, struct folio *folio, - unsigned int folio_offset, int mirror_num) + u64 length, u64 logical, phys_addr_t paddr, int mirror_num) { struct btrfs_io_stripe smap = { 0 }; struct bio_vec bvec; @@ -835,8 +837,7 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, bio_init(&bio, smap.dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; - ret = bio_add_folio(&bio, folio, length, folio_offset); - ASSERT(ret); + __bio_add_page(&bio, phys_to_page(paddr), length, offset_in_page(paddr)); ret = submit_bio_wait(&bio); if (ret) { /* try to remap that extent elsewhere? */ @@ -900,22 +901,18 @@ int __init btrfs_bioset_init(void) return -ENOMEM; if (bioset_init(&btrfs_clone_bioset, BIO_POOL_SIZE, offsetof(struct btrfs_bio, bio), 0)) - goto out_free_bioset; + goto out; if (bioset_init(&btrfs_repair_bioset, BIO_POOL_SIZE, offsetof(struct btrfs_bio, bio), BIOSET_NEED_BVECS)) - goto out_free_clone_bioset; + goto out; if (mempool_init_kmalloc_pool(&btrfs_failed_bio_pool, BIO_POOL_SIZE, sizeof(struct btrfs_failed_bio))) - goto out_free_repair_bioset; + goto out; return 0; -out_free_repair_bioset: - bioset_exit(&btrfs_repair_bioset); -out_free_clone_bioset: - bioset_exit(&btrfs_clone_bioset); -out_free_bioset: - bioset_exit(&btrfs_bioset); +out: + btrfs_bioset_exit(); return -ENOMEM; } diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index e2fe16074ad6..dc2eb43b7097 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -110,7 +110,6 @@ void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status); void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num); void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace); int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, - u64 length, u64 logical, struct folio *folio, - unsigned int folio_offset, int mirror_num); + u64 length, u64 logical, phys_addr_t paddr, int mirror_num); #endif diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index a8129f1ce78c..5b0cb04b2b93 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -525,10 +525,9 @@ int btrfs_add_new_free_space(struct btrfs_block_group *block_group, u64 start, *total_added_ret = 0; while (start < end) { - if (!find_first_extent_bit(&info->excluded_extents, start, - &extent_start, &extent_end, - EXTENT_DIRTY | EXTENT_UPTODATE, - NULL)) + if (!btrfs_find_first_extent_bit(&info->excluded_extents, start, + &extent_start, &extent_end, + EXTENT_DIRTY, NULL)) break; if (extent_start <= start) { @@ -701,7 +700,7 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) struct btrfs_block_group *block_group = caching_ctl->block_group; struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_root *extent_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_key key; u64 total_found = 0; @@ -828,14 +827,13 @@ next: block_group->start + block_group->length, NULL); out: - btrfs_free_path(path); return ret; } static inline void btrfs_free_excluded_extents(const struct btrfs_block_group *bg) { - clear_extent_bits(&bg->fs_info->excluded_extents, bg->start, - bg->start + bg->length - 1, EXTENT_UPTODATE); + btrfs_clear_extent_bits(&bg->fs_info->excluded_extents, bg->start, + bg->start + bg->length - 1, EXTENT_DIRTY); } static noinline void caching_thread(struct btrfs_work *work) @@ -1420,9 +1418,8 @@ static bool clean_pinned_extents(struct btrfs_trans_handle *trans, int ret; spin_lock(&fs_info->trans_lock); - if (trans->transaction->list.prev != &fs_info->trans_list) { - prev_trans = list_last_entry(&trans->transaction->list, - struct btrfs_transaction, list); + if (!list_is_first(&trans->transaction->list, &fs_info->trans_list)) { + prev_trans = list_prev_entry(trans->transaction, list); refcount_inc(&prev_trans->use_count); } spin_unlock(&fs_info->trans_lock); @@ -1439,14 +1436,14 @@ static bool clean_pinned_extents(struct btrfs_trans_handle *trans, */ mutex_lock(&fs_info->unused_bg_unpin_mutex); if (prev_trans) { - ret = clear_extent_bits(&prev_trans->pinned_extents, start, end, - EXTENT_DIRTY); + ret = btrfs_clear_extent_bits(&prev_trans->pinned_extents, start, end, + EXTENT_DIRTY); if (ret) goto out; } - ret = clear_extent_bits(&trans->transaction->pinned_extents, start, end, - EXTENT_DIRTY); + ret = btrfs_clear_extent_bits(&trans->transaction->pinned_extents, start, end, + EXTENT_DIRTY); out: mutex_unlock(&fs_info->unused_bg_unpin_mutex); if (prev_trans) @@ -2218,9 +2215,9 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) if (cache->start < BTRFS_SUPER_INFO_OFFSET) { stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start; cache->bytes_super += stripe_len; - ret = set_extent_bit(&fs_info->excluded_extents, cache->start, - cache->start + stripe_len - 1, - EXTENT_UPTODATE, NULL); + ret = btrfs_set_extent_bit(&fs_info->excluded_extents, cache->start, + cache->start + stripe_len - 1, + EXTENT_DIRTY, NULL); if (ret) return ret; } @@ -2246,9 +2243,9 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) cache->start + cache->length - logical[nr]); cache->bytes_super += len; - ret = set_extent_bit(&fs_info->excluded_extents, logical[nr], - logical[nr] + len - 1, - EXTENT_UPTODATE, NULL); + ret = btrfs_set_extent_bit(&fs_info->excluded_extents, + logical[nr], logical[nr] + len - 1, + EXTENT_DIRTY, NULL); if (ret) { kfree(logical); return ret; @@ -2373,6 +2370,7 @@ static int read_one_block_group(struct btrfs_fs_info *info, cache->commit_used = cache->used; cache->flags = btrfs_stack_block_group_flags(bgi); cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi); + cache->space_info = btrfs_find_space_info(info, cache->flags); set_free_space_tree_thresholds(cache); @@ -2451,6 +2449,7 @@ static int read_one_block_group(struct btrfs_fs_info *info, btrfs_remove_free_space_cache(cache); goto error; } + trace_btrfs_add_block_group(info, cache, 0); btrfs_add_bg_to_space_info(info, cache); @@ -2495,6 +2494,7 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info) bg->cached = BTRFS_CACHE_FINISHED; bg->used = map->chunk_len; bg->flags = map->type; + bg->space_info = btrfs_find_space_info(fs_info, bg->flags); ret = btrfs_add_block_group_cache(bg); /* * We may have some valid block group cache added already, in @@ -2868,8 +2868,8 @@ static u64 calculate_global_root_id(const struct btrfs_fs_info *fs_info, u64 off } struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans, - u64 type, - u64 chunk_offset, u64 size) + struct btrfs_space_info *space_info, + u64 type, u64 chunk_offset, u64 size) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_group *cache; @@ -2923,7 +2923,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran * assigned to our block group. We want our bg to be added to the rbtree * with its ->space_info set. */ - cache->space_info = btrfs_find_space_info(fs_info, cache->flags); + cache->space_info = space_info; ASSERT(cache->space_info); ret = btrfs_add_block_group_cache(cache); @@ -2968,6 +2968,7 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, bool do_chunk_alloc) { struct btrfs_fs_info *fs_info = cache->fs_info; + struct btrfs_space_info *space_info = cache->space_info; struct btrfs_trans_handle *trans; struct btrfs_root *root = btrfs_block_group_root(fs_info); u64 alloc_flags; @@ -3020,7 +3021,7 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, */ alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags); if (alloc_flags != cache->flags) { - ret = btrfs_chunk_alloc(trans, alloc_flags, + ret = btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE); /* * ENOSPC is allowed here, we may have enough space @@ -3048,15 +3049,15 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM)) goto unlock_out; - alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags); - ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); + alloc_flags = btrfs_get_alloc_profile(fs_info, space_info->flags); + ret = btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE); if (ret < 0) goto out; /* * We have allocated a new chunk. We also need to activate that chunk to * grant metadata tickets for zoned filesystem. */ - ret = btrfs_zoned_activate_one_bg(fs_info, cache->space_info, true); + ret = btrfs_zoned_activate_one_bg(fs_info, space_info, true); if (ret < 0) goto out; @@ -3738,8 +3739,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, spin_unlock(&cache->lock); spin_unlock(&space_info->lock); - set_extent_bit(&trans->transaction->pinned_extents, bytenr, - bytenr + num_bytes - 1, EXTENT_DIRTY, NULL); + btrfs_set_extent_bit(&trans->transaction->pinned_extents, bytenr, + bytenr + num_bytes - 1, EXTENT_DIRTY, NULL); } spin_lock(&trans->transaction->dirty_bgs_lock); @@ -3828,17 +3829,17 @@ out: /* * Update the block_group and space info counters. * - * @cache: The cache we are manipulating - * @num_bytes: The number of bytes in question - * @delalloc: The blocks are allocated for the delalloc write + * @cache: The cache we are manipulating. + * @num_bytes: The number of bytes in question. + * @is_delalloc: Whether the blocks are allocated for a delalloc write. * * This is called by somebody who is freeing space that was never actually used * on disk. For example if you reserve some space for a new leaf in transaction * A and before transaction A commits you free that leaf, you call this with * reserve set to 0 in order to clear the reservation. */ -void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, - u64 num_bytes, int delalloc) +void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, u64 num_bytes, + bool is_delalloc) { struct btrfs_space_info *space_info = cache->space_info; @@ -3852,7 +3853,7 @@ void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, space_info->bytes_reserved -= num_bytes; space_info->max_extent_size = 0; - if (delalloc) + if (is_delalloc) cache->delalloc_bytes -= num_bytes; spin_unlock(&cache->lock); @@ -3871,14 +3872,14 @@ static void force_metadata_allocation(struct btrfs_fs_info *info) } } -static int should_alloc_chunk(const struct btrfs_fs_info *fs_info, - const struct btrfs_space_info *sinfo, int force) +static bool should_alloc_chunk(const struct btrfs_fs_info *fs_info, + const struct btrfs_space_info *sinfo, int force) { u64 bytes_used = btrfs_space_info_used(sinfo, false); u64 thresh; if (force == CHUNK_ALLOC_FORCE) - return 1; + return true; /* * in limited mode, we want to have some free space up to @@ -3889,22 +3890,31 @@ static int should_alloc_chunk(const struct btrfs_fs_info *fs_info, thresh = max_t(u64, SZ_64M, mult_perc(thresh, 1)); if (sinfo->total_bytes - bytes_used < thresh) - return 1; + return true; } if (bytes_used + SZ_2M < mult_perc(sinfo->total_bytes, 80)) - return 0; - return 1; + return false; + return true; } int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type) { u64 alloc_flags = btrfs_get_alloc_profile(trans->fs_info, type); + struct btrfs_space_info *space_info; - return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); + space_info = btrfs_find_space_info(trans->fs_info, type); + if (!space_info) { + DEBUG_WARN(); + return -EINVAL; + } + + return btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE); } -static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags) +static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_space_info *space_info, + u64 flags) { struct btrfs_block_group *bg; int ret; @@ -3917,7 +3927,7 @@ static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans */ check_system_chunk(trans, flags); - bg = btrfs_create_chunk(trans, flags); + bg = btrfs_create_chunk(trans, space_info, flags); if (IS_ERR(bg)) { ret = PTR_ERR(bg); goto out; @@ -3965,8 +3975,16 @@ static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans if (ret == -ENOSPC) { const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info); struct btrfs_block_group *sys_bg; + struct btrfs_space_info *sys_space_info; + + sys_space_info = btrfs_find_space_info(trans->fs_info, sys_flags); + if (!sys_space_info) { + ret = -EINVAL; + btrfs_abort_transaction(trans, ret); + goto out; + } - sys_bg = btrfs_create_chunk(trans, sys_flags); + sys_bg = btrfs_create_chunk(trans, sys_space_info, sys_flags); if (IS_ERR(sys_bg)) { ret = PTR_ERR(sys_bg); btrfs_abort_transaction(trans, ret); @@ -4097,6 +4115,8 @@ out: * * This function, btrfs_chunk_alloc(), belongs to phase 1. * + * @space_info: specify which space_info the new chunk should belong to. + * * If @force is CHUNK_ALLOC_FORCE: * - return 1 if it successfully allocates a chunk, * - return errors including -ENOSPC otherwise. @@ -4105,11 +4125,11 @@ out: * - return 1 if it successfully allocates a chunk, * - return errors including -ENOSPC otherwise. */ -int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, +int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_space_info *space_info, u64 flags, enum btrfs_chunk_alloc_enum force) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_space_info *space_info; struct btrfs_block_group *ret_bg; bool wait_for_alloc = false; bool should_alloc = false; @@ -4148,9 +4168,6 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, if (flags & BTRFS_BLOCK_GROUP_SYSTEM) return -ENOSPC; - space_info = btrfs_find_space_info(fs_info, flags); - ASSERT(space_info); - do { spin_lock(&space_info->lock); if (force < space_info->force_alloc) @@ -4211,7 +4228,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, force_metadata_allocation(fs_info); } - ret_bg = do_chunk_alloc(trans, flags); + ret_bg = do_chunk_alloc(trans, space_info, flags); trans->allocating_chunk = false; if (IS_ERR(ret_bg)) { @@ -4287,6 +4304,10 @@ static void reserve_chunk_space(struct btrfs_trans_handle *trans, if (left < bytes) { u64 flags = btrfs_system_alloc_profile(fs_info); struct btrfs_block_group *bg; + struct btrfs_space_info *space_info; + + space_info = btrfs_find_space_info(fs_info, flags); + ASSERT(space_info); /* * Ignore failure to create system chunk. We might end up not @@ -4294,7 +4315,7 @@ static void reserve_chunk_space(struct btrfs_trans_handle *trans, * the paths we visit in the chunk tree (they were already COWed * or created in the current transaction for example). */ - bg = btrfs_create_chunk(trans, flags); + bg = btrfs_create_chunk(trans, space_info, flags); if (IS_ERR(bg)) { ret = PTR_ERR(bg); } else { @@ -4402,6 +4423,43 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info) } } +static void check_removing_space_info(struct btrfs_space_info *space_info) +{ + struct btrfs_fs_info *info = space_info->fs_info; + + if (space_info->subgroup_id == BTRFS_SUB_GROUP_PRIMARY) { + /* This is a top space_info, proceed with its children first. */ + for (int i = 0; i < BTRFS_SPACE_INFO_SUB_GROUP_MAX; i++) { + if (space_info->sub_group[i]) { + check_removing_space_info(space_info->sub_group[i]); + kfree(space_info->sub_group[i]); + space_info->sub_group[i] = NULL; + } + } + } + + /* + * Do not hide this behind enospc_debug, this is actually important and + * indicates a real bug if this happens. + */ + if (WARN_ON(space_info->bytes_pinned > 0 || space_info->bytes_may_use > 0)) + btrfs_dump_space_info(info, space_info, 0, 0); + + /* + * If there was a failure to cleanup a log tree, very likely due to an + * IO failure on a writeback attempt of one or more of its extent + * buffers, we could not do proper (and cheap) unaccounting of their + * reserved space, so don't warn on bytes_reserved > 0 in that case. + */ + if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) || + !BTRFS_FS_LOG_CLEANUP_ERROR(info)) { + if (WARN_ON(space_info->bytes_reserved > 0)) + btrfs_dump_space_info(info, space_info, 0, 0); + } + + WARN_ON(space_info->reclaim_size > 0); +} + /* * Must be called only after stopping all workers, since we could have block * group caching kthreads running, and therefore they could race with us if we @@ -4427,8 +4485,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) write_lock(&info->block_group_cache_lock); while (!list_empty(&info->caching_block_groups)) { - caching_ctl = list_entry(info->caching_block_groups.next, - struct btrfs_caching_control, list); + caching_ctl = list_first_entry(&info->caching_block_groups, + struct btrfs_caching_control, list); list_del(&caching_ctl->list); btrfs_put_caching_control(caching_ctl); } @@ -4499,32 +4557,10 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) btrfs_release_global_block_rsv(info); while (!list_empty(&info->space_info)) { - space_info = list_entry(info->space_info.next, - struct btrfs_space_info, - list); - - /* - * Do not hide this behind enospc_debug, this is actually - * important and indicates a real bug if this happens. - */ - if (WARN_ON(space_info->bytes_pinned > 0 || - space_info->bytes_may_use > 0)) - btrfs_dump_space_info(info, space_info, 0, 0); - - /* - * If there was a failure to cleanup a log tree, very likely due - * to an IO failure on a writeback attempt of one or more of its - * extent buffers, we could not do proper (and cheap) unaccounting - * of their reserved space, so don't warn on bytes_reserved > 0 in - * that case. - */ - if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) || - !BTRFS_FS_LOG_CLEANUP_ERROR(info)) { - if (WARN_ON(space_info->bytes_reserved > 0)) - btrfs_dump_space_info(info, space_info, 0, 0); - } + space_info = list_first_entry(&info->space_info, + struct btrfs_space_info, list); - WARN_ON(space_info->reclaim_size > 0); + check_removing_space_info(space_info); list_del(&space_info->list); btrfs_sysfs_remove_space_info(space_info); } diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 36937eeab9b8..9de356bcb411 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -326,8 +326,8 @@ void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info); void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg); int btrfs_read_block_groups(struct btrfs_fs_info *info); struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans, - u64 type, - u64 chunk_offset, u64 size); + struct btrfs_space_info *space_info, + u64 type, u64 chunk_offset, u64 size); void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans); int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, bool do_chunk_alloc); @@ -340,9 +340,10 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, u64 ram_bytes, u64 num_bytes, int delalloc, bool force_wrong_size_class); -void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, - u64 num_bytes, int delalloc); -int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, +void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, u64 num_bytes, + bool is_delalloc); +int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_space_info *space_info, u64 flags, enum btrfs_chunk_alloc_enum force); int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type); void check_system_chunk(struct btrfs_trans_handle *trans, const u64 type); diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index 3f3608299c0b..5ad6de738aee 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -418,6 +418,9 @@ void btrfs_init_root_block_rsv(struct btrfs_root *root) case BTRFS_CHUNK_TREE_OBJECTID: root->block_rsv = &fs_info->chunk_block_rsv; break; + case BTRFS_TREE_LOG_OBJECTID: + root->block_rsv = &fs_info->treelog_rsv; + break; default: root->block_rsv = NULL; break; @@ -438,6 +441,14 @@ void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info) fs_info->delayed_block_rsv.space_info = space_info; fs_info->delayed_refs_rsv.space_info = space_info; + /* The treelog_rsv uses a dedicated space_info on the zoned mode. */ + if (!btrfs_is_zoned(fs_info)) { + fs_info->treelog_rsv.space_info = space_info; + } else { + ASSERT(space_info->sub_group[0]->subgroup_id == BTRFS_SUB_GROUP_TREELOG); + fs_info->treelog_rsv.space_info = space_info->sub_group[0]; + } + btrfs_update_global_block_rsv(fs_info); } diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index d12b1fac5c74..79ae9d05cd91 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -24,6 +24,7 @@ enum btrfs_rsv_type { BTRFS_BLOCK_RSV_CHUNK, BTRFS_BLOCK_RSV_DELOPS, BTRFS_BLOCK_RSV_DELREFS, + BTRFS_BLOCK_RSV_TREELOG, BTRFS_BLOCK_RSV_EMPTY, BTRFS_BLOCK_RSV_TEMP, }; diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 4e2952cf5766..a79fa0726f1d 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -529,8 +529,8 @@ static inline void btrfs_update_inode_mapping_flags(struct btrfs_inode *inode) #define CSUM_FMT "0x%*phN" #define CSUM_FMT_VALUE(size, bytes) size, bytes -int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, - u32 pgoff, u8 *csum, const u8 * const csum_expected); +int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, void *kaddr, u8 *csum, + const u8 * const csum_expected); bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, u32 bio_offset, struct bio_vec *bv); noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len, @@ -547,8 +547,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, struct btrfs_inode *parent_inode, struct btrfs_inode *inode, const struct fscrypt_str *name, int add_backref, u64 index); int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry); -int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, - int front); +int btrfs_truncate_block(struct btrfs_inode *inode, u64 offset, u64 start, u64 end); int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context); int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index e7f8ee5d48a4..48d07939fee4 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -285,12 +285,12 @@ static noinline void end_compressed_writeback(const struct compressed_bio *cb) unsigned long index = cb->start >> PAGE_SHIFT; unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT; struct folio_batch fbatch; - const int error = blk_status_to_errno(cb->bbio.bio.bi_status); int i; int ret; - if (error) - mapping_set_error(inode->i_mapping, error); + ret = blk_status_to_errno(cb->bbio.bio.bi_status); + if (ret) + mapping_set_error(inode->i_mapping, ret); folio_batch_init(&fbatch); while (index <= end_index) { @@ -499,9 +499,9 @@ static noinline int add_ra_bio_pages(struct inode *inode, } page_end = (pg_index << PAGE_SHIFT) + folio_size(folio) - 1; - lock_extent(tree, cur, page_end, NULL); + btrfs_lock_extent(tree, cur, page_end, NULL); read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, cur, page_end + 1 - cur); + em = btrfs_lookup_extent_mapping(em_tree, cur, page_end + 1 - cur); read_unlock(&em_tree->lock); /* @@ -510,20 +510,20 @@ static noinline int add_ra_bio_pages(struct inode *inode, * to this compressed extent on disk. */ if (!em || cur < em->start || - (cur + fs_info->sectorsize > extent_map_end(em)) || - (extent_map_block_start(em) >> SECTOR_SHIFT) != + (cur + fs_info->sectorsize > btrfs_extent_map_end(em)) || + (btrfs_extent_map_block_start(em) >> SECTOR_SHIFT) != orig_bio->bi_iter.bi_sector) { - free_extent_map(em); - unlock_extent(tree, cur, page_end, NULL); + btrfs_free_extent_map(em); + btrfs_unlock_extent(tree, cur, page_end, NULL); folio_unlock(folio); folio_put(folio); break; } add_size = min(em->start + em->len, page_end + 1) - cur; - free_extent_map(em); - unlock_extent(tree, cur, page_end, NULL); + btrfs_free_extent_map(em); + btrfs_unlock_extent(tree, cur, page_end, NULL); - if (folio->index == end_index) { + if (folio_contains(folio, end_index)) { size_t zero_offset = offset_in_folio(folio, isize); if (zero_offset) { @@ -576,19 +576,19 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) struct extent_map *em; unsigned long pflags; int memstall = 0; - blk_status_t ret; - int ret2; + blk_status_t status; + int ret; /* we need the actual starting offset of this extent in the file */ read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize); + em = btrfs_lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize); read_unlock(&em_tree->lock); if (!em) { - ret = BLK_STS_IOERR; + status = BLK_STS_IOERR; goto out; } - ASSERT(extent_map_is_compressed(em)); + ASSERT(btrfs_extent_map_is_compressed(em)); compressed_len = em->disk_num_bytes; cb = alloc_compressed_bio(inode, file_offset, REQ_OP_READ, @@ -600,21 +600,21 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) cb->len = bbio->bio.bi_iter.bi_size; cb->compressed_len = compressed_len; - cb->compress_type = extent_map_compression(em); + cb->compress_type = btrfs_extent_map_compression(em); cb->orig_bbio = bbio; - free_extent_map(em); + btrfs_free_extent_map(em); cb->nr_folios = DIV_ROUND_UP(compressed_len, PAGE_SIZE); - cb->compressed_folios = kcalloc(cb->nr_folios, sizeof(struct page *), GFP_NOFS); + cb->compressed_folios = kcalloc(cb->nr_folios, sizeof(struct folio *), GFP_NOFS); if (!cb->compressed_folios) { - ret = BLK_STS_RESOURCE; + status = BLK_STS_RESOURCE; goto out_free_bio; } - ret2 = btrfs_alloc_folio_array(cb->nr_folios, cb->compressed_folios); - if (ret2) { - ret = BLK_STS_RESOURCE; + ret = btrfs_alloc_folio_array(cb->nr_folios, cb->compressed_folios); + if (ret) { + status = BLK_STS_RESOURCE; goto out_free_compressed_pages; } @@ -637,7 +637,7 @@ out_free_compressed_pages: out_free_bio: bio_put(&cb->bbio.bio); out: - btrfs_bio_end_io(bbio, ret); + btrfs_bio_end_io(bbio, status); } /* @@ -1138,6 +1138,22 @@ void __cold btrfs_exit_compress(void) } /* + * The bvec is a single page bvec from a bio that contains folios from a filemap. + * + * Since the folio may be a large one, and if the bv_page is not a head page of + * a large folio, then page->index is unreliable. + * + * Thus we need this helper to grab the proper file offset. + */ +static u64 file_offset_from_bvec(const struct bio_vec *bvec) +{ + const struct page *page = bvec->bv_page; + const struct folio *folio = page_folio(page); + + return (page_pgoff(folio, page) << PAGE_SHIFT) + bvec->bv_offset; +} + +/* * Copy decompressed data from working buffer to pages. * * @buf: The decompressed data buffer @@ -1182,13 +1198,14 @@ int btrfs_decompress_buf2page(const char *buf, u32 buf_len, u32 copy_start; /* Offset inside the full decompressed extent */ u32 bvec_offset; + void *kaddr; bvec = bio_iter_iovec(orig_bio, orig_bio->bi_iter); /* * cb->start may underflow, but subtracting that value can still * give us correct offset inside the full decompressed extent. */ - bvec_offset = page_offset(bvec.bv_page) + bvec.bv_offset - cb->start; + bvec_offset = file_offset_from_bvec(&bvec) - cb->start; /* Haven't reached the bvec range, exit */ if (decompressed + buf_len <= bvec_offset) @@ -1204,10 +1221,12 @@ int btrfs_decompress_buf2page(const char *buf, u32 buf_len, * @buf + @buf_len. */ ASSERT(copy_start - decompressed < buf_len); - memcpy_to_page(bvec.bv_page, bvec.bv_offset, - buf + copy_start - decompressed, copy_len); - cur_offset += copy_len; + kaddr = bvec_kmap_local(&bvec); + memcpy(kaddr, buf + copy_start - decompressed, copy_len); + kunmap_local(kaddr); + + cur_offset += copy_len; bio_advance(orig_bio, copy_len); /* Finished the bio */ if (!orig_bio->bi_iter.bi_size) diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index df198623cc08..d34c4341eaf4 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -11,7 +11,9 @@ #include <linux/list.h> #include <linux/workqueue.h> #include <linux/wait.h> +#include <linux/pagemap.h> #include "bio.h" +#include "messages.h" struct address_space; struct page; @@ -73,11 +75,14 @@ struct compressed_bio { }; /* @range_end must be exclusive. */ -static inline u32 btrfs_calc_input_length(u64 range_end, u64 cur) +static inline u32 btrfs_calc_input_length(struct folio *folio, u64 range_end, u64 cur) { - u64 page_end = round_down(cur, PAGE_SIZE) + PAGE_SIZE; + const u64 folio_end = folio_pos(folio) + folio_size(folio); - return min(range_end, page_end) - cur; + /* @cur must be inside the folio. */ + ASSERT(folio_pos(folio) <= cur); + ASSERT(cur < folio_end); + return min(range_end, folio_end) - cur; } int __init btrfs_init_compress(void); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 075a06db43a1..71fa42ca04fe 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -61,7 +61,6 @@ struct btrfs_path { /* if there is real range locking, this locks field will change */ u8 locks[BTRFS_MAX_LEVEL]; u8 reada; - /* keep some upper locks as we walk down */ u8 lowest_level; /* @@ -69,6 +68,7 @@ struct btrfs_path { * and to force calls to keep space in the nodes */ unsigned int search_for_split:1; + /* Keep some upper locks as we walk down. */ unsigned int keep_locks:1; unsigned int skip_locking:1; unsigned int search_commit_root:1; diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index d4310d93f532..1831618579cb 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -105,15 +105,15 @@ static int btrfs_insert_inode_defrag(struct btrfs_inode *inode, return 0; } -static inline int need_auto_defrag(struct btrfs_fs_info *fs_info) +static inline bool need_auto_defrag(struct btrfs_fs_info *fs_info) { if (!btrfs_test_opt(fs_info, AUTO_DEFRAG)) - return 0; + return false; if (btrfs_fs_closing(fs_info)) - return 0; + return false; - return 1; + return true; } /* @@ -191,10 +191,7 @@ static struct inode_defrag *btrfs_pick_defrag_inode( if (parent && compare_inode_defrag(&tmp, entry) > 0) { parent = rb_next(parent); - if (parent) - entry = rb_entry(parent, struct inode_defrag, rb_node); - else - entry = NULL; + entry = rb_entry_safe(parent, struct inode_defrag, rb_node); } out: if (entry) @@ -624,7 +621,7 @@ static struct extent_map *defrag_get_extent(struct btrfs_inode *inode, u64 ino = btrfs_ino(inode); int ret; - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { ret = -ENOMEM; goto err; @@ -734,12 +731,12 @@ next: not_found: btrfs_release_path(&path); - free_extent_map(em); + btrfs_free_extent_map(em); return NULL; err: btrfs_release_path(&path); - free_extent_map(em); + btrfs_free_extent_map(em); return ERR_PTR(ret); } @@ -756,7 +753,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, * full extent lock. */ read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, sectorsize); + em = btrfs_lookup_extent_mapping(em_tree, start, sectorsize); read_unlock(&em_tree->lock); /* @@ -769,7 +766,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, * file extent items in the inode's subvolume tree). */ if (em && (em->flags & EXTENT_FLAG_MERGED)) { - free_extent_map(em); + btrfs_free_extent_map(em); em = NULL; } @@ -779,10 +776,10 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, /* Get the big lock and read metadata off disk. */ if (!locked) - lock_extent(io_tree, start, end, &cached); + btrfs_lock_extent(io_tree, start, end, &cached); em = defrag_get_extent(BTRFS_I(inode), start, newer_than); if (!locked) - unlock_extent(io_tree, start, end, &cached); + btrfs_unlock_extent(io_tree, start, end, &cached); if (IS_ERR(em)) return NULL; @@ -794,7 +791,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info, const struct extent_map *em) { - if (extent_map_is_compressed(em)) + if (btrfs_extent_map_is_compressed(em)) return BTRFS_MAX_COMPRESSED; return fs_info->max_extent_size; } @@ -837,7 +834,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, ret = true; out: - free_extent_map(next); + btrfs_free_extent_map(next); return ret; } @@ -857,13 +854,14 @@ static struct folio *defrag_prepare_one_folio(struct btrfs_inode *inode, pgoff_t { struct address_space *mapping = inode->vfs_inode.i_mapping; gfp_t mask = btrfs_alloc_write_mask(mapping); - u64 page_start = (u64)index << PAGE_SHIFT; - u64 page_end = page_start + PAGE_SIZE - 1; + u64 folio_start; + u64 folio_end; struct extent_state *cached_state = NULL; struct folio *folio; int ret; again: + /* TODO: Add order fgp order flags when large folios are fully enabled. */ folio = __filemap_get_folio(mapping, index, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask); if (IS_ERR(folio)) @@ -871,13 +869,16 @@ again: /* * Since we can defragment files opened read-only, we can encounter - * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We - * can't do I/O using huge pages yet, so return an error for now. + * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). + * + * The IO for such large folios is not fully tested, thus return + * an error to reject such folios unless it's an experimental build. + * * Filesystem transparent huge pages are typically only used for * executables that explicitly enable them, so this isn't very * restrictive. */ - if (folio_test_large(folio)) { + if (!IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL) && folio_test_large(folio)) { folio_unlock(folio); folio_put(folio); return ERR_PTR(-ETXTBSY); @@ -890,14 +891,15 @@ again: return ERR_PTR(ret); } + folio_start = folio_pos(folio); + folio_end = folio_pos(folio) + folio_size(folio) - 1; /* Wait for any existing ordered extent in the range */ while (1) { struct btrfs_ordered_extent *ordered; - lock_extent(&inode->io_tree, page_start, page_end, &cached_state); - ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); - unlock_extent(&inode->io_tree, page_start, page_end, - &cached_state); + btrfs_lock_extent(&inode->io_tree, folio_start, folio_end, &cached_state); + ordered = btrfs_lookup_ordered_range(inode, folio_start, folio_size(folio)); + btrfs_unlock_extent(&inode->io_tree, folio_start, folio_end, &cached_state); if (!ordered) break; @@ -1027,8 +1029,8 @@ static int defrag_collect_targets(struct btrfs_inode *inode, * very likely resulting in a larger extent after writeback is * triggered (except in a case of free space fragmentation). */ - if (test_range_bit_exists(&inode->io_tree, cur, cur + range_len - 1, - EXTENT_DELALLOC)) + if (btrfs_test_range_bit_exists(&inode->io_tree, cur, cur + range_len - 1, + EXTENT_DELALLOC)) goto next; /* @@ -1066,8 +1068,8 @@ static int defrag_collect_targets(struct btrfs_inode *inode, /* Empty target list, no way to merge with last entry */ if (list_empty(target_list)) goto next; - last = list_entry(target_list->prev, - struct defrag_target_range, list); + last = list_last_entry(target_list, + struct defrag_target_range, list); /* Not mergeable with last entry */ if (last->start + last->len != cur) goto next; @@ -1077,7 +1079,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode, add: last_is_target = true; - range_len = min(extent_map_end(em), start + len) - cur; + range_len = min(btrfs_extent_map_end(em), start + len) - cur; /* * This one is a good target, check if it can be merged into * last range of the target list. @@ -1085,8 +1087,8 @@ add: if (!list_empty(target_list)) { struct defrag_target_range *last; - last = list_entry(target_list->prev, - struct defrag_target_range, list); + last = list_last_entry(target_list, + struct defrag_target_range, list); ASSERT(last->start + last->len <= cur); if (last->start + last->len == cur) { /* Mergeable, enlarge the last entry */ @@ -1099,7 +1101,7 @@ add: /* Allocate new defrag_target_range */ new = kmalloc(sizeof(*new), GFP_NOFS); if (!new) { - free_extent_map(em); + btrfs_free_extent_map(em); ret = -ENOMEM; break; } @@ -1108,8 +1110,8 @@ add: list_add_tail(&new->list, target_list); next: - cur = extent_map_end(em); - free_extent_map(em); + cur = btrfs_extent_map_end(em); + btrfs_free_extent_map(em); } if (ret < 0) { struct defrag_target_range *entry; @@ -1162,27 +1164,31 @@ static int defrag_one_locked_target(struct btrfs_inode *inode, struct extent_changeset *data_reserved = NULL; const u64 start = target->start; const u64 len = target->len; - unsigned long last_index = (start + len - 1) >> PAGE_SHIFT; - unsigned long start_index = start >> PAGE_SHIFT; - unsigned long first_index = folios[0]->index; int ret = 0; - int i; - - ASSERT(last_index - first_index + 1 <= nr_pages); ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len); if (ret < 0) return ret; - clear_extent_bit(&inode->io_tree, start, start + len - 1, - EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, cached_state); - set_extent_bit(&inode->io_tree, start, start + len - 1, - EXTENT_DELALLOC | EXTENT_DEFRAG, cached_state); - - /* Update the page status */ - for (i = start_index - first_index; i <= last_index - first_index; i++) { - folio_clear_checked(folios[i]); - btrfs_folio_clamp_set_dirty(fs_info, folios[i], start, len); + btrfs_clear_extent_bit(&inode->io_tree, start, start + len - 1, + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, cached_state); + btrfs_set_extent_bit(&inode->io_tree, start, start + len - 1, + EXTENT_DELALLOC | EXTENT_DEFRAG, cached_state); + + /* + * Update the page status. + * Due to possible large folios, we have to check all folios one by one. + */ + for (int i = 0; i < nr_pages && folios[i]; i++) { + struct folio *folio = folios[i]; + + if (!folio) + break; + if (start >= folio_pos(folio) + folio_size(folio) || + start + len <= folio_pos(folio)) + continue; + btrfs_folio_clamp_clear_checked(fs_info, folio, start, len); + btrfs_folio_clamp_set_dirty(fs_info, folio, start, len); } btrfs_delalloc_release_extents(inode, len); extent_changeset_free(data_reserved); @@ -1200,11 +1206,10 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, LIST_HEAD(target_list); struct folio **folios; const u32 sectorsize = inode->root->fs_info->sectorsize; - u64 last_index = (start + len - 1) >> PAGE_SHIFT; - u64 start_index = start >> PAGE_SHIFT; - unsigned int nr_pages = last_index - start_index + 1; + u64 cur = start; + const unsigned int nr_pages = ((start + len - 1) >> PAGE_SHIFT) - + (start >> PAGE_SHIFT) + 1; int ret = 0; - int i; ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE); ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize)); @@ -1214,21 +1219,25 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, return -ENOMEM; /* Prepare all pages */ - for (i = 0; i < nr_pages; i++) { - folios[i] = defrag_prepare_one_folio(inode, start_index + i); + for (int i = 0; cur < start + len && i < nr_pages; i++) { + folios[i] = defrag_prepare_one_folio(inode, cur >> PAGE_SHIFT); if (IS_ERR(folios[i])) { ret = PTR_ERR(folios[i]); - nr_pages = i; + folios[i] = NULL; goto free_folios; } + cur = folio_pos(folios[i]) + folio_size(folios[i]); } - for (i = 0; i < nr_pages; i++) + for (int i = 0; i < nr_pages; i++) { + if (!folios[i]) + break; folio_wait_writeback(folios[i]); + } + /* We should get at least one folio. */ + ASSERT(folios[0]); /* Lock the pages range */ - lock_extent(&inode->io_tree, start_index << PAGE_SHIFT, - (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, - &cached_state); + btrfs_lock_extent(&inode->io_tree, folio_pos(folios[0]), cur - 1, &cached_state); /* * Now we have a consistent view about the extent map, re-check * which range really needs to be defragged. @@ -1254,11 +1263,11 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, kfree(entry); } unlock_extent: - unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT, - (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, - &cached_state); + btrfs_unlock_extent(&inode->io_tree, folio_pos(folios[0]), cur - 1, &cached_state); free_folios: - for (i = 0; i < nr_pages; i++) { + for (int i = 0; i < nr_pages; i++) { + if (!folios[i]) + break; folio_unlock(folios[i]); folio_put(folios[i]); } diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 88e900e5a43d..288e1776c02d 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -111,6 +111,18 @@ * making error handling and cleanup easier. */ +static inline struct btrfs_space_info *data_sinfo_for_inode(const struct btrfs_inode *inode) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + + if (btrfs_is_zoned(fs_info) && btrfs_is_data_reloc_root(inode->root)) { + ASSERT(fs_info->data_sinfo->sub_group[0]->subgroup_id == + BTRFS_SUB_GROUP_DATA_RELOC); + return fs_info->data_sinfo->sub_group[0]; + } + return fs_info->data_sinfo; +} + int btrfs_alloc_data_chunk_ondemand(const struct btrfs_inode *inode, u64 bytes) { struct btrfs_root *root = inode->root; @@ -123,7 +135,7 @@ int btrfs_alloc_data_chunk_ondemand(const struct btrfs_inode *inode, u64 bytes) if (btrfs_is_free_space_inode(inode)) flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE; - return btrfs_reserve_data_bytes(fs_info, bytes, flush); + return btrfs_reserve_data_bytes(data_sinfo_for_inode(inode), bytes, flush); } int btrfs_check_data_free_space(struct btrfs_inode *inode, @@ -144,14 +156,14 @@ int btrfs_check_data_free_space(struct btrfs_inode *inode, else if (btrfs_is_free_space_inode(inode)) flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE; - ret = btrfs_reserve_data_bytes(fs_info, len, flush); + ret = btrfs_reserve_data_bytes(data_sinfo_for_inode(inode), len, flush); if (ret < 0) return ret; /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ ret = btrfs_qgroup_reserve_data(inode, reserved, start, len); if (ret < 0) { - btrfs_free_reserved_data_space_noquota(fs_info, len); + btrfs_free_reserved_data_space_noquota(inode, len); extent_changeset_free(*reserved); *reserved = NULL; } else { @@ -168,15 +180,13 @@ int btrfs_check_data_free_space(struct btrfs_inode *inode, * which we can't sleep and is sure it won't affect qgroup reserved space. * Like clear_bit_hook(). */ -void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info, - u64 len) +void btrfs_free_reserved_data_space_noquota(struct btrfs_inode *inode, u64 len) { - struct btrfs_space_info *data_sinfo; + struct btrfs_fs_info *fs_info = inode->root->fs_info; ASSERT(IS_ALIGNED(len, fs_info->sectorsize)); - data_sinfo = fs_info->data_sinfo; - btrfs_space_info_free_bytes_may_use(data_sinfo, len); + btrfs_space_info_free_bytes_may_use(data_sinfo_for_inode(inode), len); } /* @@ -196,7 +206,7 @@ void btrfs_free_reserved_data_space(struct btrfs_inode *inode, round_down(start, fs_info->sectorsize); start = round_down(start, fs_info->sectorsize); - btrfs_free_reserved_data_space_noquota(fs_info, len); + btrfs_free_reserved_data_space_noquota(inode, len); btrfs_qgroup_free_data(inode, reserved, start, len, NULL); } @@ -439,6 +449,29 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes) btrfs_inode_rsv_release(inode, true); } +/* Shrink a previously reserved extent to a new length. */ +void btrfs_delalloc_shrink_extents(struct btrfs_inode *inode, u64 reserved_len, u64 new_len) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + const u32 reserved_num_extents = count_max_extents(fs_info, reserved_len); + const u32 new_num_extents = count_max_extents(fs_info, new_len); + const int diff_num_extents = new_num_extents - reserved_num_extents; + + ASSERT(new_len <= reserved_len); + if (new_num_extents == reserved_num_extents) + return; + + spin_lock(&inode->lock); + btrfs_mod_outstanding_extents(inode, diff_num_extents); + btrfs_calculate_inode_block_rsv_size(fs_info, inode); + spin_unlock(&inode->lock); + + if (btrfs_is_testing(fs_info)) + return; + + btrfs_inode_rsv_release(inode, true); +} + /* * Reserve data and metadata space for delalloc * diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h index 3f32953c0a80..6119c0d3f883 100644 --- a/fs/btrfs/delalloc-space.h +++ b/fs/btrfs/delalloc-space.h @@ -18,8 +18,7 @@ void btrfs_free_reserved_data_space(struct btrfs_inode *inode, void btrfs_delalloc_release_space(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len, bool qgroup_free); -void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info, - u64 len); +void btrfs_free_reserved_data_space_noquota(struct btrfs_inode *inode, u64 len); void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, bool qgroup_free); int btrfs_delalloc_reserve_space(struct btrfs_inode *inode, @@ -27,5 +26,6 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode, int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, u64 disk_num_bytes, bool noflush); void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes); +void btrfs_delalloc_shrink_extents(struct btrfs_inode *inode, u64 reserved_len, u64 new_len); #endif /* BTRFS_DELALLOC_SPACE_H */ diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 3f1551d8a5c6..c7cc24a5dd5e 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -119,7 +119,12 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( return NULL; } -/* Will return either the node or PTR_ERR(-ENOMEM) */ +/* + * Look up an existing delayed node associated with @btrfs_inode or create a new + * one and insert it to the delayed nodes of the root. + * + * Return the delayed node, or error pointer on failure. + */ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( struct btrfs_inode *btrfs_inode) { @@ -211,17 +216,13 @@ static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root, static struct btrfs_delayed_node *btrfs_first_delayed_node( struct btrfs_delayed_root *delayed_root) { - struct list_head *p; - struct btrfs_delayed_node *node = NULL; + struct btrfs_delayed_node *node; spin_lock(&delayed_root->lock); - if (list_empty(&delayed_root->node_list)) - goto out; - - p = delayed_root->node_list.next; - node = list_entry(p, struct btrfs_delayed_node, n_list); - refcount_inc(&node->refs); -out: + node = list_first_entry_or_null(&delayed_root->node_list, + struct btrfs_delayed_node, n_list); + if (node) + refcount_inc(&node->refs); spin_unlock(&delayed_root->lock); return node; @@ -293,18 +294,15 @@ static inline void btrfs_release_delayed_node(struct btrfs_delayed_node *node) static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node( struct btrfs_delayed_root *delayed_root) { - struct list_head *p; - struct btrfs_delayed_node *node = NULL; + struct btrfs_delayed_node *node; spin_lock(&delayed_root->lock); - if (list_empty(&delayed_root->prepare_list)) - goto out; - - p = delayed_root->prepare_list.next; - list_del_init(p); - node = list_entry(p, struct btrfs_delayed_node, p_list); - refcount_inc(&node->refs); -out: + node = list_first_entry_or_null(&delayed_root->prepare_list, + struct btrfs_delayed_node, p_list); + if (node) { + list_del_init(&node->p_list); + refcount_inc(&node->refs); + } spin_unlock(&delayed_root->lock); return node; @@ -454,40 +452,25 @@ static void btrfs_release_delayed_item(struct btrfs_delayed_item *item) static struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item( struct btrfs_delayed_node *delayed_node) { - struct rb_node *p; - struct btrfs_delayed_item *item = NULL; + struct rb_node *p = rb_first_cached(&delayed_node->ins_root); - p = rb_first_cached(&delayed_node->ins_root); - if (p) - item = rb_entry(p, struct btrfs_delayed_item, rb_node); - - return item; + return rb_entry_safe(p, struct btrfs_delayed_item, rb_node); } static struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item( struct btrfs_delayed_node *delayed_node) { - struct rb_node *p; - struct btrfs_delayed_item *item = NULL; - - p = rb_first_cached(&delayed_node->del_root); - if (p) - item = rb_entry(p, struct btrfs_delayed_item, rb_node); + struct rb_node *p = rb_first_cached(&delayed_node->del_root); - return item; + return rb_entry_safe(p, struct btrfs_delayed_item, rb_node); } static struct btrfs_delayed_item *__btrfs_next_delayed_item( struct btrfs_delayed_item *item) { - struct rb_node *p; - struct btrfs_delayed_item *next = NULL; - - p = rb_next(&item->rb_node); - if (p) - next = rb_entry(p, struct btrfs_delayed_item, rb_node); + struct rb_node *p = rb_next(&item->rb_node); - return next; + return rb_entry_safe(p, struct btrfs_delayed_item, rb_node); } static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, @@ -1397,17 +1380,17 @@ void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info) WARN_ON(btrfs_first_delayed_node(fs_info->delayed_root)); } -static int could_end_wait(struct btrfs_delayed_root *delayed_root, int seq) +static bool could_end_wait(struct btrfs_delayed_root *delayed_root, int seq) { int val = atomic_read(&delayed_root->items_seq); if (val < seq || val >= seq + BTRFS_DELAYED_BATCH) - return 1; + return true; if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) - return 1; + return true; - return 0; + return false; } void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info) diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 98c5b61dabe8..739c9e29aaa3 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -331,12 +331,9 @@ static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root, struct btrfs_delayed_ref_node *ins) { struct rb_node *node = &ins->ref_node; - struct rb_node *exist; + struct rb_node *exist = rb_find_add_cached(node, root, cmp_refs_node); - exist = rb_find_add_cached(node, root, cmp_refs_node); - if (exist) - return rb_entry(exist, struct btrfs_delayed_ref_node, ref_node); - return NULL; + return rb_entry_safe(exist, struct btrfs_delayed_ref_node, ref_node); } static struct btrfs_delayed_ref_head *find_first_ref_head( @@ -1339,7 +1336,7 @@ int __init btrfs_delayed_ref_init(void) { btrfs_delayed_ref_head_cachep = KMEM_CACHE(btrfs_delayed_ref_head, 0); if (!btrfs_delayed_ref_head_cachep) - goto fail; + return -ENOMEM; btrfs_delayed_ref_node_cachep = KMEM_CACHE(btrfs_delayed_ref_node, 0); if (!btrfs_delayed_ref_node_cachep) diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index f5ae880308d3..78cc23837610 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -262,7 +262,6 @@ enum btrfs_ref_type { BTRFS_REF_NOT_SET, BTRFS_REF_DATA, BTRFS_REF_METADATA, - BTRFS_REF_LAST, } __packed; struct btrfs_ref { diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 53d7d85cb4be..2decb9fff445 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -637,7 +637,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, break; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: - ASSERT(0); + DEBUG_WARN("unexpected STARTED ot SUSPENDED dev-replace state"); ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED; up_write(&dev_replace->rwsem); goto leave; @@ -794,17 +794,17 @@ static int btrfs_set_target_alloc_state(struct btrfs_device *srcdev, lockdep_assert_held(&srcdev->fs_info->chunk_mutex); - while (find_first_extent_bit(&srcdev->alloc_state, start, - &found_start, &found_end, - CHUNK_ALLOCATED, &cached_state)) { - ret = set_extent_bit(&tgtdev->alloc_state, found_start, - found_end, CHUNK_ALLOCATED, NULL); + while (btrfs_find_first_extent_bit(&srcdev->alloc_state, start, + &found_start, &found_end, + CHUNK_ALLOCATED, &cached_state)) { + ret = btrfs_set_extent_bit(&tgtdev->alloc_state, found_start, + found_end, CHUNK_ALLOCATED, NULL); if (ret) break; start = found_end + 1; } - free_extent_state(cached_state); + btrfs_free_extent_state(cached_state); return ret; } @@ -1265,16 +1265,16 @@ static int btrfs_dev_replace_kthread(void *data) return 0; } -int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace) +bool __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace) { if (!dev_replace->is_valid) - return 0; + return false; switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: - return 0; + return false; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: /* @@ -1289,7 +1289,7 @@ int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace) */ break; } - return 1; + return true; } void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount) diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h index 23e480efe5e6..b35cecf388f2 100644 --- a/fs/btrfs/dev-replace.h +++ b/fs/btrfs/dev-replace.h @@ -25,7 +25,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info); void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info); int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info); -int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace); +bool __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace); bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev, struct btrfs_block_group *cache, u64 physical); diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c index a374ce7a1813..fe9a4bd7e6e6 100644 --- a/fs/btrfs/direct-io.c +++ b/fs/btrfs/direct-io.c @@ -42,21 +42,21 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, /* Direct lock must be taken before the extent lock. */ if (nowait) { - if (!try_lock_dio_extent(io_tree, lockstart, lockend, cached_state)) + if (!btrfs_try_lock_dio_extent(io_tree, lockstart, lockend, cached_state)) return -EAGAIN; } else { - lock_dio_extent(io_tree, lockstart, lockend, cached_state); + btrfs_lock_dio_extent(io_tree, lockstart, lockend, cached_state); } while (1) { if (nowait) { - if (!try_lock_extent(io_tree, lockstart, lockend, - cached_state)) { + if (!btrfs_try_lock_extent(io_tree, lockstart, lockend, + cached_state)) { ret = -EAGAIN; break; } } else { - lock_extent(io_tree, lockstart, lockend, cached_state); + btrfs_lock_extent(io_tree, lockstart, lockend, cached_state); } /* * We're concerned with the entire range that we're going to be @@ -78,7 +78,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, lockstart, lockend))) break; - unlock_extent(io_tree, lockstart, lockend, cached_state); + btrfs_unlock_extent(io_tree, lockstart, lockend, cached_state); if (ordered) { if (nowait) { @@ -131,7 +131,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, } if (ret) - unlock_dio_extent(io_tree, lockstart, lockend, cached_state); + btrfs_unlock_dio_extent(io_tree, lockstart, lockend, cached_state); return ret; } @@ -151,11 +151,11 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, } ordered = btrfs_alloc_ordered_extent(inode, start, file_extent, - (1 << type) | - (1 << BTRFS_ORDERED_DIRECT)); + (1U << type) | + (1U << BTRFS_ORDERED_DIRECT)); if (IS_ERR(ordered)) { if (em) { - free_extent_map(em); + btrfs_free_extent_map(em); btrfs_drop_extent_map_range(inode, start, start + file_extent->num_bytes - 1, false); } @@ -204,8 +204,7 @@ again: BTRFS_ORDERED_REGULAR); btrfs_dec_block_group_reservations(fs_info, ins.objectid); if (IS_ERR(em)) - btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, - 1); + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true); return em; } @@ -246,7 +245,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, else type = BTRFS_ORDERED_NOCOW; len = min(len, em->len - (start - em->start)); - block_start = extent_map_block_start(em) + (start - em->start); + block_start = btrfs_extent_map_block_start(em) + (start - em->start); if (can_nocow_extent(BTRFS_I(inode), start, &len, &file_extent, false) == 1) { @@ -265,7 +264,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, nowait); if (ret < 0) { /* Our caller expects us to free the input extent map. */ - free_extent_map(em); + btrfs_free_extent_map(em); *map = NULL; btrfs_dec_nocow_writers(bg); if (nowait && (ret == -ENOSPC || ret == -EDQUOT)) @@ -278,7 +277,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, &file_extent, type); btrfs_dec_nocow_writers(bg); if (type == BTRFS_ORDERED_PREALLOC) { - free_extent_map(em); + btrfs_free_extent_map(em); *map = em2; em = em2; } @@ -291,7 +290,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, dio_data->nocow_done = true; } else { /* Our caller expects us to free the input extent map. */ - free_extent_map(em); + btrfs_free_extent_map(em); *map = NULL; if (nowait) { @@ -440,8 +439,8 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, start, data_alloc_len, false); if (!ret) dio_data->data_space_reserved = true; - else if (ret && !(BTRFS_I(inode)->flags & - (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) + else if (!(BTRFS_I(inode)->flags & + (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) goto err; } @@ -474,8 +473,8 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, * to buffered IO. Don't blame me, this is the price we pay for using * the generic code. */ - if (extent_map_is_compressed(em) || em->disk_bytenr == EXTENT_MAP_INLINE) { - free_extent_map(em); + if (btrfs_extent_map_is_compressed(em) || em->disk_bytenr == EXTENT_MAP_INLINE) { + btrfs_free_extent_map(em); /* * If we are in a NOWAIT context, return -EAGAIN in order to * fallback to buffered IO. This is not only because we can @@ -516,7 +515,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, * after we have submitted bios for all the extents in the range. */ if ((flags & IOMAP_NOWAIT) && len < length) { - free_extent_map(em); + btrfs_free_extent_map(em); ret = -EAGAIN; goto unlock_err; } @@ -558,13 +557,13 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, iomap->addr = IOMAP_NULL_ADDR; iomap->type = IOMAP_HOLE; } else { - iomap->addr = extent_map_block_start(em) + (start - em->start); + iomap->addr = btrfs_extent_map_block_start(em) + (start - em->start); iomap->type = IOMAP_MAPPED; } iomap->offset = start; iomap->bdev = fs_info->fs_devices->latest_dev->bdev; iomap->length = len; - free_extent_map(em); + btrfs_free_extent_map(em); /* * Reads will hold the EXTENT_DIO_LOCKED bit until the io is completed, @@ -575,13 +574,13 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, if (write) unlock_bits |= EXTENT_DIO_LOCKED; - clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, - unlock_bits, &cached_state); + btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, + unlock_bits, &cached_state); /* We didn't use everything, unlock the dio extent for the remainder. */ if (!write && (start + len) < lockend) - unlock_dio_extent(&BTRFS_I(inode)->io_tree, start + len, - lockend, NULL); + btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, start + len, + lockend, NULL); return 0; @@ -591,8 +590,8 @@ unlock_err: * to update this, be explicit that we expect EXTENT_LOCKED and * EXTENT_DIO_LOCKED to be set here, and so that's what we're clearing. */ - clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, - EXTENT_LOCKED | EXTENT_DIO_LOCKED, &cached_state); + btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, + EXTENT_LOCKED | EXTENT_DIO_LOCKED, &cached_state); err: if (dio_data->data_space_reserved) { btrfs_free_reserved_data_space(BTRFS_I(inode), @@ -615,8 +614,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, if (!write && (iomap->type == IOMAP_HOLE)) { /* If reading from a hole, unlock and return */ - unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos, - pos + length - 1, NULL); + btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos, + pos + length - 1, NULL); return 0; } @@ -627,8 +626,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, btrfs_finish_ordered_extent(dio_data->ordered, NULL, pos, length, false); else - unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos, - pos + length - 1, NULL); + btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos, + pos + length - 1, NULL); ret = -ENOTBLK; } if (write) { @@ -660,8 +659,8 @@ static void btrfs_dio_end_io(struct btrfs_bio *bbio) dip->file_offset, dip->bytes, !bio->bi_status); } else { - unlock_dio_extent(&inode->io_tree, dip->file_offset, - dip->file_offset + dip->bytes - 1, NULL); + btrfs_unlock_dio_extent(&inode->io_tree, dip->file_offset, + dip->file_offset + dip->bytes - 1, NULL); } bbio->bio.bi_private = bbio->private; @@ -692,9 +691,9 @@ static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio, * a pre-existing one. */ if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { - ret = split_extent_map(bbio->inode, bbio->file_offset, - ordered->num_bytes, len, - ordered->disk_bytenr); + ret = btrfs_split_extent_map(bbio->inode, bbio->file_offset, + ordered->num_bytes, len, + ordered->disk_bytenr); if (ret) return ret; } diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index d6eef4bd9e9d..89fe85778115 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -94,8 +94,6 @@ static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { lockdep_assert_held(&discard_ctl->lock); - if (!btrfs_run_discard_work(discard_ctl)) - return; if (list_empty(&block_group->discard_list) || block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) { @@ -118,6 +116,9 @@ static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, if (!btrfs_is_block_group_data_only(block_group)) return; + if (!btrfs_run_discard_work(discard_ctl)) + return; + spin_lock(&discard_ctl->lock); __add_to_discard_list(discard_ctl, block_group); spin_unlock(&discard_ctl->lock); @@ -244,6 +245,20 @@ again: block_group->used != 0) { if (btrfs_is_block_group_data_only(block_group)) { __add_to_discard_list(discard_ctl, block_group); + /* + * The block group must have been moved to other + * discard list even if discard was disabled in + * the meantime or a transaction abort happened, + * otherwise we can end up in an infinite loop, + * always jumping into the 'again' label and + * keep getting this block group over and over + * in case there are no other block groups in + * the discard lists. + */ + ASSERT(block_group->discard_index != + BTRFS_DISCARD_INDEX_UNUSED, + "discard_index=%d", + block_group->discard_index); } else { list_del_init(&block_group->discard_list); btrfs_put_block_group(block_group); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index aa58e0663a5d..1beb9458f622 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -193,10 +193,11 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, u64 end = min_t(u64, eb->start + eb->len, folio_pos(folio) + eb->folio_size); u32 len = end - start; + phys_addr_t paddr = PFN_PHYS(folio_pfn(folio)) + + offset_in_folio(folio, start); - ret = btrfs_repair_io_failure(fs_info, 0, start, len, - start, folio, offset_in_folio(folio, start), - mirror_num); + ret = btrfs_repair_io_failure(fs_info, 0, start, len, start, + paddr, mirror_num); if (ret) break; } @@ -224,7 +225,6 @@ int btrfs_read_extent_buffer(struct extent_buffer *eb, ASSERT(check); while (1) { - clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); ret = read_extent_buffer_pages(eb, mirror_num, check); if (!ret) break; @@ -256,7 +256,7 @@ int btrfs_read_extent_buffer(struct extent_buffer *eb, /* * Checksum a dirty tree block before IO. */ -blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) +int btree_csum_one_bio(struct btrfs_bio *bbio) { struct extent_buffer *eb = bbio->private; struct btrfs_fs_info *fs_info = eb->fs_info; @@ -267,9 +267,9 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) /* Btree blocks are always contiguous on disk. */ if (WARN_ON_ONCE(bbio->file_offset != eb->start)) - return BLK_STS_IOERR; + return -EIO; if (WARN_ON_ONCE(bbio->bio.bi_iter.bi_size != eb->len)) - return BLK_STS_IOERR; + return -EIO; /* * If an extent_buffer is marked as EXTENT_BUFFER_ZONED_ZEROOUT, don't @@ -278,13 +278,13 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) */ if (test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)) { memzero_extent_buffer(eb, 0, eb->len); - return BLK_STS_OK; + return 0; } if (WARN_ON_ONCE(found_start != eb->start)) - return BLK_STS_IOERR; + return -EIO; if (WARN_ON(!btrfs_meta_folio_test_uptodate(eb->folios[0], eb))) - return BLK_STS_IOERR; + return -EIO; ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid, offsetof(struct btrfs_header, fsid), @@ -312,7 +312,7 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) goto error; } write_extent_buffer(eb, result, 0, fs_info->csum_size); - return BLK_STS_OK; + return 0; error: btrfs_print_tree(eb, 0); @@ -326,7 +326,7 @@ error: */ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG) || btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID); - return errno_to_blk_status(ret); + return ret; } static bool check_tree_block_fsid(struct extent_buffer *eb) @@ -452,15 +452,9 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb, goto out; } - /* - * If this is a leaf block and it is corrupt, set the corrupt bit so - * that we don't try and read the other copies of this block, just - * return -EIO. - */ - if (found_level == 0 && btrfs_check_leaf(eb)) { - set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); + /* If this is a leaf block and it is corrupt, just return -EIO. */ + if (found_level == 0 && btrfs_check_leaf(eb)) ret = -EIO; - } if (found_level > 0 && btrfs_check_node(eb)) ret = -EIO; @@ -641,11 +635,16 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, } -static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, - u64 objectid) +static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, + u64 objectid, gfp_t flags) { + struct btrfs_root *root; bool dummy = btrfs_is_testing(fs_info); + root = kzalloc(sizeof(*root), flags); + if (!root) + return NULL; + memset(&root->root_key, 0, sizeof(root->root_key)); memset(&root->root_item, 0, sizeof(root->root_item)); memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); @@ -698,10 +697,10 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, btrfs_set_root_last_log_commit(root, 0); root->anon_dev = 0; if (!dummy) { - extent_io_tree_init(fs_info, &root->dirty_log_pages, - IO_TREE_ROOT_DIRTY_LOG_PAGES); - extent_io_tree_init(fs_info, &root->log_csum_range, - IO_TREE_LOG_CSUM_RANGE); + btrfs_extent_io_tree_init(fs_info, &root->dirty_log_pages, + IO_TREE_ROOT_DIRTY_LOG_PAGES); + btrfs_extent_io_tree_init(fs_info, &root->log_csum_range, + IO_TREE_LOG_CSUM_RANGE); } spin_lock_init(&root->root_item_lock); @@ -712,14 +711,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, list_add_tail(&root->leak_list, &fs_info->allocated_roots); spin_unlock(&fs_info->fs_roots_radix_lock); #endif -} -static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, - u64 objectid, gfp_t flags) -{ - struct btrfs_root *root = kzalloc(sizeof(*root), flags); - if (root) - __setup_root(root, fs_info, objectid); return root; } @@ -1863,8 +1855,8 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) int i; while (!list_empty(&fs_info->dead_roots)) { - gang[0] = list_entry(fs_info->dead_roots.next, - struct btrfs_root, root_list); + gang[0] = list_first_entry(&fs_info->dead_roots, + struct btrfs_root, root_list); list_del(&gang[0]->root_list); if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) @@ -1927,9 +1919,9 @@ static int btrfs_init_btree_inode(struct super_block *sb) inode->i_mapping->a_ops = &btree_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); - extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree, - IO_TREE_BTREE_INODE_IO); - extent_map_tree_init(&BTRFS_I(inode)->extent_tree); + btrfs_extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree, + IO_TREE_BTREE_INODE_IO); + btrfs_extent_map_tree_init(&BTRFS_I(inode)->extent_tree); BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root); set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); @@ -2002,7 +1994,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) btrfs_alloc_ordered_workqueue(fs_info, "qgroup-rescan", ordered_flags); fs_info->discard_ctl.discard_workers = - alloc_ordered_workqueue("btrfs_discard", WQ_FREEZABLE); + alloc_ordered_workqueue("btrfs-discard", WQ_FREEZABLE); if (!(fs_info->workers && fs_info->delalloc_workers && fs_info->flush_workers && @@ -2769,10 +2761,21 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) return ret; } +/* + * Lockdep gets confused between our buffer_tree which requires IRQ locking because + * we modify marks in the IRQ context, and our delayed inode xarray which doesn't + * have these requirements. Use a class key so lockdep doesn't get them mixed up. + */ +static struct lock_class_key buffer_xa_class; + void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) { INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); - INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); + + /* Use the same flags as mapping->i_pages. */ + xa_init_flags(&fs_info->buffer_tree, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT); + lockdep_set_class(&fs_info->buffer_tree.xa_lock, &buffer_xa_class); + INIT_LIST_HEAD(&fs_info->trans_list); INIT_LIST_HEAD(&fs_info->dead_roots); INIT_LIST_HEAD(&fs_info->delayed_iputs); @@ -2784,7 +2787,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->defrag_inodes_lock); spin_lock_init(&fs_info->super_lock); - spin_lock_init(&fs_info->buffer_lock); spin_lock_init(&fs_info->unused_bgs_lock); spin_lock_init(&fs_info->treelog_bg_lock); spin_lock_init(&fs_info->zone_active_bgs_lock); @@ -2829,6 +2831,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) BTRFS_BLOCK_RSV_GLOBAL); btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS); btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK); + btrfs_init_block_rsv(&fs_info->treelog_rsv, BTRFS_BLOCK_RSV_TREELOG); btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY); btrfs_init_block_rsv(&fs_info->delayed_block_rsv, BTRFS_BLOCK_RSV_DELOPS); @@ -2862,8 +2865,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) rwlock_init(&fs_info->block_group_cache_lock); fs_info->block_group_cache_tree = RB_ROOT_CACHED; - extent_io_tree_init(fs_info, &fs_info->excluded_extents, - IO_TREE_FS_EXCLUDED_EXTENTS); + btrfs_extent_io_tree_init(fs_info, &fs_info->excluded_extents, + IO_TREE_FS_EXCLUDED_EXTENTS); mutex_init(&fs_info->ordered_operations_mutex); mutex_init(&fs_info->tree_log_mutex); @@ -3315,7 +3318,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device /* * Read super block and check the signature bytes only */ - disk_super = btrfs_read_dev_super(fs_devices->latest_dev->bdev); + disk_super = btrfs_read_disk_super(fs_devices->latest_dev->bdev, 0, false); if (IS_ERR(disk_super)) { ret = PTR_ERR(disk_super); goto fail_alloc; @@ -3710,85 +3713,6 @@ static void btrfs_end_super_write(struct bio *bio) bio_put(bio); } -struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, - int copy_num, bool drop_cache) -{ - struct btrfs_super_block *super; - struct page *page; - u64 bytenr, bytenr_orig; - struct address_space *mapping = bdev->bd_mapping; - int ret; - - bytenr_orig = btrfs_sb_offset(copy_num); - ret = btrfs_sb_log_location_bdev(bdev, copy_num, READ, &bytenr); - if (ret == -ENOENT) - return ERR_PTR(-EINVAL); - else if (ret) - return ERR_PTR(ret); - - if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev)) - return ERR_PTR(-EINVAL); - - if (drop_cache) { - /* This should only be called with the primary sb. */ - ASSERT(copy_num == 0); - - /* - * Drop the page of the primary superblock, so later read will - * always read from the device. - */ - invalidate_inode_pages2_range(mapping, - bytenr >> PAGE_SHIFT, - (bytenr + BTRFS_SUPER_INFO_SIZE) >> PAGE_SHIFT); - } - - page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS); - if (IS_ERR(page)) - return ERR_CAST(page); - - super = page_address(page); - if (btrfs_super_magic(super) != BTRFS_MAGIC) { - btrfs_release_disk_super(super); - return ERR_PTR(-ENODATA); - } - - if (btrfs_super_bytenr(super) != bytenr_orig) { - btrfs_release_disk_super(super); - return ERR_PTR(-EINVAL); - } - - return super; -} - - -struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev) -{ - struct btrfs_super_block *super, *latest = NULL; - int i; - u64 transid = 0; - - /* we would like to check all the supers, but that would make - * a btrfs mount succeed after a mkfs from a different FS. - * So, we need to add a special mount option to scan for - * later supers, using BTRFS_SUPER_MIRROR_MAX instead - */ - for (i = 0; i < 1; i++) { - super = btrfs_read_dev_one_super(bdev, i, false); - if (IS_ERR(super)) - continue; - - if (!latest || btrfs_super_generation(super) > transid) { - if (latest) - btrfs_release_disk_super(super); - - latest = super; - transid = btrfs_super_generation(super); - } - } - - return super; -} - /* * Write superblock @sb to the @device. Do not wait for completion, all the * folios we use for writing are locked. @@ -3828,8 +3752,8 @@ static int write_dev_supers(struct btrfs_device *device, continue; } else if (ret < 0) { btrfs_err(device->fs_info, - "couldn't get super block location for mirror %d", - i); + "couldn't get super block location for mirror %d error %d", + i, ret); atomic_inc(&device->sb_write_errors); continue; } @@ -3848,8 +3772,8 @@ static int write_dev_supers(struct btrfs_device *device, GFP_NOFS); if (IS_ERR(folio)) { btrfs_err(device->fs_info, - "couldn't get super block page for bytenr %llu", - bytenr); + "couldn't get super block page for bytenr %llu error %ld", + bytenr, PTR_ERR(folio)); atomic_inc(&device->sb_write_errors); continue; } @@ -4244,8 +4168,9 @@ static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info) u64 found_end; found = true; - while (find_first_extent_bit(&trans->dirty_pages, cur, - &found_start, &found_end, EXTENT_DIRTY, &cached)) { + while (btrfs_find_first_extent_bit(&trans->dirty_pages, cur, + &found_start, &found_end, + EXTENT_DIRTY, &cached)) { dirty_bytes += found_end + 1 - found_start; cur = found_end + 1; } @@ -4441,7 +4366,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags); if (btrfs_check_quota_leak(fs_info)) { - WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + DEBUG_WARN("qgroup reserved space leaked"); btrfs_err(fs_info, "qgroup reserved space leaked"); } @@ -4698,9 +4623,9 @@ static void btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info, u64 start = 0; u64 end; - while (find_first_extent_bit(dirty_pages, start, &start, &end, - mark, NULL)) { - clear_extent_bits(dirty_pages, start, end, mark); + while (btrfs_find_first_extent_bit(dirty_pages, start, &start, &end, + mark, NULL)) { + btrfs_clear_extent_bits(dirty_pages, start, end, mark); while (start <= end) { eb = find_extent_buffer(fs_info, start); start += fs_info->nodesize; @@ -4733,14 +4658,14 @@ static void btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, * the same extent range. */ mutex_lock(&fs_info->unused_bg_unpin_mutex); - if (!find_first_extent_bit(unpin, 0, &start, &end, - EXTENT_DIRTY, &cached_state)) { + if (!btrfs_find_first_extent_bit(unpin, 0, &start, &end, + EXTENT_DIRTY, &cached_state)) { mutex_unlock(&fs_info->unused_bg_unpin_mutex); break; } - clear_extent_dirty(unpin, start, end, &cached_state); - free_extent_state(cached_state); + btrfs_clear_extent_dirty(unpin, start, end, &cached_state); + btrfs_free_extent_state(cached_state); btrfs_error_unpin_extent_range(fs_info, start, end); mutex_unlock(&fs_info->unused_bg_unpin_mutex); cond_resched(); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 587842991b24..864a55a96226 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -58,9 +58,6 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info, const struct btrfs_super_block *sb, int mirror_num); int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount); int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors); -struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev); -struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, - int copy_num, bool drop_cache); int btrfs_commit_super(struct btrfs_fs_info *fs_info); struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, const struct btrfs_key *key); @@ -114,7 +111,7 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int btrfs_read_extent_buffer(struct extent_buffer *buf, const struct btrfs_tree_parent_check *check); -blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio); +int btree_csum_one_bio(struct btrfs_bio *bbio); int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 13de6af279e5..b1b96eb5f64e 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -42,7 +42,7 @@ static inline void btrfs_extent_state_leak_debug_check(void) struct extent_state *state; while (!list_empty(&states)) { - state = list_entry(states.next, struct extent_state, leak_list); + state = list_first_entry(&states, struct extent_state, leak_list); pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n", state->start, state->end, state->state, extent_state_in_tree(state), @@ -59,13 +59,12 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller, struct extent_io_tree *tree, u64 start, u64 end) { - const struct btrfs_inode *inode; + const struct btrfs_inode *inode = tree->inode; u64 isize; if (tree->owner != IO_TREE_INODE_IO) return; - inode = extent_io_tree_to_inode_const(tree); isize = i_size_read(&inode->vfs_inode); if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { btrfs_debug_rl(inode->root->fs_info, @@ -80,25 +79,8 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller, #define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0) #endif - -/* - * The only tree allowed to set the inode is IO_TREE_INODE_IO. - */ -static bool is_inode_io_tree(const struct extent_io_tree *tree) -{ - return tree->owner == IO_TREE_INODE_IO; -} - -/* Return the inode if it's valid for the given tree, otherwise NULL. */ -struct btrfs_inode *extent_io_tree_to_inode(struct extent_io_tree *tree) -{ - if (tree->owner == IO_TREE_INODE_IO) - return tree->inode; - return NULL; -} - /* Read-only access to the inode. */ -const struct btrfs_inode *extent_io_tree_to_inode_const(const struct extent_io_tree *tree) +const struct btrfs_inode *btrfs_extent_io_tree_to_inode(const struct extent_io_tree *tree) { if (tree->owner == IO_TREE_INODE_IO) return tree->inode; @@ -106,15 +88,15 @@ const struct btrfs_inode *extent_io_tree_to_inode_const(const struct extent_io_t } /* For read-only access to fs_info. */ -const struct btrfs_fs_info *extent_io_tree_to_fs_info(const struct extent_io_tree *tree) +const struct btrfs_fs_info *btrfs_extent_io_tree_to_fs_info(const struct extent_io_tree *tree) { if (tree->owner == IO_TREE_INODE_IO) return tree->inode->root->fs_info; return tree->fs_info; } -void extent_io_tree_init(struct btrfs_fs_info *fs_info, - struct extent_io_tree *tree, unsigned int owner) +void btrfs_extent_io_tree_init(struct btrfs_fs_info *fs_info, + struct extent_io_tree *tree, unsigned int owner) { tree->state = RB_ROOT; spin_lock_init(&tree->lock); @@ -129,7 +111,7 @@ void extent_io_tree_init(struct btrfs_fs_info *fs_info, * aren't any waiters on any extent state record (EXTENT_LOCK_BITS are never * set on any extent state when calling this function). */ -void extent_io_tree_release(struct extent_io_tree *tree) +void btrfs_extent_io_tree_release(struct extent_io_tree *tree) { struct rb_root root; struct extent_state *state; @@ -148,7 +130,7 @@ void extent_io_tree_release(struct extent_io_tree *tree) * (see wait_extent_bit()). */ ASSERT(!waitqueue_active(&state->wq)); - free_extent_state(state); + btrfs_free_extent_state(state); cond_resched_lock(&tree->lock); } /* @@ -176,7 +158,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask) btrfs_leak_debug_add_state(state); refcount_set(&state->refs, 1); init_waitqueue_head(&state->wq); - trace_alloc_extent_state(state, mask, _RET_IP_); + trace_btrfs_alloc_extent_state(state, mask, _RET_IP_); return state; } @@ -188,14 +170,14 @@ static struct extent_state *alloc_extent_state_atomic(struct extent_state *preal return prealloc; } -void free_extent_state(struct extent_state *state) +void btrfs_free_extent_state(struct extent_state *state) { if (!state) return; if (refcount_dec_and_test(&state->refs)) { WARN_ON(extent_state_in_tree(state)); btrfs_leak_debug_del_state(state); - trace_free_extent_state(state, _RET_IP_); + trace_btrfs_free_extent_state(state, _RET_IP_); kmem_cache_free(extent_state_cache, state); } } @@ -222,38 +204,34 @@ static inline struct extent_state *next_state(struct extent_state *state) { struct rb_node *next = rb_next(&state->rb_node); - if (next) - return rb_entry(next, struct extent_state, rb_node); - else - return NULL; + return rb_entry_safe(next, struct extent_state, rb_node); } static inline struct extent_state *prev_state(struct extent_state *state) { struct rb_node *next = rb_prev(&state->rb_node); - if (next) - return rb_entry(next, struct extent_state, rb_node); - else - return NULL; + return rb_entry_safe(next, struct extent_state, rb_node); } /* - * Search @tree for an entry that contains @offset. Such entry would have - * entry->start <= offset && entry->end >= offset. + * Search @tree for an entry that contains @offset or if none exists for the + * first entry that starts and ends after that offset. * * @tree: the tree to search - * @offset: offset that should fall within an entry in @tree + * @offset: search offset * @node_ret: pointer where new node should be anchored (used when inserting an * entry in the tree) * @parent_ret: points to entry which would have been the parent of the entry, * containing @offset * - * Return a pointer to the entry that contains @offset byte address and don't change - * @node_ret and @parent_ret. + * Return a pointer to the entry that contains @offset byte address. + * + * If no such entry exists, return the first entry that starts and ends after + * @offset if one exists, otherwise NULL. * - * If no such entry exists, return pointer to entry that ends before @offset - * and fill parameters @node_ret and @parent_ret, ie. does not return NULL. + * If the returned entry starts at @offset, then @node_ret and @parent_ret + * aren't changed. */ static inline struct extent_state *tree_search_for_insert(struct extent_io_tree *tree, u64 offset, @@ -282,7 +260,11 @@ static inline struct extent_state *tree_search_for_insert(struct extent_io_tree if (parent_ret) *parent_ret = prev; - /* Search neighbors until we find the first one past the end */ + /* + * Return either the current entry if it contains offset (it ends after + * or at offset) or the first entry that starts and ends after offset if + * one exists, or NULL. + */ while (entry && offset > entry->end) entry = next_state(entry); @@ -351,7 +333,7 @@ static void __cold extent_io_tree_panic(const struct extent_io_tree *tree, const char *opname, int err) { - btrfs_panic(extent_io_tree_to_fs_info(tree), err, + btrfs_panic(btrfs_extent_io_tree_to_fs_info(tree), err, "extent io tree error on %s state start %llu end %llu", opname, state->start, state->end); } @@ -362,13 +344,12 @@ static void merge_prev_state(struct extent_io_tree *tree, struct extent_state *s prev = prev_state(state); if (prev && prev->end == state->start - 1 && prev->state == state->state) { - if (is_inode_io_tree(tree)) - btrfs_merge_delalloc_extent(extent_io_tree_to_inode(tree), - state, prev); + if (tree->owner == IO_TREE_INODE_IO) + btrfs_merge_delalloc_extent(tree->inode, state, prev); state->start = prev->start; rb_erase(&prev->rb_node, &tree->state); RB_CLEAR_NODE(&prev->rb_node); - free_extent_state(prev); + btrfs_free_extent_state(prev); } } @@ -378,13 +359,12 @@ static void merge_next_state(struct extent_io_tree *tree, struct extent_state *s next = next_state(state); if (next && next->start == state->end + 1 && next->state == state->state) { - if (is_inode_io_tree(tree)) - btrfs_merge_delalloc_extent(extent_io_tree_to_inode(tree), - state, next); + if (tree->owner == IO_TREE_INODE_IO) + btrfs_merge_delalloc_extent(tree->inode, state, next); state->end = next->end; rb_erase(&next->rb_node, &tree->state); RB_CLEAR_NODE(&next->rb_node); - free_extent_state(next); + btrfs_free_extent_state(next); } } @@ -413,8 +393,8 @@ static void set_state_bits(struct extent_io_tree *tree, u32 bits_to_set = bits & ~EXTENT_CTLBITS; int ret; - if (is_inode_io_tree(tree)) - btrfs_set_delalloc_extent(extent_io_tree_to_inode(tree), state, bits); + if (tree->owner == IO_TREE_INODE_IO) + btrfs_set_delalloc_extent(tree->inode, state, bits); ret = add_extent_changeset(state, bits_to_set, changeset, 1); BUG_ON(ret < 0); @@ -459,10 +439,9 @@ static struct extent_state *insert_state(struct extent_io_tree *tree, if (state->end < entry->start) { if (try_merge && end == entry->start && state->state == entry->state) { - if (is_inode_io_tree(tree)) - btrfs_merge_delalloc_extent( - extent_io_tree_to_inode(tree), - state, entry); + if (tree->owner == IO_TREE_INODE_IO) + btrfs_merge_delalloc_extent(tree->inode, + state, entry); entry->start = state->start; merge_prev_state(tree, entry); state->state = 0; @@ -472,10 +451,9 @@ static struct extent_state *insert_state(struct extent_io_tree *tree, } else if (state->end > entry->end) { if (try_merge && entry->end == start && state->state == entry->state) { - if (is_inode_io_tree(tree)) - btrfs_merge_delalloc_extent( - extent_io_tree_to_inode(tree), - state, entry); + if (tree->owner == IO_TREE_INODE_IO) + btrfs_merge_delalloc_extent(tree->inode, + state, entry); entry->end = state->end; merge_next_state(tree, entry); state->state = 0; @@ -527,9 +505,8 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, struct rb_node *parent = NULL; struct rb_node **node; - if (is_inode_io_tree(tree)) - btrfs_split_delalloc_extent(extent_io_tree_to_inode(tree), orig, - split); + if (tree->owner == IO_TREE_INODE_IO) + btrfs_split_delalloc_extent(tree->inode, orig, split); prealloc->start = orig->start; prealloc->end = split - 1; @@ -549,7 +526,7 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, } else if (prealloc->end > entry->end) { node = &(*node)->rb_right; } else { - free_extent_state(prealloc); + btrfs_free_extent_state(prealloc); return -EEXIST; } } @@ -561,6 +538,18 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, } /* + * Use this during tree iteration to avoid doing next node searches when it's + * not needed (the current record ends at or after the target range's end). + */ +static inline struct extent_state *next_search_state(struct extent_state *state, u64 end) +{ + if (state->end < end) + return next_state(state); + + return NULL; +} + +/* * Utility function to clear some bits in an extent state struct. It will * optionally wake up anyone waiting on this state (wake == 1). * @@ -569,16 +558,15 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, */ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, struct extent_state *state, - u32 bits, int wake, + u32 bits, int wake, u64 end, struct extent_changeset *changeset) { struct extent_state *next; u32 bits_to_clear = bits & ~EXTENT_CTLBITS; int ret; - if (is_inode_io_tree(tree)) - btrfs_clear_delalloc_extent(extent_io_tree_to_inode(tree), state, - bits); + if (tree->owner == IO_TREE_INODE_IO) + btrfs_clear_delalloc_extent(tree->inode, state, bits); ret = add_extent_changeset(state, bits_to_clear, changeset, 0); BUG_ON(ret < 0); @@ -586,17 +574,17 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, if (wake) wake_up(&state->wq); if (state->state == 0) { - next = next_state(state); + next = next_search_state(state, end); if (extent_state_in_tree(state)) { rb_erase(&state->rb_node, &tree->state); RB_CLEAR_NODE(&state->rb_node); - free_extent_state(state); + btrfs_free_extent_state(state); } else { WARN_ON(1); } } else { merge_state(tree, state); - next = next_state(state); + next = next_search_state(state, end); } return next; } @@ -620,18 +608,18 @@ static void set_gfp_mask_from_bits(u32 *bits, gfp_t *mask) * * This takes the tree lock, and returns 0 on success and < 0 on error. */ -int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_state **cached_state, - struct extent_changeset *changeset) +int btrfs_clear_extent_bit_changeset(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_state **cached_state, + struct extent_changeset *changeset) { struct extent_state *state; struct extent_state *cached; struct extent_state *prealloc = NULL; u64 last_end; - int err; - int clear = 0; - int wake; - int delete = (bits & EXTENT_CLEAR_ALL_BITS); + int ret = 0; + bool clear; + bool wake; + const bool delete = (bits & EXTENT_CLEAR_ALL_BITS); gfp_t mask; set_gfp_mask_from_bits(&bits, &mask); @@ -644,9 +632,8 @@ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, if (bits & EXTENT_DELALLOC) bits |= EXTENT_NORESERVE; - wake = ((bits & EXTENT_LOCK_BITS) ? 1 : 0); - if (bits & (EXTENT_LOCK_BITS | EXTENT_BOUNDARY)) - clear = 1; + wake = (bits & EXTENT_LOCK_BITS); + clear = (bits & (EXTENT_LOCK_BITS | EXTENT_BOUNDARY)); again: if (!prealloc) { /* @@ -676,7 +663,7 @@ again: goto hit_next; } if (clear) - free_extent_state(cached); + btrfs_free_extent_state(cached); } /* This search will find the extents that end after our range starts. */ @@ -691,7 +678,7 @@ hit_next: /* The state doesn't have the wanted bits, go ahead. */ if (!(state->state & bits)) { - state = next_state(state); + state = next_search_state(state, end); goto next; } @@ -714,18 +701,24 @@ hit_next: prealloc = alloc_extent_state_atomic(prealloc); if (!prealloc) goto search_again; - err = split_state(tree, state, prealloc, start); - if (err) - extent_io_tree_panic(tree, state, "split", err); - + ret = split_state(tree, state, prealloc, start); prealloc = NULL; - if (err) + if (ret) { + extent_io_tree_panic(tree, state, "split", ret); goto out; + } if (state->end <= end) { - state = clear_state_bit(tree, state, bits, wake, changeset); + state = clear_state_bit(tree, state, bits, wake, end, + changeset); goto next; } - goto search_again; + if (need_resched()) + goto search_again; + /* + * Fallthrough and try atomic extent state allocation if needed. + * If it fails we'll jump to 'search_again' retry the allocation + * in non-atomic mode and start the search again. + */ } /* * | ---- desired range ---- | @@ -736,30 +729,31 @@ hit_next: prealloc = alloc_extent_state_atomic(prealloc); if (!prealloc) goto search_again; - err = split_state(tree, state, prealloc, end + 1); - if (err) - extent_io_tree_panic(tree, state, "split", err); + ret = split_state(tree, state, prealloc, end + 1); + if (ret) { + extent_io_tree_panic(tree, state, "split", ret); + prealloc = NULL; + goto out; + } if (wake) wake_up(&state->wq); - clear_state_bit(tree, prealloc, bits, wake, changeset); + clear_state_bit(tree, prealloc, bits, wake, end, changeset); prealloc = NULL; goto out; } - state = clear_state_bit(tree, state, bits, wake, changeset); + state = clear_state_bit(tree, state, bits, wake, end, changeset); next: - if (last_end == (u64)-1) + if (last_end >= end) goto out; start = last_end + 1; - if (start <= end && state && !need_resched()) + if (state && !need_resched()) goto hit_next; search_again: - if (start > end) - goto out; spin_unlock(&tree->lock); if (gfpflags_allow_blocking(mask)) cond_resched(); @@ -767,10 +761,9 @@ search_again: out: spin_unlock(&tree->lock); - if (prealloc) - free_extent_state(prealloc); + btrfs_free_extent_state(prealloc); - return 0; + return ret; } @@ -820,7 +813,7 @@ process_node: schedule(); spin_lock(&tree->lock); finish_wait(&state->wq, &wait); - free_extent_state(state); + btrfs_free_extent_state(state); goto again; } start = state->end + 1; @@ -838,7 +831,7 @@ out: if (cached_state && *cached_state) { state = *cached_state; *cached_state = NULL; - free_extent_state(state); + btrfs_free_extent_state(state); } spin_unlock(&tree->lock); } @@ -877,7 +870,7 @@ static struct extent_state *find_first_extent_bit_state(struct extent_io_tree *t */ state = tree_search(tree, start); while (state) { - if (state->end >= start && (state->state & bits)) + if (state->state & bits) return state; state = next_state(state); } @@ -892,9 +885,9 @@ static struct extent_state *find_first_extent_bit_state(struct extent_io_tree *t * Return true if we find something, and update @start_ret and @end_ret. * Return false if we found nothing. */ -bool find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, u32 bits, - struct extent_state **cached_state) +bool btrfs_find_first_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, u32 bits, + struct extent_state **cached_state) { struct extent_state *state; bool ret = false; @@ -914,13 +907,13 @@ bool find_first_extent_bit(struct extent_io_tree *tree, u64 start, * again. If we haven't found any, clear as well since * it's now useless. */ - free_extent_state(*cached_state); + btrfs_free_extent_state(*cached_state); *cached_state = NULL; if (state) goto got_it; goto out; } - free_extent_state(*cached_state); + btrfs_free_extent_state(*cached_state); *cached_state = NULL; } @@ -952,14 +945,17 @@ out: * contiguous area for given bits. We will search to the first bit we find, and * then walk down the tree until we find a non-contiguous area. The area * returned will be the full contiguous area with the bits set. + * + * Returns true if we found a range with the given bits set, in which case + * @start_ret and @end_ret are updated, or false if no range was found. */ -int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, u32 bits) +bool btrfs_find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, u32 bits) { struct extent_state *state; - int ret = 1; + bool ret = false; - ASSERT(!btrfs_fs_incompat(extent_io_tree_to_fs_info(tree), NO_HOLES)); + ASSERT(!btrfs_fs_incompat(btrfs_extent_io_tree_to_fs_info(tree), NO_HOLES)); spin_lock(&tree->lock); state = find_first_extent_bit_state(tree, start, bits); @@ -971,7 +967,7 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, break; *end_ret = state->end; } - ret = 0; + ret = true; } spin_unlock(&tree->lock); return ret; @@ -1046,11 +1042,11 @@ out: * * [start, end] is inclusive This takes the tree lock. */ -static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, u64 *failed_start, - struct extent_state **failed_state, - struct extent_state **cached_state, - struct extent_changeset *changeset) +static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, u64 *failed_start, + struct extent_state **failed_state, + struct extent_state **cached_state, + struct extent_changeset *changeset) { struct extent_state *state; struct extent_state *prealloc = NULL; @@ -1129,12 +1125,11 @@ hit_next: set_state_bits(tree, state, bits, changeset); cache_state(state, cached_state); merge_state(tree, state); - if (last_end == (u64)-1) + if (last_end >= end) goto out; start = last_end + 1; state = next_state(state); - if (start < end && state && state->start == start && - !need_resched()) + if (state && state->start == start && !need_resched()) goto hit_next; goto search_again; } @@ -1186,12 +1181,11 @@ hit_next: set_state_bits(tree, state, bits, changeset); cache_state(state, cached_state); merge_state(tree, state); - if (last_end == (u64)-1) + if (last_end >= end) goto out; start = last_end + 1; state = next_state(state); - if (start < end && state && state->start == start && - !need_resched()) + if (state && state->start == start && !need_resched()) goto hit_next; } goto search_again; @@ -1204,14 +1198,8 @@ hit_next: * extent we found. */ if (state->start > start) { - u64 this_end; struct extent_state *inserted_state; - if (end < last_start) - this_end = end; - else - this_end = last_start - 1; - prealloc = alloc_extent_state_atomic(prealloc); if (!prealloc) goto search_again; @@ -1221,17 +1209,38 @@ hit_next: * extent. */ prealloc->start = start; - prealloc->end = this_end; + if (end < last_start) + prealloc->end = end; + else + prealloc->end = last_start - 1; + inserted_state = insert_state(tree, prealloc, bits, changeset); if (IS_ERR(inserted_state)) { ret = PTR_ERR(inserted_state); extent_io_tree_panic(tree, prealloc, "insert", ret); + goto out; } cache_state(inserted_state, cached_state); if (inserted_state == prealloc) prealloc = NULL; - start = this_end + 1; + start = inserted_state->end + 1; + + /* Beyond target range, stop. */ + if (start > end) + goto out; + + if (need_resched()) + goto search_again; + + state = next_search_state(inserted_state, end); + /* + * If there's a next state, whether contiguous or not, we don't + * need to unlock and start search agian. If it's not contiguous + * we will end up here and try to allocate a prealloc state and insert. + */ + if (state) + goto hit_next; goto search_again; } /* @@ -1252,8 +1261,11 @@ hit_next: if (!prealloc) goto search_again; ret = split_state(tree, state, prealloc, end + 1); - if (ret) + if (ret) { extent_io_tree_panic(tree, state, "split", ret); + prealloc = NULL; + goto out; + } set_state_bits(tree, prealloc, bits, changeset); cache_state(prealloc, cached_state); @@ -1272,18 +1284,16 @@ search_again: out: spin_unlock(&tree->lock); - if (prealloc) - free_extent_state(prealloc); + btrfs_free_extent_state(prealloc); return ret; } -int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_state **cached_state) +int btrfs_set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_state **cached_state) { - return __set_extent_bit(tree, start, end, bits, NULL, NULL, - cached_state, NULL); + return set_extent_bit(tree, start, end, bits, NULL, NULL, cached_state, NULL); } /* @@ -1304,9 +1314,9 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, * * All allocations are done with GFP_NOFS. */ -int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, u32 clear_bits, - struct extent_state **cached_state) +int btrfs_convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, u32 clear_bits, + struct extent_state **cached_state) { struct extent_state *state; struct extent_state *prealloc = NULL; @@ -1374,12 +1384,11 @@ hit_next: if (state->start == start && state->end <= end) { set_state_bits(tree, state, bits, NULL); cache_state(state, cached_state); - state = clear_state_bit(tree, state, clear_bits, 0, NULL); - if (last_end == (u64)-1) + state = clear_state_bit(tree, state, clear_bits, 0, end, NULL); + if (last_end >= end) goto out; start = last_end + 1; - if (start < end && state && state->start == start && - !need_resched()) + if (state && state->start == start && !need_resched()) goto hit_next; goto search_again; } @@ -1406,20 +1415,19 @@ hit_next: goto out; } ret = split_state(tree, state, prealloc, start); - if (ret) - extent_io_tree_panic(tree, state, "split", ret); prealloc = NULL; - if (ret) + if (ret) { + extent_io_tree_panic(tree, state, "split", ret); goto out; + } if (state->end <= end) { set_state_bits(tree, state, bits, NULL); cache_state(state, cached_state); - state = clear_state_bit(tree, state, clear_bits, 0, NULL); - if (last_end == (u64)-1) + state = clear_state_bit(tree, state, clear_bits, 0, end, NULL); + if (last_end >= end) goto out; start = last_end + 1; - if (start < end && state && state->start == start && - !need_resched()) + if (state && state->start == start && !need_resched()) goto hit_next; } goto search_again; @@ -1432,14 +1440,8 @@ hit_next: * extent we found. */ if (state->start > start) { - u64 this_end; struct extent_state *inserted_state; - if (end < last_start) - this_end = end; - else - this_end = last_start - 1; - prealloc = alloc_extent_state_atomic(prealloc); if (!prealloc) { ret = -ENOMEM; @@ -1451,16 +1453,37 @@ hit_next: * extent. */ prealloc->start = start; - prealloc->end = this_end; + if (end < last_start) + prealloc->end = end; + else + prealloc->end = last_start - 1; + inserted_state = insert_state(tree, prealloc, bits, NULL); if (IS_ERR(inserted_state)) { ret = PTR_ERR(inserted_state); extent_io_tree_panic(tree, prealloc, "insert", ret); + goto out; } cache_state(inserted_state, cached_state); if (inserted_state == prealloc) prealloc = NULL; - start = this_end + 1; + start = inserted_state->end + 1; + + /* Beyond target range, stop. */ + if (start > end) + goto out; + + if (need_resched()) + goto search_again; + + state = next_search_state(inserted_state, end); + /* + * If there's a next state, whether contiguous or not, we don't + * need to unlock and start search again. If it's not contiguous + * we will end up here and try to allocate a prealloc state and insert. + */ + if (state) + goto hit_next; goto search_again; } /* @@ -1477,12 +1500,15 @@ hit_next: } ret = split_state(tree, state, prealloc, end + 1); - if (ret) + if (ret) { extent_io_tree_panic(tree, state, "split", ret); + prealloc = NULL; + goto out; + } set_state_bits(tree, prealloc, bits, NULL); cache_state(prealloc, cached_state); - clear_state_bit(tree, prealloc, clear_bits, 0, NULL); + clear_state_bit(tree, prealloc, clear_bits, 0, end, NULL); prealloc = NULL; goto out; } @@ -1497,8 +1523,7 @@ search_again: out: spin_unlock(&tree->lock); - if (prealloc) - free_extent_state(prealloc); + btrfs_free_extent_state(prealloc); return ret; } @@ -1518,8 +1543,8 @@ out: * spans (last_range_end, end of device]. In this case it's up to the caller to * trim @end_ret to the appropriate size. */ -void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, u32 bits) +void btrfs_find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, u32 bits) { struct extent_state *state; struct extent_state *prev = NULL, *next = NULL; @@ -1636,10 +1661,10 @@ out: * all given bits set. If the returned number of bytes is greater than zero * then @start is updated with the offset of the first byte with the bits set. */ -u64 count_range_bits(struct extent_io_tree *tree, - u64 *start, u64 search_end, u64 max_bytes, - u32 bits, int contig, - struct extent_state **cached_state) +u64 btrfs_count_range_bits(struct extent_io_tree *tree, + u64 *start, u64 search_end, u64 max_bytes, + u32 bits, int contig, + struct extent_state **cached_state) { struct extent_state *state = NULL; struct extent_state *cached; @@ -1710,7 +1735,7 @@ search: } if (cached_state) { - free_extent_state(*cached_state); + btrfs_free_extent_state(*cached_state); *cached_state = state; if (state) refcount_inc(&state->refs); @@ -1724,16 +1749,16 @@ search: /* * Check if the single @bit exists in the given range. */ -bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit) +bool btrfs_test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit) { - struct extent_state *state = NULL; + struct extent_state *state; bool bitset = false; ASSERT(is_power_of_2(bit)); spin_lock(&tree->lock); state = tree_search(tree, start); - while (state && start <= end) { + while (state) { if (state->start > end) break; @@ -1742,9 +1767,7 @@ bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 break; } - /* If state->end is (u64)-1, start will overflow to 0 */ - start = state->end + 1; - if (start > end || start == 0) + if (state->end >= end) break; state = next_state(state); } @@ -1752,16 +1775,51 @@ bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 return bitset; } +void btrfs_get_range_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 *bits, + struct extent_state **cached_state) +{ + struct extent_state *state; + + /* + * The cached state is currently mandatory and not used to start the + * search, only to cache the first state record found in the range. + */ + ASSERT(cached_state != NULL); + ASSERT(*cached_state == NULL); + + *bits = 0; + + spin_lock(&tree->lock); + state = tree_search(tree, start); + if (state && state->start < end) { + *cached_state = state; + refcount_inc(&state->refs); + } + while (state) { + if (state->start > end) + break; + + *bits |= state->state; + + if (state->end >= end) + break; + + state = next_state(state); + } + spin_unlock(&tree->lock); +} + /* * Check if the whole range [@start,@end) contains the single @bit set. */ -bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit, - struct extent_state *cached) +bool btrfs_test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit, + struct extent_state *cached) { - struct extent_state *state = NULL; + struct extent_state *state; bool bitset = true; ASSERT(is_power_of_2(bit)); + ASSERT(start < end); spin_lock(&tree->lock); if (cached && extent_state_in_tree(cached) && cached->start <= start && @@ -1769,30 +1827,22 @@ bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit, state = cached; else state = tree_search(tree, start); - while (state && start <= end) { + while (state) { if (state->start > start) { bitset = false; break; } - if (state->start > end) - break; - if ((state->state & bit) == 0) { bitset = false; break; } - if (state->end == (u64)-1) + if (state->end >= end) break; - /* - * Last entry (if state->end is (u64)-1 and overflow happens), - * or next entry starts after the range. - */ + /* Next state must start where this one ends. */ start = state->end + 1; - if (start > end || start == 0) - break; state = next_state(state); } @@ -1804,8 +1854,8 @@ bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit, } /* Wrappers around set/clear extent bit */ -int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_changeset *changeset) +int btrfs_set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_changeset *changeset) { /* * We don't support EXTENT_LOCK_BITS yet, as current changeset will @@ -1814,11 +1864,11 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, */ ASSERT(!(bits & EXTENT_LOCK_BITS)); - return __set_extent_bit(tree, start, end, bits, NULL, NULL, NULL, changeset); + return set_extent_bit(tree, start, end, bits, NULL, NULL, NULL, changeset); } -int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_changeset *changeset) +int btrfs_clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_changeset *changeset) { /* * Don't support EXTENT_LOCK_BITS case, same reason as @@ -1826,20 +1876,21 @@ int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, */ ASSERT(!(bits & EXTENT_LOCK_BITS)); - return __clear_extent_bit(tree, start, end, bits, NULL, changeset); + return btrfs_clear_extent_bit_changeset(tree, start, end, bits, NULL, changeset); } -bool __try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, - struct extent_state **cached) +bool btrfs_try_lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_state **cached) { int err; u64 failed_start; - err = __set_extent_bit(tree, start, end, bits, &failed_start, - NULL, cached, NULL); + err = set_extent_bit(tree, start, end, bits, &failed_start, NULL, + cached, NULL); if (err == -EEXIST) { if (failed_start > start) - clear_extent_bit(tree, start, failed_start - 1, bits, cached); + btrfs_clear_extent_bit(tree, start, failed_start - 1, + bits, cached); return 0; } return 1; @@ -1849,35 +1900,54 @@ bool __try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits * Either insert or lock state struct between start and end use mask to tell * us if waiting is desired. */ -int __lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, - struct extent_state **cached_state) +int btrfs_lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, + struct extent_state **cached_state) { struct extent_state *failed_state = NULL; int err; u64 failed_start; - err = __set_extent_bit(tree, start, end, bits, &failed_start, - &failed_state, cached_state, NULL); + err = set_extent_bit(tree, start, end, bits, &failed_start, + &failed_state, cached_state, NULL); while (err == -EEXIST) { if (failed_start != start) - clear_extent_bit(tree, start, failed_start - 1, - bits, cached_state); + btrfs_clear_extent_bit(tree, start, failed_start - 1, + bits, cached_state); wait_extent_bit(tree, failed_start, end, bits, &failed_state); - err = __set_extent_bit(tree, start, end, bits, - &failed_start, &failed_state, - cached_state, NULL); + err = set_extent_bit(tree, start, end, bits, &failed_start, + &failed_state, cached_state, NULL); } return err; } -void __cold extent_state_free_cachep(void) +/* + * Get the extent state that follows the given extent state. + * This is meant to be used in a context where we know no other tasks can + * concurrently modify the tree. + */ +struct extent_state *btrfs_next_extent_state(struct extent_io_tree *tree, + struct extent_state *state) +{ + struct extent_state *next; + + spin_lock(&tree->lock); + ASSERT(extent_state_in_tree(state)); + next = next_state(state); + if (next) + refcount_inc(&next->refs); + spin_unlock(&tree->lock); + + return next; +} + +void __cold btrfs_extent_state_free_cachep(void) { btrfs_extent_state_leak_debug_check(); kmem_cache_destroy(extent_state_cache); } -int __init extent_state_init_cachep(void) +int __init btrfs_extent_state_init_cachep(void) { extent_state_cache = kmem_cache_create("btrfs_extent_state", sizeof(struct extent_state), 0, 0, diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 6ffef1cd37c1..0a18ca9c59c3 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -17,7 +17,6 @@ struct btrfs_inode; /* Bits for the extent state */ enum { ENUM_BIT(EXTENT_DIRTY), - ENUM_BIT(EXTENT_UPTODATE), ENUM_BIT(EXTENT_LOCKED), ENUM_BIT(EXTENT_DIO_LOCKED), ENUM_BIT(EXTENT_NEW), @@ -39,6 +38,11 @@ enum { */ ENUM_BIT(EXTENT_DELALLOC_NEW), /* + * Mark that a range is being locked for finishing an ordered extent. + * Used together with EXTENT_LOCKED. + */ + ENUM_BIT(EXTENT_FINISHING_ORDERED), + /* * When an ordered extent successfully completes for a region marked as * a new delalloc range, use this flag when clearing a new delalloc * range to indicate that the VFS' inode number of bytes should be @@ -130,117 +134,116 @@ struct extent_state { #endif }; -struct btrfs_inode *extent_io_tree_to_inode(struct extent_io_tree *tree); -const struct btrfs_inode *extent_io_tree_to_inode_const(const struct extent_io_tree *tree); -const struct btrfs_fs_info *extent_io_tree_to_fs_info(const struct extent_io_tree *tree); +const struct btrfs_inode *btrfs_extent_io_tree_to_inode(const struct extent_io_tree *tree); +const struct btrfs_fs_info *btrfs_extent_io_tree_to_fs_info(const struct extent_io_tree *tree); -void extent_io_tree_init(struct btrfs_fs_info *fs_info, - struct extent_io_tree *tree, unsigned int owner); -void extent_io_tree_release(struct extent_io_tree *tree); -int __lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, - struct extent_state **cached); -bool __try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, - struct extent_state **cached); +void btrfs_extent_io_tree_init(struct btrfs_fs_info *fs_info, + struct extent_io_tree *tree, unsigned int owner); +void btrfs_extent_io_tree_release(struct extent_io_tree *tree); +int btrfs_lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, + struct extent_state **cached); +bool btrfs_try_lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_state **cached); -static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached) +static inline int btrfs_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached) { - return __lock_extent(tree, start, end, EXTENT_LOCKED, cached); + return btrfs_lock_extent_bits(tree, start, end, EXTENT_LOCKED, cached); } -static inline bool try_lock_extent(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached) +static inline bool btrfs_try_lock_extent(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) { - return __try_lock_extent(tree, start, end, EXTENT_LOCKED, cached); + return btrfs_try_lock_extent_bits(tree, start, end, EXTENT_LOCKED, cached); } -int __init extent_state_init_cachep(void); -void __cold extent_state_free_cachep(void); - -u64 count_range_bits(struct extent_io_tree *tree, - u64 *start, u64 search_end, - u64 max_bytes, u32 bits, int contig, - struct extent_state **cached_state); - -void free_extent_state(struct extent_state *state); -bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit, - struct extent_state *cached_state); -bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit); -int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_changeset *changeset); -int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_state **cached, - struct extent_changeset *changeset); - -static inline int clear_extent_bit(struct extent_io_tree *tree, u64 start, - u64 end, u32 bits, - struct extent_state **cached) -{ - return __clear_extent_bit(tree, start, end, bits, cached, NULL); -} +int __init btrfs_extent_state_init_cachep(void); +void __cold btrfs_extent_state_free_cachep(void); + +u64 btrfs_count_range_bits(struct extent_io_tree *tree, + u64 *start, u64 search_end, + u64 max_bytes, u32 bits, int contig, + struct extent_state **cached_state); -static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached) +void btrfs_free_extent_state(struct extent_state *state); +bool btrfs_test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit, + struct extent_state *cached_state); +bool btrfs_test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit); +void btrfs_get_range_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 *bits, + struct extent_state **cached_state); +int btrfs_clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_changeset *changeset); +int btrfs_clear_extent_bit_changeset(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_state **cached, + struct extent_changeset *changeset); + +static inline int btrfs_clear_extent_bit(struct extent_io_tree *tree, u64 start, + u64 end, u32 bits, + struct extent_state **cached) { - return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, cached, NULL); + return btrfs_clear_extent_bit_changeset(tree, start, end, bits, cached, NULL); } -static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start, - u64 end, u32 bits) +static inline int btrfs_unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached) { - return clear_extent_bit(tree, start, end, bits, NULL); + return btrfs_clear_extent_bit_changeset(tree, start, end, EXTENT_LOCKED, + cached, NULL); } -int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_changeset *changeset); -int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_state **cached_state); - -static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached_state) +static inline int btrfs_clear_extent_bits(struct extent_io_tree *tree, u64 start, + u64 end, u32 bits) { - return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE, - cached_state, NULL); + return btrfs_clear_extent_bit(tree, start, end, bits, NULL); } -static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached) +int btrfs_set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_changeset *changeset); +int btrfs_set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_state **cached_state); + +static inline int btrfs_clear_extent_dirty(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) { - return clear_extent_bit(tree, start, end, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, cached); + return btrfs_clear_extent_bit(tree, start, end, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, cached); } -int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, u32 clear_bits, - struct extent_state **cached_state); - -bool find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, u32 bits, - struct extent_state **cached_state); -void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, u32 bits); -int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, u32 bits); +int btrfs_convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, u32 clear_bits, + struct extent_state **cached_state); + +bool btrfs_find_first_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, u32 bits, + struct extent_state **cached_state); +void btrfs_find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, u32 bits); +bool btrfs_find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, u32 bits); bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, u64 *end, u64 max_bytes, struct extent_state **cached_state); -static inline int lock_dio_extent(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached) +static inline int btrfs_lock_dio_extent(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) { - return __lock_extent(tree, start, end, EXTENT_DIO_LOCKED, cached); + return btrfs_lock_extent_bits(tree, start, end, EXTENT_DIO_LOCKED, cached); } -static inline bool try_lock_dio_extent(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached) +static inline bool btrfs_try_lock_dio_extent(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) { - return __try_lock_extent(tree, start, end, EXTENT_DIO_LOCKED, cached); + return btrfs_try_lock_extent_bits(tree, start, end, EXTENT_DIO_LOCKED, cached); } -static inline int unlock_dio_extent(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached) +static inline int btrfs_unlock_dio_extent(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) { - return __clear_extent_bit(tree, start, end, EXTENT_DIO_LOCKED, cached, NULL); + return btrfs_clear_extent_bit_changeset(tree, start, end, EXTENT_DIO_LOCKED, + cached, NULL); } +struct extent_state *btrfs_next_extent_state(struct extent_io_tree *tree, + struct extent_state *state); + #endif /* BTRFS_EXTENT_IO_TREE_H */ diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 957230abd827..cb6128778a83 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -409,15 +409,15 @@ static u64 hash_extent_data_ref_item(struct extent_buffer *leaf, btrfs_extent_data_ref_offset(leaf, ref)); } -static int match_extent_data_ref(struct extent_buffer *leaf, - struct btrfs_extent_data_ref *ref, - u64 root_objectid, u64 owner, u64 offset) +static bool match_extent_data_ref(struct extent_buffer *leaf, + struct btrfs_extent_data_ref *ref, + u64 root_objectid, u64 owner, u64 offset) { if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid || btrfs_extent_data_ref_objectid(leaf, ref) != owner || btrfs_extent_data_ref_offset(leaf, ref) != offset) - return 0; - return 1; + return false; + return true; } static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, @@ -2006,7 +2006,12 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, delayed_refs = &trans->transaction->delayed_refs; if (min_bytes == 0) { - max_count = delayed_refs->num_heads_ready; + /* + * We may be subject to a harmless race if some task is + * concurrently adding or removing a delayed ref, so silence + * KCSAN and similar tools. + */ + max_count = data_race(delayed_refs->num_heads_ready); min_bytes = U64_MAX; } @@ -2598,8 +2603,8 @@ static int pin_down_extent(struct btrfs_trans_handle *trans, spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - set_extent_bit(&trans->transaction->pinned_extents, bytenr, - bytenr + num_bytes - 1, EXTENT_DIRTY, NULL); + btrfs_set_extent_bit(&trans->transaction->pinned_extents, bytenr, + bytenr + num_bytes - 1, EXTENT_DIRTY, NULL); return 0; } @@ -2818,34 +2823,63 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_group *block_group, *tmp; struct list_head *deleted_bgs; - struct extent_io_tree *unpin; + struct extent_io_tree *unpin = &trans->transaction->pinned_extents; + struct extent_state *cached_state = NULL; u64 start; u64 end; + int unpin_error = 0; int ret; - unpin = &trans->transaction->pinned_extents; + mutex_lock(&fs_info->unused_bg_unpin_mutex); + btrfs_find_first_extent_bit(unpin, 0, &start, &end, EXTENT_DIRTY, &cached_state); - while (!TRANS_ABORTED(trans)) { - struct extent_state *cached_state = NULL; - - mutex_lock(&fs_info->unused_bg_unpin_mutex); - if (!find_first_extent_bit(unpin, 0, &start, &end, - EXTENT_DIRTY, &cached_state)) { - mutex_unlock(&fs_info->unused_bg_unpin_mutex); - break; - } + while (!TRANS_ABORTED(trans) && cached_state) { + struct extent_state *next_state; if (btrfs_test_opt(fs_info, DISCARD_SYNC)) ret = btrfs_discard_extent(fs_info, start, end + 1 - start, NULL); - clear_extent_dirty(unpin, start, end, &cached_state); + next_state = btrfs_next_extent_state(unpin, cached_state); + btrfs_clear_extent_dirty(unpin, start, end, &cached_state); ret = unpin_extent_range(fs_info, start, end, true); - BUG_ON(ret); - mutex_unlock(&fs_info->unused_bg_unpin_mutex); - free_extent_state(cached_state); - cond_resched(); + /* + * If we get an error unpinning an extent range, store the first + * error to return later after trying to unpin all ranges and do + * the sync discards. Our caller will abort the transaction + * (which already wrote new superblocks) and on the next mount + * the space will be available as it was pinned by in-memory + * only structures in this phase. + */ + if (ret) { + btrfs_err_rl(fs_info, +"failed to unpin extent range [%llu, %llu] when committing transaction %llu: %s (%d)", + start, end, trans->transid, + btrfs_decode_error(ret), ret); + if (!unpin_error) + unpin_error = ret; + } + + btrfs_free_extent_state(cached_state); + + if (need_resched()) { + btrfs_free_extent_state(next_state); + mutex_unlock(&fs_info->unused_bg_unpin_mutex); + cond_resched(); + cached_state = NULL; + mutex_lock(&fs_info->unused_bg_unpin_mutex); + btrfs_find_first_extent_bit(unpin, 0, &start, &end, + EXTENT_DIRTY, &cached_state); + } else { + cached_state = next_state; + if (cached_state) { + start = cached_state->start; + end = cached_state->end; + } + } } + mutex_unlock(&fs_info->unused_bg_unpin_mutex); + btrfs_free_extent_state(cached_state); if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) { btrfs_discard_calc_delay(&fs_info->discard_ctl); @@ -2859,14 +2893,10 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) */ deleted_bgs = &trans->transaction->deleted_bgs; list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) { - u64 trimmed = 0; - ret = -EROFS; if (!TRANS_ABORTED(trans)) - ret = btrfs_discard_extent(fs_info, - block_group->start, - block_group->length, - &trimmed); + ret = btrfs_discard_extent(fs_info, block_group->start, + block_group->length, NULL); /* * Not strictly necessary to lock, as the block_group should be @@ -2888,7 +2918,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) } } - return 0; + return unpin_error; } /* @@ -3483,17 +3513,11 @@ int btrfs_free_tree_block(struct btrfs_trans_handle *trans, WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); btrfs_add_free_space(bg, buf->start, buf->len); - btrfs_free_reserved_bytes(bg, buf->len, 0); + btrfs_free_reserved_bytes(bg, buf->len, false); btrfs_put_block_group(bg); trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len); out: - - /* - * Deleting the buffer, clear the corrupt flag since it doesn't - * matter anymore. - */ - clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); return 0; } @@ -4111,6 +4135,7 @@ static int can_allocate_chunk(struct btrfs_fs_info *fs_info, static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, struct btrfs_key *ins, struct find_free_extent_ctl *ffe_ctl, + struct btrfs_space_info *space_info, bool full_search) { struct btrfs_root *root = fs_info->chunk_root; @@ -4165,7 +4190,7 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, return ret; } - ret = btrfs_chunk_alloc(trans, ffe_ctl->flags, + ret = btrfs_chunk_alloc(trans, space_info, ffe_ctl->flags, CHUNK_ALLOC_FORCE_FOR_EXTENT); /* Do not bail out on ENOSPC since we can do more. */ @@ -4382,11 +4407,22 @@ static noinline int find_free_extent(struct btrfs_root *root, ins->objectid = 0; ins->offset = 0; - trace_find_free_extent(root, ffe_ctl); + trace_btrfs_find_free_extent(root, ffe_ctl); space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags); + if (btrfs_is_zoned(fs_info) && space_info) { + /* Use dedicated sub-space_info for dedicated block group users. */ + if (ffe_ctl->for_data_reloc) { + space_info = space_info->sub_group[0]; + ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC); + } else if (ffe_ctl->for_treelog) { + space_info = space_info->sub_group[0]; + ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_TREELOG); + } + } if (!space_info) { - btrfs_err(fs_info, "No space info for %llu", ffe_ctl->flags); + btrfs_err(fs_info, "no space info for %llu, tree-log %d, relocation %d", + ffe_ctl->flags, ffe_ctl->for_treelog, ffe_ctl->for_data_reloc); return -ENOSPC; } @@ -4408,6 +4444,7 @@ static noinline int find_free_extent(struct btrfs_root *root, * picked out then we don't care that the block group is cached. */ if (block_group && block_group_bits(block_group, ffe_ctl->flags) && + block_group->space_info == space_info && block_group->cached != BTRFS_CACHE_NO) { down_read(&space_info->groups_sem); if (list_empty(&block_group->list) || @@ -4433,7 +4470,7 @@ static noinline int find_free_extent(struct btrfs_root *root, } } search: - trace_find_free_extent_search_loop(root, ffe_ctl); + trace_btrfs_find_free_extent_search_loop(root, ffe_ctl); ffe_ctl->have_caching_bg = false; if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) || ffe_ctl->index == 0) @@ -4485,7 +4522,7 @@ search: } have_block_group: - trace_find_free_extent_have_block_group(root, ffe_ctl, block_group); + trace_btrfs_find_free_extent_have_block_group(root, ffe_ctl, block_group); ffe_ctl->cached = btrfs_block_group_done(block_group); if (unlikely(!ffe_ctl->cached)) { ffe_ctl->have_caching_bg = true; @@ -4578,7 +4615,8 @@ loop: } up_read(&space_info->groups_sem); - ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, full_search); + ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, space_info, + full_search); if (ret > 0) goto search; @@ -4700,8 +4738,8 @@ again: return ret; } -int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, - u64 start, u64 len, int delalloc) +int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len, + bool is_delalloc) { struct btrfs_block_group *cache; @@ -4713,7 +4751,7 @@ int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, } btrfs_add_free_space(cache, start, len); - btrfs_free_reserved_bytes(cache, len, delalloc); + btrfs_free_reserved_bytes(cache, len, is_delalloc); trace_btrfs_reserved_extent_free(fs_info, start, len); btrfs_put_block_group(cache); @@ -5071,17 +5109,17 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, * EXTENT bit to differentiate dirty pages. */ if (buf->log_index == 0) - set_extent_bit(&root->dirty_log_pages, buf->start, - buf->start + buf->len - 1, - EXTENT_DIRTY, NULL); + btrfs_set_extent_bit(&root->dirty_log_pages, buf->start, + buf->start + buf->len - 1, + EXTENT_DIRTY, NULL); else - set_extent_bit(&root->dirty_log_pages, buf->start, - buf->start + buf->len - 1, - EXTENT_NEW, NULL); + btrfs_set_extent_bit(&root->dirty_log_pages, buf->start, + buf->start + buf->len - 1, + EXTENT_NEW, NULL); } else { buf->log_index = -1; - set_extent_bit(&trans->transaction->dirty_pages, buf->start, - buf->start + buf->len - 1, EXTENT_DIRTY, NULL); + btrfs_set_extent_bit(&trans->transaction->dirty_pages, buf->start, + buf->start + buf->len - 1, EXTENT_DIRTY, NULL); } /* this returns a buffer locked for blocking */ return buf; @@ -5187,7 +5225,7 @@ out_free_buf: btrfs_tree_unlock(buf); free_extent_buffer(buf); out_free_reserved: - btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, false); out_unuse: btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize); return ERR_PTR(ret); @@ -6397,13 +6435,13 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) if (ret) break; - find_first_clear_extent_bit(&device->alloc_state, start, - &start, &end, - CHUNK_TRIMMED | CHUNK_ALLOCATED); + btrfs_find_first_clear_extent_bit(&device->alloc_state, start, + &start, &end, + CHUNK_TRIMMED | CHUNK_ALLOCATED); /* Check if there are any CHUNK_* bits left */ if (start > device->total_bytes) { - WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + DEBUG_WARN(); btrfs_warn_in_rcu(fs_info, "ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu", start, end - start + 1, @@ -6436,8 +6474,8 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) ret = btrfs_issue_discard(device->bdev, start, len, &bytes); if (!ret) - set_extent_bit(&device->alloc_state, start, - start + bytes - 1, CHUNK_TRIMMED, NULL); + btrfs_set_extent_bit(&device->alloc_state, start, + start + bytes - 1, CHUNK_TRIMMED, NULL); mutex_unlock(&fs_info->chunk_mutex); if (ret) diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index 0ed682d9ed7b..72914074c304 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -149,8 +149,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref); u64 btrfs_get_extent_owner_root(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, int slot); -int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, - u64 start, u64 len, int delalloc); +int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len, + bool is_delalloc); int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, const struct extent_buffer *eb); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 197f5e51c474..e43f6280f954 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -96,6 +96,8 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info) */ struct btrfs_bio_ctrl { struct btrfs_bio *bbio; + /* Last byte contained in bbio + 1 . */ + loff_t next_file_offset; enum btrfs_compression_type compress_type; u32 len_to_oe_boundary; blk_opf_t opf; @@ -221,22 +223,17 @@ static void __process_folios_contig(struct address_space *mapping, } static noinline void unlock_delalloc_folio(const struct inode *inode, - const struct folio *locked_folio, + struct folio *locked_folio, u64 start, u64 end) { - unsigned long index = start >> PAGE_SHIFT; - unsigned long end_index = end >> PAGE_SHIFT; - ASSERT(locked_folio); - if (index == locked_folio->index && end_index == index) - return; __process_folios_contig(inode->i_mapping, locked_folio, start, end, PAGE_UNLOCK); } static noinline int lock_delalloc_folios(struct inode *inode, - const struct folio *locked_folio, + struct folio *locked_folio, u64 start, u64 end) { struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); @@ -246,9 +243,6 @@ static noinline int lock_delalloc_folios(struct inode *inode, u64 processed_end = start; struct folio_batch fbatch; - if (index == locked_folio->index && index == end_index) - return 0; - folio_batch_init(&fbatch); while (index <= end_index) { unsigned int found_folios, i; @@ -340,7 +334,7 @@ again: /* @delalloc_end can be -1, never go beyond @orig_end */ *end = min(delalloc_end, orig_end); - free_extent_state(cached_state); + btrfs_free_extent_state(cached_state); return false; } @@ -366,7 +360,7 @@ again: /* some of the folios are gone, lets avoid looping by * shortening the size of the delalloc range we're searching */ - free_extent_state(cached_state); + btrfs_free_extent_state(cached_state); cached_state = NULL; if (!loops) { max_bytes = PAGE_SIZE; @@ -379,13 +373,13 @@ again: } /* step three, lock the state bits for the whole range */ - lock_extent(tree, delalloc_start, delalloc_end, &cached_state); + btrfs_lock_extent(tree, delalloc_start, delalloc_end, &cached_state); /* then test to make sure it is all still delalloc */ - ret = test_range_bit(tree, delalloc_start, delalloc_end, - EXTENT_DELALLOC, cached_state); + ret = btrfs_test_range_bit(tree, delalloc_start, delalloc_end, + EXTENT_DELALLOC, cached_state); - unlock_extent(tree, delalloc_start, delalloc_end, &cached_state); + btrfs_unlock_extent(tree, delalloc_start, delalloc_end, &cached_state); if (!ret) { unlock_delalloc_folio(inode, locked_folio, delalloc_start, delalloc_end); @@ -403,7 +397,7 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, struct extent_state **cached, u32 clear_bits, unsigned long page_ops) { - clear_extent_bit(&inode->io_tree, start, end, clear_bits, cached); + btrfs_clear_extent_bit(&inode->io_tree, start, end, clear_bits, cached); __process_folios_contig(inode->vfs_inode.i_mapping, locked_folio, start, end, page_ops); @@ -462,9 +456,6 @@ static void end_bbio_data_write(struct btrfs_bio *bbio) u64 start = folio_pos(folio) + fi.offset; u32 len = fi.length; - /* Only order 0 (single page) folios are allowed for data. */ - ASSERT(folio_order(folio) == 0); - /* Our read/write should always be sector aligned. */ if (!IS_ALIGNED(fi.offset, sectorsize)) btrfs_err(fs_info, @@ -512,43 +503,22 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) struct btrfs_fs_info *fs_info = bbio->fs_info; struct bio *bio = &bbio->bio; struct folio_iter fi; - const u32 sectorsize = fs_info->sectorsize; ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_folio_all(fi, &bbio->bio) { bool uptodate = !bio->bi_status; struct folio *folio = fi.folio; struct inode *inode = folio->mapping->host; - u64 start; - u64 end; - u32 len; + u64 start = folio_pos(folio) + fi.offset; btrfs_debug(fs_info, "%s: bi_sector=%llu, err=%d, mirror=%u", __func__, bio->bi_iter.bi_sector, bio->bi_status, bbio->mirror_num); - /* - * We always issue full-sector reads, but if some block in a - * folio fails to read, blk_update_request() will advance - * bv_offset and adjust bv_len to compensate. Print a warning - * for unaligned offsets, and an error if they don't add up to - * a full sector. - */ - if (!IS_ALIGNED(fi.offset, sectorsize)) - btrfs_err(fs_info, - "partial page read in btrfs with offset %zu and length %zu", - fi.offset, fi.length); - else if (!IS_ALIGNED(fi.offset + fi.length, sectorsize)) - btrfs_info(fs_info, - "incomplete page read with offset %zu and length %zu", - fi.offset, fi.length); - - start = folio_pos(folio) + fi.offset; - end = start + fi.length - 1; - len = fi.length; if (likely(uptodate)) { + u64 end = start + fi.length - 1; loff_t i_size = i_size_read(inode); /* @@ -573,7 +543,7 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) } /* Update page status and unlock. */ - end_folio_read(folio, uptodate, start, len); + end_folio_read(folio, uptodate, start, fi.length); } bio_put(bio); } @@ -664,13 +634,10 @@ static int alloc_eb_folio_array(struct extent_buffer *eb, bool nofail) } static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl, - struct folio *folio, u64 disk_bytenr, - unsigned int pg_offset) + u64 disk_bytenr, loff_t file_offset) { struct bio *bio = &bio_ctrl->bbio->bio; - struct bio_vec *bvec = bio_last_bvec_all(bio); const sector_t sector = disk_bytenr >> SECTOR_SHIFT; - struct folio *bv_folio = page_folio(bvec->bv_page); if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) { /* @@ -681,19 +648,11 @@ static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl, } /* - * The contig check requires the following conditions to be met: - * - * 1) The folios are belonging to the same inode - * This is implied by the call chain. - * - * 2) The range has adjacent logical bytenr - * - * 3) The range has adjacent file offset - * This is required for the usage of btrfs_bio->file_offset. + * To merge into a bio both the disk sector and the logical offset in + * the file need to be contiguous. */ - return bio_end_sector(bio) == sector && - folio_pos(bv_folio) + bvec->bv_offset + bvec->bv_len == - folio_pos(folio) + pg_offset; + return bio_ctrl->next_file_offset == file_offset && + bio_end_sector(bio) == sector; } static void alloc_new_bio(struct btrfs_inode *inode, @@ -711,6 +670,7 @@ static void alloc_new_bio(struct btrfs_inode *inode, bbio->file_offset = file_offset; bio_ctrl->bbio = bbio; bio_ctrl->len_to_oe_boundary = U32_MAX; + bio_ctrl->next_file_offset = file_offset; /* Limit data write bios to the ordered boundary. */ if (bio_ctrl->wbc) { @@ -752,22 +712,21 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl, size_t size, unsigned long pg_offset) { struct btrfs_inode *inode = folio_to_inode(folio); + loff_t file_offset = folio_pos(folio) + pg_offset; ASSERT(pg_offset + size <= folio_size(folio)); ASSERT(bio_ctrl->end_io_func); if (bio_ctrl->bbio && - !btrfs_bio_is_contig(bio_ctrl, folio, disk_bytenr, pg_offset)) + !btrfs_bio_is_contig(bio_ctrl, disk_bytenr, file_offset)) submit_one_bio(bio_ctrl); do { u32 len = size; /* Allocate new bio if needed */ - if (!bio_ctrl->bbio) { - alloc_new_bio(inode, bio_ctrl, disk_bytenr, - folio_pos(folio) + pg_offset); - } + if (!bio_ctrl->bbio) + alloc_new_bio(inode, bio_ctrl, disk_bytenr, file_offset); /* Cap to the current ordered extent boundary if there is one. */ if (len > bio_ctrl->len_to_oe_boundary) { @@ -781,14 +740,15 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl, submit_one_bio(bio_ctrl); continue; } + bio_ctrl->next_file_offset += len; if (bio_ctrl->wbc) - wbc_account_cgroup_owner(bio_ctrl->wbc, folio, - len); + wbc_account_cgroup_owner(bio_ctrl->wbc, folio, len); size -= len; pg_offset += len; disk_bytenr += len; + file_offset += len; /* * len_to_oe_boundary defaults to U32_MAX, which isn't folio or @@ -903,13 +863,13 @@ static struct extent_map *get_extent_map(struct btrfs_inode *inode, if (*em_cached) { em = *em_cached; - if (extent_map_in_tree(em) && start >= em->start && - start < extent_map_end(em)) { + if (btrfs_extent_map_in_tree(em) && start >= em->start && + start < btrfs_extent_map_end(em)) { refcount_inc(&em->refs); return em; } - free_extent_map(em); + btrfs_free_extent_map(em); *em_cached = NULL; } @@ -980,20 +940,20 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, return PTR_ERR(em); } extent_offset = cur - em->start; - BUG_ON(extent_map_end(em) <= cur); + BUG_ON(btrfs_extent_map_end(em) <= cur); BUG_ON(end < cur); - compress_type = extent_map_compression(em); + compress_type = btrfs_extent_map_compression(em); if (compress_type != BTRFS_COMPRESS_NONE) disk_bytenr = em->disk_bytenr; else - disk_bytenr = extent_map_block_start(em) + extent_offset; + disk_bytenr = btrfs_extent_map_block_start(em) + extent_offset; if (em->flags & EXTENT_FLAG_PREALLOC) block_start = EXTENT_MAP_HOLE; else - block_start = extent_map_block_start(em); + block_start = btrfs_extent_map_block_start(em); /* * If we have a file range that points to a compressed extent @@ -1037,7 +997,7 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, if (prev_em_start) *prev_em_start = em->start; - free_extent_map(em); + btrfs_free_extent_map(em); em = NULL; /* we've found a hole, just zero and go on */ @@ -1212,7 +1172,7 @@ static void lock_extents_for_read(struct btrfs_inode *inode, u64 start, u64 end, ASSERT(IS_ALIGNED(end + 1, PAGE_SIZE)); again: - lock_extent(&inode->io_tree, start, end, cached_state); + btrfs_lock_extent(&inode->io_tree, start, end, cached_state); cur_pos = start; while (cur_pos < end) { struct btrfs_ordered_extent *ordered; @@ -1235,7 +1195,7 @@ again: } /* Now wait for the OE to finish. */ - unlock_extent(&inode->io_tree, start, end, cached_state); + btrfs_unlock_extent(&inode->io_tree, start, end, cached_state); btrfs_start_ordered_extent_nowriteback(ordered, start, end + 1 - start); btrfs_put_ordered_extent(ordered); /* We have unlocked the whole range, restart from the beginning. */ @@ -1255,9 +1215,9 @@ int btrfs_read_folio(struct file *file, struct folio *folio) lock_extents_for_read(inode, start, end, &cached_state); ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, NULL); - unlock_extent(&inode->io_tree, start, end, &cached_state); + btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); - free_extent_map(em_cached); + btrfs_free_extent_map(em_cached); /* * If btrfs_do_readpage() failed we will want to submit the assembled @@ -1443,8 +1403,8 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, * We've hit an error during previous delalloc range, * have to cleanup the remaining locked ranges. */ - unlock_extent(&inode->io_tree, found_start, - found_start + found_len - 1, NULL); + btrfs_unlock_extent(&inode->io_tree, found_start, + found_start + found_len - 1, NULL); unlock_delalloc_folio(&inode->vfs_inode, folio, found_start, found_start + found_len - 1); @@ -1550,19 +1510,19 @@ static int submit_one_sector(struct btrfs_inode *inode, return PTR_ERR(em); extent_offset = filepos - em->start; - em_end = extent_map_end(em); + em_end = btrfs_extent_map_end(em); ASSERT(filepos <= em_end); ASSERT(IS_ALIGNED(em->start, sectorsize)); ASSERT(IS_ALIGNED(em->len, sectorsize)); - block_start = extent_map_block_start(em); - disk_bytenr = extent_map_block_start(em) + extent_offset; + block_start = btrfs_extent_map_block_start(em); + disk_bytenr = btrfs_extent_map_block_start(em) + extent_offset; - ASSERT(!extent_map_is_compressed(em)); + ASSERT(!btrfs_extent_map_is_compressed(em)); ASSERT(block_start != EXTENT_MAP_HOLE); ASSERT(block_start != EXTENT_MAP_INLINE); - free_extent_map(em); + btrfs_free_extent_map(em); em = NULL; /* @@ -1718,7 +1678,7 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl return 0; } - if (folio->index == end_index) + if (folio_contains(folio, end_index)) folio_zero_range(folio, pg_offset, folio_size(folio) - pg_offset); /* @@ -1814,8 +1774,18 @@ static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *e */ spin_lock(&eb->refs_lock); if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { + XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits); + unsigned long flags; + set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); spin_unlock(&eb->refs_lock); + + xas_lock_irqsave(&xas, flags); + xas_load(&xas); + xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK); + xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY); + xas_unlock_irqrestore(&xas, flags); + btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, -eb->len, @@ -1901,24 +1871,151 @@ static void set_btree_ioerr(struct extent_buffer *eb) } } +static void buffer_tree_set_mark(const struct extent_buffer *eb, xa_mark_t mark) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits); + unsigned long flags; + + xas_lock_irqsave(&xas, flags); + xas_load(&xas); + xas_set_mark(&xas, mark); + xas_unlock_irqrestore(&xas, flags); +} + +static void buffer_tree_clear_mark(const struct extent_buffer *eb, xa_mark_t mark) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits); + unsigned long flags; + + xas_lock_irqsave(&xas, flags); + xas_load(&xas); + xas_clear_mark(&xas, mark); + xas_unlock_irqrestore(&xas, flags); +} + +static void buffer_tree_tag_for_writeback(struct btrfs_fs_info *fs_info, + unsigned long start, unsigned long end) +{ + XA_STATE(xas, &fs_info->buffer_tree, start); + unsigned int tagged = 0; + void *eb; + + xas_lock_irq(&xas); + xas_for_each_marked(&xas, eb, end, PAGECACHE_TAG_DIRTY) { + xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE); + if (++tagged % XA_CHECK_SCHED) + continue; + xas_pause(&xas); + xas_unlock_irq(&xas); + cond_resched(); + xas_lock_irq(&xas); + } + xas_unlock_irq(&xas); +} + +struct eb_batch { + unsigned int nr; + unsigned int cur; + struct extent_buffer *ebs[PAGEVEC_SIZE]; +}; + +static inline bool eb_batch_add(struct eb_batch *batch, struct extent_buffer *eb) +{ + batch->ebs[batch->nr++] = eb; + return (batch->nr < PAGEVEC_SIZE); +} + +static inline void eb_batch_init(struct eb_batch *batch) +{ + batch->nr = 0; + batch->cur = 0; +} + +static inline struct extent_buffer *eb_batch_next(struct eb_batch *batch) +{ + if (batch->cur >= batch->nr) + return NULL; + return batch->ebs[batch->cur++]; +} + +static inline void eb_batch_release(struct eb_batch *batch) +{ + for (unsigned int i = 0; i < batch->nr; i++) + free_extent_buffer(batch->ebs[i]); + eb_batch_init(batch); +} + +static inline struct extent_buffer *find_get_eb(struct xa_state *xas, unsigned long max, + xa_mark_t mark) +{ + struct extent_buffer *eb; + +retry: + eb = xas_find_marked(xas, max, mark); + + if (xas_retry(xas, eb)) + goto retry; + + if (!eb) + return NULL; + + if (!atomic_inc_not_zero(&eb->refs)) { + xas_reset(xas); + goto retry; + } + + if (unlikely(eb != xas_reload(xas))) { + free_extent_buffer(eb); + xas_reset(xas); + goto retry; + } + + return eb; +} + +static unsigned int buffer_tree_get_ebs_tag(struct btrfs_fs_info *fs_info, + unsigned long *start, + unsigned long end, xa_mark_t tag, + struct eb_batch *batch) +{ + XA_STATE(xas, &fs_info->buffer_tree, *start); + struct extent_buffer *eb; + + rcu_read_lock(); + while ((eb = find_get_eb(&xas, end, tag)) != NULL) { + if (!eb_batch_add(batch, eb)) { + *start = ((eb->start + eb->len) >> fs_info->sectorsize_bits); + goto out; + } + } + if (end == ULONG_MAX) + *start = ULONG_MAX; + else + *start = end + 1; +out: + rcu_read_unlock(); + + return batch->nr; +} + /* * The endio specific version which won't touch any unsafe spinlock in endio * context. */ static struct extent_buffer *find_extent_buffer_nolock( - const struct btrfs_fs_info *fs_info, u64 start) + struct btrfs_fs_info *fs_info, u64 start) { struct extent_buffer *eb; + unsigned long index = (start >> fs_info->sectorsize_bits); rcu_read_lock(); - eb = radix_tree_lookup(&fs_info->buffer_radix, - start >> fs_info->sectorsize_bits); - if (eb && atomic_inc_not_zero(&eb->refs)) { - rcu_read_unlock(); - return eb; - } + eb = xa_load(&fs_info->buffer_tree, index); + if (eb && !atomic_inc_not_zero(&eb->refs)) + eb = NULL; rcu_read_unlock(); - return NULL; + return eb; } static void end_bbio_meta_write(struct btrfs_bio *bbio) @@ -1933,6 +2030,7 @@ static void end_bbio_meta_write(struct btrfs_bio *bbio) btrfs_meta_folio_clear_writeback(fi.folio, eb); } + buffer_tree_clear_mark(eb, PAGECACHE_TAG_WRITEBACK); clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); smp_mb__after_atomic(); wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); @@ -2004,163 +2102,36 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, } /* - * Submit one subpage btree page. - * - * The main difference to submit_eb_page() is: - * - Page locking - * For subpage, we don't rely on page locking at all. - * - * - Flush write bio - * We only flush bio if we may be unable to fit current extent buffers into - * current bio. + * Wait for all eb writeback in the given range to finish. * - * Return >=0 for the number of submitted extent buffers. - * Return <0 for fatal error. + * @fs_info: The fs_info for this file system. + * @start: The offset of the range to start waiting on writeback. + * @end: The end of the range, inclusive. This is meant to be used in + * conjuction with wait_marked_extents, so this will usually be + * the_next_eb->start - 1. */ -static int submit_eb_subpage(struct folio *folio, struct writeback_control *wbc) +void btrfs_btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start, + u64 end) { - struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); - int submitted = 0; - u64 folio_start = folio_pos(folio); - int bit_start = 0; - int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits; - const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); + struct eb_batch batch; + unsigned long start_index = (start >> fs_info->sectorsize_bits); + unsigned long end_index = (end >> fs_info->sectorsize_bits); - /* Lock and write each dirty extent buffers in the range */ - while (bit_start < blocks_per_folio) { - struct btrfs_subpage *subpage = folio_get_private(folio); + eb_batch_init(&batch); + while (start_index <= end_index) { struct extent_buffer *eb; - unsigned long flags; - u64 start; + unsigned int nr_ebs; - /* - * Take private lock to ensure the subpage won't be detached - * in the meantime. - */ - spin_lock(&folio->mapping->i_private_lock); - if (!folio_test_private(folio)) { - spin_unlock(&folio->mapping->i_private_lock); + nr_ebs = buffer_tree_get_ebs_tag(fs_info, &start_index, end_index, + PAGECACHE_TAG_WRITEBACK, &batch); + if (!nr_ebs) break; - } - spin_lock_irqsave(&subpage->lock, flags); - if (!test_bit(bit_start + btrfs_bitmap_nr_dirty * blocks_per_folio, - subpage->bitmaps)) { - spin_unlock_irqrestore(&subpage->lock, flags); - spin_unlock(&folio->mapping->i_private_lock); - bit_start++; - continue; - } - - start = folio_start + bit_start * fs_info->sectorsize; - bit_start += sectors_per_node; - - /* - * Here we just want to grab the eb without touching extra - * spin locks, so call find_extent_buffer_nolock(). - */ - eb = find_extent_buffer_nolock(fs_info, start); - spin_unlock_irqrestore(&subpage->lock, flags); - spin_unlock(&folio->mapping->i_private_lock); - - /* - * The eb has already reached 0 refs thus find_extent_buffer() - * doesn't return it. We don't need to write back such eb - * anyway. - */ - if (!eb) - continue; - - if (lock_extent_buffer_for_io(eb, wbc)) { - write_one_eb(eb, wbc); - submitted++; - } - free_extent_buffer(eb); - } - return submitted; -} - -/* - * Submit all page(s) of one extent buffer. - * - * @page: the page of one extent buffer - * @eb_context: to determine if we need to submit this page, if current page - * belongs to this eb, we don't need to submit - * - * The caller should pass each page in their bytenr order, and here we use - * @eb_context to determine if we have submitted pages of one extent buffer. - * - * If we have, we just skip until we hit a new page that doesn't belong to - * current @eb_context. - * - * If not, we submit all the page(s) of the extent buffer. - * - * Return >0 if we have submitted the extent buffer successfully. - * Return 0 if we don't need to submit the page, as it's already submitted by - * previous call. - * Return <0 for fatal error. - */ -static int submit_eb_page(struct folio *folio, struct btrfs_eb_write_context *ctx) -{ - struct writeback_control *wbc = ctx->wbc; - struct address_space *mapping = folio->mapping; - struct extent_buffer *eb; - int ret; - - if (!folio_test_private(folio)) - return 0; - - if (btrfs_meta_is_subpage(folio_to_fs_info(folio))) - return submit_eb_subpage(folio, wbc); - - spin_lock(&mapping->i_private_lock); - if (!folio_test_private(folio)) { - spin_unlock(&mapping->i_private_lock); - return 0; - } - - eb = folio_get_private(folio); - - /* - * Shouldn't happen and normally this would be a BUG_ON but no point - * crashing the machine for something we can survive anyway. - */ - if (WARN_ON(!eb)) { - spin_unlock(&mapping->i_private_lock); - return 0; - } - - if (eb == ctx->eb) { - spin_unlock(&mapping->i_private_lock); - return 0; - } - ret = atomic_inc_not_zero(&eb->refs); - spin_unlock(&mapping->i_private_lock); - if (!ret) - return 0; - - ctx->eb = eb; - - ret = btrfs_check_meta_write_pointer(eb->fs_info, ctx); - if (ret) { - if (ret == -EBUSY) - ret = 0; - free_extent_buffer(eb); - return ret; - } - if (!lock_extent_buffer_for_io(eb, wbc)) { - free_extent_buffer(eb); - return 0; - } - /* Implies write in zoned mode. */ - if (ctx->zoned_bg) { - /* Mark the last eb in the block group. */ - btrfs_schedule_zone_finish_bg(ctx->zoned_bg, eb); - ctx->zoned_bg->meta_write_pointer += eb->len; + while ((eb = eb_batch_next(&batch)) != NULL) + wait_on_extent_buffer_writeback(eb); + eb_batch_release(&batch); + cond_resched(); } - write_one_eb(eb, wbc); - free_extent_buffer(eb); - return 1; } int btree_write_cache_pages(struct address_space *mapping, @@ -2171,25 +2142,27 @@ int btree_write_cache_pages(struct address_space *mapping, int ret = 0; int done = 0; int nr_to_write_done = 0; - struct folio_batch fbatch; - unsigned int nr_folios; - pgoff_t index; - pgoff_t end; /* Inclusive */ + struct eb_batch batch; + unsigned int nr_ebs; + unsigned long index; + unsigned long end; int scanned = 0; xa_mark_t tag; - folio_batch_init(&fbatch); + eb_batch_init(&batch); if (wbc->range_cyclic) { - index = mapping->writeback_index; /* Start from prev offset */ + index = ((mapping->writeback_index << PAGE_SHIFT) >> fs_info->sectorsize_bits); end = -1; + /* * Start from the beginning does not need to cycle over the * range, mark it as scanned. */ scanned = (index == 0); } else { - index = wbc->range_start >> PAGE_SHIFT; - end = wbc->range_end >> PAGE_SHIFT; + index = (wbc->range_start >> fs_info->sectorsize_bits); + end = (wbc->range_end >> fs_info->sectorsize_bits); + scanned = 1; } if (wbc->sync_mode == WB_SYNC_ALL) @@ -2199,31 +2172,40 @@ int btree_write_cache_pages(struct address_space *mapping, btrfs_zoned_meta_io_lock(fs_info); retry: if (wbc->sync_mode == WB_SYNC_ALL) - tag_pages_for_writeback(mapping, index, end); + buffer_tree_tag_for_writeback(fs_info, index, end); while (!done && !nr_to_write_done && (index <= end) && - (nr_folios = filemap_get_folios_tag(mapping, &index, end, - tag, &fbatch))) { - unsigned i; + (nr_ebs = buffer_tree_get_ebs_tag(fs_info, &index, end, tag, &batch))) { + struct extent_buffer *eb; - for (i = 0; i < nr_folios; i++) { - struct folio *folio = fbatch.folios[i]; + while ((eb = eb_batch_next(&batch)) != NULL) { + ctx.eb = eb; + + ret = btrfs_check_meta_write_pointer(eb->fs_info, &ctx); + if (ret) { + if (ret == -EBUSY) + ret = 0; - ret = submit_eb_page(folio, &ctx); - if (ret == 0) + if (ret) { + done = 1; + break; + } + free_extent_buffer(eb); continue; - if (ret < 0) { - done = 1; - break; } - /* - * the filesystem may choose to bump up nr_to_write. - * We have to make sure to honor the new nr_to_write - * at any time - */ - nr_to_write_done = wbc->nr_to_write <= 0; + if (!lock_extent_buffer_for_io(eb, wbc)) + continue; + + /* Implies write in zoned mode. */ + if (ctx.zoned_bg) { + /* Mark the last eb in the block group. */ + btrfs_schedule_zone_finish_bg(ctx.zoned_bg, eb); + ctx.zoned_bg->meta_write_pointer += eb->len; + } + write_one_eb(eb, wbc); } - folio_batch_release(&fbatch); + nr_to_write_done = (wbc->nr_to_write <= 0); + eb_batch_release(&batch); cond_resched(); } if (!scanned && !done) { @@ -2574,10 +2556,10 @@ void btrfs_readahead(struct readahead_control *rac) while ((folio = readahead_folio(rac)) != NULL) btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start); - unlock_extent(&inode->io_tree, start, end, &cached_state); + btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); if (em_cached) - free_extent_map(em_cached); + btrfs_free_extent_map(em_cached); submit_one_bio(&bio_ctrl); } @@ -2601,7 +2583,7 @@ int extent_invalidate_folio(struct extent_io_tree *tree, if (start > end) return 0; - lock_extent(tree, start, end, &cached_state); + btrfs_lock_extent(tree, start, end, &cached_state); folio_wait_writeback(folio); /* @@ -2609,46 +2591,54 @@ int extent_invalidate_folio(struct extent_io_tree *tree, * so here we only need to unlock the extent range to free any * existing extent state. */ - unlock_extent(tree, start, end, &cached_state); + btrfs_unlock_extent(tree, start, end, &cached_state); return 0; } /* - * a helper for release_folio, this tests for areas of the page that - * are locked or under IO and drops the related state bits if it is safe - * to drop the page. + * A helper for struct address_space_operations::release_folio, this tests for + * areas of the folio that are locked or under IO and drops the related state + * bits if it is safe to drop the folio. */ static bool try_release_extent_state(struct extent_io_tree *tree, struct folio *folio) { + struct extent_state *cached_state = NULL; u64 start = folio_pos(folio); u64 end = start + folio_size(folio) - 1; - bool ret; + u32 range_bits; + u32 clear_bits; + bool ret = false; + int ret2; - if (test_range_bit_exists(tree, start, end, EXTENT_LOCKED)) { - ret = false; - } else { - u32 clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM | - EXTENT_DELALLOC_NEW | EXTENT_CTLBITS | - EXTENT_QGROUP_RESERVED); - int ret2; + btrfs_get_range_bits(tree, start, end, &range_bits, &cached_state); - /* - * At this point we can safely clear everything except the - * locked bit, the nodatasum bit and the delalloc new bit. - * The delalloc new bit will be cleared by ordered extent - * completion. - */ - ret2 = __clear_extent_bit(tree, start, end, clear_bits, NULL, NULL); + /* + * We can release the folio if it's locked only for ordered extent + * completion, since that doesn't require using the folio. + */ + if ((range_bits & EXTENT_LOCKED) && + !(range_bits & EXTENT_FINISHING_ORDERED)) + goto out; + + clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW | + EXTENT_CTLBITS | EXTENT_QGROUP_RESERVED | + EXTENT_FINISHING_ORDERED); + /* + * At this point we can safely clear everything except the locked, + * nodatasum, delalloc new and finishing ordered bits. The delalloc new + * bit will be cleared by ordered extent completion. + */ + ret2 = btrfs_clear_extent_bit(tree, start, end, clear_bits, &cached_state); + /* + * If clear_extent_bit failed for enomem reasons, we can't allow the + * release to continue. + */ + if (ret2 == 0) + ret = true; +out: + btrfs_free_extent_state(cached_state); - /* if clear_extent_bit failed for enomem reasons, - * we can't allow the release to continue. - */ - if (ret2 < 0) - ret = false; - else - ret = true; - } return ret; } @@ -2671,18 +2661,19 @@ bool try_release_extent_mapping(struct folio *folio, gfp_t mask) struct extent_map *em; write_lock(&extent_tree->lock); - em = lookup_extent_mapping(extent_tree, start, len); + em = btrfs_lookup_extent_mapping(extent_tree, start, len); if (!em) { write_unlock(&extent_tree->lock); break; } if ((em->flags & EXTENT_FLAG_PINNED) || em->start != start) { write_unlock(&extent_tree->lock); - free_extent_map(em); + btrfs_free_extent_map(em); break; } - if (test_range_bit_exists(io_tree, em->start, - extent_map_end(em) - 1, EXTENT_LOCKED)) + if (btrfs_test_range_bit_exists(io_tree, em->start, + btrfs_extent_map_end(em) - 1, + EXTENT_LOCKED)) goto next; /* * If it's not in the list of modified extents, used by a fast @@ -2709,15 +2700,15 @@ remove_em: * fsync performance for workloads with a data size that exceeds * or is close to the system's memory). */ - remove_extent_mapping(inode, em); + btrfs_remove_extent_mapping(inode, em); /* Once for the inode's extent map tree. */ - free_extent_map(em); + btrfs_free_extent_map(em); next: - start = extent_map_end(em); + start = btrfs_extent_map_end(em); write_unlock(&extent_tree->lock); /* Once for us, for the lookup_extent_mapping() reference. */ - free_extent_map(em); + btrfs_free_extent_map(em); if (need_resched()) { /* @@ -2756,6 +2747,7 @@ static bool folio_range_has_eb(struct folio *folio) static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct folio *folio) { struct btrfs_fs_info *fs_info = eb->fs_info; + struct address_space *mapping = folio->mapping; const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); /* @@ -2763,21 +2755,20 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo * be done under the i_private_lock. */ if (mapped) - spin_lock(&folio->mapping->i_private_lock); + spin_lock(&mapping->i_private_lock); if (!folio_test_private(folio)) { if (mapped) - spin_unlock(&folio->mapping->i_private_lock); + spin_unlock(&mapping->i_private_lock); return; } if (!btrfs_meta_is_subpage(fs_info)) { /* - * We do this since we'll remove the pages after we've - * removed the eb from the radix tree, so we could race - * and have this page now attached to the new eb. So - * only clear folio if it's still connected to - * this eb. + * We do this since we'll remove the pages after we've removed + * the eb from the xarray, so we could race and have this page + * now attached to the new eb. So only clear folio if it's + * still connected to this eb. */ if (folio_test_private(folio) && folio_get_private(folio) == eb) { BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); @@ -2787,7 +2778,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo folio_detach_private(folio); } if (mapped) - spin_unlock(&folio->mapping->i_private_lock); + spin_unlock(&mapping->i_private_lock); return; } @@ -2810,7 +2801,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo if (!folio_range_has_eb(folio)) btrfs_detach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA); - spin_unlock(&folio->mapping->i_private_lock); + spin_unlock(&mapping->i_private_lock); } /* Release all folios attached to the extent buffer */ @@ -2825,9 +2816,6 @@ static void btrfs_release_extent_buffer_folios(const struct extent_buffer *eb) continue; detach_extent_buffer_folio(eb, folio); - - /* One for when we allocated the folio. */ - folio_put(folio); } } @@ -2862,9 +2850,28 @@ static struct extent_buffer *__alloc_extent_buffer(struct btrfs_fs_info *fs_info return eb; } +/* + * For use in eb allocation error cleanup paths, as btrfs_release_extent_buffer() + * does not call folio_put(), and we need to set the folios to NULL so that + * btrfs_release_extent_buffer() will not detach them a second time. + */ +static void cleanup_extent_buffer_folios(struct extent_buffer *eb) +{ + const int num_folios = num_extent_folios(eb); + + /* We canont use num_extent_folios() as loop bound as eb->folios changes. */ + for (int i = 0; i < num_folios; i++) { + ASSERT(eb->folios[i]); + detach_extent_buffer_folio(eb, eb->folios[i]); + folio_put(eb->folios[i]); + eb->folios[i] = NULL; + } +} + struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) { struct extent_buffer *new; + int num_folios; int ret; new = __alloc_extent_buffer(src->fs_info, src->start); @@ -2879,25 +2886,34 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags); ret = alloc_eb_folio_array(new, false); - if (ret) { - btrfs_release_extent_buffer(new); - return NULL; - } + if (ret) + goto release_eb; - for (int i = 0; i < num_extent_folios(src); i++) { + ASSERT(num_extent_folios(src) == num_extent_folios(new), + "%d != %d", num_extent_folios(src), num_extent_folios(new)); + /* Explicitly use the cached num_extent value from now on. */ + num_folios = num_extent_folios(src); + for (int i = 0; i < num_folios; i++) { struct folio *folio = new->folios[i]; ret = attach_extent_buffer_folio(new, folio, NULL); - if (ret < 0) { - btrfs_release_extent_buffer(new); - return NULL; - } + if (ret < 0) + goto cleanup_folios; WARN_ON(folio_test_dirty(folio)); } + for (int i = 0; i < num_folios; i++) + folio_put(new->folios[i]); + copy_extent_buffer_full(new, src); set_extent_buffer_uptodate(new); return new; + +cleanup_folios: + cleanup_extent_buffer_folios(new); +release_eb: + btrfs_release_extent_buffer(new); + return NULL; } struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, @@ -2912,13 +2928,15 @@ struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, ret = alloc_eb_folio_array(eb, false); if (ret) - goto out; + goto release_eb; for (int i = 0; i < num_extent_folios(eb); i++) { ret = attach_extent_buffer_folio(eb, eb->folios[i], NULL); if (ret < 0) - goto out_detach; + goto cleanup_folios; } + for (int i = 0; i < num_extent_folios(eb); i++) + folio_put(eb->folios[i]); set_extent_buffer_uptodate(eb); btrfs_set_header_nritems(eb, 0); @@ -2926,15 +2944,10 @@ struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, return eb; -out_detach: - for (int i = 0; i < num_extent_folios(eb); i++) { - if (eb->folios[i]) { - detach_extent_buffer_folio(eb, eb->folios[i]); - folio_put(eb->folios[i]); - } - } -out: - kmem_cache_free(extent_buffer_cache, eb); +cleanup_folios: + cleanup_extent_buffer_folios(eb); +release_eb: + btrfs_release_extent_buffer(eb); return NULL; } @@ -2942,9 +2955,9 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) { int refs; /* - * The TREE_REF bit is first set when the extent_buffer is added - * to the radix tree. It is also reset, if unset, when a new reference - * is created by find_extent_buffer. + * The TREE_REF bit is first set when the extent_buffer is added to the + * xarray. It is also reset, if unset, when a new reference is created + * by find_extent_buffer. * * It is only cleared in two cases: freeing the last non-tree * reference to the extent_buffer when its STALE bit is set or @@ -2956,13 +2969,12 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) * conditions between the calls to check_buffer_tree_ref in those * codepaths and clearing TREE_REF in try_release_extent_buffer. * - * The actual lifetime of the extent_buffer in the radix tree is - * adequately protected by the refcount, but the TREE_REF bit and - * its corresponding reference are not. To protect against this - * class of races, we call check_buffer_tree_ref from the codepaths - * which trigger io. Note that once io is initiated, TREE_REF can no - * longer be cleared, so that is the moment at which any such race is - * best fixed. + * The actual lifetime of the extent_buffer in the xarray is adequately + * protected by the refcount, but the TREE_REF bit and its corresponding + * reference are not. To protect against this class of races, we call + * check_buffer_tree_ref() from the code paths which trigger io. Note that + * once io is initiated, TREE_REF can no longer be cleared, so that is + * the moment at which any such race is best fixed. */ refs = atomic_read(&eb->refs); if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) @@ -3026,30 +3038,29 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, return ERR_PTR(-ENOMEM); eb->fs_info = fs_info; again: - ret = radix_tree_preload(GFP_NOFS); - if (ret) { - exists = ERR_PTR(ret); - goto free_eb; + xa_lock_irq(&fs_info->buffer_tree); + exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->sectorsize_bits, + NULL, eb, GFP_NOFS); + if (xa_is_err(exists)) { + ret = xa_err(exists); + xa_unlock_irq(&fs_info->buffer_tree); + btrfs_release_extent_buffer(eb); + return ERR_PTR(ret); } - spin_lock(&fs_info->buffer_lock); - ret = radix_tree_insert(&fs_info->buffer_radix, - start >> fs_info->sectorsize_bits, eb); - spin_unlock(&fs_info->buffer_lock); - radix_tree_preload_end(); - if (ret == -EEXIST) { - exists = find_extent_buffer(fs_info, start); - if (exists) - goto free_eb; - else + if (exists) { + if (!atomic_inc_not_zero(&exists->refs)) { + /* The extent buffer is being freed, retry. */ + xa_unlock_irq(&fs_info->buffer_tree); goto again; + } + xa_unlock_irq(&fs_info->buffer_tree); + btrfs_release_extent_buffer(eb); + return exists; } + xa_unlock_irq(&fs_info->buffer_tree); check_buffer_tree_ref(eb); - set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); return eb; -free_eb: - btrfs_release_extent_buffer(eb); - return exists; #else /* Stub to avoid linker error when compiled with optimizations turned off. */ return NULL; @@ -3064,9 +3075,9 @@ static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info, lockdep_assert_held(&folio->mapping->i_private_lock); /* - * For subpage case, we completely rely on radix tree to ensure we - * don't try to insert two ebs for the same bytenr. So here we always - * return NULL and just continue. + * For subpage case, we completely rely on xarray to ensure we don't try + * to insert two ebs for the same bytenr. So here we always return NULL + * and just continue. */ if (btrfs_meta_is_subpage(fs_info)) return NULL; @@ -3100,10 +3111,9 @@ static bool check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) return true; } - if (fs_info->nodesize < PAGE_SIZE && - offset_in_page(start) + fs_info->nodesize > PAGE_SIZE) { + if (fs_info->nodesize < PAGE_SIZE && !IS_ALIGNED(start, fs_info->nodesize)) { btrfs_err(fs_info, - "tree block crosses page boundary, start %llu nodesize %u", + "tree block is not nodesize aligned, start %llu nodesize %u", start, fs_info->nodesize); return true; } @@ -3139,7 +3149,7 @@ static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i, struct btrfs_fs_info *fs_info = eb->fs_info; struct address_space *mapping = fs_info->btree_inode->i_mapping; const unsigned long index = eb->start >> PAGE_SHIFT; - struct folio *existing_folio = NULL; + struct folio *existing_folio; int ret; ASSERT(found_eb_ret); @@ -3148,6 +3158,7 @@ static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i, ASSERT(eb->folios[i]); retry: + existing_folio = NULL; ret = filemap_add_folio(mapping, eb->folios[i], index + i, GFP_NOFS | __GFP_NOFAIL); if (!ret) @@ -3155,10 +3166,8 @@ retry: existing_folio = filemap_lock_folio(mapping, index + i); /* The page cache only exists for a very short time, just retry. */ - if (IS_ERR(existing_folio)) { - existing_folio = NULL; + if (IS_ERR(existing_folio)) goto retry; - } /* For now, we should only have single-page folios for btree inode. */ ASSERT(folio_nr_pages(existing_folio) == 1); @@ -3199,7 +3208,7 @@ finish: /* * To inform we have an extra eb under allocation, so that * detach_extent_buffer_page() won't release the folio private when the - * eb hasn't been inserted into radix tree yet. + * eb hasn't been inserted into the xarray yet. * * The ref will be decreased when the eb releases the page, in * detach_extent_buffer_page(). Thus needs no special handling in the @@ -3306,7 +3315,7 @@ reallocate: * using 0-order folios. */ if (unlikely(ret == -EAGAIN)) { - ASSERT(0); + DEBUG_WARN("folio order mismatch between new eb and filemap"); goto reallocate; } attached++; @@ -3333,10 +3342,9 @@ reallocate: /* * We can't unlock the pages just yet since the extent buffer - * hasn't been properly inserted in the radix tree, this - * opens a race with btree_release_folio which can free a page - * while we are still filling in all pages for the buffer and - * we could crash. + * hasn't been properly inserted into the xarray, this opens a + * race with btree_release_folio() which can free a page while we + * are still filling in all pages for the buffer and we could crash. */ } if (uptodate) @@ -3345,34 +3353,42 @@ reallocate: if (page_contig) eb->addr = folio_address(eb->folios[0]) + offset_in_page(eb->start); again: - ret = radix_tree_preload(GFP_NOFS); - if (ret) + xa_lock_irq(&fs_info->buffer_tree); + existing_eb = __xa_cmpxchg(&fs_info->buffer_tree, + start >> fs_info->sectorsize_bits, NULL, eb, + GFP_NOFS); + if (xa_is_err(existing_eb)) { + ret = xa_err(existing_eb); + xa_unlock_irq(&fs_info->buffer_tree); goto out; - - spin_lock(&fs_info->buffer_lock); - ret = radix_tree_insert(&fs_info->buffer_radix, - start >> fs_info->sectorsize_bits, eb); - spin_unlock(&fs_info->buffer_lock); - radix_tree_preload_end(); - if (ret == -EEXIST) { - ret = 0; - existing_eb = find_extent_buffer(fs_info, start); - if (existing_eb) - goto out; - else + } + if (existing_eb) { + if (!atomic_inc_not_zero(&existing_eb->refs)) { + xa_unlock_irq(&fs_info->buffer_tree); goto again; + } + xa_unlock_irq(&fs_info->buffer_tree); + goto out; } + xa_unlock_irq(&fs_info->buffer_tree); + /* add one reference for the tree */ check_buffer_tree_ref(eb); - set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); /* * Now it's safe to unlock the pages because any calls to * btree_release_folio will correctly detect that a page belongs to a * live buffer and won't free them prematurely. */ - for (int i = 0; i < num_extent_folios(eb); i++) + for (int i = 0; i < num_extent_folios(eb); i++) { folio_unlock(eb->folios[i]); + /* + * A folio that has been added to an address_space mapping + * should not continue holding the refcount from its original + * allocation indefinitely. + */ + folio_put(eb->folios[i]); + } return eb; out: @@ -3386,26 +3402,22 @@ out: * want that to grab this eb, as we're getting ready to free it. So we * have to detach it first and then unlock it. * - * We have to drop our reference and NULL it out here because in the - * subpage case detaching does a btrfs_folio_dec_eb_refs() for our eb. - * Below when we call btrfs_release_extent_buffer() we will call - * detach_extent_buffer_folio() on our remaining pages in the !subpage - * case. If we left eb->folios[i] populated in the subpage case we'd - * double put our reference and be super sad. + * Note: the bounds is num_extent_pages() as we need to go through all slots. */ - for (int i = 0; i < attached; i++) { - ASSERT(eb->folios[i]); - detach_extent_buffer_folio(eb, eb->folios[i]); - folio_unlock(eb->folios[i]); - folio_put(eb->folios[i]); + for (int i = 0; i < num_extent_pages(eb); i++) { + struct folio *folio = eb->folios[i]; + + if (i < attached) { + ASSERT(folio); + detach_extent_buffer_folio(eb, folio); + folio_unlock(folio); + } else if (!folio) { + continue; + } + + folio_put(folio); eb->folios[i] = NULL; } - /* - * Now all pages of that extent buffer is unmapped, set UNMAPPED flag, - * so it can be cleaned up without utilizing folio->mapping. - */ - set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); - btrfs_release_extent_buffer(eb); if (ret < 0) return ERR_PTR(ret); @@ -3428,18 +3440,27 @@ static int release_extent_buffer(struct extent_buffer *eb) WARN_ON(atomic_read(&eb->refs) == 0); if (atomic_dec_and_test(&eb->refs)) { - if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) { - struct btrfs_fs_info *fs_info = eb->fs_info; + struct btrfs_fs_info *fs_info = eb->fs_info; - spin_unlock(&eb->refs_lock); + spin_unlock(&eb->refs_lock); - spin_lock(&fs_info->buffer_lock); - radix_tree_delete(&fs_info->buffer_radix, - eb->start >> fs_info->sectorsize_bits); - spin_unlock(&fs_info->buffer_lock); - } else { - spin_unlock(&eb->refs_lock); - } + /* + * We're erasing, theoretically there will be no allocations, so + * just use GFP_ATOMIC. + * + * We use cmpxchg instead of erase because we do not know if + * this eb is actually in the tree or not, we could be cleaning + * up an eb that we allocated but never inserted into the tree. + * Thus use cmpxchg to remove it from the tree if it is there, + * or leave the other entry if this isn't in the tree. + * + * The documentation says that putting a NULL value is the same + * as erase as long as XA_FLAGS_ALLOC is not set, which it isn't + * in this case. + */ + xa_cmpxchg_irq(&fs_info->buffer_tree, + eb->start >> fs_info->sectorsize_bits, eb, NULL, + GFP_ATOMIC); btrfs_leak_debug_del_eb(eb); /* Should be safe to release folios at this point. */ @@ -3508,8 +3529,8 @@ static void btree_clear_folio_dirty_tag(struct folio *folio) ASSERT(folio_test_locked(folio)); xa_lock_irq(&folio->mapping->i_pages); if (!folio_test_dirty(folio)) - __xa_clear_mark(&folio->mapping->i_pages, - folio_index(folio), PAGECACHE_TAG_DIRTY); + __xa_clear_mark(&folio->mapping->i_pages, folio->index, + PAGECACHE_TAG_DIRTY); xa_unlock_irq(&folio->mapping->i_pages); } @@ -3540,6 +3561,7 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, if (!test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) return; + buffer_tree_clear_mark(eb, PAGECACHE_TAG_DIRTY); percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, -eb->len, fs_info->dirty_metadata_batch); @@ -3588,6 +3610,7 @@ void set_extent_buffer_dirty(struct extent_buffer *eb) folio_lock(eb->folios[0]); for (int i = 0; i < num_extent_folios(eb); i++) btrfs_meta_folio_set_dirty(eb->folios[i], eb); + buffer_tree_set_mark(eb, PAGECACHE_TAG_DIRTY); if (subpage) folio_unlock(eb->folios[0]); percpu_counter_add_batch(&eb->fs_info->dirty_metadata_bytes, @@ -3647,12 +3670,10 @@ static void end_bbio_meta_read(struct btrfs_bio *bbio) btrfs_validate_extent_buffer(eb, &bbio->parent_check) < 0) uptodate = false; - if (uptodate) { + if (uptodate) set_extent_buffer_uptodate(eb); - } else { + else clear_extent_buffer_uptodate(eb); - set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); - } clear_extent_buffer_reading(eb); free_extent_buffer(eb); @@ -3691,7 +3712,6 @@ int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num, return 0; } - clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); eb->read_mirror = 0; check_buffer_tree_ref(eb); atomic_inc(&eb->refs); @@ -3737,7 +3757,7 @@ static bool report_eb_range(const struct extent_buffer *eb, unsigned long start, btrfs_warn(eb->fs_info, "access to eb bytenr %llu len %u out of range start %lu len %lu", eb->start, eb->len, start, len); - WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + DEBUG_WARN(); return true; } @@ -4273,71 +4293,17 @@ void memmove_extent_buffer(const struct extent_buffer *dst, } } -#define GANG_LOOKUP_SIZE 16 -static struct extent_buffer *get_next_extent_buffer( - const struct btrfs_fs_info *fs_info, struct folio *folio, u64 bytenr) -{ - struct extent_buffer *gang[GANG_LOOKUP_SIZE]; - struct extent_buffer *found = NULL; - u64 folio_start = folio_pos(folio); - u64 cur = folio_start; - - ASSERT(in_range(bytenr, folio_start, PAGE_SIZE)); - lockdep_assert_held(&fs_info->buffer_lock); - - while (cur < folio_start + PAGE_SIZE) { - int ret; - int i; - - ret = radix_tree_gang_lookup(&fs_info->buffer_radix, - (void **)gang, cur >> fs_info->sectorsize_bits, - min_t(unsigned int, GANG_LOOKUP_SIZE, - PAGE_SIZE / fs_info->nodesize)); - if (ret == 0) - goto out; - for (i = 0; i < ret; i++) { - /* Already beyond page end */ - if (gang[i]->start >= folio_start + PAGE_SIZE) - goto out; - /* Found one */ - if (gang[i]->start >= bytenr) { - found = gang[i]; - goto out; - } - } - cur = gang[ret - 1]->start + gang[ret - 1]->len; - } -out: - return found; -} - static int try_release_subpage_extent_buffer(struct folio *folio) { struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); - u64 cur = folio_pos(folio); - const u64 end = cur + PAGE_SIZE; + struct extent_buffer *eb; + unsigned long start = (folio_pos(folio) >> fs_info->sectorsize_bits); + unsigned long index = start; + unsigned long end = index + (PAGE_SIZE >> fs_info->sectorsize_bits) - 1; int ret; - while (cur < end) { - struct extent_buffer *eb = NULL; - - /* - * Unlike try_release_extent_buffer() which uses folio private - * to grab buffer, for subpage case we rely on radix tree, thus - * we need to ensure radix tree consistency. - * - * We also want an atomic snapshot of the radix tree, thus go - * with spinlock rather than RCU. - */ - spin_lock(&fs_info->buffer_lock); - eb = get_next_extent_buffer(fs_info, folio, cur); - if (!eb) { - /* No more eb in the page range after or at cur */ - spin_unlock(&fs_info->buffer_lock); - break; - } - cur = eb->start + eb->len; - + xa_lock_irq(&fs_info->buffer_tree); + xa_for_each_range(&fs_info->buffer_tree, index, eb, start, end) { /* * The same as try_release_extent_buffer(), to ensure the eb * won't disappear out from under us. @@ -4345,10 +4311,9 @@ static int try_release_subpage_extent_buffer(struct folio *folio) spin_lock(&eb->refs_lock); if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { spin_unlock(&eb->refs_lock); - spin_unlock(&fs_info->buffer_lock); - break; + continue; } - spin_unlock(&fs_info->buffer_lock); + xa_unlock_irq(&fs_info->buffer_tree); /* * If tree ref isn't set then we know the ref on this eb is a @@ -4366,7 +4331,10 @@ static int try_release_subpage_extent_buffer(struct folio *folio) * release_extent_buffer() will release the refs_lock. */ release_extent_buffer(eb); + xa_lock_irq(&fs_info->buffer_tree); } + xa_unlock_irq(&fs_info->buffer_tree); + /* * Finally to check if we have cleared folio private, as if we have * released all ebs in the page, the folio private should be cleared now. diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 2e261892c7bc..e36e8d6a00bc 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -38,16 +38,10 @@ struct btrfs_tree_parent_check; enum { EXTENT_BUFFER_UPTODATE, EXTENT_BUFFER_DIRTY, - EXTENT_BUFFER_CORRUPT, - /* this got triggered by readahead */ - EXTENT_BUFFER_READAHEAD, EXTENT_BUFFER_TREE_REF, EXTENT_BUFFER_STALE, EXTENT_BUFFER_WRITEBACK, - /* read IO error */ - EXTENT_BUFFER_READ_ERR, EXTENT_BUFFER_UNMAPPED, - EXTENT_BUFFER_IN_TREE, /* write IO error */ EXTENT_BUFFER_WRITE_ERR, /* Indicate the extent buffer is written zeroed out (for zoned) */ @@ -79,7 +73,7 @@ enum { * single word in a bitmap may straddle two pages in the extent buffer. */ #define BIT_BYTE(nr) ((nr) / BITS_PER_BYTE) -#define BYTE_MASK ((1 << BITS_PER_BYTE) - 1) +#define BYTE_MASK ((1U << BITS_PER_BYTE) - 1) #define BITMAP_FIRST_BYTE_MASK(start) \ ((BYTE_MASK << ((start) & (BITS_PER_BYTE - 1))) & BYTE_MASK) #define BITMAP_LAST_BYTE_MASK(nbits) \ @@ -246,6 +240,7 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc); int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc); +void btrfs_btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start, u64 end); void btrfs_readahead(struct readahead_control *rac); int set_folio_extent_mapped(struct folio *folio); void clear_folio_extent_mapped(struct folio *folio); @@ -298,6 +293,8 @@ static inline int __pure num_extent_pages(const struct extent_buffer *eb) */ static inline int __pure num_extent_folios(const struct extent_buffer *eb) { + if (!eb->folios[0]) + return 0; if (folio_order(eb->folios[0])) return 1; return num_extent_pages(eb); diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 7f46abbd6311..02bfdb976e40 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -13,7 +13,7 @@ static struct kmem_cache *extent_map_cache; -int __init extent_map_init(void) +int __init btrfs_extent_map_init(void) { extent_map_cache = kmem_cache_create("btrfs_extent_map", sizeof(struct extent_map), 0, 0, NULL); @@ -22,7 +22,7 @@ int __init extent_map_init(void) return 0; } -void __cold extent_map_exit(void) +void __cold btrfs_extent_map_exit(void) { kmem_cache_destroy(extent_map_cache); } @@ -31,7 +31,7 @@ void __cold extent_map_exit(void) * Initialize the extent tree @tree. Should be called for each new inode or * other user of the extent_map interface. */ -void extent_map_tree_init(struct extent_map_tree *tree) +void btrfs_extent_map_tree_init(struct extent_map_tree *tree) { tree->root = RB_ROOT; INIT_LIST_HEAD(&tree->modified_extents); @@ -42,7 +42,7 @@ void extent_map_tree_init(struct extent_map_tree *tree) * Allocate a new extent_map structure. The new structure is returned with a * reference count of one and needs to be freed using free_extent_map() */ -struct extent_map *alloc_extent_map(void) +struct extent_map *btrfs_alloc_extent_map(void) { struct extent_map *em; em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS); @@ -58,12 +58,12 @@ struct extent_map *alloc_extent_map(void) * Drop the reference out on @em by one and free the structure if the reference * count hits zero. */ -void free_extent_map(struct extent_map *em) +void btrfs_free_extent_map(struct extent_map *em) { if (!em) return; if (refcount_dec_and_test(&em->refs)) { - WARN_ON(extent_map_in_tree(em)); + WARN_ON(btrfs_extent_map_in_tree(em)); WARN_ON(!list_empty(&em->list)); kmem_cache_free(extent_map_cache, em); } @@ -102,19 +102,19 @@ static int tree_insert(struct rb_root *root, struct extent_map *em) if (em->start < entry->start) p = &(*p)->rb_left; - else if (em->start >= extent_map_end(entry)) + else if (em->start >= btrfs_extent_map_end(entry)) p = &(*p)->rb_right; else return -EEXIST; } orig_parent = parent; - while (parent && em->start >= extent_map_end(entry)) { + while (parent && em->start >= btrfs_extent_map_end(entry)) { parent = rb_next(parent); entry = rb_entry(parent, struct extent_map, rb_node); } if (parent) - if (end > entry->start && em->start < extent_map_end(entry)) + if (end > entry->start && em->start < btrfs_extent_map_end(entry)) return -EEXIST; parent = orig_parent; @@ -124,7 +124,7 @@ static int tree_insert(struct rb_root *root, struct extent_map *em) entry = rb_entry(parent, struct extent_map, rb_node); } if (parent) - if (end > entry->start && em->start < extent_map_end(entry)) + if (end > entry->start && em->start < btrfs_extent_map_end(entry)) return -EEXIST; rb_link_node(&em->rb_node, orig_parent, p); @@ -136,8 +136,8 @@ static int tree_insert(struct rb_root *root, struct extent_map *em) * Search through the tree for an extent_map with a given offset. If it can't * be found, try to find some neighboring extents */ -static struct rb_node *__tree_search(struct rb_root *root, u64 offset, - struct rb_node **prev_or_next_ret) +static struct rb_node *tree_search(struct rb_root *root, u64 offset, + struct rb_node **prev_or_next_ret) { struct rb_node *n = root->rb_node; struct rb_node *prev = NULL; @@ -154,14 +154,14 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset, if (offset < entry->start) n = n->rb_left; - else if (offset >= extent_map_end(entry)) + else if (offset >= btrfs_extent_map_end(entry)) n = n->rb_right; else return n; } orig_prev = prev; - while (prev && offset >= extent_map_end(prev_entry)) { + while (prev && offset >= btrfs_extent_map_end(prev_entry)) { prev = rb_next(prev); prev_entry = rb_entry(prev, struct extent_map, rb_node); } @@ -188,14 +188,14 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset, static inline u64 extent_map_block_len(const struct extent_map *em) { - if (extent_map_is_compressed(em)) + if (btrfs_extent_map_is_compressed(em)) return em->disk_num_bytes; return em->len; } static inline u64 extent_map_block_end(const struct extent_map *em) { - const u64 block_start = extent_map_block_start(em); + const u64 block_start = btrfs_extent_map_block_start(em); const u64 block_end = block_start + extent_map_block_len(em); if (block_end < block_start) @@ -210,7 +210,7 @@ static bool can_merge_extent_map(const struct extent_map *em) return false; /* Don't merge compressed extents, we need to know their actual size. */ - if (extent_map_is_compressed(em)) + if (btrfs_extent_map_is_compressed(em)) return false; if (em->flags & EXTENT_FLAG_LOGGING) @@ -230,7 +230,7 @@ static bool can_merge_extent_map(const struct extent_map *em) /* Check to see if two extent_map structs are adjacent and safe to merge. */ static bool mergeable_maps(const struct extent_map *prev, const struct extent_map *next) { - if (extent_map_end(prev) != next->start) + if (btrfs_extent_map_end(prev) != next->start) return false; /* @@ -242,7 +242,7 @@ static bool mergeable_maps(const struct extent_map *prev, const struct extent_ma return false; if (next->disk_bytenr < EXTENT_MAP_LAST_BYTE - 1) - return extent_map_block_start(next) == extent_map_block_end(prev); + return btrfs_extent_map_block_start(next) == extent_map_block_end(prev); /* HOLES and INLINE extents. */ return next->disk_bytenr == prev->disk_bytenr; @@ -270,8 +270,8 @@ static void merge_ondisk_extents(const struct extent_map *prev, const struct ext u64 new_offset; /* @prev and @next should not be compressed. */ - ASSERT(!extent_map_is_compressed(prev)); - ASSERT(!extent_map_is_compressed(next)); + ASSERT(!btrfs_extent_map_is_compressed(prev)); + ASSERT(!btrfs_extent_map_is_compressed(next)); /* * There are two different cases where @prev and @next can be merged. @@ -327,9 +327,9 @@ static void validate_extent_map(struct btrfs_fs_info *fs_info, struct extent_map if (em->offset + em->len > em->ram_bytes) dump_extent_map(fs_info, "ram_bytes too small", em); if (em->offset + em->len > em->disk_num_bytes && - !extent_map_is_compressed(em)) + !btrfs_extent_map_is_compressed(em)) dump_extent_map(fs_info, "disk_num_bytes too small", em); - if (!extent_map_is_compressed(em) && + if (!btrfs_extent_map_is_compressed(em) && em->ram_bytes != em->disk_num_bytes) dump_extent_map(fs_info, "ram_bytes mismatch with disk_num_bytes for non-compressed em", @@ -361,8 +361,8 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em) if (em->start != 0) { rb = rb_prev(&em->rb_node); - if (rb) - merge = rb_entry(rb, struct extent_map, rb_node); + merge = rb_entry_safe(rb, struct extent_map, rb_node); + if (rb && can_merge_extent_map(merge) && mergeable_maps(merge, em)) { em->start = merge->start; em->len += merge->len; @@ -374,13 +374,13 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em) validate_extent_map(fs_info, em); remove_em(inode, merge); - free_extent_map(merge); + btrfs_free_extent_map(merge); } } rb = rb_next(&em->rb_node); - if (rb) - merge = rb_entry(rb, struct extent_map, rb_node); + merge = rb_entry_safe(rb, struct extent_map, rb_node); + if (rb && can_merge_extent_map(merge) && mergeable_maps(em, merge)) { em->len += merge->len; if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) @@ -389,7 +389,7 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em) em->generation = max(em->generation, merge->generation); em->flags |= EXTENT_FLAG_MERGED; remove_em(inode, merge); - free_extent_map(merge); + btrfs_free_extent_map(merge); } } @@ -409,7 +409,7 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em) * -ENOENT when the extent is not found in the tree * -EUCLEAN if the found extent does not match the expected start */ -int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen) +int btrfs_unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_map_tree *tree = &inode->extent_tree; @@ -417,7 +417,7 @@ int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen) struct extent_map *em; write_lock(&tree->lock); - em = lookup_extent_mapping(tree, start, len); + em = btrfs_lookup_extent_mapping(tree, start, len); if (WARN_ON(!em)) { btrfs_warn(fs_info, @@ -444,17 +444,17 @@ int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen) out: write_unlock(&tree->lock); - free_extent_map(em); + btrfs_free_extent_map(em); return ret; } -void clear_em_logging(struct btrfs_inode *inode, struct extent_map *em) +void btrfs_clear_em_logging(struct btrfs_inode *inode, struct extent_map *em) { lockdep_assert_held_write(&inode->extent_tree.lock); em->flags &= ~EXTENT_FLAG_LOGGING; - if (extent_map_in_tree(em)) + if (btrfs_extent_map_in_tree(em)) try_merge_map(inode, em); } @@ -508,16 +508,15 @@ static int add_extent_mapping(struct btrfs_inode *inode, return 0; } -static struct extent_map * -__lookup_extent_mapping(struct extent_map_tree *tree, - u64 start, u64 len, int strict) +static struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len, int strict) { struct extent_map *em; struct rb_node *rb_node; struct rb_node *prev_or_next = NULL; u64 end = range_end(start, len); - rb_node = __tree_search(&tree->root, start, &prev_or_next); + rb_node = tree_search(&tree->root, start, &prev_or_next); if (!rb_node) { if (prev_or_next) rb_node = prev_or_next; @@ -527,7 +526,7 @@ __lookup_extent_mapping(struct extent_map_tree *tree, em = rb_entry(rb_node, struct extent_map, rb_node); - if (strict && !(end > em->start && start < extent_map_end(em))) + if (strict && !(end > em->start && start < btrfs_extent_map_end(em))) return NULL; refcount_inc(&em->refs); @@ -546,10 +545,10 @@ __lookup_extent_mapping(struct extent_map_tree *tree, * intersect, so check the object returned carefully to make sure that no * additional lookups are needed. */ -struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, - u64 start, u64 len) +struct extent_map *btrfs_lookup_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len) { - return __lookup_extent_mapping(tree, start, len, 1); + return lookup_extent_mapping(tree, start, len, 1); } /* @@ -564,10 +563,10 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, * * If one can't be found, any nearby extent may be returned */ -struct extent_map *search_extent_mapping(struct extent_map_tree *tree, - u64 start, u64 len) +struct extent_map *btrfs_search_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len) { - return __lookup_extent_mapping(tree, start, len, 0); + return lookup_extent_mapping(tree, start, len, 0); } /* @@ -579,7 +578,7 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree, * Remove @em from the extent tree of @inode. No reference counts are dropped, * and no checks are done to see if the range is in use. */ -void remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em) +void btrfs_remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em) { struct extent_map_tree *tree = &inode->extent_tree; @@ -605,7 +604,7 @@ static void replace_extent_mapping(struct btrfs_inode *inode, validate_extent_map(fs_info, new); WARN_ON(cur->flags & EXTENT_FLAG_PINNED); - ASSERT(extent_map_in_tree(cur)); + ASSERT(btrfs_extent_map_in_tree(cur)); if (!(cur->flags & EXTENT_FLAG_LOGGING)) list_del_init(&cur->list); rb_replace_node(&cur->rb_node, &new->rb_node, &tree->root); @@ -651,7 +650,7 @@ static noinline int merge_extent_mapping(struct btrfs_inode *inode, u64 end; u64 start_diff; - if (map_start < em->start || map_start >= extent_map_end(em)) + if (map_start < em->start || map_start >= btrfs_extent_map_end(em)) return -EINVAL; if (existing->start > map_start) { @@ -662,10 +661,10 @@ static noinline int merge_extent_mapping(struct btrfs_inode *inode, next = next_extent_map(prev); } - start = prev ? extent_map_end(prev) : em->start; + start = prev ? btrfs_extent_map_end(prev) : em->start; start = max_t(u64, start, em->start); - end = next ? next->start : extent_map_end(em); - end = min_t(u64, end, extent_map_end(em)); + end = next ? next->start : btrfs_extent_map_end(em); + end = min_t(u64, end, btrfs_extent_map_end(em)); start_diff = start - em->start; em->start = start; em->len = end - start; @@ -716,7 +715,7 @@ int btrfs_add_extent_mapping(struct btrfs_inode *inode, if (ret == -EEXIST) { struct extent_map *existing; - existing = search_extent_mapping(&inode->extent_tree, start, len); + existing = btrfs_search_extent_mapping(&inode->extent_tree, start, len); trace_btrfs_handle_em_exist(fs_info, existing, em, start, len); @@ -725,8 +724,8 @@ int btrfs_add_extent_mapping(struct btrfs_inode *inode, * extent causing the -EEXIST. */ if (start >= existing->start && - start < extent_map_end(existing)) { - free_extent_map(em); + start < btrfs_extent_map_end(existing)) { + btrfs_free_extent_map(em); *em_in = existing; ret = 0; } else { @@ -739,14 +738,14 @@ int btrfs_add_extent_mapping(struct btrfs_inode *inode, */ ret = merge_extent_mapping(inode, existing, em, start); if (WARN_ON(ret)) { - free_extent_map(em); + btrfs_free_extent_map(em); *em_in = NULL; btrfs_warn(fs_info, "extent map merge error existing [%llu, %llu) with em [%llu, %llu) start %llu", - existing->start, extent_map_end(existing), + existing->start, btrfs_extent_map_end(existing), orig_start, orig_start + orig_len, start); } - free_extent_map(existing); + btrfs_free_extent_map(existing); } } @@ -772,8 +771,8 @@ static void drop_all_extent_maps_fast(struct btrfs_inode *inode) em = rb_entry(node, struct extent_map, rb_node); em->flags &= ~(EXTENT_FLAG_PINNED | EXTENT_FLAG_LOGGING); - remove_extent_mapping(inode, em); - free_extent_map(em); + btrfs_remove_extent_mapping(inode, em); + btrfs_free_extent_map(em); if (cond_resched_rwlock_write(&tree->lock)) node = rb_first(&tree->root); @@ -826,15 +825,15 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, * range ends after our range (and they might be the same extent map), * because we need to split those two extent maps at the boundaries. */ - split = alloc_extent_map(); - split2 = alloc_extent_map(); + split = btrfs_alloc_extent_map(); + split2 = btrfs_alloc_extent_map(); write_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, len); + em = btrfs_lookup_extent_mapping(em_tree, start, len); while (em) { /* extent_map_end() returns exclusive value (last byte + 1). */ - const u64 em_end = extent_map_end(em); + const u64 em_end = btrfs_extent_map_end(em); struct extent_map *next_em = NULL; u64 gen; unsigned long flags; @@ -898,7 +897,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, split->generation = gen; split->flags = flags; replace_extent_mapping(inode, em, split, modified); - free_extent_map(split); + btrfs_free_extent_map(split); split = split2; split2 = NULL; } @@ -925,7 +924,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, split->ram_bytes = split->len; } - if (extent_map_in_tree(em)) { + if (btrfs_extent_map_in_tree(em)) { replace_extent_mapping(inode, em, split, modified); } else { int ret; @@ -936,11 +935,11 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, if (WARN_ON(ret != 0) && modified) btrfs_set_inode_full_sync(inode); } - free_extent_map(split); + btrfs_free_extent_map(split); split = NULL; } remove_em: - if (extent_map_in_tree(em)) { + if (btrfs_extent_map_in_tree(em)) { /* * If the extent map is still in the tree it means that * either of the following is true: @@ -965,25 +964,25 @@ remove_em: ASSERT(!split); btrfs_set_inode_full_sync(inode); } - remove_extent_mapping(inode, em); + btrfs_remove_extent_mapping(inode, em); } /* * Once for the tree reference (we replaced or removed the * extent map from the tree). */ - free_extent_map(em); + btrfs_free_extent_map(em); next: /* Once for us (for our lookup reference). */ - free_extent_map(em); + btrfs_free_extent_map(em); em = next_em; } write_unlock(&em_tree->lock); - free_extent_map(split); - free_extent_map(split2); + btrfs_free_extent_map(split); + btrfs_free_extent_map(split2); } /* @@ -1007,7 +1006,7 @@ int btrfs_replace_extent_map_range(struct btrfs_inode *inode, struct extent_map_tree *tree = &inode->extent_tree; int ret; - ASSERT(!extent_map_in_tree(new_em)); + ASSERT(!btrfs_extent_map_in_tree(new_em)); /* * The caller has locked an appropriate file range in the inode's io @@ -1033,8 +1032,8 @@ int btrfs_replace_extent_map_range(struct btrfs_inode *inode, * * This function is used when an ordered_extent needs to be split. */ -int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, - u64 new_logical) +int btrfs_split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, + u64 new_logical) { struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; @@ -1046,25 +1045,25 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, ASSERT(pre != 0); ASSERT(pre < len); - split_pre = alloc_extent_map(); + split_pre = btrfs_alloc_extent_map(); if (!split_pre) return -ENOMEM; - split_mid = alloc_extent_map(); + split_mid = btrfs_alloc_extent_map(); if (!split_mid) { ret = -ENOMEM; goto out_free_pre; } - lock_extent(&inode->io_tree, start, start + len - 1, NULL); + btrfs_lock_extent(&inode->io_tree, start, start + len - 1, NULL); write_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, len); + em = btrfs_lookup_extent_mapping(em_tree, start, len); if (!em) { ret = -EIO; goto out_unlock; } ASSERT(em->len == len); - ASSERT(!extent_map_is_compressed(em)); + ASSERT(!btrfs_extent_map_is_compressed(em)); ASSERT(em->disk_bytenr < EXTENT_MAP_LAST_BYTE); ASSERT(em->flags & EXTENT_FLAG_PINNED); ASSERT(!(em->flags & EXTENT_FLAG_LOGGING)); @@ -1093,7 +1092,7 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, /* Insert the middle extent_map. */ split_mid->start = em->start + pre; split_mid->len = em->len - pre; - split_mid->disk_bytenr = extent_map_block_start(em) + pre; + split_mid->disk_bytenr = btrfs_extent_map_block_start(em) + pre; split_mid->disk_num_bytes = split_mid->len; split_mid->offset = 0; split_mid->ram_bytes = split_mid->len; @@ -1102,16 +1101,16 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, add_extent_mapping(inode, split_mid, 1); /* Once for us */ - free_extent_map(em); + btrfs_free_extent_map(em); /* Once for the tree */ - free_extent_map(em); + btrfs_free_extent_map(em); out_unlock: write_unlock(&em_tree->lock); - unlock_extent(&inode->io_tree, start, start + len - 1, NULL); - free_extent_map(split_mid); + btrfs_unlock_extent(&inode->io_tree, start, start + len - 1, NULL); + btrfs_free_extent_map(split_mid); out_free_pre: - free_extent_map(split_pre); + btrfs_free_extent_map(split_pre); return ret; } @@ -1168,10 +1167,10 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c if (!list_empty(&em->list) && em->generation >= cur_fs_gen) btrfs_set_inode_full_sync(inode); - remove_extent_mapping(inode, em); + btrfs_remove_extent_mapping(inode, em); trace_btrfs_extent_map_shrinker_remove_em(inode, em); /* Drop the reference for the tree. */ - free_extent_map(em); + btrfs_free_extent_map(em); nr_dropped++; next: if (ctx->scanned >= ctx->nr_to_scan) diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index cd123b266b64..d4b81ee4d97b 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -108,8 +108,8 @@ struct extent_map_tree { struct btrfs_inode; -static inline void extent_map_set_compression(struct extent_map *em, - enum btrfs_compression_type type) +static inline void btrfs_extent_map_set_compression(struct extent_map *em, + enum btrfs_compression_type type) { if (type == BTRFS_COMPRESS_ZLIB) em->flags |= EXTENT_FLAG_COMPRESS_ZLIB; @@ -119,7 +119,8 @@ static inline void extent_map_set_compression(struct extent_map *em, em->flags |= EXTENT_FLAG_COMPRESS_ZSTD; } -static inline enum btrfs_compression_type extent_map_compression(const struct extent_map *em) +static inline enum btrfs_compression_type btrfs_extent_map_compression( + const struct extent_map *em) { if (em->flags & EXTENT_FLAG_COMPRESS_ZLIB) return BTRFS_COMPRESS_ZLIB; @@ -137,50 +138,50 @@ static inline enum btrfs_compression_type extent_map_compression(const struct ex * More efficient way to determine if extent is compressed, instead of using * 'extent_map_compression() != BTRFS_COMPRESS_NONE'. */ -static inline bool extent_map_is_compressed(const struct extent_map *em) +static inline bool btrfs_extent_map_is_compressed(const struct extent_map *em) { return (em->flags & (EXTENT_FLAG_COMPRESS_ZLIB | EXTENT_FLAG_COMPRESS_LZO | EXTENT_FLAG_COMPRESS_ZSTD)) != 0; } -static inline int extent_map_in_tree(const struct extent_map *em) +static inline int btrfs_extent_map_in_tree(const struct extent_map *em) { return !RB_EMPTY_NODE(&em->rb_node); } -static inline u64 extent_map_block_start(const struct extent_map *em) +static inline u64 btrfs_extent_map_block_start(const struct extent_map *em) { if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) { - if (extent_map_is_compressed(em)) + if (btrfs_extent_map_is_compressed(em)) return em->disk_bytenr; return em->disk_bytenr + em->offset; } return em->disk_bytenr; } -static inline u64 extent_map_end(const struct extent_map *em) +static inline u64 btrfs_extent_map_end(const struct extent_map *em) { if (em->start + em->len < em->start) return (u64)-1; return em->start + em->len; } -void extent_map_tree_init(struct extent_map_tree *tree); -struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, - u64 start, u64 len); -void remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em); -int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, - u64 new_logical); - -struct extent_map *alloc_extent_map(void); -void free_extent_map(struct extent_map *em); -int __init extent_map_init(void); -void __cold extent_map_exit(void); -int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen); -void clear_em_logging(struct btrfs_inode *inode, struct extent_map *em); -struct extent_map *search_extent_mapping(struct extent_map_tree *tree, - u64 start, u64 len); +void btrfs_extent_map_tree_init(struct extent_map_tree *tree); +struct extent_map *btrfs_lookup_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len); +void btrfs_remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em); +int btrfs_split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, + u64 new_logical); + +struct extent_map *btrfs_alloc_extent_map(void); +void btrfs_free_extent_map(struct extent_map *em); +int __init btrfs_extent_map_init(void); +void __cold btrfs_extent_map_exit(void); +int btrfs_unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen); +void btrfs_clear_em_logging(struct btrfs_inode *inode, struct extent_map *em); +struct extent_map *btrfs_search_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len); int btrfs_add_extent_mapping(struct btrfs_inode *inode, struct extent_map **em_in, u64 start, u64 len); void btrfs_drop_extent_map_range(struct btrfs_inode *inode, diff --git a/fs/btrfs/fiemap.c b/fs/btrfs/fiemap.c index b80c07ad8c5e..43bf0979fd53 100644 --- a/fs/btrfs/fiemap.c +++ b/fs/btrfs/fiemap.c @@ -634,7 +634,7 @@ static int extent_fiemap(struct btrfs_inode *inode, const u64 ino = btrfs_ino(inode); struct extent_state *cached_state = NULL; struct extent_state *delalloc_cached_state = NULL; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct fiemap_cache cache = { 0 }; struct btrfs_backref_share_check_ctx *backref_ctx; u64 last_extent_end = 0; @@ -661,7 +661,7 @@ restart: range_end = round_up(start + len, sectorsize); prev_extent_end = range_start; - lock_extent(&inode->io_tree, range_start, range_end, &cached_state); + btrfs_lock_extent(&inode->io_tree, range_start, range_end, &cached_state); ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end); if (ret < 0) @@ -841,7 +841,7 @@ check_eof_delalloc: } out_unlock: - unlock_extent(&inode->io_tree, range_start, range_end, &cached_state); + btrfs_unlock_extent(&inode->io_tree, range_start, range_end, &cached_state); if (ret == BTRFS_FIEMAP_FLUSH_CACHE) { btrfs_release_path(path); @@ -871,10 +871,9 @@ out_unlock: ret = emit_last_fiemap_cache(fieinfo, &cache); out: - free_extent_state(delalloc_cached_state); + btrfs_free_extent_state(delalloc_cached_state); kfree(cache.entries); btrfs_free_backref_share_ctx(backref_ctx); - btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 344b4db487a0..54d523d4f421 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -46,7 +46,7 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_size) { u64 start, end, i_size; - int ret; + bool found; spin_lock(&inode->lock); i_size = new_i_size ?: i_size_read(&inode->vfs_inode); @@ -55,9 +55,9 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz goto out_unlock; } - ret = find_contiguous_extent_bit(inode->file_extent_tree, 0, &start, - &end, EXTENT_DIRTY); - if (!ret && start == 0) + found = btrfs_find_contiguous_extent_bit(inode->file_extent_tree, 0, &start, + &end, EXTENT_DIRTY); + if (found && start == 0) i_size = min(i_size, end + 1); else i_size = 0; @@ -91,8 +91,8 @@ int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, ASSERT(IS_ALIGNED(start + len, inode->root->fs_info->sectorsize)); - return set_extent_bit(inode->file_extent_tree, start, start + len - 1, - EXTENT_DIRTY, NULL); + return btrfs_set_extent_bit(inode->file_extent_tree, start, start + len - 1, + EXTENT_DIRTY, NULL); } /* @@ -121,8 +121,8 @@ int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start, ASSERT(IS_ALIGNED(start + len, inode->root->fs_info->sectorsize) || len == (u64)-1); - return clear_extent_bit(inode->file_extent_tree, start, - start + len - 1, EXTENT_DIRTY, NULL); + return btrfs_clear_extent_bit(inode->file_extent_tree, start, + start + len - 1, EXTENT_DIRTY, NULL); } static size_t bytes_to_csum_size(const struct btrfs_fs_info *fs_info, u32 bytes) @@ -336,7 +336,7 @@ out: * * Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise. */ -blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) +int btrfs_lookup_bio_sums(struct btrfs_bio *bbio) { struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -347,12 +347,12 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) u32 orig_len = bio->bi_iter.bi_size; u64 orig_disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT; const unsigned int nblocks = orig_len >> fs_info->sectorsize_bits; - blk_status_t ret = BLK_STS_OK; + int ret = 0; u32 bio_offset = 0; if ((inode->flags & BTRFS_INODE_NODATASUM) || test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state)) - return BLK_STS_OK; + return 0; /* * This function is only called for read bio. @@ -369,12 +369,12 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) ASSERT(bio_op(bio) == REQ_OP_READ); path = btrfs_alloc_path(); if (!path) - return BLK_STS_RESOURCE; + return -ENOMEM; if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS); if (!bbio->csum) - return BLK_STS_RESOURCE; + return -ENOMEM; } else { bbio->csum = bbio->csum_inline; } @@ -406,7 +406,7 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) count = search_csum_tree(fs_info, path, cur_disk_bytenr, orig_len - bio_offset, csum_dst); if (count < 0) { - ret = errno_to_blk_status(count); + ret = count; if (bbio->csum != bbio->csum_inline) kfree(bbio->csum); bbio->csum = NULL; @@ -430,9 +430,9 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) if (btrfs_root_id(inode->root) == BTRFS_DATA_RELOC_TREE_OBJECTID) { u64 file_offset = bbio->file_offset + bio_offset; - set_extent_bit(&inode->io_tree, file_offset, - file_offset + sectorsize - 1, - EXTENT_NODATASUM, NULL); + btrfs_set_extent_bit(&inode->io_tree, file_offset, + file_offset + sectorsize - 1, + EXTENT_NODATASUM, NULL); } else { btrfs_warn_rl(fs_info, "csum hole found for disk bytenr range [%llu, %llu)", @@ -735,7 +735,7 @@ fail: /* * Calculate checksums of the data contained inside a bio. */ -blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio) +int btrfs_csum_one_bio(struct btrfs_bio *bbio) { struct btrfs_ordered_extent *ordered = bbio->ordered; struct btrfs_inode *inode = bbio->inode; @@ -757,7 +757,7 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio) memalloc_nofs_restore(nofs_flag); if (!sums) - return BLK_STS_RESOURCE; + return -ENOMEM; sums->len = bio->bi_iter.bi_size; INIT_LIST_HEAD(&sums->list); @@ -794,11 +794,11 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio) * record the updated logical address on Zone Append completion. * Allocate just the structure with an empty sums array here for that case. */ -blk_status_t btrfs_alloc_dummy_sum(struct btrfs_bio *bbio) +int btrfs_alloc_dummy_sum(struct btrfs_bio *bbio) { bbio->sums = kmalloc(sizeof(*bbio->sums), GFP_NOFS); if (!bbio->sums) - return BLK_STS_RESOURCE; + return -ENOMEM; bbio->sums->len = bbio->bio.bi_iter.bi_size; bbio->sums->logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; btrfs_add_ordered_sum(bbio->ordered, bbio->sums); @@ -1048,7 +1048,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key file_key; struct btrfs_key found_key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_csum_item *item; struct btrfs_csum_item *item_end; struct extent_buffer *leaf = NULL; @@ -1259,7 +1259,6 @@ found: goto again; } out: - btrfs_free_path(path); return ret; } @@ -1297,7 +1296,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, em->disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); em->offset = btrfs_file_extent_offset(leaf, fi); if (compress_type != BTRFS_COMPRESS_NONE) { - extent_map_set_compression(em, compress_type); + btrfs_extent_map_set_compression(em, compress_type); } else { /* * Older kernels can create regular non-hole data @@ -1317,7 +1316,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, em->start = 0; em->len = fs_info->sectorsize; em->offset = 0; - extent_map_set_compression(em, compress_type); + btrfs_extent_map_set_compression(em, compress_type); } else { btrfs_err(fs_info, "unknown file extent item type %d, inode %llu, offset %llu, " diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h index 6181a70ec3ef..63216c43676d 100644 --- a/fs/btrfs/file-item.h +++ b/fs/btrfs/file-item.h @@ -53,7 +53,7 @@ static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize) int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 len); -blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio); +int btrfs_lookup_bio_sums(struct btrfs_bio *bbio); int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 pos, u64 num_bytes); @@ -64,8 +64,8 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums); -blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio); -blk_status_t btrfs_alloc_dummy_sum(struct btrfs_bio *bbio); +int btrfs_csum_one_bio(struct btrfs_bio *bbio); +int btrfs_alloc_dummy_sum(struct btrfs_bio *bbio); int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit, bool nowait); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 262a707d8990..8ce6f45f45e0 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -98,9 +98,9 @@ int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos * The pages may have already been dirty, clear out old accounting so * we can set things up properly */ - clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block, - EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - cached); + btrfs_clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block, + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, + cached); ret = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, extra_bits, cached); @@ -508,20 +508,19 @@ out: return ret; } -static int extent_mergeable(struct extent_buffer *leaf, int slot, - u64 objectid, u64 bytenr, u64 orig_offset, - u64 *start, u64 *end) +static bool extent_mergeable(struct extent_buffer *leaf, int slot, u64 objectid, + u64 bytenr, u64 orig_offset, u64 *start, u64 *end) { struct btrfs_file_extent_item *fi; struct btrfs_key key; u64 extent_end; if (slot < 0 || slot >= btrfs_header_nritems(leaf)) - return 0; + return false; btrfs_item_key_to_cpu(leaf, &key, slot); if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY) - return 0; + return false; fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG || @@ -530,15 +529,15 @@ static int extent_mergeable(struct extent_buffer *leaf, int slot, btrfs_file_extent_compression(leaf, fi) || btrfs_file_extent_encryption(leaf, fi) || btrfs_file_extent_other_encoding(leaf, fi)) - return 0; + return false; extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); if ((*start && *start != key.offset) || (*end && *end != extent_end)) - return 0; + return false; *start = key.offset; *end = extent_end; - return 1; + return true; } /* @@ -553,7 +552,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, { struct btrfs_root *root = inode->root; struct extent_buffer *leaf; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_file_extent_item *fi; struct btrfs_ref ref = { 0 }; struct btrfs_key key; @@ -791,7 +790,6 @@ again: } } out: - btrfs_free_path(path); return ret; } @@ -800,7 +798,7 @@ out: * On success return a locked folio and 0 */ static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64 pos, - u64 len, bool force_uptodate) + u64 len) { u64 clamp_start = max_t(u64, pos, folio_pos(folio)); u64 clamp_end = min_t(u64, pos + len, folio_pos(folio) + folio_size(folio)); @@ -810,8 +808,7 @@ static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64 if (folio_test_uptodate(folio)) return 0; - if (!force_uptodate && - IS_ALIGNED(clamp_start, blocksize) && + if (IS_ALIGNED(clamp_start, blocksize) && IS_ALIGNED(clamp_end, blocksize)) return 0; @@ -858,32 +855,27 @@ static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait) */ static noinline int prepare_one_folio(struct inode *inode, struct folio **folio_ret, loff_t pos, size_t write_bytes, - bool force_uptodate, bool nowait) + bool nowait) { unsigned long index = pos >> PAGE_SHIFT; gfp_t mask = get_prepare_gfp_flags(inode, nowait); - fgf_t fgp_flags = (nowait ? FGP_WRITEBEGIN | FGP_NOWAIT : FGP_WRITEBEGIN); + fgf_t fgp_flags = (nowait ? FGP_WRITEBEGIN | FGP_NOWAIT : FGP_WRITEBEGIN) | + fgf_set_order(write_bytes); struct folio *folio; int ret = 0; again: folio = __filemap_get_folio(inode->i_mapping, index, fgp_flags, mask); - if (IS_ERR(folio)) { - if (nowait) - ret = -EAGAIN; - else - ret = PTR_ERR(folio); - return ret; - } - /* Only support page sized folio yet. */ - ASSERT(folio_order(folio) == 0); + if (IS_ERR(folio)) + return PTR_ERR(folio); + ret = set_folio_extent_mapped(folio); if (ret < 0) { folio_unlock(folio); folio_put(folio); return ret; } - ret = prepare_uptodate_folio(inode, folio, pos, write_bytes, force_uptodate); + ret = prepare_uptodate_folio(inode, folio, pos, write_bytes); if (ret) { /* The folio is already unlocked. */ folio_put(folio); @@ -924,14 +916,15 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio, struct btrfs_ordered_extent *ordered; if (nowait) { - if (!try_lock_extent(&inode->io_tree, start_pos, last_pos, - cached_state)) { + if (!btrfs_try_lock_extent(&inode->io_tree, start_pos, + last_pos, cached_state)) { folio_unlock(folio); folio_put(folio); return -EAGAIN; } } else { - lock_extent(&inode->io_tree, start_pos, last_pos, cached_state); + btrfs_lock_extent(&inode->io_tree, start_pos, last_pos, + cached_state); } ordered = btrfs_lookup_ordered_range(inode, start_pos, @@ -939,8 +932,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio, if (ordered && ordered->file_offset + ordered->num_bytes > start_pos && ordered->file_offset <= last_pos) { - unlock_extent(&inode->io_tree, start_pos, last_pos, - cached_state); + btrfs_unlock_extent(&inode->io_tree, start_pos, last_pos, + cached_state); folio_unlock(folio); folio_put(folio); btrfs_start_ordered_extent(ordered); @@ -1020,7 +1013,7 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, else *write_bytes = min_t(size_t, *write_bytes , num_bytes - pos + lockstart); - unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); + btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); return ret; } @@ -1077,241 +1070,306 @@ int btrfs_write_check(struct kiocb *iocb, size_t count) return 0; } -ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) +static void release_space(struct btrfs_inode *inode, struct extent_changeset *data_reserved, + u64 start, u64 len, bool only_release_metadata) { - struct file *file = iocb->ki_filp; - loff_t pos; - struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct extent_changeset *data_reserved = NULL; - u64 release_bytes = 0; - u64 lockstart; - u64 lockend; - size_t num_written = 0; - ssize_t ret; - loff_t old_isize; - unsigned int ilock_flags = 0; - const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); - unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0); - bool only_release_metadata = false; - - if (nowait) - ilock_flags |= BTRFS_ILOCK_TRY; + if (len == 0) + return; - ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags); - if (ret < 0) - return ret; + if (only_release_metadata) { + btrfs_check_nocow_unlock(inode); + btrfs_delalloc_release_metadata(inode, len, true); + } else { + const struct btrfs_fs_info *fs_info = inode->root->fs_info; - /* - * We can only trust the isize with inode lock held, or it can race with - * other buffered writes and cause incorrect call of - * pagecache_isize_extended() to overwrite existing data. - */ - old_isize = i_size_read(inode); + btrfs_delalloc_release_space(inode, data_reserved, + round_down(start, fs_info->sectorsize), + len, true); + } +} - ret = generic_write_checks(iocb, i); - if (ret <= 0) - goto out; +/* + * Reserve data and metadata space for this buffered write range. + * + * Return >0 for the number of bytes reserved, which is always block aligned. + * Return <0 for error. + */ +static ssize_t reserve_space(struct btrfs_inode *inode, + struct extent_changeset **data_reserved, + u64 start, size_t *len, bool nowait, + bool *only_release_metadata) +{ + const struct btrfs_fs_info *fs_info = inode->root->fs_info; + const unsigned int block_offset = (start & (fs_info->sectorsize - 1)); + size_t reserve_bytes; + int ret; - ret = btrfs_write_check(iocb, ret); - if (ret < 0) - goto out; + ret = btrfs_check_data_free_space(inode, data_reserved, start, *len, nowait); + if (ret < 0) { + int can_nocow; - pos = iocb->ki_pos; - while (iov_iter_count(i) > 0) { - struct extent_state *cached_state = NULL; - size_t offset = offset_in_page(pos); - size_t sector_offset; - size_t write_bytes = min(iov_iter_count(i), PAGE_SIZE - offset); - size_t reserve_bytes; - size_t copied; - size_t dirty_sectors; - size_t num_sectors; - struct folio *folio = NULL; - int extents_locked; - bool force_page_uptodate = false; + if (nowait && (ret == -ENOSPC || ret == -EAGAIN)) + return -EAGAIN; /* - * Fault pages before locking them in prepare_one_folio() - * to avoid recursive lock + * If we don't have to COW at the offset, reserve metadata only. + * write_bytes may get smaller than requested here. */ - if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) { - ret = -EFAULT; - break; - } + can_nocow = btrfs_check_nocow_lock(inode, start, len, nowait); + if (can_nocow < 0) + ret = can_nocow; + if (can_nocow > 0) + ret = 0; + if (ret) + return ret; + *only_release_metadata = true; + } - only_release_metadata = false; - sector_offset = pos & (fs_info->sectorsize - 1); + reserve_bytes = round_up(*len + block_offset, fs_info->sectorsize); + WARN_ON(reserve_bytes == 0); + ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes, + reserve_bytes, nowait); + if (ret) { + if (!*only_release_metadata) + btrfs_free_reserved_data_space(inode, *data_reserved, + start, *len); + else + btrfs_check_nocow_unlock(inode); - extent_changeset_release(data_reserved); - ret = btrfs_check_data_free_space(BTRFS_I(inode), - &data_reserved, pos, - write_bytes, nowait); - if (ret < 0) { - int can_nocow; + if (nowait && ret == -ENOSPC) + ret = -EAGAIN; + return ret; + } + return reserve_bytes; +} - if (nowait && (ret == -ENOSPC || ret == -EAGAIN)) { - ret = -EAGAIN; - break; - } +/* Shrink the reserved data and metadata space from @reserved_len to @new_len. */ +static void shrink_reserved_space(struct btrfs_inode *inode, + struct extent_changeset *data_reserved, + u64 reserved_start, u64 reserved_len, + u64 new_len, bool only_release_metadata) +{ + const u64 diff = reserved_len - new_len; - /* - * If we don't have to COW at the offset, reserve - * metadata only. write_bytes may get smaller than - * requested here. - */ - can_nocow = btrfs_check_nocow_lock(BTRFS_I(inode), pos, - &write_bytes, nowait); - if (can_nocow < 0) - ret = can_nocow; - if (can_nocow > 0) - ret = 0; - if (ret) - break; - only_release_metadata = true; - } + ASSERT(new_len <= reserved_len); + btrfs_delalloc_shrink_extents(inode, reserved_len, new_len); + if (only_release_metadata) + btrfs_delalloc_release_metadata(inode, diff, true); + else + btrfs_delalloc_release_space(inode, data_reserved, + reserved_start + new_len, diff, true); +} - reserve_bytes = round_up(write_bytes + sector_offset, - fs_info->sectorsize); - WARN_ON(reserve_bytes == 0); - ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), - reserve_bytes, - reserve_bytes, nowait); - if (ret) { - if (!only_release_metadata) - btrfs_free_reserved_data_space(BTRFS_I(inode), - data_reserved, pos, - write_bytes); - else - btrfs_check_nocow_unlock(BTRFS_I(inode)); +/* Calculate the maximum amount of bytes we can write into one folio. */ +static size_t calc_write_bytes(const struct btrfs_inode *inode, + const struct iov_iter *iter, u64 start) +{ + const size_t max_folio_size = mapping_max_folio_size(inode->vfs_inode.i_mapping); - if (nowait && ret == -ENOSPC) - ret = -EAGAIN; - break; - } + return min(max_folio_size - (start & (max_folio_size - 1)), + iov_iter_count(iter)); +} + +/* + * Do the heavy-lifting work to copy one range into one folio of the page cache. + * + * Return > 0 in case we copied all bytes or just some of them. + * Return 0 if no bytes were copied, in which case the caller should retry. + * Return <0 on error. + */ +static int copy_one_range(struct btrfs_inode *inode, struct iov_iter *iter, + struct extent_changeset **data_reserved, u64 start, + bool nowait) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_state *cached_state = NULL; + size_t write_bytes = calc_write_bytes(inode, iter, start); + size_t copied; + const u64 reserved_start = round_down(start, fs_info->sectorsize); + u64 reserved_len; + struct folio *folio = NULL; + int extents_locked; + u64 lockstart; + u64 lockend; + bool only_release_metadata = false; + const unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0); + int ret; + + /* + * Fault all pages before locking them in prepare_one_folio() to avoid + * recursive lock. + */ + if (unlikely(fault_in_iov_iter_readable(iter, write_bytes))) + return -EFAULT; + extent_changeset_release(*data_reserved); + ret = reserve_space(inode, data_reserved, start, &write_bytes, nowait, + &only_release_metadata); + if (ret < 0) + return ret; + reserved_len = ret; + /* Write range must be inside the reserved range. */ + ASSERT(reserved_start <= start); + ASSERT(start + write_bytes <= reserved_start + reserved_len); - release_bytes = reserve_bytes; again: - ret = balance_dirty_pages_ratelimited_flags(inode->i_mapping, bdp_flags); - if (ret) { - btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); - break; - } + ret = balance_dirty_pages_ratelimited_flags(inode->vfs_inode.i_mapping, + bdp_flags); + if (ret) { + btrfs_delalloc_release_extents(inode, reserved_len); + release_space(inode, *data_reserved, reserved_start, reserved_len, + only_release_metadata); + return ret; + } - ret = prepare_one_folio(inode, &folio, pos, write_bytes, - force_page_uptodate, false); - if (ret) { - btrfs_delalloc_release_extents(BTRFS_I(inode), - reserve_bytes); - break; - } + ret = prepare_one_folio(&inode->vfs_inode, &folio, start, write_bytes, false); + if (ret) { + btrfs_delalloc_release_extents(inode, reserved_len); + release_space(inode, *data_reserved, reserved_start, reserved_len, + only_release_metadata); + return ret; + } + + /* + * The reserved range goes beyond the current folio, shrink the reserved + * space to the folio boundary. + */ + if (reserved_start + reserved_len > folio_pos(folio) + folio_size(folio)) { + const u64 last_block = folio_pos(folio) + folio_size(folio); + + shrink_reserved_space(inode, *data_reserved, reserved_start, + reserved_len, last_block - reserved_start, + only_release_metadata); + write_bytes = last_block - start; + reserved_len = last_block - reserved_start; + } + + extents_locked = lock_and_cleanup_extent_if_need(inode, folio, start, + write_bytes, &lockstart, + &lockend, nowait, + &cached_state); + if (extents_locked < 0) { + if (!nowait && extents_locked == -EAGAIN) + goto again; - extents_locked = lock_and_cleanup_extent_if_need(BTRFS_I(inode), - folio, pos, write_bytes, &lockstart, - &lockend, nowait, &cached_state); - if (extents_locked < 0) { - if (!nowait && extents_locked == -EAGAIN) - goto again; + btrfs_delalloc_release_extents(inode, reserved_len); + release_space(inode, *data_reserved, reserved_start, reserved_len, + only_release_metadata); + ret = extents_locked; + return ret; + } - btrfs_delalloc_release_extents(BTRFS_I(inode), - reserve_bytes); - ret = extents_locked; - break; - } + copied = copy_folio_from_iter_atomic(folio, offset_in_folio(folio, start), + write_bytes, iter); + flush_dcache_folio(folio); - copied = copy_folio_from_iter_atomic(folio, - offset_in_folio(folio, pos), write_bytes, i); - flush_dcache_folio(folio); + if (unlikely(copied < write_bytes)) { + u64 last_block; /* - * If we get a partial write, we can end up with partially - * uptodate page. Although if sector size < page size we can - * handle it, but if it's not sector aligned it can cause - * a lot of complexity, so make sure they don't happen by - * forcing retry this copy. + * The original write range doesn't need an uptodate folio as + * the range is block aligned. But now a short copy happened. + * We cannot handle it without an uptodate folio. + * + * So just revert the range and we will retry. */ - if (unlikely(copied < write_bytes)) { - if (!folio_test_uptodate(folio)) { - iov_iter_revert(i, copied); - copied = 0; - } + if (!folio_test_uptodate(folio)) { + iov_iter_revert(iter, copied); + copied = 0; } - num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes); - dirty_sectors = round_up(copied + sector_offset, - fs_info->sectorsize); - dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors); - + /* No copied bytes, unlock, release reserved space and exit. */ if (copied == 0) { - force_page_uptodate = true; - dirty_sectors = 0; - } else { - force_page_uptodate = false; + if (extents_locked) + btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, + &cached_state); + else + btrfs_free_extent_state(cached_state); + btrfs_delalloc_release_extents(inode, reserved_len); + release_space(inode, *data_reserved, reserved_start, reserved_len, + only_release_metadata); + btrfs_drop_folio(fs_info, folio, start, copied); + return 0; } - if (num_sectors > dirty_sectors) { - /* release everything except the sectors we dirtied */ - release_bytes -= dirty_sectors << fs_info->sectorsize_bits; - if (only_release_metadata) { - btrfs_delalloc_release_metadata(BTRFS_I(inode), - release_bytes, true); - } else { - u64 release_start = round_up(pos + copied, - fs_info->sectorsize); - btrfs_delalloc_release_space(BTRFS_I(inode), - data_reserved, release_start, - release_bytes, true); - } - } + /* Release the reserved space beyond the last block. */ + last_block = round_up(start + copied, fs_info->sectorsize); + + shrink_reserved_space(inode, *data_reserved, reserved_start, + reserved_len, last_block - reserved_start, + only_release_metadata); + reserved_len = last_block - reserved_start; + } - release_bytes = round_up(copied + sector_offset, - fs_info->sectorsize); + ret = btrfs_dirty_folio(inode, folio, start, copied, &cached_state, + only_release_metadata); + /* + * If we have not locked the extent range, because the range's start + * offset is >= i_size, we might still have a non-NULL cached extent + * state, acquired while marking the extent range as delalloc through + * btrfs_dirty_page(). Therefore free any possible cached extent state + * to avoid a memory leak. + */ + if (extents_locked) + btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); + else + btrfs_free_extent_state(cached_state); - ret = btrfs_dirty_folio(BTRFS_I(inode), folio, pos, copied, - &cached_state, only_release_metadata); + btrfs_delalloc_release_extents(inode, reserved_len); + if (ret) { + btrfs_drop_folio(fs_info, folio, start, copied); + release_space(inode, *data_reserved, reserved_start, reserved_len, + only_release_metadata); + return ret; + } + if (only_release_metadata) + btrfs_check_nocow_unlock(inode); - /* - * If we have not locked the extent range, because the range's - * start offset is >= i_size, we might still have a non-NULL - * cached extent state, acquired while marking the extent range - * as delalloc through btrfs_dirty_page(). Therefore free any - * possible cached extent state to avoid a memory leak. - */ - if (extents_locked) - unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, - lockend, &cached_state); - else - free_extent_state(cached_state); + btrfs_drop_folio(fs_info, folio, start, copied); + return copied; +} - btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); - if (ret) { - btrfs_drop_folio(fs_info, folio, pos, copied); - break; - } +ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *iter) +{ + struct file *file = iocb->ki_filp; + loff_t pos; + struct inode *inode = file_inode(file); + struct extent_changeset *data_reserved = NULL; + size_t num_written = 0; + ssize_t ret; + loff_t old_isize; + unsigned int ilock_flags = 0; + const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); - release_bytes = 0; - if (only_release_metadata) - btrfs_check_nocow_unlock(BTRFS_I(inode)); + if (nowait) + ilock_flags |= BTRFS_ILOCK_TRY; - btrfs_drop_folio(fs_info, folio, pos, copied); + ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags); + if (ret < 0) + return ret; - cond_resched(); + /* + * We can only trust the isize with inode lock held, or it can race with + * other buffered writes and cause incorrect call of + * pagecache_isize_extended() to overwrite existing data. + */ + old_isize = i_size_read(inode); - pos += copied; - num_written += copied; - } + ret = generic_write_checks(iocb, iter); + if (ret <= 0) + goto out; - if (release_bytes) { - if (only_release_metadata) { - btrfs_check_nocow_unlock(BTRFS_I(inode)); - btrfs_delalloc_release_metadata(BTRFS_I(inode), - release_bytes, true); - } else { - btrfs_delalloc_release_space(BTRFS_I(inode), - data_reserved, - round_down(pos, fs_info->sectorsize), - release_bytes, true); - } + ret = btrfs_write_check(iocb, ret); + if (ret < 0) + goto out; + + pos = iocb->ki_pos; + while (iov_iter_count(iter) > 0) { + ret = copy_one_range(BTRFS_I(inode), iter, &data_reserved, pos, nowait); + if (ret < 0) + break; + pos += ret; + num_written += ret; + cond_resched(); } extent_changeset_free(data_reserved); @@ -1406,7 +1464,7 @@ int btrfs_release_file(struct inode *inode, struct file *filp) if (private) { kfree(private->filldir_buf); - free_extent_state(private->llseek_cached_state); + btrfs_free_extent_state(private->llseek_cached_state); kfree(private); filp->private_data = NULL; } @@ -1783,16 +1841,12 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) unsigned long zero_start; loff_t size; size_t fsize = folio_size(folio); - vm_fault_t ret; - int ret2; - int reserved = 0; + int ret; u64 reserved_space; u64 page_start; u64 page_end; u64 end; - ASSERT(folio_order(folio) == 0); - reserved_space = fsize; sb_start_pagefault(inode->i_sb); @@ -1808,21 +1862,14 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) * end up waiting indefinitely to get a lock on the page currently * being processed by btrfs_page_mkwrite() function. */ - ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, - page_start, reserved_space); - if (!ret2) { - ret2 = file_update_time(vmf->vma->vm_file); - reserved = 1; - } - if (ret2) { - ret = vmf_error(ret2); - if (reserved) - goto out; + ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, + page_start, reserved_space); + if (ret < 0) goto out_noreserve; - } - /* Make the VM retry the fault. */ - ret = VM_FAULT_NOPAGE; + ret = file_update_time(vmf->vma->vm_file); + if (ret < 0) + goto out; again: down_read(&BTRFS_I(inode)->i_mmap_lock); folio_lock(folio); @@ -1835,11 +1882,10 @@ again: } folio_wait_writeback(folio); - lock_extent(io_tree, page_start, page_end, &cached_state); - ret2 = set_folio_extent_mapped(folio); - if (ret2 < 0) { - ret = vmf_error(ret2); - unlock_extent(io_tree, page_start, page_end, &cached_state); + btrfs_lock_extent(io_tree, page_start, page_end, &cached_state); + ret = set_folio_extent_mapped(folio); + if (ret < 0) { + btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state); goto out_unlock; } @@ -1849,7 +1895,7 @@ again: */ ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, fsize); if (ordered) { - unlock_extent(io_tree, page_start, page_end, &cached_state); + btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state); folio_unlock(folio); up_read(&BTRFS_I(inode)->i_mmap_lock); btrfs_start_ordered_extent(ordered); @@ -1857,12 +1903,12 @@ again: goto again; } - if (folio->index == ((size - 1) >> PAGE_SHIFT)) { + if (folio_contains(folio, (size - 1) >> PAGE_SHIFT)) { reserved_space = round_up(size - page_start, fs_info->sectorsize); if (reserved_space < fsize) { end = page_start + reserved_space - 1; btrfs_delalloc_release_space(BTRFS_I(inode), - data_reserved, page_start, + data_reserved, end + 1, fsize - reserved_space, true); } } @@ -1874,15 +1920,14 @@ again: * clear any delalloc bits within this page range since we have to * reserve data&meta space before lock_page() (see above comments). */ - clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end, - EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, &cached_state); + btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end, + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, &cached_state); - ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0, + ret = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0, &cached_state); - if (ret2) { - unlock_extent(io_tree, page_start, page_end, &cached_state); - ret = VM_FAULT_SIGBUS; + if (ret < 0) { + btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state); goto out_unlock; } @@ -1901,7 +1946,7 @@ again: btrfs_set_inode_last_sub_trans(BTRFS_I(inode)); - unlock_extent(io_tree, page_start, page_end, &cached_state); + btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state); up_read(&BTRFS_I(inode)->i_mmap_lock); btrfs_delalloc_release_extents(BTRFS_I(inode), fsize); @@ -1915,11 +1960,16 @@ out_unlock: out: btrfs_delalloc_release_extents(BTRFS_I(inode), fsize); btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start, - reserved_space, (ret != 0)); + reserved_space, true); + extent_changeset_free(data_reserved); out_noreserve: sb_end_pagefault(inode->i_sb); - extent_changeset_free(data_reserved); - return ret; + + if (ret < 0) + return vmf_error(ret); + + /* Make the VM retry the fault. */ + return VM_FAULT_NOPAGE; } static const struct vm_operations_struct btrfs_file_vm_ops = { @@ -1941,33 +1991,33 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) return 0; } -static int hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf, - int slot, u64 start, u64 end) +static bool hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf, + int slot, u64 start, u64 end) { struct btrfs_file_extent_item *fi; struct btrfs_key key; if (slot < 0 || slot >= btrfs_header_nritems(leaf)) - return 0; + return false; btrfs_item_key_to_cpu(leaf, &key, slot); if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) - return 0; + return false; fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG) - return 0; + return false; if (btrfs_file_extent_disk_bytenr(leaf, fi)) - return 0; + return false; if (key.offset == end) - return 1; + return true; if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start) - return 1; - return 0; + return true; + return false; } static int fill_holes(struct btrfs_trans_handle *trans, @@ -2041,7 +2091,7 @@ static int fill_holes(struct btrfs_trans_handle *trans, out: btrfs_release_path(path); - hole_em = alloc_extent_map(); + hole_em = btrfs_alloc_extent_map(); if (!hole_em) { btrfs_drop_extent_map_range(inode, offset, end - 1, false); btrfs_set_inode_full_sync(inode); @@ -2055,7 +2105,7 @@ out: hole_em->generation = trans->transid; ret = btrfs_replace_extent_map_range(inode, hole_em, true); - free_extent_map(hole_em); + btrfs_free_extent_map(hole_em); if (ret) btrfs_set_inode_full_sync(inode); } @@ -2088,15 +2138,33 @@ static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len) 0 : *start + *len - em->start - em->len; *start = em->start + em->len; } - free_extent_map(em); + btrfs_free_extent_map(em); return ret; } -static void btrfs_punch_hole_lock_range(struct inode *inode, - const u64 lockstart, - const u64 lockend, - struct extent_state **cached_state) +/* + * Check if there is no folio in the range. + * + * We cannot utilize filemap_range_has_page() in a filemap with large folios + * as we can hit the following false positive: + * + * start end + * | | + * |//|//|//|//| | | | | | | | |//|//| + * \ / \ / + * Folio A Folio B + * + * That large folio A and B cover the start and end indexes. + * In that case filemap_range_has_page() will always return true, but the above + * case is fine for btrfs_punch_hole_lock_range() usage. + * + * So here we only ensure that no other folios is in the range, excluding the + * head/tail large folio. + */ +static bool check_range_has_page(struct inode *inode, u64 start, u64 end) { + struct folio_batch fbatch; + bool ret = false; /* * For subpage case, if the range is not at page boundary, we could * have pages at the leading/tailing part of the range. @@ -2104,15 +2172,48 @@ static void btrfs_punch_hole_lock_range(struct inode *inode, * will always return true. * So here we need to do extra page alignment for * filemap_range_has_page(). + * + * And do not decrease page_lockend right now, as it can be 0. */ - const u64 page_lockstart = round_up(lockstart, PAGE_SIZE); - const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1; + const u64 page_lockstart = round_up(start, PAGE_SIZE); + const u64 page_lockend = round_down(end + 1, PAGE_SIZE); + const pgoff_t start_index = page_lockstart >> PAGE_SHIFT; + const pgoff_t end_index = (page_lockend - 1) >> PAGE_SHIFT; + pgoff_t tmp = start_index; + int found_folios; + + /* The same page or adjacent pages. */ + if (page_lockend <= page_lockstart) + return false; + + folio_batch_init(&fbatch); + found_folios = filemap_get_folios(inode->i_mapping, &tmp, end_index, &fbatch); + for (int i = 0; i < found_folios; i++) { + struct folio *folio = fbatch.folios[i]; + + /* A large folio begins before the start. Not a target. */ + if (folio->index < start_index) + continue; + /* A large folio extends beyond the end. Not a target. */ + if (folio->index + folio_nr_pages(folio) > end_index) + continue; + /* A folio doesn't cover the head/tail index. Found a target. */ + ret = true; + break; + } + folio_batch_release(&fbatch); + return ret; +} +static void btrfs_punch_hole_lock_range(struct inode *inode, + const u64 lockstart, const u64 lockend, + struct extent_state **cached_state) +{ while (1) { truncate_pagecache_range(inode, lockstart, lockend); - lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, - cached_state); + btrfs_lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, + cached_state); /* * We can't have ordered extents in the range, nor dirty/writeback * pages, because we have locked the inode's VFS lock in exclusive @@ -2123,12 +2224,11 @@ static void btrfs_punch_hole_lock_range(struct inode *inode, * locking the range check if we have pages in the range, and if * we do, unlock the range and retry. */ - if (!filemap_range_has_page(inode->i_mapping, page_lockstart, - page_lockend)) + if (!check_range_has_page(inode, lockstart, lockend)) break; - unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, - cached_state); + btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, + cached_state); } btrfs_assert_inode_range_clean(BTRFS_I(inode), lockstart, lockend); @@ -2501,7 +2601,8 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) u64 lockend; u64 tail_start; u64 tail_len; - u64 orig_start = offset; + const u64 orig_start = offset; + const u64 orig_end = offset + len - 1; int ret = 0; bool same_block; u64 ino_size; @@ -2533,18 +2634,14 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset)) == (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)); /* - * We needn't truncate any block which is beyond the end of the file - * because we are sure there is no data there. - */ - /* * Only do this if we are in the same block and we aren't doing the * entire block. */ if (same_block && len < fs_info->sectorsize) { if (offset < ino_size) { truncated_block = true; - ret = btrfs_truncate_block(BTRFS_I(inode), offset, len, - 0); + ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1, + orig_start, orig_end); } else { ret = 0; } @@ -2554,7 +2651,7 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) /* zero back part of the first block */ if (offset < ino_size) { truncated_block = true; - ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0); + ret = btrfs_truncate_block(BTRFS_I(inode), offset, orig_start, orig_end); if (ret) { btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); return ret; @@ -2591,8 +2688,8 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) if (tail_start + tail_len < ino_size) { truncated_block = true; ret = btrfs_truncate_block(BTRFS_I(inode), - tail_start + tail_len, - 0, 1); + tail_start + tail_len - 1, + orig_start, orig_end); if (ret) goto out_only_mutex; } @@ -2626,8 +2723,8 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); out: - unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, - &cached_state); + btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, + &cached_state); out_only_mutex: if (!updated_inode && truncated_block && !ret) { /* @@ -2745,7 +2842,7 @@ static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode, else ret = RANGE_BOUNDARY_WRITTEN_EXTENT; - free_extent_map(em); + btrfs_free_extent_map(em); return ret; } @@ -2760,6 +2857,8 @@ static int btrfs_zero_range(struct inode *inode, int ret; u64 alloc_hint = 0; const u64 sectorsize = fs_info->sectorsize; + const u64 orig_start = offset; + const u64 orig_end = offset + len - 1; u64 alloc_start = round_down(offset, sectorsize); u64 alloc_end = round_up(offset + len, sectorsize); u64 bytes_to_reserve = 0; @@ -2789,7 +2888,7 @@ static int btrfs_zero_range(struct inode *inode, * do nothing except updating the inode's i_size if * needed. */ - free_extent_map(em); + btrfs_free_extent_map(em); ret = btrfs_fallocate_update_isize(inode, offset + len, mode); goto out; @@ -2802,9 +2901,9 @@ static int btrfs_zero_range(struct inode *inode, ASSERT(IS_ALIGNED(alloc_start, sectorsize)); len = offset + len - alloc_start; offset = alloc_start; - alloc_hint = extent_map_block_start(em) + em->len; + alloc_hint = btrfs_extent_map_block_start(em) + em->len; } - free_extent_map(em); + btrfs_free_extent_map(em); if (BTRFS_BYTES_TO_BLKS(fs_info, offset) == BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) { @@ -2815,22 +2914,22 @@ static int btrfs_zero_range(struct inode *inode, } if (em->flags & EXTENT_FLAG_PREALLOC) { - free_extent_map(em); + btrfs_free_extent_map(em); ret = btrfs_fallocate_update_isize(inode, offset + len, mode); goto out; } if (len < sectorsize && em->disk_bytenr != EXTENT_MAP_HOLE) { - free_extent_map(em); - ret = btrfs_truncate_block(BTRFS_I(inode), offset, len, - 0); + btrfs_free_extent_map(em); + ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1, + orig_start, orig_end); if (!ret) ret = btrfs_fallocate_update_isize(inode, offset + len, mode); return ret; } - free_extent_map(em); + btrfs_free_extent_map(em); alloc_start = round_down(offset, sectorsize); alloc_end = alloc_start + sectorsize; goto reserve_space; @@ -2854,7 +2953,8 @@ static int btrfs_zero_range(struct inode *inode, alloc_start = round_down(offset, sectorsize); ret = 0; } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) { - ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0); + ret = btrfs_truncate_block(BTRFS_I(inode), offset, + orig_start, orig_end); if (ret) goto out; } else { @@ -2871,8 +2971,8 @@ static int btrfs_zero_range(struct inode *inode, alloc_end = round_up(offset + len, sectorsize); ret = 0; } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) { - ret = btrfs_truncate_block(BTRFS_I(inode), offset + len, - 0, 1); + ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1, + orig_start, orig_end); if (ret) goto out; } else { @@ -2897,16 +2997,16 @@ reserve_space: ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved, alloc_start, bytes_to_reserve); if (ret) { - unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, - lockend, &cached_state); + btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, + lockend, &cached_state); goto out; } ret = btrfs_prealloc_file_range(inode, mode, alloc_start, alloc_end - alloc_start, fs_info->sectorsize, offset + len, &alloc_hint); - unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, - &cached_state); + btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, + &cached_state); /* btrfs_prealloc_file_range releases reserved space on error */ if (ret) { space_reserved = false; @@ -2992,7 +3092,8 @@ static long btrfs_fallocate(struct file *file, int mode, * need to zero out the end of the block if i_size lands in the * middle of a block. */ - ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0); + ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, + inode->i_size, (u64)-1); if (ret) goto out; } @@ -3017,8 +3118,8 @@ static long btrfs_fallocate(struct file *file, int mode, } locked_end = alloc_end - 1; - lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, - &cached_state); + btrfs_lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, + &cached_state); btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end); @@ -3030,8 +3131,8 @@ static long btrfs_fallocate(struct file *file, int mode, ret = PTR_ERR(em); break; } - last_byte = min(extent_map_end(em), alloc_end); - actual_end = min_t(u64, extent_map_end(em), offset + len); + last_byte = min(btrfs_extent_map_end(em), alloc_end); + actual_end = min_t(u64, btrfs_extent_map_end(em), offset + len); last_byte = ALIGN(last_byte, blocksize); if (em->disk_bytenr == EXTENT_MAP_HOLE || (cur_offset >= inode->i_size && @@ -3040,19 +3141,19 @@ static long btrfs_fallocate(struct file *file, int mode, ret = add_falloc_range(&reserve_list, cur_offset, range_len); if (ret < 0) { - free_extent_map(em); + btrfs_free_extent_map(em); break; } ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved, cur_offset, range_len); if (ret < 0) { - free_extent_map(em); + btrfs_free_extent_map(em); break; } qgroup_reserved += range_len; data_space_needed += range_len; } - free_extent_map(em); + btrfs_free_extent_map(em); cur_offset = last_byte; } @@ -3106,8 +3207,8 @@ static long btrfs_fallocate(struct file *file, int mode, */ ret = btrfs_fallocate_update_isize(inode, actual_end, mode); out_unlock: - unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, - &cached_state); + btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, + &cached_state); out: btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); extent_changeset_free(data_reserved); @@ -3141,10 +3242,10 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end if (inode->delalloc_bytes > 0) { spin_unlock(&inode->lock); *delalloc_start_ret = start; - delalloc_len = count_range_bits(&inode->io_tree, - delalloc_start_ret, end, - len, EXTENT_DELALLOC, 1, - cached_state); + delalloc_len = btrfs_count_range_bits(&inode->io_tree, + delalloc_start_ret, end, + len, EXTENT_DELALLOC, 1, + cached_state); } else { spin_unlock(&inode->lock); } @@ -3453,7 +3554,7 @@ static loff_t find_desired_extent(struct file *file, loff_t offset, int whence) last_extent_end = lockstart; - lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); + btrfs_lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) { @@ -3599,7 +3700,7 @@ static loff_t find_desired_extent(struct file *file, loff_t offset, int whence) } out: - unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); + btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); btrfs_free_path(path); if (ret < 0) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 05e173311c1a..4b34ea1f01c2 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -308,8 +308,9 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, bool locked = false; if (block_group) { - struct btrfs_path *path = btrfs_alloc_path(); + BTRFS_PATH_AUTO_FREE(path); + path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto fail; @@ -330,13 +331,12 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, spin_lock(&block_group->lock); block_group->disk_cache_state = BTRFS_DC_CLEAR; spin_unlock(&block_group->lock); - btrfs_free_path(path); } btrfs_i_size_write(inode, 0); truncate_pagecache(vfs_inode, 0); - lock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); + btrfs_lock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); btrfs_drop_extent_map_range(inode, 0, (u64)-1, false); /* @@ -348,7 +348,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, inode_sub_bytes(&inode->vfs_inode, control.sub_bytes); btrfs_inode_safe_disk_i_size_write(inode, control.last_size); - unlock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); + btrfs_unlock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); if (ret) goto fail; @@ -457,7 +457,7 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) mask); if (IS_ERR(folio)) { io_ctl_drop_pages(io_ctl); - return -ENOMEM; + return PTR_ERR(folio); } ret = set_folio_extent_mapped(folio); @@ -1080,9 +1080,8 @@ int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl, /* Get the cluster for this block_group if it exists */ if (block_group && !list_empty(&block_group->cluster_list)) { - cluster = list_entry(block_group->cluster_list.next, - struct btrfs_free_cluster, - block_group_list); + cluster = list_first_entry(&block_group->cluster_list, + struct btrfs_free_cluster, block_group_list); } if (!node && cluster) { @@ -1160,8 +1159,8 @@ update_cache_item(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) { - clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, - EXTENT_DELALLOC, NULL); + btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, + EXTENT_DELALLOC, NULL); goto fail; } leaf = path->nodes[0]; @@ -1172,9 +1171,9 @@ update_cache_item(struct btrfs_trans_handle *trans, btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || found_key.offset != offset) { - clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, - inode->i_size - 1, EXTENT_DELALLOC, - NULL); + btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, + inode->i_size - 1, EXTENT_DELALLOC, + NULL); btrfs_release_path(path); goto fail; } @@ -1219,9 +1218,9 @@ static noinline_for_stack int write_pinned_extent_entries( start = block_group->start; while (start < block_group->start + block_group->length) { - if (!find_first_extent_bit(unpin, start, - &extent_start, &extent_end, - EXTENT_DIRTY, NULL)) + if (!btrfs_find_first_extent_bit(unpin, start, + &extent_start, &extent_end, + EXTENT_DIRTY, NULL)) return 0; /* This pinned extent is out of our range */ @@ -1267,8 +1266,8 @@ static int flush_dirty_cache(struct inode *inode) ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1); if (ret) - clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, - EXTENT_DELALLOC, NULL); + btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, + EXTENT_DELALLOC, NULL); return ret; } @@ -1288,8 +1287,8 @@ cleanup_write_cache_enospc(struct inode *inode, struct extent_state **cached_state) { io_ctl_drop_pages(io_ctl); - unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, - cached_state); + btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, + cached_state); } static int __btrfs_wait_cache_io(struct btrfs_root *root, @@ -1414,8 +1413,8 @@ static int __btrfs_write_out_cache(struct inode *inode, if (ret) goto out_unlock; - lock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, - &cached_state); + btrfs_lock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, + &cached_state); io_ctl_set_generation(io_ctl, trans->transid); @@ -1475,8 +1474,8 @@ static int __btrfs_write_out_cache(struct inode *inode, io_ctl_drop_pages(io_ctl); io_ctl_free(io_ctl); - unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, - &cached_state); + btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, + &cached_state); /* * at this point the pages are under IO and we're happy, @@ -2342,9 +2341,8 @@ again: struct rb_node *node; struct btrfs_free_space *entry; - cluster = list_entry(block_group->cluster_list.next, - struct btrfs_free_cluster, - block_group_list); + cluster = list_first_entry(&block_group->cluster_list, + struct btrfs_free_cluster, block_group_list); spin_lock(&cluster->lock); node = rb_first(&cluster->root); if (!node) { diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 39c6b96a4c25..0c573d46639a 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -117,7 +117,7 @@ struct btrfs_free_space_info *search_free_space_info( if (ret != 0) { btrfs_warn(fs_info, "missing free space info for %llu", block_group->start); - ASSERT(0); + DEBUG_WARN(); return ERR_PTR(-ENOENT); } @@ -141,12 +141,12 @@ static int btrfs_search_prev_slot(struct btrfs_trans_handle *trans, return ret; if (ret == 0) { - ASSERT(0); + DEBUG_WARN(); return -EIO; } if (p->slots[0] == 0) { - ASSERT(0); + DEBUG_WARN("no previous slot found"); return -EIO; } p->slots[0]--; @@ -223,6 +223,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, bitmap = alloc_bitmap(bitmap_size); if (!bitmap) { ret = -ENOMEM; + btrfs_abort_transaction(trans, ret); goto out; } @@ -235,8 +236,10 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, while (!done) { ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } leaf = path->nodes[0]; nr = 0; @@ -271,14 +274,17 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, } ret = btrfs_del_items(trans, root, path, path->slots[0], nr); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } btrfs_release_path(path); } info = search_free_space_info(trans, block_group, path, 1); if (IS_ERR(info)) { ret = PTR_ERR(info); + btrfs_abort_transaction(trans, ret); goto out; } leaf = path->nodes[0]; @@ -293,8 +299,8 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, "incorrect extent count for %llu; counted %u, expected %u", block_group->start, extent_count, expected_extent_count); - ASSERT(0); ret = -EIO; + btrfs_abort_transaction(trans, ret); goto out; } @@ -315,8 +321,10 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, root, path, &key, data_size); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } leaf = path->nodes[0]; ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); @@ -331,8 +339,6 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, ret = 0; out: kvfree(bitmap); - if (ret) - btrfs_abort_transaction(trans, ret); return ret; } @@ -358,6 +364,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, bitmap = alloc_bitmap(bitmap_size); if (!bitmap) { ret = -ENOMEM; + btrfs_abort_transaction(trans, ret); goto out; } @@ -370,8 +377,10 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, while (!done) { ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } leaf = path->nodes[0]; nr = 0; @@ -412,14 +421,17 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, } ret = btrfs_del_items(trans, root, path, path->slots[0], nr); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } btrfs_release_path(path); } info = search_free_space_info(trans, block_group, path, 1); if (IS_ERR(info)) { ret = PTR_ERR(info); + btrfs_abort_transaction(trans, ret); goto out; } leaf = path->nodes[0]; @@ -441,8 +453,10 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, key.offset = (end_bit - start_bit) * block_group->fs_info->sectorsize; ret = btrfs_insert_empty_item(trans, root, path, &key, 0); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } btrfs_release_path(path); extent_count++; @@ -455,16 +469,14 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, "incorrect extent count for %llu; counted %u, expected %u", block_group->start, extent_count, expected_extent_count); - ASSERT(0); ret = -EIO; + btrfs_abort_transaction(trans, ret); goto out; } ret = 0; out: kvfree(bitmap); - if (ret) - btrfs_abort_transaction(trans, ret); return ret; } @@ -838,13 +850,15 @@ int remove_from_free_space_tree(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; + btrfs_abort_transaction(trans, ret); goto out; } block_group = btrfs_lookup_block_group(trans->fs_info, start); if (!block_group) { - ASSERT(0); + DEBUG_WARN("no block group found for start=%llu", start); ret = -ENOENT; + btrfs_abort_transaction(trans, ret); goto out; } @@ -852,12 +866,12 @@ int remove_from_free_space_tree(struct btrfs_trans_handle *trans, ret = __remove_from_free_space_tree(trans, block_group, path, start, size); mutex_unlock(&block_group->free_space_lock); + if (ret) + btrfs_abort_transaction(trans, ret); btrfs_put_block_group(block_group); out: btrfs_free_path(path); - if (ret) - btrfs_abort_transaction(trans, ret); return ret; } @@ -1031,25 +1045,27 @@ int add_to_free_space_tree(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; + btrfs_abort_transaction(trans, ret); goto out; } block_group = btrfs_lookup_block_group(trans->fs_info, start); if (!block_group) { - ASSERT(0); + DEBUG_WARN("no block group found for start=%llu", start); ret = -ENOENT; + btrfs_abort_transaction(trans, ret); goto out; } mutex_lock(&block_group->free_space_lock); ret = __add_to_free_space_tree(trans, block_group, path, start, size); mutex_unlock(&block_group->free_space_lock); + if (ret) + btrfs_abort_transaction(trans, ret); btrfs_put_block_group(block_group); out: btrfs_free_path(path); - if (ret) - btrfs_abort_transaction(trans, ret); return ret; } @@ -1555,7 +1571,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, "incorrect extent count for %llu; counted %u, expected %u", block_group->start, extent_count, expected_extent_count); - ASSERT(0); + DEBUG_WARN(); ret = -EIO; goto out; } @@ -1619,7 +1635,7 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, "incorrect extent count for %llu; counted %u, expected %u", block_group->start, extent_count, expected_extent_count); - ASSERT(0); + DEBUG_WARN(); ret = -EIO; goto out; } diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index bcca43046064..4394de12a767 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -300,6 +300,7 @@ enum { #define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR 0ULL #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) +#define BTRFS_WARNING_COMMIT_INTERVAL (300) #define BTRFS_DEFAULT_MAX_INLINE (2048) struct btrfs_dev_replace { @@ -471,6 +472,8 @@ struct btrfs_fs_info { struct btrfs_block_rsv delayed_block_rsv; /* Block reservation for delayed refs */ struct btrfs_block_rsv delayed_refs_rsv; + /* Block reservation for treelog tree */ + struct btrfs_block_rsv treelog_rsv; struct btrfs_block_rsv empty_block_rsv; @@ -776,10 +779,8 @@ struct btrfs_fs_info { struct btrfs_delayed_root *delayed_root; - /* Extent buffer radix tree */ - spinlock_t buffer_lock; /* Entries are eb->start / sectorsize */ - struct radix_tree_root buffer_radix; + struct xarray buffer_tree; /* Next backup root to be overwritten */ int backup_root_index; diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 3530de0618c8..a61c3540d67b 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -109,7 +109,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, u64 inode_objectid, u64 ref_objectid, u64 *index) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_inode_extref *extref; struct extent_buffer *leaf; @@ -129,9 +129,9 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) - ret = -ENOENT; + return -ENOENT; if (ret < 0) - goto out; + return ret; /* * Sanity check - did we find the right item for this name? @@ -142,8 +142,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, ref_objectid, name); if (!extref) { btrfs_abort_transaction(trans, -ENOENT); - ret = -ENOENT; - goto out; + return -ENOENT; } leaf = path->nodes[0]; @@ -152,12 +151,8 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, *index = btrfs_inode_extref_index(leaf, extref); if (del_len == item_size) { - /* - * Common case only one ref in the item, remove the - * whole item. - */ - ret = btrfs_del_item(trans, root, path); - goto out; + /* Common case only one ref in the item, remove the whole item. */ + return btrfs_del_item(trans, root, path); } ptr = (unsigned long)extref; @@ -168,9 +163,6 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, btrfs_truncate_item(trans, path, item_size - del_len, 1); -out: - btrfs_free_path(path); - return ret; } @@ -260,7 +252,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, int ret; int ins_len = name->len + sizeof(*extref); unsigned long ptr; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct extent_buffer *leaf; @@ -279,13 +271,13 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, path->slots[0], ref_objectid, name)) - goto out; + return ret; btrfs_extend_item(trans, path, ins_len); ret = 0; } if (ret < 0) - goto out; + return ret; leaf = path->nodes[0]; ptr = (unsigned long)btrfs_item_ptr(leaf, path->slots[0], char); @@ -298,9 +290,8 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, ptr = (unsigned long)&extref->name; write_extent_buffer(path->nodes[0], name->name, ptr, name->len); -out: - btrfs_free_path(path); - return ret; + + return 0; } /* Will return 0, -ENOMEM, -EMLINK, or -EEXIST or anything from the CoW path */ diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index cc67d1a2d611..c0c778243bf1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -686,12 +686,12 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, if (!can_cow_file_range_inline(inode, offset, size, compressed_size)) return 1; - lock_extent(&inode->io_tree, offset, end, &cached); + btrfs_lock_extent(&inode->io_tree, offset, end, &cached); ret = __cow_file_range_inline(inode, size, compressed_size, compress_type, compressed_folio, update_i_size); if (ret > 0) { - unlock_extent(&inode->io_tree, offset, end, &cached); + btrfs_unlock_extent(&inode->io_tree, offset, end, &cached); return ret; } @@ -777,26 +777,9 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, struct btrfs_fs_info *fs_info = inode->root->fs_info; if (!btrfs_inode_can_compress(inode)) { - WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), - KERN_ERR "BTRFS: unexpected compression for ino %llu\n", - btrfs_ino(inode)); + DEBUG_WARN("BTRFS: unexpected compression for ino %llu", btrfs_ino(inode)); return 0; } - /* - * Only enable sector perfect compression for experimental builds. - * - * This is a big feature change for subpage cases, and can hit - * different corner cases, so only limit this feature for - * experimental build for now. - * - * ETA for moving this out of experimental builds is 6.15. - */ - if (fs_info->sectorsize < PAGE_SIZE && - !IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL)) { - if (!PAGE_ALIGNED(start) || - !PAGE_ALIGNED(end + 1)) - return 0; - } /* force compress */ if (btrfs_test_opt(fs_info, FORCE_COMPRESS)) @@ -1109,6 +1092,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, struct extent_state *cached = NULL; struct extent_map *em; int ret = 0; + bool free_pages = false; u64 start = async_extent->start; u64 end = async_extent->start + async_extent->ram_size - 1; @@ -1129,7 +1113,10 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, } if (async_extent->compress_type == BTRFS_COMPRESS_NONE) { + ASSERT(!async_extent->folios); + ASSERT(async_extent->nr_folios == 0); submit_uncompressed_range(inode, async_extent, locked_folio); + free_pages = true; goto done; } @@ -1145,10 +1132,11 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, * fall back to uncompressed. */ submit_uncompressed_range(inode, async_extent, locked_folio); + free_pages = true; goto done; } - lock_extent(io_tree, start, end, &cached); + btrfs_lock_extent(io_tree, start, end, &cached); /* Here we're doing allocation and writeback of the compressed pages */ file_extent.disk_bytenr = ins.objectid; @@ -1163,10 +1151,10 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, ret = PTR_ERR(em); goto out_free_reserve; } - free_extent_map(em); + btrfs_free_extent_map(em); ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent, - 1 << BTRFS_ORDERED_COMPRESSED); + 1U << BTRFS_ORDERED_COMPRESSED); if (IS_ERR(ordered)) { btrfs_drop_extent_map_range(inode, start, end, false); ret = PTR_ERR(ordered); @@ -1186,12 +1174,14 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, done: if (async_chunk->blkcg_css) kthread_associate_blkcg(NULL); + if (free_pages) + free_async_extent_pages(async_extent); kfree(async_extent); return; out_free_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); - btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true); mapping_set_error(inode->vfs_inode.i_mapping, -EIO); extent_clear_unlock_delalloc(inode, start, end, NULL, &cached, @@ -1218,7 +1208,7 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, u64 alloc_hint = 0; read_lock(&em_tree->lock); - em = search_extent_mapping(em_tree, start, num_bytes); + em = btrfs_search_extent_mapping(em_tree, start, num_bytes); if (em) { /* * if block start isn't an actual block number then find the @@ -1226,15 +1216,15 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * block is also bogus then just don't worry about it. */ if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) { - free_extent_map(em); - em = search_extent_mapping(em_tree, 0, 0); + btrfs_free_extent_map(em); + em = btrfs_search_extent_mapping(em_tree, 0, 0); if (em && em->disk_bytenr < EXTENT_MAP_LAST_BYTE) - alloc_hint = extent_map_block_start(em); + alloc_hint = btrfs_extent_map_block_start(em); if (em) - free_extent_map(em); + btrfs_free_extent_map(em); } else { - alloc_hint = extent_map_block_start(em); - free_extent_map(em); + alloc_hint = btrfs_extent_map_block_start(em); + btrfs_free_extent_map(em); } } read_unlock(&em_tree->lock); @@ -1397,24 +1387,24 @@ static noinline int cow_file_range(struct btrfs_inode *inode, * Locked range will be released either during error clean up or * after the whole range is finished. */ - lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1, - &cached); + btrfs_lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1, + &cached); em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_REGULAR); if (IS_ERR(em)) { - unlock_extent(&inode->io_tree, start, - start + cur_alloc_size - 1, &cached); + btrfs_unlock_extent(&inode->io_tree, start, + start + cur_alloc_size - 1, &cached); ret = PTR_ERR(em); goto out_reserve; } - free_extent_map(em); + btrfs_free_extent_map(em); ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent, - 1 << BTRFS_ORDERED_REGULAR); + 1U << BTRFS_ORDERED_REGULAR); if (IS_ERR(ordered)) { - unlock_extent(&inode->io_tree, start, - start + cur_alloc_size - 1, &cached); + btrfs_unlock_extent(&inode->io_tree, start, + start + cur_alloc_size - 1, &cached); ret = PTR_ERR(ordered); goto out_drop_extent_cache; } @@ -1469,7 +1459,7 @@ out_drop_extent_cache: btrfs_drop_extent_map_range(inode, start, start + cur_alloc_size - 1, false); out_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); - btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true); out_unlock: /* * Now, we have three regions to clean up: @@ -1578,8 +1568,8 @@ static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_ PAGE_SHIFT; while (!list_empty(&async_chunk->extents)) { - async_extent = list_entry(async_chunk->extents.next, - struct async_extent, list); + async_extent = list_first_entry(&async_chunk->extents, + struct async_extent, list); list_del(&async_extent->list); submit_one_async_extent(async_chunk, async_extent, &alloc_hint); } @@ -1749,9 +1739,9 @@ static int fallback_to_cow(struct btrfs_inode *inode, * group that contains that extent to RO mode and therefore force COW * when starting writeback. */ - lock_extent(io_tree, start, end, &cached_state); - count = count_range_bits(io_tree, &range_start, end, range_bytes, - EXTENT_NORESERVE, 0, NULL); + btrfs_lock_extent(io_tree, start, end, &cached_state); + count = btrfs_count_range_bits(io_tree, &range_start, end, range_bytes, + EXTENT_NORESERVE, 0, NULL); if (count > 0 || is_space_ino || is_reloc_ino) { u64 bytes = count; struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -1765,10 +1755,9 @@ static int fallback_to_cow(struct btrfs_inode *inode, spin_unlock(&sinfo->lock); if (count > 0) - clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE, - NULL); + btrfs_clear_extent_bits(io_tree, start, end, EXTENT_NORESERVE); } - unlock_extent(io_tree, start, end, &cached_state); + btrfs_unlock_extent(io_tree, start, end, &cached_state); /* * Don't try to create inline extents, as a mix of inline extent that @@ -1976,7 +1965,7 @@ static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio u64 end = file_pos + len - 1; int ret = 0; - lock_extent(&inode->io_tree, file_pos, end, cached); + btrfs_lock_extent(&inode->io_tree, file_pos, end, cached); if (is_prealloc) { struct extent_map *em; @@ -1984,20 +1973,20 @@ static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio em = btrfs_create_io_em(inode, file_pos, &nocow_args->file_extent, BTRFS_ORDERED_PREALLOC); if (IS_ERR(em)) { - unlock_extent(&inode->io_tree, file_pos, end, cached); + btrfs_unlock_extent(&inode->io_tree, file_pos, end, cached); return PTR_ERR(em); } - free_extent_map(em); + btrfs_free_extent_map(em); } ordered = btrfs_alloc_ordered_extent(inode, file_pos, &nocow_args->file_extent, is_prealloc - ? (1 << BTRFS_ORDERED_PREALLOC) - : (1 << BTRFS_ORDERED_NOCOW)); + ? (1U << BTRFS_ORDERED_PREALLOC) + : (1U << BTRFS_ORDERED_NOCOW)); if (IS_ERR(ordered)) { if (is_prealloc) btrfs_drop_extent_map_range(inode, file_pos, end, false); - unlock_extent(&inode->io_tree, file_pos, end, cached); + btrfs_unlock_extent(&inode->io_tree, file_pos, end, cached); return PTR_ERR(ordered); } @@ -2129,12 +2118,13 @@ next_slot: /* * If the found extent starts after requested offset, then - * adjust extent_end to be right before this extent begins + * adjust cur_offset to be right before this extent begins. */ if (found_key.offset > cur_offset) { - extent_end = found_key.offset; - extent_type = 0; - goto must_cow; + if (cow_start == (u64)-1) + cow_start = cur_offset; + cur_offset = found_key.offset; + goto next_slot; } /* @@ -2295,7 +2285,7 @@ error: if (cur_offset < end) { struct extent_state *cached = NULL; - lock_extent(&inode->io_tree, cur_offset, end, &cached); + btrfs_lock_extent(&inode->io_tree, cur_offset, end, &cached); extent_clear_unlock_delalloc(inode, cur_offset, end, locked_folio, &cached, EXTENT_LOCKED | EXTENT_DELALLOC | @@ -2317,7 +2307,7 @@ static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end) { if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) { if (inode->defrag_bytes && - test_range_bit_exists(&inode->io_tree, start, end, EXTENT_DEFRAG)) + btrfs_test_range_bit_exists(&inode->io_tree, start, end, EXTENT_DEFRAG)) return false; return true; } @@ -2606,7 +2596,7 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, !btrfs_is_free_space_inode(inode) && !(state->state & EXTENT_NORESERVE) && (bits & EXTENT_CLEAR_DATA_RESV)) - btrfs_free_reserved_data_space_noquota(fs_info, len); + btrfs_free_reserved_data_space_noquota(inode, len); percpu_counter_add_batch(&fs_info->delalloc_bytes, -len, fs_info->delalloc_batch); @@ -2690,12 +2680,12 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, if (em_len > search_len) em_len = search_len; - ret = set_extent_bit(&inode->io_tree, search_start, - search_start + em_len - 1, - EXTENT_DELALLOC_NEW, cached_state); + ret = btrfs_set_extent_bit(&inode->io_tree, search_start, + search_start + em_len - 1, + EXTENT_DELALLOC_NEW, cached_state); next: - search_start = extent_map_end(em); - free_extent_map(em); + search_start = btrfs_extent_map_end(em); + btrfs_free_extent_map(em); if (ret) return ret; } @@ -2725,8 +2715,8 @@ int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, return ret; } - return set_extent_bit(&inode->io_tree, start, end, - EXTENT_DELALLOC | extra_bits, cached_state); + return btrfs_set_extent_bit(&inode->io_tree, start, end, + EXTENT_DELALLOC | extra_bits, cached_state); } /* see btrfs_writepage_start_hook for details on why this is required */ @@ -2801,7 +2791,7 @@ again: if (ret) goto out_page; - lock_extent(&inode->io_tree, page_start, page_end, &cached_state); + btrfs_lock_extent(&inode->io_tree, page_start, page_end, &cached_state); /* already ordered? We're done */ if (folio_test_ordered(folio)) @@ -2809,8 +2799,8 @@ again: ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); if (ordered) { - unlock_extent(&inode->io_tree, page_start, page_end, - &cached_state); + btrfs_unlock_extent(&inode->io_tree, page_start, page_end, + &cached_state); folio_unlock(folio); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); @@ -2836,7 +2826,7 @@ out_reserved: if (free_delalloc_space) btrfs_delalloc_release_space(inode, data_reserved, page_start, PAGE_SIZE, true); - unlock_extent(&inode->io_tree, page_start, page_end, &cached_state); + btrfs_unlock_extent(&inode->io_tree, page_start, page_end, &cached_state); out_page: if (ret) { /* @@ -2888,7 +2878,7 @@ int btrfs_writepage_cow_fixup(struct folio *folio) * We should not hit such out-of-band dirty folios anymore. */ if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL)) { - WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + DEBUG_WARN(); btrfs_err_rl(fs_info, "root %lld ino %llu folio %llu is marked dirty without notifying the fs", BTRFS_I(inode)->root->root_key.objectid, @@ -2937,7 +2927,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, { struct btrfs_root *root = inode->root; const u64 sectorsize = root->fs_info->sectorsize; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_key ins; u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi); @@ -3019,8 +3009,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, file_pos - offset, qgroup_reserved, &ins); out: - btrfs_free_path(path); - return ret; } @@ -3136,8 +3124,10 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) * depending on their current state). */ if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { - clear_bits |= EXTENT_LOCKED; - lock_extent(io_tree, start, end, &cached_state); + clear_bits |= EXTENT_LOCKED | EXTENT_FINISHING_ORDERED; + btrfs_lock_extent_bits(io_tree, start, end, + EXTENT_LOCKED | EXTENT_FINISHING_ORDERED, + &cached_state); } if (freespace_inode) @@ -3201,8 +3191,8 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) goto out; } - ret = unpin_extent_cache(inode, ordered_extent->file_offset, - ordered_extent->num_bytes, trans->transid); + ret = btrfs_unpin_extent_cache(inode, ordered_extent->file_offset, + ordered_extent->num_bytes, trans->transid); if (ret < 0) { btrfs_abort_transaction(trans, ret); goto out; @@ -3221,9 +3211,9 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) */ if ((clear_bits & EXTENT_DELALLOC_NEW) && !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) - clear_extent_bit(&inode->io_tree, start, end, - EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES, - &cached_state); + btrfs_clear_extent_bit(&inode->io_tree, start, end, + EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES, + &cached_state); btrfs_inode_safe_disk_i_size_write(inode, 0); ret = btrfs_update_inode_fallback(trans, inode); @@ -3232,15 +3222,13 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) goto out; } out: - clear_extent_bit(&inode->io_tree, start, end, clear_bits, - &cached_state); + btrfs_clear_extent_bit(&inode->io_tree, start, end, clear_bits, + &cached_state); if (trans) btrfs_end_transaction(trans); if (ret || truncated) { - u64 unwritten_start = start; - /* * If we failed to finish this ordered extent for any reason we * need to make sure BTRFS_ORDERED_IOERR is set on the ordered @@ -3252,10 +3240,6 @@ out: if (ret) btrfs_mark_ordered_extent_error(ordered_extent); - if (truncated) - unwritten_start += logical_len; - clear_extent_uptodate(io_tree, unwritten_start, end, NULL); - /* * Drop extent maps for the part of the extent we didn't write. * @@ -3270,9 +3254,15 @@ out: * we don't mess with the extent map tree in the NOCOW case, but * for now simply skip this if we are the free space inode. */ - if (!btrfs_is_free_space_inode(inode)) + if (!btrfs_is_free_space_inode(inode)) { + u64 unwritten_start = start; + + if (truncated) + unwritten_start += logical_len; + btrfs_drop_extent_map_range(inode, unwritten_start, end, false); + } /* * If the ordered extent had an IOERR or something else went @@ -3299,7 +3289,7 @@ out: NULL); btrfs_free_reserved_extent(fs_info, ordered_extent->disk_bytenr, - ordered_extent->disk_num_bytes, 1); + ordered_extent->disk_num_bytes, true); /* * Actually free the qgroup rsv which was released when * the ordered extent was created. @@ -3336,20 +3326,16 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered) /* * Verify the checksum for a single sector without any extra action that depend * on the type of I/O. + * + * @kaddr must be a properly kmapped address. */ -int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, - u32 pgoff, u8 *csum, const u8 * const csum_expected) +int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, void *kaddr, u8 *csum, + const u8 * const csum_expected) { SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); - char *kaddr; - - ASSERT(pgoff + fs_info->sectorsize <= PAGE_SIZE); shash->tfm = fs_info->csum_shash; - - kaddr = kmap_local_page(page) + pgoff; crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum); - kunmap_local(kaddr); if (memcmp(csum, csum_expected, fs_info->csum_size)) return -EIO; @@ -3378,6 +3364,7 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, u64 end = file_offset + bv->bv_len - 1; u8 *csum_expected; u8 csum[BTRFS_CSUM_SIZE]; + void *kaddr; ASSERT(bv->bv_len == fs_info->sectorsize); @@ -3385,19 +3372,22 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, return true; if (btrfs_is_data_reloc_root(inode->root) && - test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM, - NULL)) { + btrfs_test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM, + NULL)) { /* Skip the range without csum for data reloc inode */ - clear_extent_bits(&inode->io_tree, file_offset, end, - EXTENT_NODATASUM); + btrfs_clear_extent_bits(&inode->io_tree, file_offset, end, + EXTENT_NODATASUM); return true; } csum_expected = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) * fs_info->csum_size; - if (btrfs_check_sector_csum(fs_info, bv->bv_page, bv->bv_offset, csum, - csum_expected)) + kaddr = bvec_kmap_local(bv); + if (btrfs_check_sector_csum(fs_info, kaddr, csum, csum_expected)) { + kunmap_local(kaddr); goto zeroit; + } + kunmap_local(kaddr); return true; zeroit: @@ -3544,7 +3534,7 @@ static int btrfs_orphan_del(struct btrfs_trans_handle *trans, int btrfs_orphan_cleanup(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_key key, found_key; struct btrfs_trans_handle *trans; @@ -3734,19 +3724,22 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) out: if (ret) btrfs_err(fs_info, "could not do orphan cleanup %d", ret); - btrfs_free_path(path); return ret; } /* - * very simple check to peek ahead in the leaf looking for xattrs. If we - * don't find any xattrs, we know there can't be any acls. + * Look ahead in the leaf for xattrs. If we don't find any then we know there + * can't be any ACLs. * - * slot is the slot the inode is in, objectid is the objectid of the inode + * @leaf: the eb leaf where to search + * @slot: the slot the inode is in + * @objectid: the objectid of the inode + * + * Return true if there is xattr/ACL, false otherwise. */ -static noinline int acls_after_inode_item(struct extent_buffer *leaf, - int slot, u64 objectid, - int *first_xattr_slot) +static noinline bool acls_after_inode_item(struct extent_buffer *leaf, + int slot, u64 objectid, + int *first_xattr_slot) { u32 nritems = btrfs_header_nritems(leaf); struct btrfs_key found_key; @@ -3766,45 +3759,50 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf, while (slot < nritems) { btrfs_item_key_to_cpu(leaf, &found_key, slot); - /* we found a different objectid, there must not be acls */ + /* We found a different objectid, there must be no ACLs. */ if (found_key.objectid != objectid) - return 0; + return false; - /* we found an xattr, assume we've got an acl */ + /* We found an xattr, assume we've got an ACL. */ if (found_key.type == BTRFS_XATTR_ITEM_KEY) { if (*first_xattr_slot == -1) *first_xattr_slot = slot; if (found_key.offset == xattr_access || found_key.offset == xattr_default) - return 1; + return true; } /* - * we found a key greater than an xattr key, there can't - * be any acls later on + * We found a key greater than an xattr key, there can't be any + * ACLs later on. */ if (found_key.type > BTRFS_XATTR_ITEM_KEY) - return 0; + return false; slot++; scanned++; /* - * it goes inode, inode backrefs, xattrs, extents, - * so if there are a ton of hard links to an inode there can - * be a lot of backrefs. Don't waste time searching too hard, - * this is just an optimization + * The item order goes like: + * - inode + * - inode backrefs + * - xattrs + * - extents, + * + * so if there are lots of hard links to an inode there can be + * a lot of backrefs. Don't waste time searching too hard, + * this is just an optimization. */ if (scanned >= 8) break; } - /* we hit the end of the leaf before we found an xattr or - * something larger than an xattr. We have to assume the inode - * has acls + /* + * We hit the end of the leaf before we found an xattr or something + * larger than an xattr. We have to assume the inode has ACLs. */ if (*first_xattr_slot == -1) *first_xattr_slot = slot; - return 1; + return true; } static int btrfs_init_file_extent_tree(struct btrfs_inode *inode) @@ -3824,7 +3822,8 @@ static int btrfs_init_file_extent_tree(struct btrfs_inode *inode) if (!inode->file_extent_tree) return -ENOMEM; - extent_io_tree_init(fs_info, inode->file_extent_tree, IO_TREE_INODE_FILE_EXTENT); + btrfs_extent_io_tree_init(fs_info, inode->file_extent_tree, + IO_TREE_INODE_FILE_EXTENT); /* Lockdep class is set only for the file extent tree. */ lockdep_set_class(&inode->file_extent_tree->lock, &file_extent_tree_class); @@ -4127,7 +4126,7 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode *inode) { struct btrfs_inode_item *inode_item; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_key key; int ret; @@ -4141,7 +4140,7 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, if (ret) { if (ret > 0) ret = -ENOENT; - goto failed; + return ret; } leaf = path->nodes[0]; @@ -4150,10 +4149,7 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode); btrfs_set_inode_last_trans(trans, inode); - ret = 0; -failed: - btrfs_free_path(path); - return ret; + return 0; } /* @@ -4487,7 +4483,7 @@ out: static noinline int may_destroy_subvol(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_dir_item *di; struct btrfs_key key; struct fscrypt_str name = FSTR_INIT("default", 7); @@ -4509,7 +4505,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root) btrfs_err(fs_info, "deleting default subvolume %llu is not allowed", key.objectid); - goto out; + return ret; } btrfs_release_path(path); } @@ -4520,14 +4516,13 @@ static noinline int may_destroy_subvol(struct btrfs_root *root) ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); if (ret < 0) - goto out; + return ret; if (ret == 0) { /* * Key with offset -1 found, there would have to exist a root * with such id, but this is out of valid range. */ - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } ret = 0; @@ -4537,8 +4532,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root) if (key.objectid == btrfs_root_id(root) && key.type == BTRFS_ROOT_REF_KEY) ret = -ENOTEMPTY; } -out: - btrfs_free_path(path); + return ret; } @@ -4781,20 +4775,80 @@ out_notrans: return ret; } +static bool is_inside_block(u64 bytenr, u64 blockstart, u32 blocksize) +{ + ASSERT(IS_ALIGNED(blockstart, blocksize), "blockstart=%llu blocksize=%u", + blockstart, blocksize); + + if (blockstart <= bytenr && bytenr <= blockstart + blocksize - 1) + return true; + return false; +} + +static int truncate_block_zero_beyond_eof(struct btrfs_inode *inode, u64 start) +{ + const pgoff_t index = (start >> PAGE_SHIFT); + struct address_space *mapping = inode->vfs_inode.i_mapping; + struct folio *folio; + u64 zero_start; + u64 zero_end; + int ret = 0; + +again: + folio = filemap_lock_folio(mapping, index); + /* No folio present. */ + if (IS_ERR(folio)) + return 0; + + if (!folio_test_uptodate(folio)) { + ret = btrfs_read_folio(NULL, folio); + folio_lock(folio); + if (folio->mapping != mapping) { + folio_unlock(folio); + folio_put(folio); + goto again; + } + if (!folio_test_uptodate(folio)) { + ret = -EIO; + goto out_unlock; + } + } + folio_wait_writeback(folio); + + /* + * We do not need to lock extents nor wait for OE, as it's already + * beyond EOF. + */ + + zero_start = max_t(u64, folio_pos(folio), start); + zero_end = folio_pos(folio) + folio_size(folio) - 1; + folio_zero_range(folio, zero_start - folio_pos(folio), + zero_end - zero_start + 1); + +out_unlock: + folio_unlock(folio); + folio_put(folio); + return ret; +} + /* - * Read, zero a chunk and write a block. + * Handle the truncation of a fs block. * - * @inode - inode that we're zeroing - * @from - the offset to start zeroing - * @len - the length to zero, 0 to zero the entire range respective to the - * offset - * @front - zero up to the offset instead of from the offset on + * @inode - inode that we're zeroing + * @offset - the file offset of the block to truncate + * The value must be inside [@start, @end], and the function will do + * extra checks if the block that covers @offset needs to be zeroed. + * @start - the start file offset of the range we want to zero + * @end - the end (inclusive) file offset of the range we want to zero. * - * This will find the block for the "from" offset and cow the block and zero the - * part we want to zero. This is used with truncate and hole punching. + * If the range is not block aligned, read out the folio that covers @offset, + * and if needed zero blocks that are inside the folio and covered by [@start, @end). + * If @start or @end + 1 lands inside a block, that block will be marked dirty + * for writeback. + * + * This is utilized by hole punch, zero range, file expansion. */ -int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, - int front) +int btrfs_truncate_block(struct btrfs_inode *inode, u64 offset, u64 start, u64 end) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct address_space *mapping = inode->vfs_inode.i_mapping; @@ -4804,20 +4858,56 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, struct extent_changeset *data_reserved = NULL; bool only_release_metadata = false; u32 blocksize = fs_info->sectorsize; - pgoff_t index = from >> PAGE_SHIFT; - unsigned offset = from & (blocksize - 1); + pgoff_t index = (offset >> PAGE_SHIFT); struct folio *folio; gfp_t mask = btrfs_alloc_write_mask(mapping); size_t write_bytes = blocksize; int ret = 0; + const bool in_head_block = is_inside_block(offset, round_down(start, blocksize), + blocksize); + const bool in_tail_block = is_inside_block(offset, round_down(end, blocksize), + blocksize); + bool need_truncate_head = false; + bool need_truncate_tail = false; + u64 zero_start; + u64 zero_end; u64 block_start; u64 block_end; - if (IS_ALIGNED(offset, blocksize) && - (!len || IS_ALIGNED(len, blocksize))) + /* @offset should be inside the range. */ + ASSERT(start <= offset && offset <= end, "offset=%llu start=%llu end=%llu", + offset, start, end); + + /* The range is aligned at both ends. */ + if (IS_ALIGNED(start, blocksize) && IS_ALIGNED(end + 1, blocksize)) { + /* + * For block size < page size case, we may have polluted blocks + * beyond EOF. So we also need to zero them out. + */ + if (end == (u64)-1 && blocksize < PAGE_SIZE) + ret = truncate_block_zero_beyond_eof(inode, start); goto out; + } - block_start = round_down(from, blocksize); + /* + * @offset may not be inside the head nor tail block. In that case we + * don't need to do anything. + */ + if (!in_head_block && !in_tail_block) + goto out; + + /* + * Skip the truncatioin if the range in the target block is already aligned. + * The seemingly complex check will also handle the same block case. + */ + if (in_head_block && !IS_ALIGNED(start, blocksize)) + need_truncate_head = true; + if (in_tail_block && !IS_ALIGNED(end + 1, blocksize)) + need_truncate_tail = true; + if (!need_truncate_head && !need_truncate_tail) + goto out; + + block_start = round_down(offset, blocksize); block_end = block_start + blocksize - 1; ret = btrfs_check_data_free_space(inode, &data_reserved, block_start, @@ -4841,10 +4931,13 @@ again: folio = __filemap_get_folio(mapping, index, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask); if (IS_ERR(folio)) { - btrfs_delalloc_release_space(inode, data_reserved, block_start, - blocksize, true); + if (only_release_metadata) + btrfs_delalloc_release_metadata(inode, blocksize, true); + else + btrfs_delalloc_release_space(inode, data_reserved, + block_start, blocksize, true); btrfs_delalloc_release_extents(inode, blocksize); - ret = -ENOMEM; + ret = PTR_ERR(folio); goto out; } @@ -4874,11 +4967,11 @@ again: folio_wait_writeback(folio); - lock_extent(io_tree, block_start, block_end, &cached_state); + btrfs_lock_extent(io_tree, block_start, block_end, &cached_state); ordered = btrfs_lookup_ordered_extent(inode, block_start); if (ordered) { - unlock_extent(io_tree, block_start, block_end, &cached_state); + btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state); folio_unlock(folio); folio_put(folio); btrfs_start_ordered_extent(ordered); @@ -4886,37 +4979,46 @@ again: goto again; } - clear_extent_bit(&inode->io_tree, block_start, block_end, - EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - &cached_state); + btrfs_clear_extent_bit(&inode->io_tree, block_start, block_end, + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, + &cached_state); ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0, &cached_state); if (ret) { - unlock_extent(io_tree, block_start, block_end, &cached_state); + btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state); goto out_unlock; } - if (offset != blocksize) { - if (!len) - len = blocksize - offset; - if (front) - folio_zero_range(folio, block_start - folio_pos(folio), - offset); - else - folio_zero_range(folio, - (block_start - folio_pos(folio)) + offset, - len); + if (end == (u64)-1) { + /* + * We're truncating beyond EOF, the remaining blocks normally are + * already holes thus no need to zero again, but it's possible for + * fs block size < page size cases to have memory mapped writes + * to pollute ranges beyond EOF. + * + * In that case although such polluted blocks beyond EOF will + * not reach disk, it still affects our page caches. + */ + zero_start = max_t(u64, folio_pos(folio), start); + zero_end = min_t(u64, folio_pos(folio) + folio_size(folio) - 1, + end); + } else { + zero_start = max_t(u64, block_start, start); + zero_end = min_t(u64, block_end, end); } + folio_zero_range(folio, zero_start - folio_pos(folio), + zero_end - zero_start + 1); + btrfs_folio_clear_checked(fs_info, folio, block_start, block_end + 1 - block_start); btrfs_folio_set_dirty(fs_info, folio, block_start, block_end + 1 - block_start); - unlock_extent(io_tree, block_start, block_end, &cached_state); + btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state); if (only_release_metadata) - set_extent_bit(&inode->io_tree, block_start, block_end, - EXTENT_NORESERVE, NULL); + btrfs_set_extent_bit(&inode->io_tree, block_start, block_end, + EXTENT_NORESERVE, NULL); out_unlock: if (ret) { @@ -5009,7 +5111,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) * rest of the block before we expand the i_size, otherwise we could * expose stale data. */ - ret = btrfs_truncate_block(inode, oldsize, 0, 0); + ret = btrfs_truncate_block(inode, oldsize, oldsize, -1); if (ret) return ret; @@ -5026,7 +5128,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) em = NULL; break; } - last_byte = min(extent_map_end(em), block_end); + last_byte = min(btrfs_extent_map_end(em), block_end); last_byte = ALIGN(last_byte, fs_info->sectorsize); hole_size = last_byte - cur_offset; @@ -5042,7 +5144,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) if (ret) break; - hole_em = alloc_extent_map(); + hole_em = btrfs_alloc_extent_map(); if (!hole_em) { btrfs_drop_extent_map_range(inode, cur_offset, cur_offset + hole_size - 1, @@ -5059,7 +5161,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) hole_em->generation = btrfs_get_fs_generation(fs_info); ret = btrfs_replace_extent_map_range(inode, hole_em, true); - free_extent_map(hole_em); + btrfs_free_extent_map(hole_em); } else { ret = btrfs_inode_set_file_extent_range(inode, cur_offset, hole_size); @@ -5067,14 +5169,14 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) break; } next: - free_extent_map(em); + btrfs_free_extent_map(em); em = NULL; cur_offset = last_byte; if (cur_offset >= block_end) break; } - free_extent_map(em); - unlock_extent(io_tree, hole_start, block_end - 1, &cached_state); + btrfs_free_extent_map(em); + btrfs_unlock_extent(io_tree, hole_start, block_end - 1, &cached_state); return ret; } @@ -5258,7 +5360,7 @@ static void evict_inode_truncate_pages(struct inode *inode) state_flags = state->state; spin_unlock(&io_tree->lock); - lock_extent(io_tree, start, end, &cached_state); + btrfs_lock_extent(io_tree, start, end, &cached_state); /* * If still has DELALLOC flag, the extent didn't reach disk, @@ -5272,9 +5374,9 @@ static void evict_inode_truncate_pages(struct inode *inode) btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start, end - start + 1, NULL); - clear_extent_bit(io_tree, start, end, - EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING, - &cached_state); + btrfs_clear_extent_bit(io_tree, start, end, + EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING, + &cached_state); cond_resched(); spin_lock(&io_tree->lock); @@ -5460,7 +5562,7 @@ static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry, struct btrfs_key *location, u8 *type) { struct btrfs_dir_item *di; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_root *root = dir->root; int ret = 0; struct fscrypt_name fname; @@ -5471,7 +5573,7 @@ static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry, ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname); if (ret < 0) - goto out; + return ret; /* * fscrypt_setup_filename() should never return a positive value, but * gcc on sparc/parisc thinks it can, so assert that doesn't happen. @@ -5500,7 +5602,6 @@ static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry, *type = btrfs_dir_ftype(path->nodes[0], di); out: fscrypt_free_filename(&fname); - btrfs_free_path(path); return ret; } @@ -5515,7 +5616,7 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, struct btrfs_key *location, struct btrfs_root **sub_root) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_root *new_root; struct btrfs_root_ref *ref; struct extent_buffer *leaf; @@ -5571,7 +5672,6 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, location->offset = 0; err = 0; out: - btrfs_free_path(path); fscrypt_free_filename(&fname); return err; } @@ -5681,8 +5781,10 @@ struct btrfs_inode *btrfs_iget(u64 ino, struct btrfs_root *root) return inode; path = btrfs_alloc_path(); - if (!path) + if (!path) { + iget_failed(&inode->vfs_inode); return ERR_PTR(-ENOMEM); + } ret = btrfs_read_locked_inode(inode, path); btrfs_free_path(path); @@ -5848,7 +5950,7 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode) { struct btrfs_root *root = inode->root; struct btrfs_key key, found_key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; int ret; @@ -5862,15 +5964,14 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode) ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) - goto out; + return ret; /* FIXME: we should be able to handle this */ if (ret == 0) - goto out; - ret = 0; + return ret; if (path->slots[0] == 0) { inode->index_cnt = BTRFS_DIR_START_INDEX; - goto out; + return 0; } path->slots[0]--; @@ -5881,13 +5982,12 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode) if (found_key.objectid != btrfs_ino(inode) || found_key.type != BTRFS_DIR_INDEX_KEY) { inode->index_cnt = BTRFS_DIR_START_INDEX; - goto out; + return 0; } inode->index_cnt = found_key.offset + 1; -out: - btrfs_free_path(path); - return ret; + + return 0; } static int btrfs_get_dir_last_index(struct btrfs_inode *dir, u64 *index) @@ -5990,7 +6090,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) struct btrfs_dir_item *di; struct btrfs_key key; struct btrfs_key found_key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); void *addr; LIST_HEAD(ins_list); LIST_HEAD(del_list); @@ -6103,7 +6203,6 @@ nopos: err: if (put) btrfs_readdir_put_delayed_items(BTRFS_I(inode), &ins_list, &del_list); - btrfs_free_path(path); return ret; } @@ -6893,18 +6992,18 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, struct extent_map_tree *em_tree = &inode->extent_tree; read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, len); + em = btrfs_lookup_extent_mapping(em_tree, start, len); read_unlock(&em_tree->lock); if (em) { if (em->start > start || em->start + em->len <= start) - free_extent_map(em); + btrfs_free_extent_map(em); else if (em->disk_bytenr == EXTENT_MAP_INLINE && folio) - free_extent_map(em); + btrfs_free_extent_map(em); else goto out; } - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { ret = -ENOMEM; goto out; @@ -7041,7 +7140,7 @@ not_found: insert: ret = 0; btrfs_release_path(path); - if (em->start > start || extent_map_end(em) <= start) { + if (em->start > start || btrfs_extent_map_end(em) <= start) { btrfs_err(fs_info, "bad extent! em: [%llu %llu] passed [%llu %llu]", em->start, em->len, start, len); @@ -7058,7 +7157,7 @@ out: trace_btrfs_get_extent(root, inode, em); if (ret) { - free_extent_map(em); + btrfs_free_extent_map(em); return ERR_PTR(ret); } return em; @@ -7102,7 +7201,7 @@ noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len, struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct can_nocow_file_extent_args nocow_args = { 0 }; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int ret; struct extent_buffer *leaf; struct extent_io_tree *io_tree = &inode->io_tree; @@ -7118,13 +7217,12 @@ noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len, ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), offset, 0); if (ret < 0) - goto out; + return ret; if (ret == 1) { if (path->slots[0] == 0) { - /* can't find the item, must cow */ - ret = 0; - goto out; + /* Can't find the item, must COW. */ + return 0; } path->slots[0]--; } @@ -7133,17 +7231,17 @@ noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len, btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) { - /* not our file or wrong item type, must cow */ - goto out; + /* Not our file or wrong item type, must COW. */ + return 0; } if (key.offset > offset) { - /* Wrong offset, must cow */ - goto out; + /* Wrong offset, must COW. */ + return 0; } if (btrfs_file_extent_end(path) <= offset) - goto out; + return 0; fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); found_type = btrfs_file_extent_type(leaf, fi); @@ -7158,15 +7256,13 @@ noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len, if (ret != 1) { /* Treat errors as not being able to NOCOW. */ - ret = 0; - goto out; + return 0; } - ret = 0; if (btrfs_extent_readonly(fs_info, nocow_args.file_extent.disk_bytenr + nocow_args.file_extent.offset)) - goto out; + return 0; if (!(inode->flags & BTRFS_INODE_NODATACOW) && found_type == BTRFS_FILE_EXTENT_PREALLOC) { @@ -7174,21 +7270,18 @@ noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len, range_end = round_up(offset + nocow_args.file_extent.num_bytes, root->fs_info->sectorsize) - 1; - ret = test_range_bit_exists(io_tree, offset, range_end, EXTENT_DELALLOC); - if (ret) { - ret = -EAGAIN; - goto out; - } + ret = btrfs_test_range_bit_exists(io_tree, offset, range_end, + EXTENT_DELALLOC); + if (ret) + return -EAGAIN; } if (file_extent) memcpy(file_extent, &nocow_args.file_extent, sizeof(*file_extent)); *len = nocow_args.file_extent.num_bytes; - ret = 1; -out: - btrfs_free_path(path); - return ret; + + return 1; } /* The callers of this must take lock_extent() */ @@ -7236,7 +7329,7 @@ struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start, break; } - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) return ERR_PTR(-ENOMEM); @@ -7249,15 +7342,15 @@ struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start, em->offset = file_extent->offset; em->flags |= EXTENT_FLAG_PINNED; if (type == BTRFS_ORDERED_COMPRESSED) - extent_map_set_compression(em, file_extent->compression); + btrfs_extent_map_set_compression(em, file_extent->compression); ret = btrfs_replace_extent_map_range(inode, em, true); if (ret) { - free_extent_map(em); + btrfs_free_extent_map(em); return ERR_PTR(ret); } - /* em got 2 refs now, callers needs to do free_extent_map once. */ + /* em got 2 refs now, callers needs to do btrfs_free_extent_map once. */ return em; } @@ -7384,7 +7477,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, } if (!inode_evicting) - lock_extent(tree, page_start, page_end, &cached_state); + btrfs_lock_extent(tree, page_start, page_end, &cached_state); cur = page_start; while (cur < page_end) { @@ -7440,10 +7533,10 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, * btrfs_finish_ordered_io(). */ if (!inode_evicting) - clear_extent_bit(tree, cur, range_end, - EXTENT_DELALLOC | - EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, &cached_state); + btrfs_clear_extent_bit(tree, cur, range_end, + EXTENT_DELALLOC | + EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, &cached_state); spin_lock_irq(&inode->ordered_tree_lock); set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); @@ -7485,12 +7578,11 @@ next: * Since the IO will never happen for this page. */ btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur, NULL); - if (!inode_evicting) { - clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED | - EXTENT_DELALLOC | EXTENT_UPTODATE | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG | - extra_flags, &cached_state); - } + if (!inode_evicting) + btrfs_clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED | + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG | extra_flags, + &cached_state); cur = range_end + 1; } /* @@ -7594,7 +7686,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize); control.new_size = new_size; - lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state); + btrfs_lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state); /* * We want to drop from the next block forward in case this new * size is not block aligned since we will be keeping the last @@ -7609,7 +7701,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) inode_sub_bytes(&inode->vfs_inode, control.sub_bytes); btrfs_inode_safe_disk_i_size_write(inode, control.last_size); - unlock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state); + btrfs_unlock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state); trans->block_rsv = &fs_info->trans_block_rsv; if (ret != -ENOSPC && ret != -EAGAIN) @@ -7653,7 +7745,8 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); - ret = btrfs_truncate_block(inode, inode->vfs_inode.i_size, 0, 0); + ret = btrfs_truncate_block(inode, inode->vfs_inode.i_size, + inode->vfs_inode.i_size, (u64)-1); if (ret) goto out; trans = btrfs_start_transaction(root, 1); @@ -7765,10 +7858,10 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->i_otime_nsec = 0; inode = &ei->vfs_inode; - extent_map_tree_init(&ei->extent_tree); + btrfs_extent_map_tree_init(&ei->extent_tree); /* This io tree sets the valid inode. */ - extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO); + btrfs_extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO); ei->io_tree.inode = ei; ei->file_extent_tree = NULL; @@ -8545,7 +8638,7 @@ static int start_delalloc_inodes(struct btrfs_root *root, struct btrfs_inode *inode; struct inode *tmp_inode; - inode = list_entry(splice.next, struct btrfs_inode, delalloc_inodes); + inode = list_first_entry(&splice, struct btrfs_inode, delalloc_inodes); list_move_tail(&inode->delalloc_inodes, &root->delalloc_inodes); @@ -8909,11 +9002,11 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, if (IS_ERR(trans)) { ret = PTR_ERR(trans); btrfs_free_reserved_extent(fs_info, ins.objectid, - ins.offset, 0); + ins.offset, false); break; } - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { btrfs_drop_extent_map_range(BTRFS_I(inode), cur_offset, cur_offset + ins.offset - 1, false); @@ -8931,7 +9024,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, em->generation = trans->transid; ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true); - free_extent_map(em); + btrfs_free_extent_map(em); next: num_bytes -= ins.offset; cur_offset += ins.offset; @@ -9103,7 +9196,7 @@ static ssize_t btrfs_encoded_read_inline( struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct extent_io_tree *io_tree = &inode->io_tree; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_file_extent_item *item; u64 ram_bytes; @@ -9113,10 +9206,8 @@ static ssize_t btrfs_encoded_read_inline( const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } + if (!path) + return -ENOMEM; path->nowait = nowait; @@ -9125,9 +9216,9 @@ static ssize_t btrfs_encoded_read_inline( if (ret) { if (ret > 0) { /* The extent item disappeared? */ - ret = -EIO; + return -EIO; } - goto out; + return ret; } leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -9140,17 +9231,16 @@ static ssize_t btrfs_encoded_read_inline( ret = btrfs_encoded_io_compression_from_extent(fs_info, btrfs_file_extent_compression(leaf, item)); if (ret < 0) - goto out; + return ret; encoded->compression = ret; if (encoded->compression) { size_t inline_size; inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]); - if (inline_size > count) { - ret = -ENOBUFS; - goto out; - } + if (inline_size > count) + return -ENOBUFS; + count = inline_size; encoded->unencoded_len = ram_bytes; encoded->unencoded_offset = iocb->ki_pos - extent_start; @@ -9162,13 +9252,12 @@ static ssize_t btrfs_encoded_read_inline( } tmp = kmalloc(count, GFP_NOFS); - if (!tmp) { - ret = -ENOMEM; - goto out; - } + if (!tmp) + return -ENOMEM; + read_extent_buffer(leaf, tmp, ptr, count); btrfs_release_path(path); - unlock_extent(io_tree, start, lockend, cached_state); + btrfs_unlock_extent(io_tree, start, lockend, cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); *unlocked = true; @@ -9176,8 +9265,7 @@ static ssize_t btrfs_encoded_read_inline( if (ret != count) ret = -EFAULT; kfree(tmp); -out: - btrfs_free_path(path); + return ret; } @@ -9317,7 +9405,7 @@ ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter, if (ret) goto out; - unlock_extent(io_tree, start, lockend, cached_state); + btrfs_unlock_extent(io_tree, start, lockend, cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); *unlocked = true; @@ -9394,7 +9482,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, goto out_unlock_inode; } - if (!try_lock_extent(io_tree, start, lockend, cached_state)) { + if (!btrfs_try_lock_extent(io_tree, start, lockend, cached_state)) { ret = -EAGAIN; goto out_unlock_inode; } @@ -9403,7 +9491,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, lockend - start + 1); if (ordered) { btrfs_put_ordered_extent(ordered); - unlock_extent(io_tree, start, lockend, cached_state); + btrfs_unlock_extent(io_tree, start, lockend, cached_state); ret = -EAGAIN; goto out_unlock_inode; } @@ -9416,13 +9504,13 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, if (ret) goto out_unlock_inode; - lock_extent(io_tree, start, lockend, cached_state); + btrfs_lock_extent(io_tree, start, lockend, cached_state); ordered = btrfs_lookup_ordered_range(inode, start, lockend - start + 1); if (!ordered) break; btrfs_put_ordered_extent(ordered); - unlock_extent(io_tree, start, lockend, cached_state); + btrfs_unlock_extent(io_tree, start, lockend, cached_state); cond_resched(); } } @@ -9440,7 +9528,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, * For inline extents we get everything we need out of the * extent item. */ - free_extent_map(em); + btrfs_free_extent_map(em); em = NULL; ret = btrfs_encoded_read_inline(iocb, iter, start, lockend, cached_state, extent_start, @@ -9452,7 +9540,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, * We only want to return up to EOF even if the extent extends beyond * that. */ - encoded->len = min_t(u64, extent_map_end(em), + encoded->len = min_t(u64, btrfs_extent_map_end(em), inode->vfs_inode.i_size) - iocb->ki_pos; if (em->disk_bytenr == EXTENT_MAP_HOLE || (em->flags & EXTENT_FLAG_PREALLOC)) { @@ -9460,7 +9548,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, count = min_t(u64, count, encoded->len); encoded->len = count; encoded->unencoded_len = count; - } else if (extent_map_is_compressed(em)) { + } else if (btrfs_extent_map_is_compressed(em)) { *disk_bytenr = em->disk_bytenr; /* * Bail if the buffer isn't large enough to return the whole @@ -9475,12 +9563,12 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, encoded->unencoded_len = em->ram_bytes; encoded->unencoded_offset = iocb->ki_pos - (em->start - em->offset); ret = btrfs_encoded_io_compression_from_extent(fs_info, - extent_map_compression(em)); + btrfs_extent_map_compression(em)); if (ret < 0) goto out_em; encoded->compression = ret; } else { - *disk_bytenr = extent_map_block_start(em) + (start - em->start); + *disk_bytenr = btrfs_extent_map_block_start(em) + (start - em->start); if (encoded->len > count) encoded->len = count; /* @@ -9493,11 +9581,11 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, encoded->unencoded_len = count; *disk_io_size = ALIGN(*disk_io_size, fs_info->sectorsize); } - free_extent_map(em); + btrfs_free_extent_map(em); em = NULL; if (*disk_bytenr == EXTENT_MAP_HOLE) { - unlock_extent(io_tree, start, lockend, cached_state); + btrfs_unlock_extent(io_tree, start, lockend, cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); unlocked = true; ret = iov_iter_zero(count, iter); @@ -9509,11 +9597,11 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, } out_em: - free_extent_map(em); + btrfs_free_extent_map(em); out_unlock_extent: /* Leave inode and extent locked if we need to do a read. */ if (!unlocked && ret != -EIOCBQUEUED) - unlock_extent(io_tree, start, lockend, cached_state); + btrfs_unlock_extent(io_tree, start, lockend, cached_state); out_unlock_inode: if (!unlocked && ret != -EIOCBQUEUED) btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); @@ -9660,14 +9748,14 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, end >> PAGE_SHIFT); if (ret) goto out_folios; - lock_extent(io_tree, start, end, &cached_state); + btrfs_lock_extent(io_tree, start, end, &cached_state); ordered = btrfs_lookup_ordered_range(inode, start, num_bytes); if (!ordered && !filemap_range_has_page(inode->vfs_inode.i_mapping, start, end)) break; if (ordered) btrfs_put_ordered_extent(ordered); - unlock_extent(io_tree, start, end, &cached_state); + btrfs_unlock_extent(io_tree, start, end, &cached_state); cond_resched(); } @@ -9717,11 +9805,11 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, ret = PTR_ERR(em); goto out_free_reserved; } - free_extent_map(em); + btrfs_free_extent_map(em); ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent, - (1 << BTRFS_ORDERED_ENCODED) | - (1 << BTRFS_ORDERED_COMPRESSED)); + (1U << BTRFS_ORDERED_ENCODED) | + (1U << BTRFS_ORDERED_COMPRESSED)); if (IS_ERR(ordered)) { btrfs_drop_extent_map_range(inode, start, end, false); ret = PTR_ERR(ordered); @@ -9732,7 +9820,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, if (start + encoded->len > inode->vfs_inode.i_size) i_size_write(&inode->vfs_inode, start + encoded->len); - unlock_extent(io_tree, start, end, &cached_state); + btrfs_unlock_extent(io_tree, start, end, &cached_state); btrfs_delalloc_release_extents(inode, num_bytes); @@ -9742,7 +9830,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, out_free_reserved: btrfs_dec_block_group_reservations(fs_info, ins.objectid); - btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true); out_delalloc_release: btrfs_delalloc_release_extents(inode, num_bytes); btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0); @@ -9755,9 +9843,9 @@ out_free_data_space: * bytes_may_use. */ if (!extent_reserved) - btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes); + btrfs_free_reserved_data_space_noquota(inode, disk_num_bytes); out_unlock: - unlock_extent(io_tree, start, end, &cached_state); + btrfs_unlock_extent(io_tree, start, end, &cached_state); out_folios: for (i = 0; i < nr_folios; i++) { if (folios[i]) @@ -10022,7 +10110,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize); - lock_extent(io_tree, 0, isize - 1, &cached_state); + btrfs_lock_extent(io_tree, 0, isize - 1, &cached_state); while (prev_extent_end < isize) { struct btrfs_key key; struct extent_buffer *leaf; @@ -10200,7 +10288,7 @@ out: if (!IS_ERR_OR_NULL(map)) btrfs_free_chunk_map(map); - unlock_extent(io_tree, 0, isize - 1, &cached_state); + btrfs_unlock_extent(io_tree, 0, isize - 1, &cached_state); if (ret) btrfs_swap_deactivate(file); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 63aeacc54945..913acef3f0a9 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -909,7 +909,7 @@ static noinline int btrfs_mksubvol(const struct path *parent, if (error == -EINTR) return error; - dentry = lookup_one(idmap, name, parent->dentry, namelen); + dentry = lookup_one(idmap, &QSTR_LEN(name, namelen), parent->dentry); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out_unlock; @@ -1446,8 +1446,8 @@ out: return ret; } -static noinline int key_in_sk(const struct btrfs_key *key, - const struct btrfs_ioctl_search_key *sk) +static noinline bool key_in_sk(const struct btrfs_key *key, + const struct btrfs_ioctl_search_key *sk) { struct btrfs_key test; int ret; @@ -1458,7 +1458,7 @@ static noinline int key_in_sk(const struct btrfs_key *key, ret = btrfs_comp_cpu_keys(key, &test); if (ret < 0) - return 0; + return false; test.objectid = sk->max_objectid; test.type = sk->max_type; @@ -1466,8 +1466,8 @@ static noinline int key_in_sk(const struct btrfs_key *key, ret = btrfs_comp_cpu_keys(key, &test); if (ret > 0) - return 0; - return 1; + return false; + return true; } static noinline int copy_to_sk(struct btrfs_path *path, @@ -2288,7 +2288,6 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, struct btrfs_ioctl_vol_args_v2 *vol_args2 = NULL; struct mnt_idmap *idmap = file_mnt_idmap(file); char *subvol_name, *subvol_name_ptr = NULL; - int subvol_namelen; int ret = 0; bool destroy_parent = false; @@ -2411,10 +2410,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, goto out; } - subvol_namelen = strlen(subvol_name); - if (strchr(subvol_name, '/') || - strncmp(subvol_name, "..", subvol_namelen) == 0) { + strcmp(subvol_name, "..") == 0) { ret = -EINVAL; goto free_subvol_name; } @@ -2427,7 +2424,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, ret = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); if (ret == -EINTR) goto free_subvol_name; - dentry = lookup_one(idmap, subvol_name, parent, subvol_namelen); + dentry = lookup_one(idmap, &QSTR(subvol_name), parent); if (IS_ERR(dentry)) { ret = PTR_ERR(dentry); goto out_unlock_dir; @@ -4510,7 +4507,7 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp, args.compression, &unlocked); if (!unlocked) { - unlock_extent(io_tree, start, lockend, &cached_state); + btrfs_unlock_extent(io_tree, start, lockend, &cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); } } @@ -4699,7 +4696,7 @@ static void btrfs_uring_read_finished(struct io_uring_cmd *cmd, unsigned int iss ret = priv->count; out: - unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state); + btrfs_unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); io_uring_cmd_done(cmd, ret, 0, issue_flags); @@ -4788,7 +4785,7 @@ static int btrfs_uring_read_extent(struct kiocb *iocb, struct iov_iter *iter, return -EIOCBQUEUED; out_fail: - unlock_extent(io_tree, start, lockend, &cached_state); + btrfs_unlock_extent(io_tree, start, lockend, &cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); kfree(priv); return ret; @@ -4913,7 +4910,7 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue (const char *)&data->args + copy_end_kernel, sizeof(data->args) - copy_end_kernel)) { if (ret == -EIOCBQUEUED) { - unlock_extent(io_tree, start, lockend, &cached_state); + btrfs_unlock_extent(io_tree, start, lockend, &cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); } ret = -EFAULT; diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 81e62b652e21..a3e6d9616e60 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -149,15 +149,15 @@ void btrfs_tree_read_lock_nested(struct extent_buffer *eb, enum btrfs_lock_nesti /* * Try-lock for read. * - * Return 1 if the rwlock has been taken, 0 otherwise + * Return true if the rwlock has been taken, false otherwise */ -int btrfs_try_tree_read_lock(struct extent_buffer *eb) +bool btrfs_try_tree_read_lock(struct extent_buffer *eb) { if (down_read_trylock(&eb->lock)) { trace_btrfs_try_tree_read_lock(eb); - return 1; + return true; } - return 0; + return false; } /* diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index c69e57ff804b..af29df98ac14 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -189,7 +189,7 @@ static inline void btrfs_tree_read_lock(struct extent_buffer *eb) } void btrfs_tree_read_unlock(struct extent_buffer *eb); -int btrfs_try_tree_read_lock(struct extent_buffer *eb); +bool btrfs_try_tree_read_lock(struct extent_buffer *eb); struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root); diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index a45bc11f8665..d403641889ca 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -252,9 +252,8 @@ int lzo_compress_folios(struct list_head *ws, struct address_space *mapping, /* Compress at most one sector of data each time */ in_len = min_t(u32, start + len - cur_in, sectorsize - sector_off); ASSERT(in_len); - data_in = kmap_local_folio(folio_in, 0); - ret = lzo1x_1_compress(data_in + - offset_in_page(cur_in), in_len, + data_in = kmap_local_folio(folio_in, offset_in_folio(folio_in, cur_in)); + ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf, &out_len, workspace->mem); kunmap_local(data_in); diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h index 08a9272399d2..6abf81bb00c2 100644 --- a/fs/btrfs/messages.h +++ b/fs/btrfs/messages.h @@ -4,6 +4,7 @@ #define BTRFS_MESSAGES_H #include <linux/types.h> +#include <linux/types.h> #include <linux/printk.h> #include <linux/bug.h> @@ -170,15 +171,83 @@ do { \ #ifdef CONFIG_BTRFS_ASSERT -#define btrfs_assertfail(expr, file, line) ({ \ - pr_err("assertion failed: %s, in %s:%d\n", (expr), (file), (line)); \ - BUG(); \ -}) +__printf(1, 2) +static inline void verify_assert_printk_format(const char *fmt, ...) { + /* Stub to verify the assertion format string. */ +} + +/* Take the first token if any. */ +#define __FIRST_ARG(_, ...) _ +/* + * Skip the first token and return the rest, if it's empty the comma is dropped. + * As ##__VA_ARGS__ cannot be at the beginning of the macro the __VA_OPT__ is needed + * and supported since GCC 8 and Clang 12. + */ +#define __REST_ARGS(_, ... ) __VA_OPT__(,) __VA_ARGS__ + +#if defined(CONFIG_CC_IS_CLANG) || GCC_VERSION >= 80000 +/* + * Assertion with optional printk() format. + * + * Accepted syntax: + * ASSERT(condition); + * ASSERT(condition, "string"); + * ASSERT(condition, "variable=%d", variable); + * + * How it works: + * - if there's no format string, ""[0] evaluates at compile time to 0 and the + * true branch is executed + * - any non-empty format string with the "" prefix evaluates to != 0 at + * compile time and the false branch is executed + * - stringified condition is printed as %s so we don't accidentally mix format + * strings (the % operator) + * - there can be only one printk() call, so the format strings and arguments are + * spliced together: + * DEFAULT_FMT [USER_FMT], DEFAULT_ARGS [, USER_ARGS] + * - comma between DEFAULT_ARGS and USER_ARGS is handled by preprocessor + * (requires __VA_OPT__ support) + * - otherwise we could use __VA_OPT(,) __VA_ARGS__ for the 2nd+ argument of args, + */ +#define ASSERT(cond, args...) \ +do { \ + verify_assert_printk_format("check the format string" args); \ + if (!likely(cond)) { \ + if (("" __FIRST_ARG(args) [0]) == 0) { \ + pr_err("assertion failed: %s :: %ld, in %s:%d\n", \ + #cond, (long)(cond), __FILE__, __LINE__); \ + } else { \ + pr_err("assertion failed: %s :: %ld, in %s:%d (" __FIRST_ARG(args) ")\n", \ + #cond, (long)(cond), __FILE__, __LINE__ __REST_ARGS(args)); \ + } \ + BUG(); \ + } \ +} while(0) + +#else + +/* For GCC < 8.x only the simple output. */ + +#define ASSERT(cond, args...) \ +do { \ + verify_assert_printk_format("check the format string" args); \ + if (!likely(cond)) { \ + pr_err("assertion failed: %s :: %ld, in %s:%d\n", \ + #cond, (long)(cond), __FILE__, __LINE__); \ + BUG(); \ + } \ +} while(0) + +#endif + +#else +#define ASSERT(cond, args...) (void)(cond) +#endif -#define ASSERT(expr) \ - (likely(expr) ? (void)0 : btrfs_assertfail(#expr, __FILE__, __LINE__)) +#ifdef CONFIG_BTRFS_DEBUG +/* Verbose warning only under debug build. */ +#define DEBUG_WARN(args...) WARN(1, KERN_ERR args) #else -#define ASSERT(expr) (void)(expr) +#define DEBUG_WARN(...) do {} while(0) #endif __printf(5, 6) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 03c945711003..9212ce110cde 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -153,25 +153,30 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( struct btrfs_ordered_extent *entry; int ret; u64 qgroup_rsv = 0; + const bool is_nocow = (flags & + ((1U << BTRFS_ORDERED_NOCOW) | (1U << BTRFS_ORDERED_PREALLOC))); - if (flags & - ((1 << BTRFS_ORDERED_NOCOW) | (1 << BTRFS_ORDERED_PREALLOC))) { - /* For nocow write, we can release the qgroup rsv right now */ + /* + * For a NOCOW write we can free the qgroup reserve right now. For a COW + * one we transfer the reserved space from the inode's iotree into the + * ordered extent by calling btrfs_qgroup_release_data() and tracking + * the qgroup reserved amount in the ordered extent, so that later after + * completing the ordered extent, when running the data delayed ref it + * creates, we free the reserved data with btrfs_qgroup_free_refroot(). + */ + if (is_nocow) ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes, &qgroup_rsv); - if (ret < 0) - return ERR_PTR(ret); - } else { - /* - * The ordered extent has reserved qgroup space, release now - * and pass the reserved number for qgroup_record to free. - */ + else ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes, &qgroup_rsv); - if (ret < 0) - return ERR_PTR(ret); - } + + if (ret < 0) + return ERR_PTR(ret); + entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS); - if (!entry) - return ERR_PTR(-ENOMEM); + if (!entry) { + entry = ERR_PTR(-ENOMEM); + goto out; + } entry->file_offset = file_offset; entry->num_bytes = num_bytes; @@ -180,7 +185,12 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( entry->disk_num_bytes = disk_num_bytes; entry->offset = offset; entry->bytes_left = num_bytes; - entry->inode = BTRFS_I(igrab(&inode->vfs_inode)); + if (WARN_ON_ONCE(!igrab(&inode->vfs_inode))) { + kmem_cache_free(btrfs_ordered_extent_cache, entry); + entry = ERR_PTR(-ESTALE); + goto out; + } + entry->inode = inode; entry->compress_type = compress_type; entry->truncated_len = (u64)-1; entry->qgroup_rsv = qgroup_rsv; @@ -203,6 +213,12 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( btrfs_mod_outstanding_extents(inode, 1); spin_unlock(&inode->lock); +out: + if (IS_ERR(entry) && !is_nocow) + btrfs_qgroup_free_refroot(inode->root->fs_info, + btrfs_root_id(inode->root), + qgroup_rsv, BTRFS_QGROUP_RSV_DATA); + return entry; } @@ -253,7 +269,7 @@ static void insert_ordered_extent(struct btrfs_ordered_extent *entry) * @disk_bytenr: Offset of extent on disk. * @disk_num_bytes: Size of extent on disk. * @offset: Offset into unencoded data where file data starts. - * @flags: Flags specifying type of extent (1 << BTRFS_ORDERED_*). + * @flags: Flags specifying type of extent (1U << BTRFS_ORDERED_*). * @compress_type: Compression algorithm used for data. * * Most of these parameters correspond to &struct btrfs_file_extent_item. The @@ -607,23 +623,18 @@ out: */ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) { - struct list_head *cur; - struct btrfs_ordered_sum *sum; - trace_btrfs_ordered_extent_put(entry->inode, entry); if (refcount_dec_and_test(&entry->refs)) { + struct btrfs_ordered_sum *sum; + struct btrfs_ordered_sum *tmp; + ASSERT(list_empty(&entry->root_extent_list)); ASSERT(list_empty(&entry->log_list)); ASSERT(RB_EMPTY_NODE(&entry->rb_node)); - if (entry->inode) - btrfs_add_delayed_iput(entry->inode); - while (!list_empty(&entry->list)) { - cur = entry->list.next; - sum = list_entry(cur, struct btrfs_ordered_sum, list); - list_del(&sum->list); + btrfs_add_delayed_iput(entry->inode); + list_for_each_entry_safe(sum, tmp, &entry->list, list) kvfree(sum); - } kmem_cache_free(btrfs_ordered_extent_cache, entry); } } @@ -1173,7 +1184,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, cachedp = cached_state; while (1) { - lock_extent(&inode->io_tree, start, end, cachedp); + btrfs_lock_extent(&inode->io_tree, start, end, cachedp); ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1); if (!ordered) { @@ -1186,7 +1197,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, refcount_dec(&cache->refs); break; } - unlock_extent(&inode->io_tree, start, end, cachedp); + btrfs_unlock_extent(&inode->io_tree, start, end, cachedp); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); } @@ -1204,7 +1215,7 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, { struct btrfs_ordered_extent *ordered; - if (!try_lock_extent(&inode->io_tree, start, end, cached_state)) + if (!btrfs_try_lock_extent(&inode->io_tree, start, end, cached_state)) return false; ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1); @@ -1212,7 +1223,7 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, return true; btrfs_put_ordered_extent(ordered); - unlock_extent(&inode->io_tree, start, end, cached_state); + btrfs_unlock_extent(&inode->io_tree, start, end, cached_state); return false; } diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index d6fa36674270..b3176edbde82 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -83,7 +83,7 @@ static void qgroup_rsv_add(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup, u64 num_bytes, enum btrfs_qgroup_rsv_type type) { - trace_qgroup_update_reserve(fs_info, qgroup, num_bytes, type); + trace_btrfs_qgroup_update_reserve(fs_info, qgroup, num_bytes, type); qgroup->rsv.values[type] += num_bytes; } @@ -91,7 +91,7 @@ static void qgroup_rsv_release(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup, u64 num_bytes, enum btrfs_qgroup_rsv_type type) { - trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes, type); + trace_btrfs_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes, type); if (qgroup->rsv.values[type] >= num_bytes) { qgroup->rsv.values[type] -= num_bytes; return; @@ -1823,7 +1823,7 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) if (qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] || qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] || qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]) { - WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + DEBUG_WARN(); btrfs_warn_rl(fs_info, "to be deleted qgroup %u/%llu has non-zero numbers, data %llu meta prealloc %llu meta pertrans %llu", btrfs_qgroup_level(qgroup->qgroupid), @@ -1843,7 +1843,7 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT)) { if (qgroup->rfer || qgroup->excl || qgroup->rfer_cmpr || qgroup->excl_cmpr) { - WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + DEBUG_WARN(); btrfs_warn_rl(fs_info, "to be deleted qgroup %u/%llu has non-zero numbers, rfer %llu rfer_cmpr %llu excl %llu excl_cmpr %llu", btrfs_qgroup_level(qgroup->qgroupid), @@ -2837,8 +2837,8 @@ static void qgroup_update_counters(struct btrfs_fs_info *fs_info, cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq); cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq); - trace_qgroup_update_counters(fs_info, qg, cur_old_count, - cur_new_count); + trace_btrfs_qgroup_update_counters(fs_info, qg, cur_old_count, + cur_new_count); /* Rfer update part */ if (cur_old_count == 0 && cur_new_count > 0) { @@ -3100,8 +3100,7 @@ cleanup: kfree(record); } - trace_qgroup_num_dirty_extents(fs_info, trans->transid, - num_dirty_extents); + trace_btrfs_qgroup_num_dirty_extents(fs_info, trans->transid, num_dirty_extents); return ret; } @@ -4129,8 +4128,8 @@ static int qgroup_unreserve_range(struct btrfs_inode *inode, * Now the entry is in [start, start + len), revert the * EXTENT_QGROUP_RESERVED bit. */ - clear_ret = clear_extent_bits(&inode->io_tree, entry_start, - entry_end, EXTENT_QGROUP_RESERVED); + clear_ret = btrfs_clear_extent_bits(&inode->io_tree, entry_start, + entry_end, EXTENT_QGROUP_RESERVED); if (!ret && clear_ret < 0) ret = clear_ret; @@ -4232,8 +4231,9 @@ static int qgroup_reserve_data(struct btrfs_inode *inode, reserved = *reserved_ret; /* Record already reserved space */ orig_reserved = reserved->bytes_changed; - ret = set_record_extent_bits(&inode->io_tree, start, - start + len -1, EXTENT_QGROUP_RESERVED, reserved); + ret = btrfs_set_record_extent_bits(&inode->io_tree, start, + start + len - 1, EXTENT_QGROUP_RESERVED, + reserved); /* Newly reserved space */ to_reserve = reserved->bytes_changed - orig_reserved; @@ -4326,9 +4326,10 @@ static int qgroup_free_reserved_data(struct btrfs_inode *inode, * EXTENT_QGROUP_RESERVED, we won't double free. * So not need to rush. */ - ret = clear_record_extent_bits(&inode->io_tree, free_start, - free_start + free_len - 1, - EXTENT_QGROUP_RESERVED, &changeset); + ret = btrfs_clear_record_extent_bits(&inode->io_tree, free_start, + free_start + free_len - 1, + EXTENT_QGROUP_RESERVED, + &changeset); if (ret < 0) goto out; freed += changeset.bytes_changed; @@ -4352,9 +4353,9 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, int ret; if (btrfs_qgroup_mode(inode->root->fs_info) == BTRFS_QGROUP_MODE_DISABLED) { - return clear_record_extent_bits(&inode->io_tree, start, - start + len - 1, - EXTENT_QGROUP_RESERVED, NULL); + return btrfs_clear_record_extent_bits(&inode->io_tree, start, + start + len - 1, + EXTENT_QGROUP_RESERVED, NULL); } /* In release case, we shouldn't have @reserved */ @@ -4362,8 +4363,8 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, if (free && reserved) return qgroup_free_reserved_data(inode, reserved, start, len, released); extent_changeset_init(&changeset); - ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1, - EXTENT_QGROUP_RESERVED, &changeset); + ret = btrfs_clear_record_extent_bits(&inode->io_tree, start, start + len - 1, + EXTENT_QGROUP_RESERVED, &changeset); if (ret < 0) goto out; @@ -4472,7 +4473,7 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, return 0; BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); - trace_qgroup_meta_reserve(root, (s64)num_bytes, type); + trace_btrfs_qgroup_meta_reserve(root, (s64)num_bytes, type); ret = qgroup_reserve(root, num_bytes, enforce, type); if (ret < 0) return ret; @@ -4517,7 +4518,7 @@ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root) return; /* TODO: Update trace point to handle such free */ - trace_qgroup_meta_free_all_pertrans(root); + trace_btrfs_qgroup_meta_free_all_pertrans(root); /* Special value -1 means to free all reserved space */ btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(root), (u64)-1, BTRFS_QGROUP_RSV_META_PERTRANS); @@ -4539,7 +4540,7 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, */ num_bytes = sub_root_meta_rsv(root, num_bytes, type); BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); - trace_qgroup_meta_reserve(root, -(s64)num_bytes, type); + trace_btrfs_qgroup_meta_reserve(root, -(s64)num_bytes, type); btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(root), num_bytes, type); } @@ -4593,7 +4594,7 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes) /* Same as btrfs_qgroup_free_meta_prealloc() */ num_bytes = sub_root_meta_rsv(root, num_bytes, BTRFS_QGROUP_RSV_META_PREALLOC); - trace_qgroup_meta_convert(root, num_bytes); + trace_btrfs_qgroup_meta_convert(root, num_bytes); qgroup_convert_meta(fs_info, btrfs_root_id(root), num_bytes); if (!sb_rdonly(fs_info->sb)) add_root_meta_rsv(root, num_bytes, BTRFS_QGROUP_RSV_META_PERTRANS); @@ -4611,8 +4612,8 @@ void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode) int ret; extent_changeset_init(&changeset); - ret = clear_record_extent_bits(&inode->io_tree, 0, (u64)-1, - EXTENT_QGROUP_RESERVED, &changeset); + ret = btrfs_clear_record_extent_bits(&inode->io_tree, 0, (u64)-1, + EXTENT_QGROUP_RESERVED, &changeset); WARN_ON(ret < 0); if (WARN_ON(changeset.bytes_changed)) { @@ -4766,7 +4767,7 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root, * Marking qgroup inconsistent should be enough * for end users. */ - WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + DEBUG_WARN("duplicated but mismatched entry found"); ret = -EEXIST; } kfree(block); diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index cdd373c27784..3ff2bedfb3a4 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -134,14 +134,17 @@ struct btrfs_stripe_hash_table { }; /* - * A bvec like structure to present a sector inside a page. - * - * Unlike bvec we don't need bvlen, as it's fixed to sectorsize. + * A structure to present a sector inside a page, the length is fixed to + * sectorsize; */ struct sector_ptr { - struct page *page; - unsigned int pgoff:24; - unsigned int uptodate:8; + /* + * Blocks from the bio list can still be highmem. + * So here we use physical address to present a page and the offset inside it. + */ + phys_addr_t paddr; + bool has_paddr; + bool uptodate; }; static void rmw_rbio_work(struct work_struct *work); @@ -200,8 +203,7 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) struct btrfs_stripe_hash_table *x; struct btrfs_stripe_hash *cur; struct btrfs_stripe_hash *h; - int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; - int i; + unsigned int num_entries = 1U << BTRFS_STRIPE_HASH_TABLE_BITS; if (info->stripe_hash_table) return 0; @@ -222,7 +224,7 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) h = table->table; - for (i = 0; i < num_entries; i++) { + for (unsigned int i = 0; i < num_entries; i++) { cur = h + i; INIT_LIST_HEAD(&cur->hash_list); spin_lock_init(&cur->lock); @@ -233,6 +235,14 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) return 0; } +static void memcpy_sectors(const struct sector_ptr *dst, + const struct sector_ptr *src, u32 blocksize) +{ + memcpy_page(phys_to_page(dst->paddr), offset_in_page(dst->paddr), + phys_to_page(src->paddr), offset_in_page(src->paddr), + blocksize); +} + /* * caching an rbio means to copy anything from the * bio_sectors array into the stripe_pages array. We @@ -253,7 +263,7 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio) for (i = 0; i < rbio->nr_sectors; i++) { /* Some range not covered by bio (partial write), skip it */ - if (!rbio->bio_sectors[i].page) { + if (!rbio->bio_sectors[i].has_paddr) { /* * Even if the sector is not covered by bio, if it is * a data sector it should still be uptodate as it is @@ -264,12 +274,8 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio) continue; } - ASSERT(rbio->stripe_sectors[i].page); - memcpy_page(rbio->stripe_sectors[i].page, - rbio->stripe_sectors[i].pgoff, - rbio->bio_sectors[i].page, - rbio->bio_sectors[i].pgoff, - rbio->bioc->fs_info->sectorsize); + memcpy_sectors(&rbio->stripe_sectors[i], &rbio->bio_sectors[i], + rbio->bioc->fs_info->sectorsize); rbio->stripe_sectors[i].uptodate = 1; } set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); @@ -326,8 +332,13 @@ static void index_stripe_sectors(struct btrfs_raid_bio *rbio) int page_index = offset >> PAGE_SHIFT; ASSERT(page_index < rbio->nr_pages); - rbio->stripe_sectors[i].page = rbio->stripe_pages[page_index]; - rbio->stripe_sectors[i].pgoff = offset_in_page(offset); + if (!rbio->stripe_pages[page_index]) + continue; + + rbio->stripe_sectors[i].has_paddr = true; + rbio->stripe_sectors[i].paddr = + page_to_phys(rbio->stripe_pages[page_index]) + + offset_in_page(offset); } } @@ -507,9 +518,8 @@ static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) spin_lock(&table->cache_lock); while (!list_empty(&table->stripe_cache)) { - rbio = list_entry(table->stripe_cache.next, - struct btrfs_raid_bio, - stripe_cache); + rbio = list_first_entry(&table->stripe_cache, + struct btrfs_raid_bio, stripe_cache); __remove_rbio_from_cache(rbio); } spin_unlock(&table->cache_lock); @@ -567,9 +577,9 @@ static void cache_rbio(struct btrfs_raid_bio *rbio) if (table->cache_size > RBIO_CACHE_SIZE) { struct btrfs_raid_bio *found; - found = list_entry(table->stripe_cache.prev, - struct btrfs_raid_bio, - stripe_cache); + found = list_last_entry(&table->stripe_cache, + struct btrfs_raid_bio, + stripe_cache); if (found != rbio) __remove_rbio_from_cache(found); @@ -882,14 +892,14 @@ done_nolock: remove_rbio_from_cache(rbio); } -static void rbio_endio_bio_list(struct bio *cur, blk_status_t err) +static void rbio_endio_bio_list(struct bio *cur, blk_status_t status) { struct bio *next; while (cur) { next = cur->bi_next; cur->bi_next = NULL; - cur->bi_status = err; + cur->bi_status = status; bio_endio(cur); cur = next; } @@ -899,7 +909,7 @@ static void rbio_endio_bio_list(struct bio *cur, blk_status_t err) * this frees the rbio and runs through all the bios in the * bio_list and calls end_io on them */ -static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) +static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t status) { struct bio *cur = bio_list_get(&rbio->bio_list); struct bio *extra; @@ -928,9 +938,9 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) extra = bio_list_get(&rbio->bio_list); free_raid_bio(rbio); - rbio_endio_bio_list(cur, err); + rbio_endio_bio_list(cur, status); if (extra) - rbio_endio_bio_list(extra, err); + rbio_endio_bio_list(extra, status); } /* @@ -962,9 +972,9 @@ static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio, spin_lock(&rbio->bio_list_lock); sector = &rbio->bio_sectors[index]; - if (sector->page || bio_list_only) { + if (sector->has_paddr || bio_list_only) { /* Don't return sector without a valid page pointer */ - if (!sector->page) + if (!sector->has_paddr) sector = NULL; spin_unlock(&rbio->bio_list_lock); return sector; @@ -1142,7 +1152,7 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, rbio, stripe_nr); ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors, rbio, sector_nr); - ASSERT(sector->page); + ASSERT(sector->has_paddr); stripe = &rbio->bioc->stripes[stripe_nr]; disk_start = stripe->physical + sector_nr * sectorsize; @@ -1173,8 +1183,8 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, */ if (last_end == disk_start && !last->bi_status && last->bi_bdev == stripe->dev->bdev) { - ret = bio_add_page(last, sector->page, sectorsize, - sector->pgoff); + ret = bio_add_page(last, phys_to_page(sector->paddr), + sectorsize, offset_in_page(sector->paddr)); if (ret == sectorsize) return 0; } @@ -1187,7 +1197,8 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, bio->bi_iter.bi_sector = disk_start >> SECTOR_SHIFT; bio->bi_private = rbio; - __bio_add_page(bio, sector->page, sectorsize, sector->pgoff); + __bio_add_page(bio, phys_to_page(sector->paddr), sectorsize, + offset_in_page(sector->paddr)); bio_list_add(bio_list, bio); return 0; } @@ -1195,23 +1206,20 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio) { const u32 sectorsize = rbio->bioc->fs_info->sectorsize; - struct bio_vec bvec; - struct bvec_iter iter; + const u32 sectorsize_bits = rbio->bioc->fs_info->sectorsize_bits; + struct bvec_iter iter = bio->bi_iter; u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - rbio->bioc->full_stripe_logical; - bio_for_each_segment(bvec, bio, iter) { - u32 bvec_offset; - - for (bvec_offset = 0; bvec_offset < bvec.bv_len; - bvec_offset += sectorsize, offset += sectorsize) { - int index = offset / sectorsize; - struct sector_ptr *sector = &rbio->bio_sectors[index]; + while (iter.bi_size) { + unsigned int index = (offset >> sectorsize_bits); + struct sector_ptr *sector = &rbio->bio_sectors[index]; + struct bio_vec bv = bio_iter_iovec(bio, iter); - sector->page = bvec.bv_page; - sector->pgoff = bvec.bv_offset + bvec_offset; - ASSERT(sector->pgoff < PAGE_SIZE); - } + sector->has_paddr = true; + sector->paddr = bvec_phys(&bv); + bio_advance_iter_single(bio, &iter, sectorsize); + offset += sectorsize; } } @@ -1289,6 +1297,15 @@ static void assert_rbio(struct btrfs_raid_bio *rbio) ASSERT_RBIO(rbio->nr_data < rbio->real_stripes, rbio); } +static inline void *kmap_local_sector(const struct sector_ptr *sector) +{ + /* The sector pointer must have a page mapped to it. */ + ASSERT(sector->has_paddr); + + return kmap_local_page(phys_to_page(sector->paddr)) + + offset_in_page(sector->paddr); +} + /* Generate PQ for one vertical stripe. */ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) { @@ -1301,14 +1318,13 @@ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) /* First collect one sector from each data stripe */ for (stripe = 0; stripe < rbio->nr_data; stripe++) { sector = sector_in_rbio(rbio, stripe, sectornr, 0); - pointers[stripe] = kmap_local_page(sector->page) + - sector->pgoff; + pointers[stripe] = kmap_local_sector(sector); } /* Then add the parity stripe */ sector = rbio_pstripe_sector(rbio, sectornr); sector->uptodate = 1; - pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff; + pointers[stripe++] = kmap_local_sector(sector); if (has_qstripe) { /* @@ -1317,8 +1333,7 @@ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) */ sector = rbio_qstripe_sector(rbio, sectornr); sector->uptodate = 1; - pointers[stripe++] = kmap_local_page(sector->page) + - sector->pgoff; + pointers[stripe++] = kmap_local_sector(sector); assert_rbio(rbio); raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, @@ -1477,15 +1492,14 @@ static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio) * stripe_pages[], thus we need to locate the sector. */ static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio, - struct page *page, - unsigned int pgoff) + phys_addr_t paddr) { int i; for (i = 0; i < rbio->nr_sectors; i++) { struct sector_ptr *sector = &rbio->stripe_sectors[i]; - if (sector->page == page && sector->pgoff == pgoff) + if (sector->has_paddr && sector->paddr == paddr) return sector; } return NULL; @@ -1505,11 +1519,10 @@ static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio) bio_for_each_segment_all(bvec, bio, iter_all) { struct sector_ptr *sector; - int pgoff; + phys_addr_t paddr = bvec_phys(bvec); - for (pgoff = bvec->bv_offset; pgoff - bvec->bv_offset < bvec->bv_len; - pgoff += sectorsize) { - sector = find_stripe_sector(rbio, bvec->bv_page, pgoff); + for (u32 off = 0; off < bvec->bv_len; off += sectorsize) { + sector = find_stripe_sector(rbio, paddr + off); ASSERT(sector); if (sector) sector->uptodate = 1; @@ -1519,17 +1532,14 @@ static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio) static int get_bio_sector_nr(struct btrfs_raid_bio *rbio, struct bio *bio) { - struct bio_vec *bv = bio_first_bvec_all(bio); + phys_addr_t bvec_paddr = bvec_phys(bio_first_bvec_all(bio)); int i; for (i = 0; i < rbio->nr_sectors; i++) { - struct sector_ptr *sector; - - sector = &rbio->stripe_sectors[i]; - if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset) + if (rbio->stripe_sectors[i].paddr == bvec_paddr) break; - sector = &rbio->bio_sectors[i]; - if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset) + if (rbio->bio_sectors[i].has_paddr && + rbio->bio_sectors[i].paddr == bvec_paddr) break; } ASSERT(i < rbio->nr_sectors); @@ -1575,11 +1585,11 @@ static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio, return; bio_for_each_segment_all(bvec, bio, iter_all) { - int bv_offset; + void *kaddr; - for (bv_offset = bvec->bv_offset; - bv_offset < bvec->bv_offset + bvec->bv_len; - bv_offset += fs_info->sectorsize, total_sector_nr++) { + kaddr = bvec_kmap_local(bvec); + for (u32 off = 0; off < bvec->bv_len; + off += fs_info->sectorsize, total_sector_nr++) { u8 csum_buf[BTRFS_CSUM_SIZE]; u8 *expected_csum = rbio->csum_buf + total_sector_nr * fs_info->csum_size; @@ -1589,11 +1599,12 @@ static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio, if (!test_bit(total_sector_nr, rbio->csum_bitmap)) continue; - ret = btrfs_check_sector_csum(fs_info, bvec->bv_page, - bv_offset, csum_buf, expected_csum); + ret = btrfs_check_sector_csum(fs_info, kaddr + off, + csum_buf, expected_csum); if (ret < 0) set_bit(total_sector_nr, rbio->error_bitmap); } + kunmap_local(kaddr); } } @@ -1689,8 +1700,8 @@ static void raid_unplug(struct blk_plug_cb *cb, bool from_schedule) list_sort(NULL, &plug->rbio_list, plug_cmp); while (!list_empty(&plug->rbio_list)) { - cur = list_entry(plug->rbio_list.next, - struct btrfs_raid_bio, plug_list); + cur = list_first_entry(&plug->rbio_list, + struct btrfs_raid_bio, plug_list); list_del_init(&cur->plug_list); if (rbio_is_full(cur)) { @@ -1791,6 +1802,7 @@ static int verify_one_sector(struct btrfs_raid_bio *rbio, struct sector_ptr *sector; u8 csum_buf[BTRFS_CSUM_SIZE]; u8 *csum_expected; + void *kaddr; int ret; if (!rbio->csum_bitmap || !rbio->csum_buf) @@ -1809,13 +1821,12 @@ static int verify_one_sector(struct btrfs_raid_bio *rbio, sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); } - ASSERT(sector->page); - csum_expected = rbio->csum_buf + (stripe_nr * rbio->stripe_nsectors + sector_nr) * fs_info->csum_size; - ret = btrfs_check_sector_csum(fs_info, sector->page, sector->pgoff, - csum_buf, csum_expected); + kaddr = kmap_local_sector(sector); + ret = btrfs_check_sector_csum(fs_info, kaddr, csum_buf, csum_expected); + kunmap_local(kaddr); return ret; } @@ -1872,9 +1883,7 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, } else { sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); } - ASSERT(sector->page); - pointers[stripe_nr] = kmap_local_page(sector->page) + - sector->pgoff; + pointers[stripe_nr] = kmap_local_sector(sector); unmap_array[stripe_nr] = pointers[stripe_nr]; } @@ -2282,9 +2291,8 @@ static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio) static void raid_wait_write_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; - blk_status_t err = bio->bi_status; - if (err) + if (bio->bi_status) rbio_update_error_bitmap(rbio, bio); bio_put(bio); if (atomic_dec_and_test(&rbio->stripes_pending)) @@ -2326,7 +2334,7 @@ static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio) * thus this rbio can not be cached one, as cached one must * have all its data sectors present and uptodate. */ - if (!sector->page || !sector->uptodate) + if (!sector->has_paddr || !sector->uptodate) return true; } return false; @@ -2516,6 +2524,7 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio) int stripe; int sectornr; bool has_qstripe; + struct page *page; struct sector_ptr p_sector = { 0 }; struct sector_ptr q_sector = { 0 }; struct bio_list bio_list; @@ -2547,29 +2556,33 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio) */ clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); - p_sector.page = alloc_page(GFP_NOFS); - if (!p_sector.page) + page = alloc_page(GFP_NOFS); + if (!page) return -ENOMEM; - p_sector.pgoff = 0; + p_sector.has_paddr = true; + p_sector.paddr = page_to_phys(page); p_sector.uptodate = 1; + page = NULL; if (has_qstripe) { /* RAID6, allocate and map temp space for the Q stripe */ - q_sector.page = alloc_page(GFP_NOFS); - if (!q_sector.page) { - __free_page(p_sector.page); - p_sector.page = NULL; + page = alloc_page(GFP_NOFS); + if (!page) { + __free_page(phys_to_page(p_sector.paddr)); + p_sector.has_paddr = false; return -ENOMEM; } - q_sector.pgoff = 0; + q_sector.has_paddr = true; + q_sector.paddr = page_to_phys(page); q_sector.uptodate = 1; - pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page); + page = NULL; + pointers[rbio->real_stripes - 1] = kmap_local_sector(&q_sector); } bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); /* Map the parity stripe just once */ - pointers[nr_data] = kmap_local_page(p_sector.page); + pointers[nr_data] = kmap_local_sector(&p_sector); for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { struct sector_ptr *sector; @@ -2578,8 +2591,7 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio) /* first collect one page from each data stripe */ for (stripe = 0; stripe < nr_data; stripe++) { sector = sector_in_rbio(rbio, stripe, sectornr, 0); - pointers[stripe] = kmap_local_page(sector->page) + - sector->pgoff; + pointers[stripe] = kmap_local_sector(sector); } if (has_qstripe) { @@ -2595,7 +2607,7 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio) /* Check scrubbing parity and repair it */ sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); - parity = kmap_local_page(sector->page) + sector->pgoff; + parity = kmap_local_sector(sector); if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0) memcpy(parity, pointers[rbio->scrubp], sectorsize); else @@ -2608,12 +2620,11 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio) } kunmap_local(pointers[nr_data]); - __free_page(p_sector.page); - p_sector.page = NULL; - if (q_sector.page) { - kunmap_local(pointers[rbio->real_stripes - 1]); - __free_page(q_sector.page); - q_sector.page = NULL; + __free_page(phys_to_page(p_sector.paddr)); + p_sector.has_paddr = false; + if (q_sector.has_paddr) { + __free_page(phys_to_page(q_sector.paddr)); + q_sector.has_paddr = false; } /* diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 15c296cb4dac..62161beca559 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -87,7 +87,7 @@ static int copy_inline_to_page(struct btrfs_inode *inode, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, btrfs_alloc_write_mask(mapping)); if (IS_ERR(folio)) { - ret = -ENOMEM; + ret = PTR_ERR(folio); goto out_unlock; } @@ -95,9 +95,8 @@ static int copy_inline_to_page(struct btrfs_inode *inode, if (ret < 0) goto out_unlock; - clear_extent_bit(&inode->io_tree, file_offset, range_end, - EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - NULL); + btrfs_clear_extent_bits(&inode->io_tree, file_offset, range_end, + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG); ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL); if (ret) goto out_unlock; @@ -646,10 +645,10 @@ static int btrfs_extent_same_range(struct btrfs_inode *src, u64 loff, u64 len, * because we have already locked the inode's i_mmap_lock in exclusive * mode. */ - lock_extent(&dst->io_tree, dst_loff, end, &cached_state); + btrfs_lock_extent(&dst->io_tree, dst_loff, end, &cached_state); ret = btrfs_clone(&src->vfs_inode, &dst->vfs_inode, loff, len, ALIGN(len, bs), dst_loff, 1); - unlock_extent(&dst->io_tree, dst_loff, end, &cached_state); + btrfs_unlock_extent(&dst->io_tree, dst_loff, end, &cached_state); btrfs_btree_balance_dirty(fs_info); @@ -749,9 +748,9 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, * mode. */ end = destoff + len - 1; - lock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state); + btrfs_lock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state); ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); - unlock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state); + btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state); /* * We may have copied an inline extent into a page of the destination diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index f948f4f6431c..02086191630d 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -178,8 +178,9 @@ static void mark_block_processed(struct reloc_control *rc, in_range(node->bytenr, rc->block_group->start, rc->block_group->length)) { blocksize = rc->extent_root->fs_info->nodesize; - set_extent_bit(&rc->processed_blocks, node->bytenr, - node->bytenr + blocksize - 1, EXTENT_DIRTY, NULL); + btrfs_set_extent_bit(&rc->processed_blocks, node->bytenr, + node->bytenr + blocksize - 1, EXTENT_DIRTY, + NULL); } node->processed = 1; } @@ -195,8 +196,8 @@ static struct btrfs_backref_node *walk_up_backref( int idx = *index; while (!list_empty(&node->upper)) { - edge = list_entry(node->upper.next, - struct btrfs_backref_edge, list[LOWER]); + edge = list_first_entry(&node->upper, struct btrfs_backref_edge, + list[LOWER]); edges[idx++] = edge; node = edge->node[UPPER]; } @@ -222,8 +223,8 @@ static struct btrfs_backref_node *walk_down_backref( idx--; continue; } - edge = list_entry(edge->list[LOWER].next, - struct btrfs_backref_edge, list[LOWER]); + edge = list_first_entry(&edge->list[LOWER], struct btrfs_backref_edge, + list[LOWER]); edges[idx - 1] = edge; *index = idx; return edge->node[UPPER]; @@ -347,8 +348,8 @@ static bool handle_useless_nodes(struct reloc_control *rc, struct btrfs_backref_edge *edge; struct btrfs_backref_node *lower; - edge = list_entry(cur->lower.next, - struct btrfs_backref_edge, list[UPPER]); + edge = list_first_entry(&cur->lower, struct btrfs_backref_edge, + list[UPPER]); list_del(&edge->list[UPPER]); list_del(&edge->list[LOWER]); lower = edge->node[LOWER]; @@ -910,16 +911,16 @@ int replace_file_extents(struct btrfs_trans_handle *trans, /* Take mmap lock to serialize with reflinks. */ if (!down_read_trylock(&inode->i_mmap_lock)) continue; - ret = try_lock_extent(&inode->io_tree, key.offset, - end, &cached_state); + ret = btrfs_try_lock_extent(&inode->io_tree, key.offset, + end, &cached_state); if (!ret) { up_read(&inode->i_mmap_lock); continue; } btrfs_drop_extent_map_range(inode, key.offset, end, true); - unlock_extent(&inode->io_tree, key.offset, end, - &cached_state); + btrfs_unlock_extent(&inode->io_tree, key.offset, end, + &cached_state); up_read(&inode->i_mmap_lock); } } @@ -1378,9 +1379,9 @@ static int invalidate_extent_cache(struct btrfs_root *root, } /* the lock_extent waits for read_folio to complete */ - lock_extent(&inode->io_tree, start, end, &cached_state); + btrfs_lock_extent(&inode->io_tree, start, end, &cached_state); btrfs_drop_extent_map_range(inode, start, end, true); - unlock_extent(&inode->io_tree, start, end, &cached_state); + btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); } return 0; } @@ -1697,8 +1698,8 @@ again: rc->merge_reloc_tree = true; while (!list_empty(&rc->reloc_roots)) { - reloc_root = list_entry(rc->reloc_roots.next, - struct btrfs_root, root_list); + reloc_root = list_first_entry(&rc->reloc_roots, + struct btrfs_root, root_list); list_del_init(&reloc_root->root_list); root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, @@ -1813,8 +1814,7 @@ again: while (!list_empty(&reloc_roots)) { found = 1; - reloc_root = list_entry(reloc_roots.next, - struct btrfs_root, root_list); + reloc_root = list_first_entry(&reloc_roots, struct btrfs_root, root_list); root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false); @@ -1930,11 +1930,11 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, * reloc root without a corresponding root this could return ENOENT. */ if (IS_ERR(root)) { - ASSERT(0); + DEBUG_WARN("error %ld reading root for reloc root", PTR_ERR(root)); return PTR_ERR(root); } if (root->reloc_root != reloc_root) { - ASSERT(0); + DEBUG_WARN("unexpected reloc root found"); btrfs_err(fs_info, "root %llu has two reloc roots associated with it", reloc_root->root_key.offset); @@ -2109,8 +2109,8 @@ static noinline_for_stack u64 calcu_metadata_size(struct reloc_control *rc, if (list_empty(&next->upper)) break; - edge = list_entry(next->upper.next, - struct btrfs_backref_edge, list[LOWER]); + edge = list_first_entry(&next->upper, struct btrfs_backref_edge, + list[LOWER]); edges[index++] = edge; next = edge->node[UPPER]; } @@ -2356,8 +2356,8 @@ static int finish_pending_nodes(struct btrfs_trans_handle *trans, for (level = 0; level < BTRFS_MAX_LEVEL; level++) { while (!list_empty(&cache->pending[level])) { - node = list_entry(cache->pending[level].next, - struct btrfs_backref_node, list); + node = list_first_entry(&cache->pending[level], + struct btrfs_backref_node, list); list_move_tail(&node->list, &list); BUG_ON(!node->pending); @@ -2395,8 +2395,8 @@ static void update_processed_blocks(struct reloc_control *rc, if (list_empty(&next->upper)) break; - edge = list_entry(next->upper.next, - struct btrfs_backref_edge, list[LOWER]); + edge = list_first_entry(&next->upper, struct btrfs_backref_edge, + list[LOWER]); edges[index++] = edge; next = edge->node[UPPER]; } @@ -2408,8 +2408,8 @@ static int tree_block_processed(u64 bytenr, struct reloc_control *rc) { u32 blocksize = rc->extent_root->fs_info->nodesize; - if (test_range_bit(&rc->processed_blocks, bytenr, - bytenr + blocksize - 1, EXTENT_DIRTY, NULL)) + if (btrfs_test_range_bit(&rc->processed_blocks, bytenr, + bytenr + blocksize - 1, EXTENT_DIRTY, NULL)) return 1; return 0; } @@ -2706,9 +2706,6 @@ static noinline_for_stack int prealloc_file_extent_cluster(struct reloc_control if (ret < 0) return ret; - clear_extent_bits(&inode->io_tree, i_size, - round_up(i_size, PAGE_SIZE) - 1, - EXTENT_UPTODATE); folio = filemap_lock_folio(mapping, i_size >> PAGE_SHIFT); /* * If page is freed we don't need to do anything then, as we @@ -2738,21 +2735,21 @@ static noinline_for_stack int prealloc_file_extent_cluster(struct reloc_control else end = cluster->end - offset; - lock_extent(&inode->io_tree, start, end, &cached_state); + btrfs_lock_extent(&inode->io_tree, start, end, &cached_state); num_bytes = end + 1 - start; ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start, num_bytes, num_bytes, end + 1, &alloc_hint); cur_offset = end + 1; - unlock_extent(&inode->io_tree, start, end, &cached_state); + btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); if (ret) break; } btrfs_inode_unlock(inode, 0); if (cur_offset < prealloc_end) - btrfs_free_reserved_data_space_noquota(inode->root->fs_info, - prealloc_end + 1 - cur_offset); + btrfs_free_reserved_data_space_noquota(inode, + prealloc_end + 1 - cur_offset); return ret; } @@ -2766,7 +2763,7 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct reloc_contr u64 end = rc->cluster.end - offset; int ret = 0; - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) return -ENOMEM; @@ -2777,10 +2774,10 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct reloc_contr em->ram_bytes = em->len; em->flags |= EXTENT_FLAG_PINNED; - lock_extent(&inode->io_tree, start, end, &cached_state); + btrfs_lock_extent(&inode->io_tree, start, end, &cached_state); ret = btrfs_replace_extent_map_range(inode, em, false); - unlock_extent(&inode->io_tree, start, end, &cached_state); - free_extent_map(em); + btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); + btrfs_free_extent_map(em); return ret; } @@ -2902,15 +2899,15 @@ again: goto release_folio; /* Mark the range delalloc and dirty for later writeback */ - lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, - &cached_state); + btrfs_lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, + clamped_end, &cached_state); ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start, clamped_end, 0, &cached_state); if (ret) { - clear_extent_bit(&BTRFS_I(inode)->io_tree, - clamped_start, clamped_end, - EXTENT_LOCKED | EXTENT_BOUNDARY, - &cached_state); + btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, + clamped_start, clamped_end, + EXTENT_LOCKED | EXTENT_BOUNDARY, + &cached_state); btrfs_delalloc_release_metadata(BTRFS_I(inode), clamped_len, true); btrfs_delalloc_release_extents(BTRFS_I(inode), @@ -2932,12 +2929,12 @@ again: u64 boundary_end = boundary_start + fs_info->sectorsize - 1; - set_extent_bit(&BTRFS_I(inode)->io_tree, - boundary_start, boundary_end, - EXTENT_BOUNDARY, NULL); + btrfs_set_extent_bit(&BTRFS_I(inode)->io_tree, + boundary_start, boundary_end, + EXTENT_BOUNDARY, NULL); } - unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, - &cached_state); + btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, + &cached_state); btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len); cur += clamped_len; @@ -3435,9 +3432,9 @@ next: goto next; } - block_found = find_first_extent_bit(&rc->processed_blocks, - key.objectid, &start, &end, - EXTENT_DIRTY, NULL); + block_found = btrfs_find_first_extent_bit(&rc->processed_blocks, + key.objectid, &start, &end, + EXTENT_DIRTY, NULL); if (block_found && start <= key.objectid) { btrfs_release_path(path); @@ -3646,7 +3643,7 @@ restart: } btrfs_release_path(path); - clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY); + btrfs_clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY); if (trans) { btrfs_end_transaction_throttle(trans); @@ -3803,7 +3800,7 @@ out: if (ret) { if (inode) iput(&inode->vfs_inode); - inode = ERR_PTR(ret); + return ERR_PTR(ret); } return &inode->vfs_inode; } @@ -3862,7 +3859,7 @@ static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) btrfs_backref_init_cache(fs_info, &rc->backref_cache, true); rc->reloc_root_tree.rb_root = RB_ROOT; spin_lock_init(&rc->reloc_root_tree.lock); - extent_io_tree_init(fs_info, &rc->processed_blocks, IO_TREE_RELOC_BLOCKS); + btrfs_extent_io_tree_init(fs_info, &rc->processed_blocks, IO_TREE_RELOC_BLOCKS); return rc; } @@ -4185,8 +4182,7 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info) rc->merge_reloc_tree = true; while (!list_empty(&reloc_roots)) { - reloc_root = list_entry(reloc_roots.next, - struct btrfs_root, root_list); + reloc_root = list_first_entry(&reloc_roots, struct btrfs_root, root_list); list_del(&reloc_root->root_list); if (btrfs_root_refs(&reloc_root->root_item) == 0) { @@ -4279,7 +4275,7 @@ int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered) while (!list_empty(&list)) { struct btrfs_ordered_sum *sums = - list_entry(list.next, struct btrfs_ordered_sum, list); + list_first_entry(&list, struct btrfs_ordered_sum, list); list_del_init(&sums->list); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 2c5edcee9450..ce36fafc771e 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -66,8 +66,6 @@ struct scrub_ctx; /* Represent one sector and its needed info to verify the content. */ struct scrub_sector_verification { - bool is_metadata; - union { /* * Csum pointer for data csum verification. Should point to a @@ -100,6 +98,38 @@ enum scrub_stripe_flags { SCRUB_STRIPE_FLAG_NO_REPORT, }; +/* + * We have multiple bitmaps for one scrub_stripe. + * However each bitmap has at most (BTRFS_STRIPE_LEN / blocksize) bits, + * which is normally 16, and much smaller than BITS_PER_LONG (32 or 64). + * + * So to reduce memory usage for each scrub_stripe, we pack those bitmaps + * into a larger one. + * + * These enum records where the sub-bitmap are inside the larger one. + * Each subbitmap starts at scrub_bitmap_nr_##name * nr_sectors bit. + */ +enum { + /* Which blocks are covered by extent items. */ + scrub_bitmap_nr_has_extent = 0, + + /* Which blocks are meteadata. */ + scrub_bitmap_nr_is_metadata, + + /* + * Which blocks have errors, including IO, csum, and metadata + * errors. + * This sub-bitmap is the OR results of the next few error related + * sub-bitmaps. + */ + scrub_bitmap_nr_error, + scrub_bitmap_nr_io_error, + scrub_bitmap_nr_csum_error, + scrub_bitmap_nr_meta_error, + scrub_bitmap_nr_meta_gen_error, + scrub_bitmap_nr_last, +}; + #define SCRUB_STRIPE_PAGES (BTRFS_STRIPE_LEN / PAGE_SIZE) /* @@ -138,36 +168,15 @@ struct scrub_stripe { */ unsigned long state; - /* Indicate which sectors are covered by extent items. */ - unsigned long extent_sector_bitmap; - - /* - * The errors hit during the initial read of the stripe. - * - * Would be utilized for error reporting and repair. - * - * The remaining init_nr_* records the number of errors hit, only used - * by error reporting. - */ - unsigned long init_error_bitmap; - unsigned int init_nr_io_errors; - unsigned int init_nr_csum_errors; - unsigned int init_nr_meta_errors; + /* The large bitmap contains all the sub-bitmaps. */ + unsigned long bitmaps[BITS_TO_LONGS(scrub_bitmap_nr_last * + (BTRFS_STRIPE_LEN / BTRFS_MIN_BLOCKSIZE))]; /* - * The following error bitmaps are all for the current status. - * Every time we submit a new read, these bitmaps may be updated. - * - * error_bitmap = io_error_bitmap | csum_error_bitmap | meta_error_bitmap; - * - * IO and csum errors can happen for both metadata and data. + * For writeback (repair or replace) error reporting. + * This one is protected by a spinlock, thus can not be packed into + * the larger bitmap. */ - unsigned long error_bitmap; - unsigned long io_error_bitmap; - unsigned long csum_error_bitmap; - unsigned long meta_error_bitmap; - - /* For writeback (repair or replace) error reporting. */ unsigned long write_error_bitmap; /* Writeback can be concurrent, thus we need to protect the bitmap. */ @@ -219,6 +228,90 @@ struct scrub_ctx { refcount_t refs; }; +#define scrub_calc_start_bit(stripe, name, block_nr) \ +({ \ + unsigned int __start_bit; \ + \ + ASSERT(block_nr < stripe->nr_sectors, \ + "nr_sectors=%u block_nr=%u", stripe->nr_sectors, block_nr); \ + __start_bit = scrub_bitmap_nr_##name * stripe->nr_sectors + block_nr; \ + __start_bit; \ +}) + +#define IMPLEMENT_SCRUB_BITMAP_OPS(name) \ +static inline void scrub_bitmap_set_##name(struct scrub_stripe *stripe, \ + unsigned int block_nr, \ + unsigned int nr_blocks) \ +{ \ + const unsigned int start_bit = scrub_calc_start_bit(stripe, \ + name, block_nr); \ + \ + bitmap_set(stripe->bitmaps, start_bit, nr_blocks); \ +} \ +static inline void scrub_bitmap_clear_##name(struct scrub_stripe *stripe, \ + unsigned int block_nr, \ + unsigned int nr_blocks) \ +{ \ + const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \ + block_nr); \ + \ + bitmap_clear(stripe->bitmaps, start_bit, nr_blocks); \ +} \ +static inline bool scrub_bitmap_test_bit_##name(struct scrub_stripe *stripe, \ + unsigned int block_nr) \ +{ \ + const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \ + block_nr); \ + \ + return test_bit(start_bit, stripe->bitmaps); \ +} \ +static inline void scrub_bitmap_set_bit_##name(struct scrub_stripe *stripe, \ + unsigned int block_nr) \ +{ \ + const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \ + block_nr); \ + \ + set_bit(start_bit, stripe->bitmaps); \ +} \ +static inline void scrub_bitmap_clear_bit_##name(struct scrub_stripe *stripe, \ + unsigned int block_nr) \ +{ \ + const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \ + block_nr); \ + \ + clear_bit(start_bit, stripe->bitmaps); \ +} \ +static inline unsigned long scrub_bitmap_read_##name(struct scrub_stripe *stripe) \ +{ \ + const unsigned int nr_blocks = stripe->nr_sectors; \ + \ + ASSERT(nr_blocks > 0 && nr_blocks <= BITS_PER_LONG, \ + "nr_blocks=%u BITS_PER_LONG=%u", \ + nr_blocks, BITS_PER_LONG); \ + \ + return bitmap_read(stripe->bitmaps, nr_blocks * scrub_bitmap_nr_##name, \ + stripe->nr_sectors); \ +} \ +static inline bool scrub_bitmap_empty_##name(struct scrub_stripe *stripe) \ +{ \ + unsigned long bitmap = scrub_bitmap_read_##name(stripe); \ + \ + return bitmap_empty(&bitmap, stripe->nr_sectors); \ +} \ +static inline unsigned int scrub_bitmap_weight_##name(struct scrub_stripe *stripe) \ +{ \ + unsigned long bitmap = scrub_bitmap_read_##name(stripe); \ + \ + return bitmap_weight(&bitmap, stripe->nr_sectors); \ +} +IMPLEMENT_SCRUB_BITMAP_OPS(has_extent); +IMPLEMENT_SCRUB_BITMAP_OPS(is_metadata); +IMPLEMENT_SCRUB_BITMAP_OPS(error); +IMPLEMENT_SCRUB_BITMAP_OPS(io_error); +IMPLEMENT_SCRUB_BITMAP_OPS(csum_error); +IMPLEMENT_SCRUB_BITMAP_OPS(meta_error); +IMPLEMENT_SCRUB_BITMAP_OPS(meta_gen_error); + struct scrub_warning { struct btrfs_path *path; u64 extent_item_size; @@ -228,6 +321,19 @@ struct scrub_warning { struct btrfs_device *dev; }; +struct scrub_error_records { + /* + * Bitmap recording which blocks hit errors (IO/csum/...) during the + * initial read. + */ + unsigned long init_error_bitmap; + + unsigned int nr_io_errors; + unsigned int nr_csum_errors; + unsigned int nr_meta_errors; + unsigned int nr_meta_gen_errors; +}; + static void release_scrub_stripe(struct scrub_stripe *stripe) { if (!stripe) @@ -579,20 +685,15 @@ static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical) return ret; } -static struct page *scrub_stripe_get_page(struct scrub_stripe *stripe, int sector_nr) +static void *scrub_stripe_get_kaddr(struct scrub_stripe *stripe, int sector_nr) { - struct btrfs_fs_info *fs_info = stripe->bg->fs_info; - int page_index = (sector_nr << fs_info->sectorsize_bits) >> PAGE_SHIFT; + u32 offset = (sector_nr << stripe->bg->fs_info->sectorsize_bits); + const struct page *page = stripe->pages[offset >> PAGE_SHIFT]; - return stripe->pages[page_index]; -} - -static unsigned int scrub_stripe_get_page_offset(struct scrub_stripe *stripe, - int sector_nr) -{ - struct btrfs_fs_info *fs_info = stripe->bg->fs_info; - - return offset_in_page(sector_nr << fs_info->sectorsize_bits); + /* stripe->pages[] is allocated by us and no highmem is allowed. */ + ASSERT(page); + ASSERT(!PageHighMem(page)); + return page_address(page) + offset_in_page(offset); } static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr) @@ -600,24 +701,22 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr struct btrfs_fs_info *fs_info = stripe->bg->fs_info; const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits; const u64 logical = stripe->logical + (sector_nr << fs_info->sectorsize_bits); - const struct page *first_page = scrub_stripe_get_page(stripe, sector_nr); - const unsigned int first_off = scrub_stripe_get_page_offset(stripe, sector_nr); + void *first_kaddr = scrub_stripe_get_kaddr(stripe, sector_nr); + struct btrfs_header *header = first_kaddr; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); u8 on_disk_csum[BTRFS_CSUM_SIZE]; u8 calculated_csum[BTRFS_CSUM_SIZE]; - struct btrfs_header *header; /* * Here we don't have a good way to attach the pages (and subpages) * to a dummy extent buffer, thus we have to directly grab the members * from pages. */ - header = (struct btrfs_header *)(page_address(first_page) + first_off); memcpy(on_disk_csum, header->csum, fs_info->csum_size); if (logical != btrfs_stack_header_bytenr(header)) { - bitmap_set(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree); - bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); + scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); + scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, "tree block %llu mirror %u has bad bytenr, has %llu want %llu", logical, stripe->mirror_num, @@ -626,8 +725,8 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr } if (memcmp(header->fsid, fs_info->fs_devices->metadata_uuid, BTRFS_FSID_SIZE) != 0) { - bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); - bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); + scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); + scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, "tree block %llu mirror %u has bad fsid, has %pU want %pU", logical, stripe->mirror_num, @@ -636,8 +735,8 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr } if (memcmp(header->chunk_tree_uuid, fs_info->chunk_tree_uuid, BTRFS_UUID_SIZE) != 0) { - bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); - bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); + scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); + scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, "tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU", logical, stripe->mirror_num, @@ -648,21 +747,18 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr /* Now check tree block csum. */ shash->tfm = fs_info->csum_shash; crypto_shash_init(shash); - crypto_shash_update(shash, page_address(first_page) + first_off + - BTRFS_CSUM_SIZE, fs_info->sectorsize - BTRFS_CSUM_SIZE); + crypto_shash_update(shash, first_kaddr + BTRFS_CSUM_SIZE, + fs_info->sectorsize - BTRFS_CSUM_SIZE); for (int i = sector_nr + 1; i < sector_nr + sectors_per_tree; i++) { - struct page *page = scrub_stripe_get_page(stripe, i); - unsigned int page_off = scrub_stripe_get_page_offset(stripe, i); - - crypto_shash_update(shash, page_address(page) + page_off, + crypto_shash_update(shash, scrub_stripe_get_kaddr(stripe, i), fs_info->sectorsize); } crypto_shash_final(shash, calculated_csum); if (memcmp(calculated_csum, on_disk_csum, fs_info->csum_size) != 0) { - bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); - bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); + scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); + scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, "tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT, logical, stripe->mirror_num, @@ -672,8 +768,8 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr } if (stripe->sectors[sector_nr].generation != btrfs_stack_header_generation(header)) { - bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); - bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); + scrub_bitmap_set_meta_gen_error(stripe, sector_nr, sectors_per_tree); + scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, "tree block %llu mirror %u has bad generation, has %llu want %llu", logical, stripe->mirror_num, @@ -681,9 +777,10 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr stripe->sectors[sector_nr].generation); return; } - bitmap_clear(&stripe->error_bitmap, sector_nr, sectors_per_tree); - bitmap_clear(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree); - bitmap_clear(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); + scrub_bitmap_clear_error(stripe, sector_nr, sectors_per_tree); + scrub_bitmap_clear_csum_error(stripe, sector_nr, sectors_per_tree); + scrub_bitmap_clear_meta_error(stripe, sector_nr, sectors_per_tree); + scrub_bitmap_clear_meta_gen_error(stripe, sector_nr, sectors_per_tree); } static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr) @@ -691,23 +788,22 @@ static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr) struct btrfs_fs_info *fs_info = stripe->bg->fs_info; struct scrub_sector_verification *sector = &stripe->sectors[sector_nr]; const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits; - struct page *page = scrub_stripe_get_page(stripe, sector_nr); - unsigned int pgoff = scrub_stripe_get_page_offset(stripe, sector_nr); + void *kaddr = scrub_stripe_get_kaddr(stripe, sector_nr); u8 csum_buf[BTRFS_CSUM_SIZE]; int ret; ASSERT(sector_nr >= 0 && sector_nr < stripe->nr_sectors); /* Sector not utilized, skip it. */ - if (!test_bit(sector_nr, &stripe->extent_sector_bitmap)) + if (!scrub_bitmap_test_bit_has_extent(stripe, sector_nr)) return; /* IO error, no need to check. */ - if (test_bit(sector_nr, &stripe->io_error_bitmap)) + if (scrub_bitmap_test_bit_io_error(stripe, sector_nr)) return; /* Metadata, verify the full tree block. */ - if (sector->is_metadata) { + if (scrub_bitmap_test_bit_is_metadata(stripe, sector_nr)) { /* * Check if the tree block crosses the stripe boundary. If * crossed the boundary, we cannot verify it but only give a @@ -733,17 +829,17 @@ static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr) * cases without csum, we have no other choice but to trust it. */ if (!sector->csum) { - clear_bit(sector_nr, &stripe->error_bitmap); + scrub_bitmap_clear_bit_error(stripe, sector_nr); return; } - ret = btrfs_check_sector_csum(fs_info, page, pgoff, csum_buf, sector->csum); + ret = btrfs_check_sector_csum(fs_info, kaddr, csum_buf, sector->csum); if (ret < 0) { - set_bit(sector_nr, &stripe->csum_error_bitmap); - set_bit(sector_nr, &stripe->error_bitmap); + scrub_bitmap_set_bit_csum_error(stripe, sector_nr); + scrub_bitmap_set_bit_error(stripe, sector_nr); } else { - clear_bit(sector_nr, &stripe->csum_error_bitmap); - clear_bit(sector_nr, &stripe->error_bitmap); + scrub_bitmap_clear_bit_csum_error(stripe, sector_nr); + scrub_bitmap_clear_bit_error(stripe, sector_nr); } } @@ -756,7 +852,7 @@ static void scrub_verify_one_stripe(struct scrub_stripe *stripe, unsigned long b for_each_set_bit(sector_nr, &bitmap, stripe->nr_sectors) { scrub_verify_one_sector(stripe, sector_nr); - if (stripe->sectors[sector_nr].is_metadata) + if (scrub_bitmap_test_bit_is_metadata(stripe, sector_nr)) sector_nr += sectors_per_tree - 1; } } @@ -766,8 +862,7 @@ static int calc_sector_number(struct scrub_stripe *stripe, struct bio_vec *first int i; for (i = 0; i < stripe->nr_sectors; i++) { - if (scrub_stripe_get_page(stripe, i) == first_bvec->bv_page && - scrub_stripe_get_page_offset(stripe, i) == first_bvec->bv_offset) + if (scrub_stripe_get_kaddr(stripe, i) == bvec_virt(first_bvec)) break; } ASSERT(i < stripe->nr_sectors); @@ -795,13 +890,13 @@ static void scrub_repair_read_endio(struct btrfs_bio *bbio) bio_size += bvec->bv_len; if (bbio->bio.bi_status) { - bitmap_set(&stripe->io_error_bitmap, sector_nr, - bio_size >> fs_info->sectorsize_bits); - bitmap_set(&stripe->error_bitmap, sector_nr, - bio_size >> fs_info->sectorsize_bits); + scrub_bitmap_set_io_error(stripe, sector_nr, + bio_size >> fs_info->sectorsize_bits); + scrub_bitmap_set_error(stripe, sector_nr, + bio_size >> fs_info->sectorsize_bits); } else { - bitmap_clear(&stripe->io_error_bitmap, sector_nr, - bio_size >> fs_info->sectorsize_bits); + scrub_bitmap_clear_io_error(stripe, sector_nr, + bio_size >> fs_info->sectorsize_bits); } bio_put(&bbio->bio); if (atomic_dec_and_test(&stripe->pending_io)) @@ -814,27 +909,39 @@ static int calc_next_mirror(int mirror, int num_copies) return (mirror + 1 > num_copies) ? 1 : mirror + 1; } +static void scrub_bio_add_sector(struct btrfs_bio *bbio, struct scrub_stripe *stripe, + int sector_nr) +{ + void *kaddr = scrub_stripe_get_kaddr(stripe, sector_nr); + int ret; + + ret = bio_add_page(&bbio->bio, virt_to_page(kaddr), bbio->fs_info->sectorsize, + offset_in_page(kaddr)); + /* + * Caller should ensure the bbio has enough size. + * And we cannot use __bio_add_page(), which doesn't do any merge. + * + * Meanwhile for scrub_submit_initial_read() we fully rely on the merge + * to create the minimal amount of bio vectors, for fs block size < page + * size cases. + */ + ASSERT(ret == bbio->fs_info->sectorsize); +} + static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe, int mirror, int blocksize, bool wait) { struct btrfs_fs_info *fs_info = stripe->bg->fs_info; struct btrfs_bio *bbio = NULL; - const unsigned long old_error_bitmap = stripe->error_bitmap; + const unsigned long old_error_bitmap = scrub_bitmap_read_error(stripe); int i; ASSERT(stripe->mirror_num >= 1); ASSERT(atomic_read(&stripe->pending_io) == 0); for_each_set_bit(i, &old_error_bitmap, stripe->nr_sectors) { - struct page *page; - int pgoff; - int ret; - - page = scrub_stripe_get_page(stripe, i); - pgoff = scrub_stripe_get_page_offset(stripe, i); - /* The current sector cannot be merged, submit the bio. */ - if (bbio && ((i > 0 && !test_bit(i - 1, &stripe->error_bitmap)) || + if (bbio && ((i > 0 && !test_bit(i - 1, &old_error_bitmap)) || bbio->bio.bi_iter.bi_size >= blocksize)) { ASSERT(bbio->bio.bi_iter.bi_size); atomic_inc(&stripe->pending_io); @@ -851,8 +958,7 @@ static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe, (i << fs_info->sectorsize_bits)) >> SECTOR_SHIFT; } - ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff); - ASSERT(ret == fs_info->sectorsize); + scrub_bio_add_sector(bbio, stripe, i); } if (bbio) { ASSERT(bbio->bio.bi_iter.bi_size); @@ -864,12 +970,15 @@ static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe, } static void scrub_stripe_report_errors(struct scrub_ctx *sctx, - struct scrub_stripe *stripe) + struct scrub_stripe *stripe, + const struct scrub_error_records *errors) { static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_device *dev = NULL; + const unsigned long extent_bitmap = scrub_bitmap_read_has_extent(stripe); + const unsigned long error_bitmap = scrub_bitmap_read_error(stripe); u64 physical = 0; int nr_data_sectors = 0; int nr_meta_sectors = 0; @@ -886,7 +995,7 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx, * Although our scrub_stripe infrastructure is mostly based on btrfs_submit_bio() * thus no need for dev/physical, error reporting still needs dev and physical. */ - if (!bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors)) { + if (!bitmap_empty(&errors->init_error_bitmap, stripe->nr_sectors)) { u64 mapped_len = fs_info->sectorsize; struct btrfs_io_context *bioc = NULL; int stripe_index = stripe->mirror_num - 1; @@ -909,10 +1018,10 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx, } skip: - for_each_set_bit(sector_nr, &stripe->extent_sector_bitmap, stripe->nr_sectors) { + for_each_set_bit(sector_nr, &extent_bitmap, stripe->nr_sectors) { bool repaired = false; - if (stripe->sectors[sector_nr].is_metadata) { + if (scrub_bitmap_test_bit_is_metadata(stripe, sector_nr)) { nr_meta_sectors++; } else { nr_data_sectors++; @@ -920,14 +1029,14 @@ skip: nr_nodatacsum_sectors++; } - if (test_bit(sector_nr, &stripe->init_error_bitmap) && - !test_bit(sector_nr, &stripe->error_bitmap)) { + if (test_bit(sector_nr, &errors->init_error_bitmap) && + !test_bit(sector_nr, &error_bitmap)) { nr_repaired_sectors++; repaired = true; } /* Good sector from the beginning, nothing need to be done. */ - if (!test_bit(sector_nr, &stripe->init_error_bitmap)) + if (!test_bit(sector_nr, &errors->init_error_bitmap)) continue; /* @@ -960,31 +1069,46 @@ skip: stripe->logical, stripe->mirror_num); } - if (test_bit(sector_nr, &stripe->io_error_bitmap)) + if (scrub_bitmap_test_bit_io_error(stripe, sector_nr)) if (__ratelimit(&rs) && dev) scrub_print_common_warning("i/o error", dev, false, stripe->logical, physical); - if (test_bit(sector_nr, &stripe->csum_error_bitmap)) + if (scrub_bitmap_test_bit_csum_error(stripe, sector_nr)) if (__ratelimit(&rs) && dev) scrub_print_common_warning("checksum error", dev, false, stripe->logical, physical); - if (test_bit(sector_nr, &stripe->meta_error_bitmap)) + if (scrub_bitmap_test_bit_meta_error(stripe, sector_nr)) if (__ratelimit(&rs) && dev) scrub_print_common_warning("header error", dev, false, stripe->logical, physical); + if (scrub_bitmap_test_bit_meta_gen_error(stripe, sector_nr)) + if (__ratelimit(&rs) && dev) + scrub_print_common_warning("generation error", dev, false, + stripe->logical, physical); } + /* Update the device stats. */ + for (int i = 0; i < errors->nr_io_errors; i++) + btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_READ_ERRS); + for (int i = 0; i < errors->nr_csum_errors; i++) + btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); + /* Generation mismatch error is based on each metadata, not each block. */ + for (int i = 0; i < errors->nr_meta_gen_errors; + i += (fs_info->nodesize >> fs_info->sectorsize_bits)) + btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_GENERATION_ERRS); + spin_lock(&sctx->stat_lock); sctx->stat.data_extents_scrubbed += stripe->nr_data_extents; sctx->stat.tree_extents_scrubbed += stripe->nr_meta_extents; sctx->stat.data_bytes_scrubbed += nr_data_sectors << fs_info->sectorsize_bits; sctx->stat.tree_bytes_scrubbed += nr_meta_sectors << fs_info->sectorsize_bits; sctx->stat.no_csum += nr_nodatacsum_sectors; - sctx->stat.read_errors += stripe->init_nr_io_errors; - sctx->stat.csum_errors += stripe->init_nr_csum_errors; - sctx->stat.verify_errors += stripe->init_nr_meta_errors; + sctx->stat.read_errors += errors->nr_io_errors; + sctx->stat.csum_errors += errors->nr_csum_errors; + sctx->stat.verify_errors += errors->nr_meta_errors + + errors->nr_meta_gen_errors; sctx->stat.uncorrectable_errors += - bitmap_weight(&stripe->error_bitmap, stripe->nr_sectors); + bitmap_weight(&error_bitmap, stripe->nr_sectors); sctx->stat.corrected_errors += nr_repaired_sectors; spin_unlock(&sctx->stat_lock); } @@ -1010,26 +1134,26 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work) struct scrub_stripe *stripe = container_of(work, struct scrub_stripe, work); struct scrub_ctx *sctx = stripe->sctx; struct btrfs_fs_info *fs_info = sctx->fs_info; + struct scrub_error_records errors = { 0 }; int num_copies = btrfs_num_copies(fs_info, stripe->bg->start, stripe->bg->length); unsigned long repaired; + unsigned long error; int mirror; int i; ASSERT(stripe->mirror_num > 0); wait_scrub_stripe_io(stripe); - scrub_verify_one_stripe(stripe, stripe->extent_sector_bitmap); + scrub_verify_one_stripe(stripe, scrub_bitmap_read_has_extent(stripe)); /* Save the initial failed bitmap for later repair and report usage. */ - stripe->init_error_bitmap = stripe->error_bitmap; - stripe->init_nr_io_errors = bitmap_weight(&stripe->io_error_bitmap, - stripe->nr_sectors); - stripe->init_nr_csum_errors = bitmap_weight(&stripe->csum_error_bitmap, - stripe->nr_sectors); - stripe->init_nr_meta_errors = bitmap_weight(&stripe->meta_error_bitmap, - stripe->nr_sectors); - - if (bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors)) + errors.init_error_bitmap = scrub_bitmap_read_error(stripe); + errors.nr_io_errors = scrub_bitmap_weight_io_error(stripe); + errors.nr_csum_errors = scrub_bitmap_weight_csum_error(stripe); + errors.nr_meta_errors = scrub_bitmap_weight_meta_error(stripe); + errors.nr_meta_gen_errors = scrub_bitmap_weight_meta_gen_error(stripe); + + if (bitmap_empty(&errors.init_error_bitmap, stripe->nr_sectors)) goto out; /* @@ -1041,13 +1165,13 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work) for (mirror = calc_next_mirror(stripe->mirror_num, num_copies); mirror != stripe->mirror_num; mirror = calc_next_mirror(mirror, num_copies)) { - const unsigned long old_error_bitmap = stripe->error_bitmap; + const unsigned long old_error_bitmap = scrub_bitmap_read_error(stripe); scrub_stripe_submit_repair_read(stripe, mirror, BTRFS_STRIPE_LEN, false); wait_scrub_stripe_io(stripe); scrub_verify_one_stripe(stripe, old_error_bitmap); - if (bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors)) + if (scrub_bitmap_empty_error(stripe)) goto out; } @@ -1065,21 +1189,22 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work) for (i = 0, mirror = stripe->mirror_num; i < num_copies; i++, mirror = calc_next_mirror(mirror, num_copies)) { - const unsigned long old_error_bitmap = stripe->error_bitmap; + const unsigned long old_error_bitmap = scrub_bitmap_read_error(stripe); scrub_stripe_submit_repair_read(stripe, mirror, fs_info->sectorsize, true); wait_scrub_stripe_io(stripe); scrub_verify_one_stripe(stripe, old_error_bitmap); - if (bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors)) + if (scrub_bitmap_empty_error(stripe)) goto out; } out: + error = scrub_bitmap_read_error(stripe); /* * Submit the repaired sectors. For zoned case, we cannot do repair * in-place, but queue the bg to be relocated. */ - bitmap_andnot(&repaired, &stripe->init_error_bitmap, &stripe->error_bitmap, + bitmap_andnot(&repaired, &errors.init_error_bitmap, &error, stripe->nr_sectors); if (!sctx->readonly && !bitmap_empty(&repaired, stripe->nr_sectors)) { if (btrfs_is_zoned(fs_info)) { @@ -1090,7 +1215,7 @@ out: } } - scrub_stripe_report_errors(sctx, stripe); + scrub_stripe_report_errors(sctx, stripe, &errors); set_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state); wake_up(&stripe->repair_wait); } @@ -1110,10 +1235,10 @@ static void scrub_read_endio(struct btrfs_bio *bbio) num_sectors = bio_size >> stripe->bg->fs_info->sectorsize_bits; if (bbio->bio.bi_status) { - bitmap_set(&stripe->io_error_bitmap, sector_nr, num_sectors); - bitmap_set(&stripe->error_bitmap, sector_nr, num_sectors); + scrub_bitmap_set_io_error(stripe, sector_nr, num_sectors); + scrub_bitmap_set_error(stripe, sector_nr, num_sectors); } else { - bitmap_clear(&stripe->io_error_bitmap, sector_nr, num_sectors); + scrub_bitmap_clear_io_error(stripe, sector_nr, num_sectors); } bio_put(&bbio->bio); if (atomic_dec_and_test(&stripe->pending_io)) { @@ -1142,6 +1267,9 @@ static void scrub_write_endio(struct btrfs_bio *bbio) bitmap_set(&stripe->write_error_bitmap, sector_nr, bio_size >> fs_info->sectorsize_bits); spin_unlock_irqrestore(&stripe->write_error_lock, flags); + for (int i = 0; i < (bio_size >> fs_info->sectorsize_bits); i++) + btrfs_dev_stat_inc_and_print(stripe->dev, + BTRFS_DEV_STAT_WRITE_ERRS); } bio_put(&bbio->bio); @@ -1199,12 +1327,8 @@ static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *str int sector_nr; for_each_set_bit(sector_nr, &write_bitmap, stripe->nr_sectors) { - struct page *page = scrub_stripe_get_page(stripe, sector_nr); - unsigned int pgoff = scrub_stripe_get_page_offset(stripe, sector_nr); - int ret; - /* We should only writeback sectors covered by an extent. */ - ASSERT(test_bit(sector_nr, &stripe->extent_sector_bitmap)); + ASSERT(scrub_bitmap_test_bit_has_extent(stripe, sector_nr)); /* Cannot merge with previous sector, submit the current one. */ if (bbio && sector_nr && !test_bit(sector_nr - 1, &write_bitmap)) { @@ -1218,8 +1342,7 @@ static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *str (sector_nr << fs_info->sectorsize_bits)) >> SECTOR_SHIFT; } - ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff); - ASSERT(ret == fs_info->sectorsize); + scrub_bio_add_sector(bbio, stripe, sector_nr); } if (bbio) scrub_submit_write_bio(sctx, stripe, bbio, dev_replace); @@ -1493,9 +1616,9 @@ static void fill_one_extent_info(struct btrfs_fs_info *fs_info, struct scrub_sector_verification *sector = &stripe->sectors[nr_sector]; - set_bit(nr_sector, &stripe->extent_sector_bitmap); + scrub_bitmap_set_bit_has_extent(stripe, nr_sector); if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - sector->is_metadata = true; + scrub_bitmap_set_bit_is_metadata(stripe, nr_sector); sector->generation = extent_gen; } } @@ -1503,15 +1626,8 @@ static void fill_one_extent_info(struct btrfs_fs_info *fs_info, static void scrub_stripe_reset_bitmaps(struct scrub_stripe *stripe) { - stripe->extent_sector_bitmap = 0; - stripe->init_error_bitmap = 0; - stripe->init_nr_io_errors = 0; - stripe->init_nr_csum_errors = 0; - stripe->init_nr_meta_errors = 0; - stripe->error_bitmap = 0; - stripe->io_error_bitmap = 0; - stripe->csum_error_bitmap = 0; - stripe->meta_error_bitmap = 0; + ASSERT(stripe->nr_sectors); + bitmap_zero(stripe->bitmaps, scrub_bitmap_nr_last * stripe->nr_sectors); } /* @@ -1541,8 +1657,8 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, u64 extent_gen; int ret; - if (unlikely(!extent_root)) { - btrfs_err(fs_info, "no valid extent root for scrub"); + if (unlikely(!extent_root || !csum_root)) { + btrfs_err(fs_info, "no valid extent or csum root for scrub"); return -EUCLEAN; } memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) * @@ -1646,7 +1762,6 @@ static void scrub_reset_stripe(struct scrub_stripe *stripe) stripe->state = 0; for (int i = 0; i < stripe->nr_sectors; i++) { - stripe->sectors[i].is_metadata = false; stripe->sectors[i].csum = NULL; stripe->sectors[i].generation = 0; } @@ -1665,24 +1780,21 @@ static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe) struct btrfs_fs_info *fs_info = stripe->bg->fs_info; struct btrfs_bio *bbio = NULL; unsigned int nr_sectors = stripe_length(stripe) >> fs_info->sectorsize_bits; + const unsigned long has_extent = scrub_bitmap_read_has_extent(stripe); u64 stripe_len = BTRFS_STRIPE_LEN; int mirror = stripe->mirror_num; int i; atomic_inc(&stripe->pending_io); - for_each_set_bit(i, &stripe->extent_sector_bitmap, stripe->nr_sectors) { - struct page *page = scrub_stripe_get_page(stripe, i); - unsigned int pgoff = scrub_stripe_get_page_offset(stripe, i); - + for_each_set_bit(i, &has_extent, stripe->nr_sectors) { /* We're beyond the chunk boundary, no need to read anymore. */ if (i >= nr_sectors) break; /* The current sector cannot be merged, submit the bio. */ if (bbio && - ((i > 0 && - !test_bit(i - 1, &stripe->extent_sector_bitmap)) || + ((i > 0 && !test_bit(i - 1, &has_extent)) || bbio->bio.bi_iter.bi_size >= stripe_len)) { ASSERT(bbio->bio.bi_iter.bi_size); atomic_inc(&stripe->pending_io); @@ -1716,8 +1828,8 @@ static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe) * the extent tree, then it's a preallocated * extent and not an error. */ - set_bit(i, &stripe->io_error_bitmap); - set_bit(i, &stripe->error_bitmap); + scrub_bitmap_set_bit_io_error(stripe, i); + scrub_bitmap_set_bit_error(stripe, i); } continue; } @@ -1727,7 +1839,7 @@ static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe) bbio->bio.bi_iter.bi_sector = logical >> SECTOR_SHIFT; } - __bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff); + scrub_bio_add_sector(bbio, stripe, i); } if (bbio) { @@ -1765,15 +1877,8 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, bbio->bio.bi_iter.bi_sector = stripe->logical >> SECTOR_SHIFT; /* Read the whole range inside the chunk boundary. */ - for (unsigned int cur = 0; cur < nr_sectors; cur++) { - struct page *page = scrub_stripe_get_page(stripe, cur); - unsigned int pgoff = scrub_stripe_get_page_offset(stripe, cur); - int ret; - - ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff); - /* We should have allocated enough bio vectors. */ - ASSERT(ret == fs_info->sectorsize); - } + for (unsigned int cur = 0; cur < nr_sectors; cur++) + scrub_bio_add_sector(bbio, stripe, cur); atomic_inc(&stripe->pending_io); /* @@ -1794,10 +1899,11 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, static bool stripe_has_metadata_error(struct scrub_stripe *stripe) { + const unsigned long error = scrub_bitmap_read_error(stripe); int i; - for_each_set_bit(i, &stripe->error_bitmap, stripe->nr_sectors) { - if (stripe->sectors[i].is_metadata) { + for_each_set_bit(i, &error, stripe->nr_sectors) { + if (scrub_bitmap_test_bit_is_metadata(stripe, i)) { struct btrfs_fs_info *fs_info = stripe->bg->fs_info; btrfs_err(fs_info, @@ -1872,13 +1978,16 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx) } for (int i = 0; i < nr_stripes; i++) { unsigned long good; + unsigned long has_extent; + unsigned long error; stripe = &sctx->stripes[i]; ASSERT(stripe->dev == fs_info->dev_replace.srcdev); - bitmap_andnot(&good, &stripe->extent_sector_bitmap, - &stripe->error_bitmap, stripe->nr_sectors); + has_extent = scrub_bitmap_read_has_extent(stripe); + error = scrub_bitmap_read_error(stripe); + bitmap_andnot(&good, &has_extent, &error, stripe->nr_sectors); scrub_write_sectors(sctx, stripe, good, true); } } @@ -2012,7 +2121,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, /* Check if all data stripes are empty. */ for (int i = 0; i < data_stripes; i++) { stripe = &sctx->raid56_data_stripes[i]; - if (!bitmap_empty(&stripe->extent_sector_bitmap, stripe->nr_sectors)) { + if (!scrub_bitmap_empty_has_extent(stripe)) { all_empty = false; break; } @@ -2044,15 +2153,18 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, */ for (int i = 0; i < data_stripes; i++) { unsigned long error; + unsigned long has_extent; stripe = &sctx->raid56_data_stripes[i]; + error = scrub_bitmap_read_error(stripe); + has_extent = scrub_bitmap_read_has_extent(stripe); + /* * We should only check the errors where there is an extent. * As we may hit an empty data stripe while it's missing. */ - bitmap_and(&error, &stripe->error_bitmap, - &stripe->extent_sector_bitmap, stripe->nr_sectors); + bitmap_and(&error, &error, &has_extent, stripe->nr_sectors); if (!bitmap_empty(&error, stripe->nr_sectors)) { btrfs_err(fs_info, "unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl", @@ -2061,8 +2173,8 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, ret = -EIO; goto out; } - bitmap_or(&extent_bitmap, &extent_bitmap, - &stripe->extent_sector_bitmap, stripe->nr_sectors); + bitmap_or(&extent_bitmap, &extent_bitmap, &has_extent, + stripe->nr_sectors); } /* Now we can check and regenerate the P/Q stripe. */ @@ -2770,17 +2882,11 @@ static int scrub_one_super(struct scrub_ctx *sctx, struct btrfs_device *dev, struct page *page, u64 physical, u64 generation) { struct btrfs_fs_info *fs_info = sctx->fs_info; - struct bio_vec bvec; - struct bio bio; struct btrfs_super_block *sb = page_address(page); int ret; - bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_READ); - bio.bi_iter.bi_sector = physical >> SECTOR_SHIFT; - __bio_add_page(&bio, page, BTRFS_SUPER_INFO_SIZE, 0); - ret = submit_bio_wait(&bio); - bio_uninit(&bio); - + ret = bdev_rw_virt(dev->bdev, physical >> SECTOR_SHIFT, sb, + BTRFS_SUPER_INFO_SIZE, REQ_OP_READ); if (ret < 0) return ret; ret = btrfs_check_super_csum(fs_info, sb); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 0c8c58c4f29b..2891ec4056c6 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -383,11 +383,11 @@ static void inconsistent_snapshot_error(struct send_ctx *sctx, result_string = "updated"; break; case BTRFS_COMPARE_TREE_SAME: - ASSERT(0); + DEBUG_WARN("no change between trees"); result_string = "unchanged"; break; default: - ASSERT(0); + DEBUG_WARN("unexpected comparison result %d", result); result_string = "unexpected"; } @@ -816,11 +816,8 @@ static int send_cmd(struct send_ctx *sctx) static int send_rename(struct send_ctx *sctx, struct fs_path *from, struct fs_path *to) { - struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret; - btrfs_debug(fs_info, "send_rename %s -> %s", from->start, to->start); - ret = begin_cmd(sctx, BTRFS_SEND_C_RENAME); if (ret < 0) return ret; @@ -840,11 +837,8 @@ tlv_put_failure: static int send_link(struct send_ctx *sctx, struct fs_path *path, struct fs_path *lnk) { - struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret; - btrfs_debug(fs_info, "send_link %s -> %s", path->start, lnk->start); - ret = begin_cmd(sctx, BTRFS_SEND_C_LINK); if (ret < 0) return ret; @@ -863,11 +857,8 @@ tlv_put_failure: */ static int send_unlink(struct send_ctx *sctx, struct fs_path *path) { - struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret; - btrfs_debug(fs_info, "send_unlink %s", path->start); - ret = begin_cmd(sctx, BTRFS_SEND_C_UNLINK); if (ret < 0) return ret; @@ -885,11 +876,8 @@ tlv_put_failure: */ static int send_rmdir(struct send_ctx *sctx, struct fs_path *path) { - struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret; - btrfs_debug(fs_info, "send_rmdir %s", path->start); - ret = begin_cmd(sctx, BTRFS_SEND_C_RMDIR); if (ret < 0) return ret; @@ -1573,7 +1561,6 @@ static int find_extent_clone(struct send_ctx *sctx, struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret; int extent_type; - u64 logical; u64 disk_byte; u64 num_bytes; struct btrfs_file_extent_item *fi; @@ -1604,7 +1591,6 @@ static int find_extent_clone(struct send_ctx *sctx, compressed = btrfs_file_extent_compression(eb, fi); num_bytes = btrfs_file_extent_num_bytes(eb, fi); - logical = disk_byte + btrfs_file_extent_offset(eb, fi); /* * Setup the clone roots. @@ -1686,14 +1672,8 @@ static int find_extent_clone(struct send_ctx *sctx, } up_read(&fs_info->commit_root_sem); - btrfs_debug(fs_info, - "find_extent_clone: data_offset=%llu, ino=%llu, num_bytes=%llu, logical=%llu", - data_offset, ino, num_bytes, logical); - - if (!backref_ctx.found) { - btrfs_debug(fs_info, "no clones found"); + if (!backref_ctx.found) return -ENOENT; - } cur_clone_root = NULL; for (i = 0; i < sctx->clone_roots_cnt; i++) { @@ -2631,12 +2611,9 @@ static void free_path_for_command(const struct send_ctx *sctx, struct fs_path *p static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size) { - struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret = 0; struct fs_path *p; - btrfs_debug(fs_info, "send_truncate %llu size=%llu", ino, size); - p = get_path_for_command(sctx, ino, gen); if (IS_ERR(p)) return PTR_ERR(p); @@ -2658,12 +2635,9 @@ out: static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode) { - struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret = 0; struct fs_path *p; - btrfs_debug(fs_info, "send_chmod %llu mode=%llu", ino, mode); - p = get_path_for_command(sctx, ino, gen); if (IS_ERR(p)) return PTR_ERR(p); @@ -2685,15 +2659,12 @@ out: static int send_fileattr(struct send_ctx *sctx, u64 ino, u64 gen, u64 fileattr) { - struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret = 0; struct fs_path *p; if (sctx->proto < 2) return 0; - btrfs_debug(fs_info, "send_fileattr %llu fileattr=%llu", ino, fileattr); - p = get_path_for_command(sctx, ino, gen); if (IS_ERR(p)) return PTR_ERR(p); @@ -2715,13 +2686,9 @@ out: static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid) { - struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret = 0; struct fs_path *p; - btrfs_debug(fs_info, "send_chown %llu uid=%llu, gid=%llu", - ino, uid, gid); - p = get_path_for_command(sctx, ino, gen); if (IS_ERR(p)) return PTR_ERR(p); @@ -2744,7 +2711,6 @@ out: static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) { - struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret = 0; struct fs_path *p = NULL; struct btrfs_inode_item *ii; @@ -2753,8 +2719,6 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) struct btrfs_key key; int slot; - btrfs_debug(fs_info, "send_utimes %llu", ino); - p = get_path_for_command(sctx, ino, gen); if (IS_ERR(p)) return PTR_ERR(p); @@ -2861,7 +2825,6 @@ static int trim_dir_utimes_cache(struct send_ctx *sctx) */ static int send_create_inode(struct send_ctx *sctx, u64 ino) { - struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret = 0; struct fs_path *p; int cmd; @@ -2870,8 +2833,6 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino) u64 mode; u64 rdev; - btrfs_debug(fs_info, "send_create_inode %llu", ino); - p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -3098,7 +3059,7 @@ static void __free_recorded_refs(struct list_head *head) struct recorded_ref *cur; while (!list_empty(head)) { - cur = list_entry(head->next, struct recorded_ref, list); + cur = list_first_entry(head, struct recorded_ref, list); recorded_ref_free(cur); } } @@ -4224,8 +4185,6 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) bool orphanized_dir = false; bool orphanized_ancestor = false; - btrfs_debug(fs_info, "process_recorded_refs %llu", sctx->cur_ino); - /* * This should never happen as the root dir always has the same ref * which is always '..' @@ -4560,8 +4519,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) /* * We have a moved dir. Add the old parent to check_dirs */ - cur = list_entry(sctx->deleted_refs.next, struct recorded_ref, - list); + cur = list_first_entry(&sctx->deleted_refs, struct recorded_ref, list); ret = dup_ref(cur, &check_dirs); if (ret < 0) goto out; @@ -5263,10 +5221,9 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) { struct btrfs_root *root = sctx->send_root; struct btrfs_fs_info *fs_info = root->fs_info; - struct folio *folio; - pgoff_t index = offset >> PAGE_SHIFT; - pgoff_t last_index; - unsigned pg_offset = offset_in_page(offset); + u64 cur = offset; + const u64 end = offset + len; + const pgoff_t last_index = ((end - 1) >> PAGE_SHIFT); struct address_space *mapping = sctx->cur_inode->i_mapping; int ret; @@ -5274,13 +5231,12 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) if (ret) return ret; - last_index = (offset + len - 1) >> PAGE_SHIFT; + while (cur < end) { + pgoff_t index = (cur >> PAGE_SHIFT); + unsigned int cur_len; + unsigned int pg_offset; + struct folio *folio; - while (index <= last_index) { - unsigned cur_len = min_t(unsigned, len, - PAGE_SIZE - pg_offset); - -again: folio = filemap_lock_folio(mapping, index); if (IS_ERR(folio)) { page_cache_sync_readahead(mapping, @@ -5293,8 +5249,8 @@ again: break; } } - - WARN_ON(folio_order(folio)); + pg_offset = offset_in_folio(folio, cur); + cur_len = min_t(unsigned int, end - cur, folio_size(folio) - pg_offset); if (folio_test_readahead(folio)) page_cache_async_readahead(mapping, &sctx->ra, NULL, folio, @@ -5316,7 +5272,7 @@ again: if (folio->mapping != mapping) { folio_unlock(folio); folio_put(folio); - goto again; + continue; } } @@ -5324,9 +5280,7 @@ again: pg_offset, cur_len); folio_unlock(folio); folio_put(folio); - index++; - pg_offset = 0; - len -= cur_len; + cur += cur_len; sctx->send_size += cur_len; } @@ -5339,12 +5293,9 @@ again: */ static int send_write(struct send_ctx *sctx, u64 offset, u32 len) { - struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret = 0; struct fs_path *p; - btrfs_debug(fs_info, "send_write offset=%llu, len=%d", offset, len); - p = get_cur_inode_path(sctx); if (IS_ERR(p)) return PTR_ERR(p); @@ -5377,11 +5328,6 @@ static int send_clone(struct send_ctx *sctx, struct fs_path *cur_inode_path; u64 gen; - btrfs_debug(sctx->send_root->fs_info, - "send_clone offset=%llu, len=%d, clone_root=%llu, clone_inode=%llu, clone_offset=%llu", - offset, len, btrfs_root_id(clone_root->root), - clone_root->ino, clone_root->offset); - cur_inode_path = get_cur_inode_path(sctx); if (IS_ERR(cur_inode_path)) return PTR_ERR(cur_inode_path); diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index ff089e3e4103..d9087aa81b21 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -50,11 +50,11 @@ * num_bytes we want to reserve. * * ->reserve - * space_info->bytes_may_reserve += num_bytes + * space_info->bytes_may_use += num_bytes * * ->extent allocation * Call btrfs_add_reserved_bytes() which does - * space_info->bytes_may_reserve -= num_bytes + * space_info->bytes_may_use -= num_bytes * space_info->bytes_reserved += extent_bytes * * ->insert reference @@ -234,19 +234,11 @@ void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info, WRITE_ONCE(space_info->chunk_size, chunk_size); } -static int create_space_info(struct btrfs_fs_info *info, u64 flags) +static void init_space_info(struct btrfs_fs_info *info, + struct btrfs_space_info *space_info, u64 flags) { - - struct btrfs_space_info *space_info; - int i; - int ret; - - space_info = kzalloc(sizeof(*space_info), GFP_NOFS); - if (!space_info) - return -ENOMEM; - space_info->fs_info = info; - for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) + for (int i = 0; i < BTRFS_NR_RAID_TYPES; i++) INIT_LIST_HEAD(&space_info->block_groups[i]); init_rwsem(&space_info->groups_sem); spin_lock_init(&space_info->lock); @@ -257,9 +249,64 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags) INIT_LIST_HEAD(&space_info->priority_tickets); space_info->clamp = 1; btrfs_update_space_info_chunk_size(space_info, calc_chunk_size(info, flags)); + space_info->subgroup_id = BTRFS_SUB_GROUP_PRIMARY; if (btrfs_is_zoned(info)) space_info->bg_reclaim_threshold = BTRFS_DEFAULT_ZONED_RECLAIM_THRESH; +} + +static int create_space_info_sub_group(struct btrfs_space_info *parent, u64 flags, + enum btrfs_space_info_sub_group id, int index) +{ + struct btrfs_fs_info *fs_info = parent->fs_info; + struct btrfs_space_info *sub_group; + int ret; + + ASSERT(parent->subgroup_id == BTRFS_SUB_GROUP_PRIMARY); + ASSERT(id != BTRFS_SUB_GROUP_PRIMARY); + + sub_group = kzalloc(sizeof(*sub_group), GFP_NOFS); + if (!sub_group) + return -ENOMEM; + + init_space_info(fs_info, sub_group, flags); + parent->sub_group[index] = sub_group; + sub_group->parent = parent; + sub_group->subgroup_id = id; + + ret = btrfs_sysfs_add_space_info_type(fs_info, sub_group); + if (ret) { + kfree(sub_group); + parent->sub_group[index] = NULL; + } + return ret; +} + +static int create_space_info(struct btrfs_fs_info *info, u64 flags) +{ + + struct btrfs_space_info *space_info; + int ret = 0; + + space_info = kzalloc(sizeof(*space_info), GFP_NOFS); + if (!space_info) + return -ENOMEM; + + init_space_info(info, space_info, flags); + + if (btrfs_is_zoned(info)) { + if (flags & BTRFS_BLOCK_GROUP_DATA) + ret = create_space_info_sub_group(space_info, flags, + BTRFS_SUB_GROUP_DATA_RELOC, + 0); + else if (flags & BTRFS_BLOCK_GROUP_METADATA) + ret = create_space_info_sub_group(space_info, flags, + BTRFS_SUB_GROUP_TREELOG, + 0); + + if (ret) + return ret; + } ret = btrfs_sysfs_add_space_info_type(info, space_info); if (ret) @@ -312,31 +359,29 @@ out: void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info, struct btrfs_block_group *block_group) { - struct btrfs_space_info *found; + struct btrfs_space_info *space_info = block_group->space_info; int factor, index; factor = btrfs_bg_type_to_factor(block_group->flags); - found = btrfs_find_space_info(info, block_group->flags); - ASSERT(found); - spin_lock(&found->lock); - found->total_bytes += block_group->length; - found->disk_total += block_group->length * factor; - found->bytes_used += block_group->used; - found->disk_used += block_group->used * factor; - found->bytes_readonly += block_group->bytes_super; - btrfs_space_info_update_bytes_zone_unusable(found, block_group->zone_unusable); + spin_lock(&space_info->lock); + space_info->total_bytes += block_group->length; + space_info->disk_total += block_group->length * factor; + space_info->bytes_used += block_group->used; + space_info->disk_used += block_group->used * factor; + space_info->bytes_readonly += block_group->bytes_super; + btrfs_space_info_update_bytes_zone_unusable(space_info, block_group->zone_unusable); if (block_group->length > 0) - found->full = 0; - btrfs_try_granting_tickets(info, found); - spin_unlock(&found->lock); + space_info->full = 0; + btrfs_try_granting_tickets(info, space_info); + spin_unlock(&space_info->lock); - block_group->space_info = found; + block_group->space_info = space_info; index = btrfs_bg_flags_to_raid_index(block_group->flags); - down_write(&found->groups_sem); - list_add_tail(&block_group->list, &found->block_groups[index]); - up_write(&found->groups_sem); + down_write(&space_info->groups_sem); + list_add_tail(&block_group->list, &space_info->block_groups[index]); + up_write(&space_info->groups_sem); } struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, @@ -556,8 +601,9 @@ static void __btrfs_dump_space_info(const struct btrfs_fs_info *fs_info, lockdep_assert_held(&info->lock); /* The free space could be negative in case of overcommit */ - btrfs_info(fs_info, "space_info %s has %lld free, is %sfull", - flag_str, + btrfs_info(fs_info, + "space_info %s (sub-group id %d) has %lld free, is %sfull", + flag_str, info->subgroup_id, (s64)(info->total_bytes - btrfs_space_info_used(info, true)), info->full ? "" : "not "); btrfs_info(fs_info, @@ -812,7 +858,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, ret = PTR_ERR(trans); break; } - ret = btrfs_chunk_alloc(trans, + ret = btrfs_chunk_alloc(trans, space_info, btrfs_get_alloc_profile(fs_info, space_info->flags), (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE : CHUNK_ALLOC_FORCE); @@ -1083,23 +1129,15 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, return (tickets_id != space_info->tickets_id); } -/* - * This is for normal flushers, we can wait all goddamned day if we want to. We - * will loop and continuously try to flush as long as we are making progress. - * We count progress as clearing off tickets each time we have to loop. - */ -static void btrfs_async_reclaim_metadata_space(struct work_struct *work) +static void do_async_reclaim_metadata_space(struct btrfs_space_info *space_info) { - struct btrfs_fs_info *fs_info; - struct btrfs_space_info *space_info; + struct btrfs_fs_info *fs_info = space_info->fs_info; u64 to_reclaim; enum btrfs_flush_state flush_state; int commit_cycles = 0; u64 last_tickets_id; enum btrfs_flush_state final_state; - fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); - space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); if (btrfs_is_zoned(fs_info)) final_state = RESET_ZONES; else @@ -1174,6 +1212,25 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) } /* + * This is for normal flushers, it can wait as much time as needed. We will + * loop and continuously try to flush as long as we are making progress. We + * count progress as clearing off tickets each time we have to loop. + */ +static void btrfs_async_reclaim_metadata_space(struct work_struct *work) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_space_info *space_info; + + fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); + space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + do_async_reclaim_metadata_space(space_info); + for (int i = 0; i < BTRFS_SPACE_INFO_SUB_GROUP_MAX; i++) { + if (space_info->sub_group[i]) + do_async_reclaim_metadata_space(space_info->sub_group[i]); + } +} + +/* * This handles pre-flushing of metadata space before we get to the point that * we need to start blocking threads on tickets. The logic here is different * from the other flush paths because it doesn't rely on tickets to tell us how @@ -1318,16 +1375,12 @@ static const enum btrfs_flush_state data_flush_states[] = { ALLOC_CHUNK_FORCE, }; -static void btrfs_async_reclaim_data_space(struct work_struct *work) +static void do_async_reclaim_data_space(struct btrfs_space_info *space_info) { - struct btrfs_fs_info *fs_info; - struct btrfs_space_info *space_info; + struct btrfs_fs_info *fs_info = space_info->fs_info; u64 last_tickets_id; enum btrfs_flush_state flush_state = 0; - fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work); - space_info = fs_info->data_sinfo; - spin_lock(&space_info->lock); if (list_empty(&space_info->tickets)) { space_info->flush = 0; @@ -1395,6 +1448,19 @@ aborted_fs: spin_unlock(&space_info->lock); } +static void btrfs_async_reclaim_data_space(struct work_struct *work) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_space_info *space_info; + + fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work); + space_info = fs_info->data_sinfo; + do_async_reclaim_data_space(space_info); + for (int i = 0; i < BTRFS_SPACE_INFO_SUB_GROUP_MAX; i++) + if (space_info->sub_group[i]) + do_async_reclaim_data_space(space_info->sub_group[i]); +} + void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info) { INIT_WORK(&fs_info->async_reclaim_work, btrfs_async_reclaim_metadata_space); @@ -1836,10 +1902,10 @@ int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, * This will reserve bytes from the data space info. If there is not enough * space then we will attempt to flush space as specified by flush. */ -int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, +int btrfs_reserve_data_bytes(struct btrfs_space_info *space_info, u64 bytes, enum btrfs_reserve_flush_enum flush) { - struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; + struct btrfs_fs_info *fs_info = space_info->fs_info; int ret; ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA || @@ -1847,12 +1913,12 @@ int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, flush == BTRFS_RESERVE_NO_FLUSH); ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA); - ret = __reserve_bytes(fs_info, data_sinfo, bytes, flush); + ret = __reserve_bytes(fs_info, space_info, bytes, flush); if (ret == -ENOSPC) { trace_btrfs_space_reservation(fs_info, "space_info:enospc", - data_sinfo->flags, bytes, 1); + space_info->flags, bytes, 1); if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) - btrfs_dump_space_info(fs_info, data_sinfo, bytes, 0); + btrfs_dump_space_info(fs_info, space_info, bytes, 0); } return ret; } diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index a96efdb5e681..92b7f5e2b850 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -98,8 +98,18 @@ enum btrfs_flush_state { RESET_ZONES = 12, }; +enum btrfs_space_info_sub_group { + BTRFS_SUB_GROUP_PRIMARY, + BTRFS_SUB_GROUP_DATA_RELOC, + BTRFS_SUB_GROUP_TREELOG, +}; + +#define BTRFS_SPACE_INFO_SUB_GROUP_MAX 1 struct btrfs_space_info { struct btrfs_fs_info *fs_info; + struct btrfs_space_info *parent; + struct btrfs_space_info *sub_group[BTRFS_SPACE_INFO_SUB_GROUP_MAX]; + int subgroup_id; spinlock_t lock; u64 total_bytes; /* total bytes in the space, @@ -288,7 +298,7 @@ static inline void btrfs_space_info_free_bytes_may_use( btrfs_try_granting_tickets(space_info->fs_info, space_info); spin_unlock(&space_info->lock); } -int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, +int btrfs_reserve_data_bytes(struct btrfs_space_info *space_info, u64 bytes, enum btrfs_reserve_flush_enum flush); void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info); void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 11dbd7be6a3b..d4f019233493 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -69,7 +69,8 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, struct btrfs_subpage *subpage; /* For metadata we don't support large folio yet. */ - ASSERT(!folio_test_large(folio)); + if (type == BTRFS_SUBPAGE_METADATA) + ASSERT(!folio_test_large(folio)); /* * We have cases like a dummy extent buffer page, which is not mapped @@ -181,9 +182,6 @@ void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio * static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - /* For subpage support, the folio must be single page. */ - ASSERT(folio_order(folio) == 0); - /* Basic checks */ ASSERT(folio_test_private(folio) && folio_get_private(folio)); ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && @@ -204,7 +202,7 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, btrfs_blocks_per_folio(fs_info, folio); \ \ btrfs_subpage_assert(fs_info, folio, start, len); \ - __start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \ + __start_bit = offset_in_folio(folio, start) >> fs_info->sectorsize_bits; \ __start_bit += blocks_per_folio * btrfs_bitmap_nr_##name; \ __start_bit; \ }) @@ -666,7 +664,7 @@ IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked, btrfs_blocks_per_folio(fs_info, folio); \ const struct btrfs_subpage *subpage = folio_get_private(folio); \ \ - ASSERT(blocks_per_folio < BITS_PER_LONG); \ + ASSERT(blocks_per_folio <= BITS_PER_LONG); \ *dst = bitmap_read(subpage->bitmaps, \ blocks_per_folio * btrfs_bitmap_nr_##name, \ blocks_per_folio); \ diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 7121d8c7a318..a0c65adce1ab 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -125,7 +125,6 @@ enum { /* Rescue options */ Opt_rescue, Opt_usebackuproot, - Opt_nologreplay, /* Debugging options */ Opt_enospc_debug, @@ -246,8 +245,6 @@ static const struct fs_parameter_spec btrfs_fs_parameters[] = { /* Rescue options. */ fsparam_enum("rescue", Opt_rescue, btrfs_parameter_rescue), - /* Deprecated, with alias rescue=nologreplay */ - __fsparam(NULL, "nologreplay", Opt_nologreplay, fs_param_deprecated, NULL), /* Deprecated, with alias rescue=usebackuproot */ __fsparam(NULL, "usebackuproot", Opt_usebackuproot, fs_param_deprecated, NULL), /* For compatibility only, alias for "rescue=nologreplay". */ @@ -449,11 +446,6 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param) else btrfs_clear_opt(ctx->mount_opt, NOTREELOG); break; - case Opt_nologreplay: - btrfs_warn(NULL, - "'nologreplay' is deprecated, use 'rescue=nologreplay' instead"); - btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY); - break; case Opt_norecovery: btrfs_info(NULL, "'norecovery' is for compatibility only, recommended to use 'rescue=nologreplay'"); @@ -569,6 +561,10 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param) break; case Opt_commit_interval: ctx->commit_interval = result.uint_32; + if (ctx->commit_interval > BTRFS_WARNING_COMMIT_INTERVAL) { + btrfs_warn(NULL, "excessive commit interval %u, use with care", + ctx->commit_interval); + } if (ctx->commit_interval == 0) ctx->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; break; @@ -1148,11 +1144,11 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) /* * subvolumes are identified by ino 256 */ -static inline int is_subvolume_inode(struct inode *inode) +static inline bool is_subvolume_inode(struct inode *inode) { if (inode && inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) - return 1; - return 0; + return true; + return false; } static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid, @@ -2292,7 +2288,7 @@ static int check_dev_super(struct btrfs_device *dev) return 0; /* Only need to check the primary super block. */ - sb = btrfs_read_dev_one_super(dev->bdev, 0, true); + sb = btrfs_read_disk_super(dev->bdev, 0, true); if (IS_ERR(sb)) return PTR_ERR(sb); @@ -2525,8 +2521,8 @@ static const struct init_sequence mod_init_seq[] = { .init_func = btrfs_free_space_init, .exit_func = btrfs_free_space_exit, }, { - .init_func = extent_state_init_cachep, - .exit_func = extent_state_free_cachep, + .init_func = btrfs_extent_state_init_cachep, + .exit_func = btrfs_extent_state_free_cachep, }, { .init_func = extent_buffer_init_cachep, .exit_func = extent_buffer_free_cachep, @@ -2534,8 +2530,8 @@ static const struct init_sequence mod_init_seq[] = { .init_func = btrfs_bioset_init, .exit_func = btrfs_bioset_exit, }, { - .init_func = extent_map_init, - .exit_func = extent_map_exit, + .init_func = btrfs_extent_map_init, + .exit_func = btrfs_extent_map_exit, #ifdef CONFIG_BTRFS_EXPERIMENTAL }, { .init_func = btrfs_read_policy_init, diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index b9af74498b0c..5d93d9dd2c12 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1930,16 +1930,35 @@ void btrfs_sysfs_remove_space_info(struct btrfs_space_info *space_info) kobject_put(&space_info->kobj); } -static const char *alloc_name(u64 flags) +static const char *alloc_name(struct btrfs_space_info *space_info) { + u64 flags = space_info->flags; + switch (flags) { case BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA: return "mixed"; case BTRFS_BLOCK_GROUP_METADATA: - return "metadata"; + switch (space_info->subgroup_id) { + case BTRFS_SUB_GROUP_PRIMARY: + return "metadata"; + case BTRFS_SUB_GROUP_TREELOG: + return "metadata-treelog"; + default: + WARN_ON_ONCE(1); + return "metadata (unknown sub-group)"; + } case BTRFS_BLOCK_GROUP_DATA: - return "data"; + switch (space_info->subgroup_id) { + case BTRFS_SUB_GROUP_PRIMARY: + return "data"; + case BTRFS_SUB_GROUP_DATA_RELOC: + return "data-reloc"; + default: + WARN_ON_ONCE(1); + return "data (unknown sub-group)"; + } case BTRFS_BLOCK_GROUP_SYSTEM: + ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_PRIMARY); return "system"; default: WARN_ON(1); @@ -1958,7 +1977,7 @@ int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info, ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype, fs_info->space_info_kobj, "%s", - alloc_name(space_info->flags)); + alloc_name(space_info)); if (ret) { kobject_put(&space_info->kobj); return ret; diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 5eff8d7d2360..b576897d71cc 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -102,7 +102,7 @@ struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info) if (!dev) return ERR_PTR(-ENOMEM); - extent_io_tree_init(fs_info, &dev->alloc_state, 0); + btrfs_extent_io_tree_init(fs_info, &dev->alloc_state, 0); INIT_LIST_HEAD(&dev->dev_list); list_add(&dev->dev_list, &fs_info->fs_devices->devices); @@ -111,7 +111,7 @@ struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info) static void btrfs_free_dummy_device(struct btrfs_device *dev) { - extent_io_tree_release(&dev->alloc_state); + btrfs_extent_io_tree_release(&dev->alloc_state); kfree(dev); } @@ -157,9 +157,9 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize) void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) { - struct radix_tree_iter iter; - void **slot; struct btrfs_device *dev, *tmp; + struct extent_buffer *eb; + unsigned long index; if (!fs_info) return; @@ -169,25 +169,13 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) test_mnt->mnt_sb->s_fs_info = NULL; - spin_lock(&fs_info->buffer_lock); - radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) { - struct extent_buffer *eb; - - eb = radix_tree_deref_slot_protected(slot, &fs_info->buffer_lock); - if (!eb) - continue; - /* Shouldn't happen but that kind of thinking creates CVE's */ - if (radix_tree_exception(eb)) { - if (radix_tree_deref_retry(eb)) - slot = radix_tree_iter_retry(&iter); - continue; - } - slot = radix_tree_iter_resume(slot, &iter); - spin_unlock(&fs_info->buffer_lock); - free_extent_buffer_stale(eb); - spin_lock(&fs_info->buffer_lock); + xa_lock_irq(&fs_info->buffer_tree); + xa_for_each(&fs_info->buffer_tree, index, eb) { + xa_unlock_irq(&fs_info->buffer_tree); + free_extent_buffer(eb); + xa_lock_irq(&fs_info->buffer_tree); } - spin_unlock(&fs_info->buffer_lock); + xa_unlock_irq(&fs_info->buffer_tree); btrfs_mapping_tree_free(fs_info); list_for_each_entry_safe(dev, tmp, &fs_info->fs_devices->devices, diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 74aca7180a5a..00da54f0164c 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -14,9 +14,9 @@ #include "../disk-io.h" #include "../btrfs_inode.h" -#define PROCESS_UNLOCK (1 << 0) -#define PROCESS_RELEASE (1 << 1) -#define PROCESS_TEST_LOCKED (1 << 2) +#define PROCESS_UNLOCK (1U << 0) +#define PROCESS_RELEASE (1U << 1) +#define PROCESS_TEST_LOCKED (1U << 2) static noinline int process_page_range(struct inode *inode, u64 start, u64 end, unsigned long flags) @@ -74,7 +74,6 @@ static void extent_flag_to_str(const struct extent_state *state, char *dest) dest[0] = 0; PRINT_ONE_FLAG(state, dest, cur, DIRTY); - PRINT_ONE_FLAG(state, dest, cur, UPTODATE); PRINT_ONE_FLAG(state, dest, cur, LOCKED); PRINT_ONE_FLAG(state, dest, cur, NEW); PRINT_ONE_FLAG(state, dest, cur, DELALLOC); @@ -150,7 +149,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) * Passing NULL as we don't have fs_info but tracepoints are not used * at this point */ - extent_io_tree_init(NULL, tmp, IO_TREE_SELFTEST); + btrfs_extent_io_tree_init(NULL, tmp, IO_TREE_SELFTEST); /* * First go through and create and mark all of our pages dirty, we pin @@ -177,7 +176,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) * |--- delalloc ---| * |--- search ---| */ - set_extent_bit(tmp, 0, sectorsize - 1, EXTENT_DELALLOC, NULL); + btrfs_set_extent_bit(tmp, 0, sectorsize - 1, EXTENT_DELALLOC, NULL); start = 0; end = start + PAGE_SIZE - 1; found = find_lock_delalloc_range(inode, page_folio(locked_page), &start, @@ -191,7 +190,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) sectorsize - 1, start, end); goto out_bits; } - unlock_extent(tmp, start, end, NULL); + btrfs_unlock_extent(tmp, start, end, NULL); unlock_page(locked_page); put_page(locked_page); @@ -208,7 +207,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) test_err("couldn't find the locked page"); goto out_bits; } - set_extent_bit(tmp, sectorsize, max_bytes - 1, EXTENT_DELALLOC, NULL); + btrfs_set_extent_bit(tmp, sectorsize, max_bytes - 1, EXTENT_DELALLOC, NULL); start = test_start; end = start + PAGE_SIZE - 1; found = find_lock_delalloc_range(inode, page_folio(locked_page), &start, @@ -227,7 +226,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) test_err("there were unlocked pages in the range"); goto out_bits; } - unlock_extent(tmp, start, end, NULL); + btrfs_unlock_extent(tmp, start, end, NULL); /* locked_page was unlocked above */ put_page(locked_page); @@ -263,7 +262,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) * * We are re-using our test_start from above since it works out well. */ - set_extent_bit(tmp, max_bytes, total_dirty - 1, EXTENT_DELALLOC, NULL); + btrfs_set_extent_bit(tmp, max_bytes, total_dirty - 1, EXTENT_DELALLOC, NULL); start = test_start; end = start + PAGE_SIZE - 1; found = find_lock_delalloc_range(inode, page_folio(locked_page), &start, @@ -282,7 +281,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) test_err("pages in range were not all locked"); goto out_bits; } - unlock_extent(tmp, start, end, NULL); + btrfs_unlock_extent(tmp, start, end, NULL); /* * Now to test where we run into a page that is no longer dirty in the @@ -327,7 +326,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) out_bits: if (ret) dump_extent_io_tree(tmp); - clear_extent_bits(tmp, 0, total_dirty - 1, (unsigned)-1); + btrfs_clear_extent_bits(tmp, 0, total_dirty - 1, (unsigned)-1); out: if (locked_page) put_page(locked_page); @@ -565,10 +564,10 @@ static int test_find_first_clear_extent_bit(void) test_msg("running find_first_clear_extent_bit test"); - extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST); + btrfs_extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST); /* Test correct handling of empty tree */ - find_first_clear_extent_bit(&tree, 0, &start, &end, CHUNK_TRIMMED); + btrfs_find_first_clear_extent_bit(&tree, 0, &start, &end, CHUNK_TRIMMED); if (start != 0 || end != -1) { test_err( "error getting a range from completely empty tree: start %llu end %llu", @@ -579,11 +578,11 @@ static int test_find_first_clear_extent_bit(void) * Set 1M-4M alloc/discard and 32M-64M thus leaving a hole between * 4M-32M */ - set_extent_bit(&tree, SZ_1M, SZ_4M - 1, - CHUNK_TRIMMED | CHUNK_ALLOCATED, NULL); + btrfs_set_extent_bit(&tree, SZ_1M, SZ_4M - 1, + CHUNK_TRIMMED | CHUNK_ALLOCATED, NULL); - find_first_clear_extent_bit(&tree, SZ_512K, &start, &end, - CHUNK_TRIMMED | CHUNK_ALLOCATED); + btrfs_find_first_clear_extent_bit(&tree, SZ_512K, &start, &end, + CHUNK_TRIMMED | CHUNK_ALLOCATED); if (start != 0 || end != SZ_1M - 1) { test_err("error finding beginning range: start %llu end %llu", @@ -592,14 +591,14 @@ static int test_find_first_clear_extent_bit(void) } /* Now add 32M-64M so that we have a hole between 4M-32M */ - set_extent_bit(&tree, SZ_32M, SZ_64M - 1, - CHUNK_TRIMMED | CHUNK_ALLOCATED, NULL); + btrfs_set_extent_bit(&tree, SZ_32M, SZ_64M - 1, + CHUNK_TRIMMED | CHUNK_ALLOCATED, NULL); /* * Request first hole starting at 12M, we should get 4M-32M */ - find_first_clear_extent_bit(&tree, 12 * SZ_1M, &start, &end, - CHUNK_TRIMMED | CHUNK_ALLOCATED); + btrfs_find_first_clear_extent_bit(&tree, 12 * SZ_1M, &start, &end, + CHUNK_TRIMMED | CHUNK_ALLOCATED); if (start != SZ_4M || end != SZ_32M - 1) { test_err("error finding trimmed range: start %llu end %llu", @@ -611,8 +610,8 @@ static int test_find_first_clear_extent_bit(void) * Search in the middle of allocated range, should get the next one * available, which happens to be unallocated -> 4M-32M */ - find_first_clear_extent_bit(&tree, SZ_2M, &start, &end, - CHUNK_TRIMMED | CHUNK_ALLOCATED); + btrfs_find_first_clear_extent_bit(&tree, SZ_2M, &start, &end, + CHUNK_TRIMMED | CHUNK_ALLOCATED); if (start != SZ_4M || end != SZ_32M - 1) { test_err("error finding next unalloc range: start %llu end %llu", @@ -624,9 +623,9 @@ static int test_find_first_clear_extent_bit(void) * Set 64M-72M with CHUNK_ALLOC flag, then search for CHUNK_TRIMMED flag * being unset in this range, we should get the entry in range 64M-72M */ - set_extent_bit(&tree, SZ_64M, SZ_64M + SZ_8M - 1, CHUNK_ALLOCATED, NULL); - find_first_clear_extent_bit(&tree, SZ_64M + SZ_1M, &start, &end, - CHUNK_TRIMMED); + btrfs_set_extent_bit(&tree, SZ_64M, SZ_64M + SZ_8M - 1, CHUNK_ALLOCATED, NULL); + btrfs_find_first_clear_extent_bit(&tree, SZ_64M + SZ_1M, &start, &end, + CHUNK_TRIMMED); if (start != SZ_64M || end != SZ_64M + SZ_8M - 1) { test_err("error finding exact range: start %llu end %llu", @@ -634,8 +633,8 @@ static int test_find_first_clear_extent_bit(void) goto out; } - find_first_clear_extent_bit(&tree, SZ_64M - SZ_8M, &start, &end, - CHUNK_TRIMMED); + btrfs_find_first_clear_extent_bit(&tree, SZ_64M - SZ_8M, &start, &end, + CHUNK_TRIMMED); /* * Search in the middle of set range whose immediate neighbour doesn't @@ -651,7 +650,7 @@ static int test_find_first_clear_extent_bit(void) * Search beyond any known range, shall return after last known range * and end should be -1 */ - find_first_clear_extent_bit(&tree, -1, &start, &end, CHUNK_TRIMMED); + btrfs_find_first_clear_extent_bit(&tree, -1, &start, &end, CHUNK_TRIMMED); if (start != SZ_64M + SZ_8M || end != -1) { test_err( "error handling beyond end of range search: start %llu end %llu", @@ -663,7 +662,7 @@ static int test_find_first_clear_extent_bit(void) out: if (ret) dump_extent_io_tree(&tree); - clear_extent_bits(&tree, 0, (u64)-1, CHUNK_TRIMMED | CHUNK_ALLOCATED); + btrfs_clear_extent_bits(&tree, 0, (u64)-1, CHUNK_TRIMMED | CHUNK_ALLOCATED); return ret; } diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index 609bb6c9c087..3a86534c116f 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -22,7 +22,7 @@ static int free_extent_map_tree(struct btrfs_inode *inode) while (!RB_EMPTY_ROOT(&em_tree->root)) { node = rb_first(&em_tree->root); em = rb_entry(node, struct extent_map, rb_node); - remove_extent_mapping(inode, em); + btrfs_remove_extent_mapping(inode, em); #ifdef CONFIG_BTRFS_DEBUG if (refcount_read(&em->refs) != 1) { @@ -36,7 +36,7 @@ static int free_extent_map_tree(struct btrfs_inode *inode) refcount_set(&em->refs, 1); } #endif - free_extent_map(em); + btrfs_free_extent_map(em); } write_unlock(&em_tree->lock); @@ -68,7 +68,7 @@ static int test_case_1(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) int ret; int ret2; - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); return -ENOMEM; @@ -87,10 +87,10 @@ static int test_case_1(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) test_err("cannot add extent range [0, 16K)"); goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); /* Add [16K, 20K) following [0, 16K) */ - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); ret = -ENOMEM; @@ -109,9 +109,9 @@ static int test_case_1(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) test_err("cannot add extent range [16K, 20K)"); goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); ret = -ENOMEM; @@ -137,7 +137,7 @@ static int test_case_1(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) ret = -ENOENT; goto out; } - if (em->start != 0 || extent_map_end(em) != SZ_16K || + if (em->start != 0 || btrfs_extent_map_end(em) != SZ_16K || em->disk_bytenr != 0 || em->disk_num_bytes != SZ_16K) { test_err( "case1 [%llu %llu]: ret %d return a wrong em (start %llu len %llu disk_bytenr %llu disk_num_bytes %llu", @@ -145,7 +145,7 @@ static int test_case_1(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) em->disk_bytenr, em->disk_num_bytes); ret = -EINVAL; } - free_extent_map(em); + btrfs_free_extent_map(em); out: ret2 = free_extent_map_tree(inode); if (ret == 0) @@ -167,7 +167,7 @@ static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) int ret; int ret2; - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); return -ENOMEM; @@ -186,10 +186,10 @@ static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) test_err("cannot add extent range [0, 1K)"); goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); /* Add [4K, 8K) following [0, 1K) */ - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); ret = -ENOMEM; @@ -208,9 +208,9 @@ static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) test_err("cannot add extent range [4K, 8K)"); goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); ret = -ENOMEM; @@ -235,14 +235,14 @@ static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) ret = -ENOENT; goto out; } - if (em->start != 0 || extent_map_end(em) != SZ_1K || + if (em->start != 0 || btrfs_extent_map_end(em) != SZ_1K || em->disk_bytenr != EXTENT_MAP_INLINE) { test_err( "case2 [0 1K]: ret %d return a wrong em (start %llu len %llu disk_bytenr %llu", ret, em->start, em->len, em->disk_bytenr); ret = -EINVAL; } - free_extent_map(em); + btrfs_free_extent_map(em); out: ret2 = free_extent_map_tree(inode); if (ret == 0) @@ -260,7 +260,7 @@ static int __test_case_3(struct btrfs_fs_info *fs_info, int ret; int ret2; - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); return -ENOMEM; @@ -279,9 +279,9 @@ static int __test_case_3(struct btrfs_fs_info *fs_info, test_err("cannot add extent range [4K, 8K)"); goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); ret = -ENOMEM; @@ -312,15 +312,15 @@ static int __test_case_3(struct btrfs_fs_info *fs_info, * Since bytes within em are contiguous, em->block_start is identical to * em->start. */ - if (start < em->start || start + len > extent_map_end(em) || - em->start != extent_map_block_start(em)) { + if (start < em->start || start + len > btrfs_extent_map_end(em) || + em->start != btrfs_extent_map_block_start(em)) { test_err( "case3 [%llu %llu): ret %d em (start %llu len %llu disk_bytenr %llu block_len %llu)", start, start + len, ret, em->start, em->len, em->disk_bytenr, em->disk_num_bytes); ret = -EINVAL; } - free_extent_map(em); + btrfs_free_extent_map(em); out: ret2 = free_extent_map_tree(inode); if (ret == 0) @@ -369,7 +369,7 @@ static int __test_case_4(struct btrfs_fs_info *fs_info, int ret; int ret2; - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); return -ENOMEM; @@ -388,9 +388,9 @@ static int __test_case_4(struct btrfs_fs_info *fs_info, test_err("cannot add extent range [0, 8K)"); goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); ret = -ENOMEM; @@ -410,9 +410,9 @@ static int __test_case_4(struct btrfs_fs_info *fs_info, test_err("cannot add extent range [8K, 32K)"); goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); ret = -ENOMEM; @@ -438,14 +438,14 @@ static int __test_case_4(struct btrfs_fs_info *fs_info, ret = -ENOENT; goto out; } - if (start < em->start || start + len > extent_map_end(em)) { + if (start < em->start || start + len > btrfs_extent_map_end(em)) { test_err( "case4 [%llu %llu): ret %d, added wrong em (start %llu len %llu disk_bytenr %llu disk_num_bytes %llu)", start, start + len, ret, em->start, em->len, em->disk_bytenr, em->disk_num_bytes); ret = -EINVAL; } - free_extent_map(em); + btrfs_free_extent_map(em); out: ret2 = free_extent_map_tree(inode); if (ret == 0) @@ -498,7 +498,7 @@ static int add_compressed_extent(struct btrfs_inode *inode, struct extent_map *em; int ret; - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); return -ENOMEM; @@ -513,7 +513,7 @@ static int add_compressed_extent(struct btrfs_inode *inode, write_lock(&em_tree->lock); ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len); write_unlock(&em_tree->lock); - free_extent_map(em); + btrfs_free_extent_map(em); if (ret < 0) { test_err("cannot add extent map [%llu, %llu)", start, start + len); return ret; @@ -719,7 +719,7 @@ static int test_case_6(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) if (ret) goto out; - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); ret = -ENOMEM; @@ -751,7 +751,7 @@ static int test_case_6(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) } ret = 0; out: - free_extent_map(em); + btrfs_free_extent_map(em); ret2 = free_extent_map_tree(inode); if (ret == 0) ret = ret2; @@ -773,7 +773,7 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) test_msg("Running btrfs_drop_extent_cache with pinned"); - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); return -ENOMEM; @@ -793,9 +793,9 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) test_err("couldn't add extent map"); goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); ret = -ENOMEM; @@ -815,7 +815,7 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) test_err("couldn't add extent map"); goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); /* * Drop [0, 36K) This should skip the [0, 4K) extent and then split the @@ -826,7 +826,7 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) /* Make sure our extent maps look sane. */ ret = -EINVAL; - em = lookup_extent_mapping(em_tree, 0, SZ_16K); + em = btrfs_lookup_extent_mapping(em_tree, 0, SZ_16K); if (!em) { test_err("didn't find an em at 0 as expected"); goto out; @@ -842,10 +842,10 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, SZ_16K, SZ_16K); + em = btrfs_lookup_extent_mapping(em_tree, SZ_16K, SZ_16K); read_unlock(&em_tree->lock); if (em) { test_err("found an em when we weren't expecting one"); @@ -853,7 +853,7 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) } read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, SZ_32K, SZ_16K); + em = btrfs_lookup_extent_mapping(em_tree, SZ_32K, SZ_16K); read_unlock(&em_tree->lock); if (!em) { test_err("didn't find an em at 32K as expected"); @@ -870,16 +870,16 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) goto out; } - if (extent_map_block_start(em) != SZ_32K + SZ_4K) { + if (btrfs_extent_map_block_start(em) != SZ_32K + SZ_4K) { test_err("em->block_start is %llu, expected 36K", - extent_map_block_start(em)); + btrfs_extent_map_block_start(em)); goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, 48 * SZ_1K, (u64)-1); + em = btrfs_lookup_extent_mapping(em_tree, 48 * SZ_1K, (u64)-1); read_unlock(&em_tree->lock); if (em) { test_err("found an unexpected em above 48K"); @@ -888,9 +888,9 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) ret = 0; out: - free_extent_map(em); + btrfs_free_extent_map(em); /* Unpin our extent to prevent warning when removing it below. */ - ret2 = unpin_extent_cache(inode, 0, SZ_16K, 0); + ret2 = btrfs_unpin_extent_cache(inode, 0, SZ_16K, 0); if (ret == 0) ret = ret2; ret2 = free_extent_map_tree(inode); @@ -913,7 +913,7 @@ static int test_case_8(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) int ret; int ret2; - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); return -ENOMEM; @@ -928,13 +928,13 @@ static int test_case_8(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) write_lock(&em_tree->lock); ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len); write_unlock(&em_tree->lock); - free_extent_map(em); + btrfs_free_extent_map(em); if (ret < 0) { test_err("couldn't add extent map for range [120K, 128K)"); goto out; } - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); ret = -ENOMEM; @@ -967,7 +967,7 @@ static int test_case_8(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) write_lock(&em_tree->lock); ret = btrfs_add_extent_mapping(inode, &em, SZ_1K * 140, SZ_4K); write_unlock(&em_tree->lock); - free_extent_map(em); + btrfs_free_extent_map(em); if (ret < 0) { test_err("couldn't add extent map for range [108K, 144K)"); goto out; diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 3ea3bc2225fe..a29d2c02c2c8 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -268,7 +268,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) test_err("expected a hole, got %llu", em->disk_bytenr); goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false); /* @@ -314,7 +314,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) * this? */ offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { @@ -336,7 +336,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); /* Regular extent */ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); @@ -363,7 +363,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); /* The next 3 are split extents */ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); @@ -389,10 +389,10 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) test_err("wrong offset, want 0, have %llu", em->offset); goto out; } - disk_bytenr = extent_map_block_start(em); + disk_bytenr = btrfs_extent_map_block_start(em); orig_start = em->start; offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { @@ -414,7 +414,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { @@ -441,13 +441,13 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } disk_bytenr += (em->start - orig_start); - if (extent_map_block_start(em) != disk_bytenr) { + if (btrfs_extent_map_block_start(em) != disk_bytenr) { test_err("wrong block start, want %llu, have %llu", - disk_bytenr, extent_map_block_start(em)); + disk_bytenr, btrfs_extent_map_block_start(em)); goto out; } offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); /* Prealloc extent */ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); @@ -475,7 +475,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); /* The next 3 are a half written prealloc extent */ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); @@ -502,10 +502,10 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) test_err("wrong offset, want 0, have %llu", em->offset); goto out; } - disk_bytenr = extent_map_block_start(em); + disk_bytenr = btrfs_extent_map_block_start(em); orig_start = em->start; offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { @@ -531,13 +531,13 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) em->start - orig_start, em->offset); goto out; } - if (extent_map_block_start(em) != disk_bytenr + em->offset) { + if (btrfs_extent_map_block_start(em) != disk_bytenr + em->offset) { test_err("unexpected block start, wanted %llu, have %llu", - disk_bytenr + em->offset, extent_map_block_start(em)); + disk_bytenr + em->offset, btrfs_extent_map_block_start(em)); goto out; } offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { @@ -564,13 +564,13 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) em->start, em->offset, orig_start); goto out; } - if (extent_map_block_start(em) != disk_bytenr + em->offset) { + if (btrfs_extent_map_block_start(em) != disk_bytenr + em->offset) { test_err("unexpected block start, wanted %llu, have %llu", - disk_bytenr + em->offset, extent_map_block_start(em)); + disk_bytenr + em->offset, btrfs_extent_map_block_start(em)); goto out; } offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); /* Now for the compressed extent */ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); @@ -597,13 +597,13 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) test_err("wrong offset, want 0, have %llu", em->offset); goto out; } - if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) { + if (btrfs_extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) { test_err("unexpected compress type, wanted %d, got %d", - BTRFS_COMPRESS_ZLIB, extent_map_compression(em)); + BTRFS_COMPRESS_ZLIB, btrfs_extent_map_compression(em)); goto out; } offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); /* Split compressed extent */ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); @@ -630,15 +630,15 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) test_err("wrong offset, want 0, have %llu", em->offset); goto out; } - if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) { + if (btrfs_extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) { test_err("unexpected compress type, wanted %d, got %d", - BTRFS_COMPRESS_ZLIB, extent_map_compression(em)); + BTRFS_COMPRESS_ZLIB, btrfs_extent_map_compression(em)); goto out; } - disk_bytenr = extent_map_block_start(em); + disk_bytenr = btrfs_extent_map_block_start(em); orig_start = em->start; offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { @@ -664,16 +664,16 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; } - if (extent_map_block_start(em) != disk_bytenr) { + if (btrfs_extent_map_block_start(em) != disk_bytenr) { test_err("block start does not match, want %llu got %llu", - disk_bytenr, extent_map_block_start(em)); + disk_bytenr, btrfs_extent_map_block_start(em)); goto out; } if (em->start != offset || em->len != 2 * sectorsize) { @@ -692,13 +692,13 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) em->start, em->offset, orig_start); goto out; } - if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) { + if (btrfs_extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) { test_err("unexpected compress type, wanted %d, got %d", - BTRFS_COMPRESS_ZLIB, extent_map_compression(em)); + BTRFS_COMPRESS_ZLIB, btrfs_extent_map_compression(em)); goto out; } offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); /* A hole between regular extents but no hole extent */ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset + 6, sectorsize); @@ -725,7 +725,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, SZ_4M); if (IS_ERR(em)) { @@ -757,7 +757,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { @@ -785,7 +785,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) ret = 0; out: if (!IS_ERR(em)) - free_extent_map(em); + btrfs_free_extent_map(em); iput(inode); btrfs_free_dummy_root(root); btrfs_free_dummy_fs_info(fs_info); @@ -858,15 +858,16 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) em->flags); goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, sectorsize, 2 * sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; } - if (extent_map_block_start(em) != sectorsize) { - test_err("expected a real extent, got %llu", extent_map_block_start(em)); + if (btrfs_extent_map_block_start(em) != sectorsize) { + test_err("expected a real extent, got %llu", + btrfs_extent_map_block_start(em)); goto out; } if (em->start != sectorsize || em->len != sectorsize) { @@ -883,7 +884,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) ret = 0; out: if (!IS_ERR(em)) - free_extent_map(em); + btrfs_free_extent_map(em); iput(inode); btrfs_free_dummy_root(root); btrfs_free_dummy_fs_info(fs_info); @@ -949,11 +950,10 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) } /* [BTRFS_MAX_EXTENT_SIZE/2][sectorsize HOLE][the rest] */ - ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, - BTRFS_MAX_EXTENT_SIZE >> 1, - (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1, - EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | - EXTENT_UPTODATE, NULL); + ret = btrfs_clear_extent_bits(&BTRFS_I(inode)->io_tree, + BTRFS_MAX_EXTENT_SIZE >> 1, + (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1, + EXTENT_DELALLOC | EXTENT_DELALLOC_NEW); if (ret) { test_err("clear_extent_bit returned %d", ret); goto out; @@ -1017,11 +1017,10 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) } /* [BTRFS_MAX_EXTENT_SIZE+4k][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4k] */ - ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, - BTRFS_MAX_EXTENT_SIZE + sectorsize, - BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, - EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | - EXTENT_UPTODATE, NULL); + ret = btrfs_clear_extent_bits(&BTRFS_I(inode)->io_tree, + BTRFS_MAX_EXTENT_SIZE + sectorsize, + BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, + EXTENT_DELALLOC | EXTENT_DELALLOC_NEW); if (ret) { test_err("clear_extent_bit returned %d", ret); goto out; @@ -1052,9 +1051,8 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) } /* Empty */ - ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, - EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | - EXTENT_UPTODATE, NULL); + ret = btrfs_clear_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1, + EXTENT_DELALLOC | EXTENT_DELALLOC_NEW); if (ret) { test_err("clear_extent_bit returned %d", ret); goto out; @@ -1068,9 +1066,8 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) ret = 0; out: if (ret) - clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, - EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | - EXTENT_UPTODATE, NULL); + btrfs_clear_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1, + EXTENT_DELALLOC | EXTENT_DELALLOC_NEW); iput(inode); btrfs_free_dummy_root(root); btrfs_free_dummy_fs_info(fs_info); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index f26a394a9ec5..b96195d6480f 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -197,7 +197,7 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans) list_del_init(&root->dirty_list); free_extent_buffer(root->commit_root); root->commit_root = btrfs_root_node(root); - extent_io_tree_release(&root->dirty_log_pages); + btrfs_extent_io_tree_release(&root->dirty_log_pages); btrfs_qgroup_clean_swapped_blocks(root); } @@ -383,10 +383,10 @@ loop: INIT_LIST_HEAD(&cur_trans->deleted_bgs); spin_lock_init(&cur_trans->dropped_roots_lock); list_add_tail(&cur_trans->list, &fs_info->trans_list); - extent_io_tree_init(fs_info, &cur_trans->dirty_pages, - IO_TREE_TRANS_DIRTY_PAGES); - extent_io_tree_init(fs_info, &cur_trans->pinned_extents, - IO_TREE_FS_PINNED_EXTENTS); + btrfs_extent_io_tree_init(fs_info, &cur_trans->dirty_pages, + IO_TREE_TRANS_DIRTY_PAGES); + btrfs_extent_io_tree_init(fs_info, &cur_trans->pinned_extents, + IO_TREE_FS_PINNED_EXTENTS); btrfs_set_fs_generation(fs_info, fs_info->generation + 1); cur_trans->transid = fs_info->generation; fs_info->running_transaction = cur_trans; @@ -538,15 +538,15 @@ static void wait_current_trans(struct btrfs_fs_info *fs_info) } } -static int may_wait_transaction(struct btrfs_fs_info *fs_info, int type) +static bool may_wait_transaction(struct btrfs_fs_info *fs_info, int type) { if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) - return 0; + return false; if (type == TRANS_START) - return 1; + return true; - return 0; + return false; } static inline bool need_reserve_reloc_root(struct btrfs_root *root) @@ -761,9 +761,10 @@ got_it: * value here. */ if (do_chunk_alloc && num_bytes) { - u64 flags = h->block_rsv->space_info->flags; + struct btrfs_space_info *space_info = h->block_rsv->space_info; + u64 flags = space_info->flags; - btrfs_chunk_alloc(h, btrfs_get_alloc_profile(fs_info, flags), + btrfs_chunk_alloc(h, space_info, btrfs_get_alloc_profile(fs_info, flags), CHUNK_ALLOC_NO_FORCE); } @@ -1128,13 +1129,13 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, u64 start = 0; u64 end; - while (find_first_extent_bit(dirty_pages, start, &start, &end, - mark, &cached_state)) { + while (btrfs_find_first_extent_bit(dirty_pages, start, &start, &end, + mark, &cached_state)) { bool wait_writeback = false; - ret = convert_extent_bit(dirty_pages, start, end, - EXTENT_NEED_WAIT, - mark, &cached_state); + ret = btrfs_convert_extent_bit(dirty_pages, start, end, + EXTENT_NEED_WAIT, + mark, &cached_state); /* * convert_extent_bit can return -ENOMEM, which is most of the * time a temporary error. So when it happens, ignore the error @@ -1155,8 +1156,8 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, if (!ret) ret = filemap_fdatawrite_range(mapping, start, end); if (!ret && wait_writeback) - ret = filemap_fdatawait_range(mapping, start, end); - free_extent_state(cached_state); + btrfs_btree_wait_writeback_range(fs_info, start, end); + btrfs_free_extent_state(cached_state); if (ret) break; cached_state = NULL; @@ -1175,14 +1176,13 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info, struct extent_io_tree *dirty_pages) { - struct address_space *mapping = fs_info->btree_inode->i_mapping; struct extent_state *cached_state = NULL; u64 start = 0; u64 end; int ret = 0; - while (find_first_extent_bit(dirty_pages, start, &start, &end, - EXTENT_NEED_WAIT, &cached_state)) { + while (btrfs_find_first_extent_bit(dirty_pages, start, &start, &end, + EXTENT_NEED_WAIT, &cached_state)) { /* * Ignore -ENOMEM errors returned by clear_extent_bit(). * When committing the transaction, we'll remove any entries @@ -1191,13 +1191,13 @@ static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info, * concurrently - we do it only at transaction commit time when * it's safe to do it (through extent_io_tree_release()). */ - ret = clear_extent_bit(dirty_pages, start, end, - EXTENT_NEED_WAIT, &cached_state); + ret = btrfs_clear_extent_bit(dirty_pages, start, end, + EXTENT_NEED_WAIT, &cached_state); if (ret == -ENOMEM) ret = 0; if (!ret) - ret = filemap_fdatawait_range(mapping, start, end); - free_extent_state(cached_state); + btrfs_btree_wait_writeback_range(fs_info, start, end); + btrfs_free_extent_state(cached_state); if (ret) break; cached_state = NULL; @@ -1265,7 +1265,7 @@ static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans) blk_finish_plug(&plug); ret2 = btrfs_wait_extents(fs_info, dirty_pages); - extent_io_tree_release(&trans->transaction->dirty_pages); + btrfs_extent_io_tree_release(&trans->transaction->dirty_pages); if (ret) return ret; @@ -1327,7 +1327,6 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans) struct btrfs_fs_info *fs_info = trans->fs_info; struct list_head *dirty_bgs = &trans->transaction->dirty_bgs; struct list_head *io_bgs = &trans->transaction->io_bgs; - struct list_head *next; struct extent_buffer *eb; int ret; @@ -1363,13 +1362,13 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans) again: while (!list_empty(&fs_info->dirty_cowonly_roots)) { struct btrfs_root *root; - next = fs_info->dirty_cowonly_roots.next; - list_del_init(next); - root = list_entry(next, struct btrfs_root, dirty_list); + + root = list_first_entry(&fs_info->dirty_cowonly_roots, + struct btrfs_root, dirty_list); clear_bit(BTRFS_ROOT_DIRTY, &root->state); + list_move_tail(&root->dirty_list, + &trans->transaction->switch_commits); - list_add_tail(&root->dirty_list, - &trans->transaction->switch_commits); ret = update_cowonly_root(trans, root); if (ret) return ret; @@ -2271,14 +2270,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) wake_up(&fs_info->transaction_blocked_wait); btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP); - if (cur_trans->list.prev != &fs_info->trans_list) { + if (!list_is_first(&cur_trans->list, &fs_info->trans_list)) { enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED; if (trans->in_fsync) want_state = TRANS_STATE_SUPER_COMMITTED; - prev_trans = list_entry(cur_trans->list.prev, - struct btrfs_transaction, list); + prev_trans = list_prev_entry(cur_trans, list); if (prev_trans->state < want_state) { refcount_inc(&prev_trans->use_count); spin_unlock(&fs_info->trans_lock); @@ -2555,7 +2553,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) wake_up(&cur_trans->commit_wait); btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED); - btrfs_finish_extent_commit(trans); + ret = btrfs_finish_extent_commit(trans); + if (ret) + goto scrub_continue; if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags)) btrfs_clear_space_info_full(fs_info); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 43979891f7c8..8f4703b488b7 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1571,7 +1571,7 @@ static int check_extent_item(struct extent_buffer *leaf, inline_type); return -EUCLEAN; } - if (inline_type < last_type) { + if (unlikely(inline_type < last_type)) { extent_err(leaf, slot, "inline ref out-of-order: has type %u, prev type %u", inline_type, last_type); @@ -1580,7 +1580,7 @@ static int check_extent_item(struct extent_buffer *leaf, /* Type changed, allow the sequence starts from U64_MAX again. */ if (inline_type > last_type) last_seq = U64_MAX; - if (seq > last_seq) { + if (unlikely(seq > last_seq)) { extent_err(leaf, slot, "inline ref out-of-order: has type %u offset %llu seq 0x%llx, prev type %u seq 0x%llx", inline_type, inline_offset, seq, @@ -1929,7 +1929,7 @@ static enum btrfs_tree_block_status check_leaf_item(struct extent_buffer *leaf, break; } - if (ret) + if (unlikely(ret)) return BTRFS_TREE_BLOCK_INVALID_ITEM; return BTRFS_TREE_BLOCK_CLEAN; } @@ -2229,13 +2229,12 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int ret; found_level = btrfs_header_level(eb); - if (found_level != check->level) { - WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), - KERN_ERR "BTRFS: tree level check failed\n"); + if (unlikely(found_level != check->level)) { + DEBUG_WARN(); btrfs_err(fs_info, "tree level mismatch detected, bytenr=%llu level expected=%u has=%u", eb->start, check->level, found_level); - return -EIO; + return -EUCLEAN; } if (!check->has_first_key) @@ -2251,11 +2250,11 @@ int btrfs_verify_level_key(struct extent_buffer *eb, return 0; /* We have @first_key, so this @eb must have at least one item */ - if (btrfs_header_nritems(eb) == 0) { + if (unlikely(btrfs_header_nritems(eb) == 0)) { btrfs_err(fs_info, "invalid tree nritems, bytenr=%llu nritems=0 expect >0", eb->start); - WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + DEBUG_WARN(); return -EUCLEAN; } @@ -2263,11 +2262,10 @@ int btrfs_verify_level_key(struct extent_buffer *eb, btrfs_node_key_to_cpu(eb, &found_key, 0); else btrfs_item_key_to_cpu(eb, &found_key, 0); - ret = btrfs_comp_cpu_keys(&check->first_key, &found_key); - if (ret) { - WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), - KERN_ERR "BTRFS: tree first key check failed\n"); + ret = btrfs_comp_cpu_keys(&check->first_key, &found_key); + if (unlikely(ret)) { + DEBUG_WARN(); btrfs_err(fs_info, "tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)", eb->start, check->transid, check->first_key.objectid, diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 90dc094cfa5e..97e933113b82 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -860,9 +860,9 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, struct btrfs_ordered_sum *sums; struct btrfs_root *csum_root; - sums = list_entry(ordered_sums.next, - struct btrfs_ordered_sum, - list); + sums = list_first_entry(&ordered_sums, + struct btrfs_ordered_sum, + list); csum_root = btrfs_csum_root(fs_info, sums->logical); if (!ret) @@ -3251,8 +3251,8 @@ static void free_log_tree(struct btrfs_trans_handle *trans, } } - extent_io_tree_release(&log->dirty_log_pages); - extent_io_tree_release(&log->log_csum_range); + btrfs_extent_io_tree_release(&log->dirty_log_pages); + btrfs_extent_io_tree_release(&log->log_csum_range); btrfs_put_root(log); } @@ -4300,8 +4300,8 @@ static int log_csums(struct btrfs_trans_handle *trans, * file which happens to refer to the same extent as well. Such races * can leave checksum items in the log with overlapping ranges. */ - ret = lock_extent(&log_root->log_csum_range, sums->logical, lock_end, - &cached_state); + ret = btrfs_lock_extent(&log_root->log_csum_range, sums->logical, lock_end, + &cached_state); if (ret) return ret; /* @@ -4317,8 +4317,8 @@ static int log_csums(struct btrfs_trans_handle *trans, if (!ret) ret = btrfs_csum_file_blocks(trans, log_root, sums); - unlock_extent(&log_root->log_csum_range, sums->logical, lock_end, - &cached_state); + btrfs_unlock_extent(&log_root->log_csum_range, sums->logical, lock_end, + &cached_state); return ret; } @@ -4648,7 +4648,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, return 0; /* If we're compressed we have to save the entire range of csums. */ - if (extent_map_is_compressed(em)) { + if (btrfs_extent_map_is_compressed(em)) { csum_offset = 0; csum_len = em->disk_num_bytes; } else { @@ -4657,7 +4657,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, } /* block start is already adjusted for the file extent offset. */ - block_start = extent_map_block_start(em); + block_start = btrfs_extent_map_block_start(em); csum_root = btrfs_csum_root(trans->fs_info, block_start); ret = btrfs_lookup_csums_list(csum_root, block_start + csum_offset, block_start + csum_offset + csum_len - 1, @@ -4667,9 +4667,9 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, ret = 0; while (!list_empty(&ordered_sums)) { - struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, - struct btrfs_ordered_sum, - list); + struct btrfs_ordered_sum *sums = list_first_entry(&ordered_sums, + struct btrfs_ordered_sum, + list); if (!ret) ret = log_csums(trans, inode, log_root, sums); list_del(&sums->list); @@ -4692,7 +4692,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, struct btrfs_key key; enum btrfs_compression_type compress_type; u64 extent_offset = em->offset; - u64 block_start = extent_map_block_start(em); + u64 block_start = btrfs_extent_map_block_start(em); u64 block_len; int ret; @@ -4703,7 +4703,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG); block_len = em->disk_num_bytes; - compress_type = extent_map_compression(em); + compress_type = btrfs_extent_map_compression(em); if (compress_type != BTRFS_COMPRESS_NONE) { btrfs_set_stack_file_extent_disk_bytenr(&fi, block_start); btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len); @@ -4947,7 +4947,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, list_sort(NULL, &extents, extent_cmp); process: while (!list_empty(&extents)) { - em = list_entry(extents.next, struct extent_map, list); + em = list_first_entry(&extents, struct extent_map, list); list_del_init(&em->list); @@ -4956,8 +4956,8 @@ process: * private list. */ if (ret) { - clear_em_logging(inode, em); - free_extent_map(em); + btrfs_clear_em_logging(inode, em); + btrfs_free_extent_map(em); continue; } @@ -4965,8 +4965,8 @@ process: ret = log_one_extent(trans, inode, em, path, ctx); write_lock(&tree->lock); - clear_em_logging(inode, em); - free_extent_map(em); + btrfs_clear_em_logging(inode, em); + btrfs_free_extent_map(em); } WARN_ON(!list_empty(&extents)); write_unlock(&tree->lock); @@ -6583,6 +6583,19 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, btrfs_log_get_delayed_items(inode, &delayed_ins_list, &delayed_del_list); + /* + * If we are fsyncing a file with 0 hard links, then commit the delayed + * inode because the last inode ref (or extref) item may still be in the + * subvolume tree and if we log it the file will still exist after a log + * replay. So commit the delayed inode to delete that last ref and we + * skip logging it. + */ + if (inode->vfs_inode.i_nlink == 0) { + ret = btrfs_commit_inode_delayed_inode(inode); + if (ret) + goto out_unlock; + } + ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key, path, dst_path, logged_isize, inode_only, ctx, @@ -7051,14 +7064,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (btrfs_root_generation(&root->root_item) == trans->transid) return BTRFS_LOG_FORCE_COMMIT; - /* - * Skip already logged inodes or inodes corresponding to tmpfiles - * (since logging them is pointless, a link count of 0 means they - * will never be accessible). - */ - if ((btrfs_inode_in_log(inode, trans->transid) && - list_empty(&ctx->ordered_extents)) || - inode->vfs_inode.i_nlink == 0) + /* Skip already logged inodes and without new extents. */ + if (btrfs_inode_in_log(inode, trans->transid) && + list_empty(&ctx->ordered_extents)) return BTRFS_NO_LOG_SYNC; ret = start_log_trans(trans, root, ctx); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c8c21c55be53..89835071cfea 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -404,7 +404,7 @@ static void btrfs_free_device(struct btrfs_device *device) { WARN_ON(!list_empty(&device->post_commit_list)); rcu_string_free(device->name); - extent_io_tree_release(&device->alloc_state); + btrfs_extent_io_tree_release(&device->alloc_state); btrfs_destroy_dev_zone_info(device); kfree(device); } @@ -415,8 +415,8 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices) WARN_ON(fs_devices->opened); while (!list_empty(&fs_devices->devices)) { - device = list_entry(fs_devices->devices.next, - struct btrfs_device, dev_list); + device = list_first_entry(&fs_devices->devices, + struct btrfs_device, dev_list); list_del(&device->dev_list); btrfs_free_device(device); } @@ -428,8 +428,8 @@ void __exit btrfs_cleanup_fs_uuids(void) struct btrfs_fs_devices *fs_devices; while (!list_empty(&fs_uuids)) { - fs_devices = list_entry(fs_uuids.next, - struct btrfs_fs_devices, fs_list); + fs_devices = list_first_entry(&fs_uuids, struct btrfs_fs_devices, + fs_list); list_del(&fs_devices->fs_list); free_fs_devices(fs_devices); } @@ -493,7 +493,7 @@ btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder, } } invalidate_bdev(bdev); - *disk_super = btrfs_read_dev_super(bdev); + *disk_super = btrfs_read_disk_super(bdev, 0, false); if (IS_ERR(*disk_super)) { ret = PTR_ERR(*disk_super); fput(*bdev_file); @@ -733,82 +733,6 @@ const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb) return has_metadata_uuid ? sb->metadata_uuid : sb->fsid; } -/* - * We can have very weird soft links passed in. - * One example is "/proc/self/fd/<fd>", which can be a soft link to - * a block device. - * - * But it's never a good idea to use those weird names. - * Here we check if the path (not following symlinks) is a good one inside - * "/dev/". - */ -static bool is_good_dev_path(const char *dev_path) -{ - struct path path = { .mnt = NULL, .dentry = NULL }; - char *path_buf = NULL; - char *resolved_path; - bool is_good = false; - int ret; - - if (!dev_path) - goto out; - - path_buf = kmalloc(PATH_MAX, GFP_KERNEL); - if (!path_buf) - goto out; - - /* - * Do not follow soft link, just check if the original path is inside - * "/dev/". - */ - ret = kern_path(dev_path, 0, &path); - if (ret) - goto out; - resolved_path = d_path(&path, path_buf, PATH_MAX); - if (IS_ERR(resolved_path)) - goto out; - if (strncmp(resolved_path, "/dev/", strlen("/dev/"))) - goto out; - is_good = true; -out: - kfree(path_buf); - path_put(&path); - return is_good; -} - -static int get_canonical_dev_path(const char *dev_path, char *canonical) -{ - struct path path = { .mnt = NULL, .dentry = NULL }; - char *path_buf = NULL; - char *resolved_path; - int ret; - - if (!dev_path) { - ret = -EINVAL; - goto out; - } - - path_buf = kmalloc(PATH_MAX, GFP_KERNEL); - if (!path_buf) { - ret = -ENOMEM; - goto out; - } - - ret = kern_path(dev_path, LOOKUP_FOLLOW, &path); - if (ret) - goto out; - resolved_path = d_path(&path, path_buf, PATH_MAX); - if (IS_ERR(resolved_path)) { - ret = PTR_ERR(resolved_path); - goto out; - } - ret = strscpy(canonical, resolved_path, PATH_MAX); -out: - kfree(path_buf); - path_put(&path); - return ret; -} - static bool is_same_device(struct btrfs_device *device, const char *new_path) { struct path old = { .mnt = NULL, .dentry = NULL }; @@ -1225,7 +1149,7 @@ static void btrfs_close_one_device(struct btrfs_device *device) device->fs_info = NULL; atomic_set(&device->dev_stats_ccnt, 0); - extent_io_tree_release(&device->alloc_state); + btrfs_extent_io_tree_release(&device->alloc_state); /* * Reset the flush error record. We might have a transient flush error @@ -1401,48 +1325,58 @@ void btrfs_release_disk_super(struct btrfs_super_block *super) put_page(page); } -static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev, - u64 bytenr, u64 bytenr_orig) +struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev, + int copy_num, bool drop_cache) { - struct btrfs_super_block *disk_super; + struct btrfs_super_block *super; struct page *page; - void *p; - pgoff_t index; + u64 bytenr, bytenr_orig; + struct address_space *mapping = bdev->bd_mapping; + int ret; - /* make sure our super fits in the device */ - if (bytenr + PAGE_SIZE >= bdev_nr_bytes(bdev)) - return ERR_PTR(-EINVAL); + bytenr_orig = btrfs_sb_offset(copy_num); + ret = btrfs_sb_log_location_bdev(bdev, copy_num, READ, &bytenr); + if (ret < 0) { + if (ret == -ENOENT) + ret = -EINVAL; + return ERR_PTR(ret); + } - /* make sure our super fits in the page */ - if (sizeof(*disk_super) > PAGE_SIZE) + if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev)) return ERR_PTR(-EINVAL); - /* make sure our super doesn't straddle pages on disk */ - index = bytenr >> PAGE_SHIFT; - if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index) - return ERR_PTR(-EINVAL); + if (drop_cache) { + /* This should only be called with the primary sb. */ + ASSERT(copy_num == 0); - /* pull in the page with our super */ - page = read_cache_page_gfp(bdev->bd_mapping, index, GFP_KERNEL); + /* + * Drop the page of the primary superblock, so later read will + * always read from the device. + */ + invalidate_inode_pages2_range(mapping, bytenr >> PAGE_SHIFT, + (bytenr + BTRFS_SUPER_INFO_SIZE) >> PAGE_SHIFT); + } + page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS); if (IS_ERR(page)) return ERR_CAST(page); - p = page_address(page); - - /* align our pointer to the offset of the super block */ - disk_super = p + offset_in_page(bytenr); - - if (btrfs_super_bytenr(disk_super) != bytenr_orig || - btrfs_super_magic(disk_super) != BTRFS_MAGIC) { - btrfs_release_disk_super(p); + super = page_address(page); + if (btrfs_super_magic(super) != BTRFS_MAGIC || + btrfs_super_bytenr(super) != bytenr_orig) { + btrfs_release_disk_super(super); return ERR_PTR(-EINVAL); } - if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1]) - disk_super->label[BTRFS_LABEL_SIZE - 1] = 0; + /* + * Make sure the last byte of label is properly NUL termiated. We use + * '%s' to print the label, if not properly NUL termiated we can access + * beyond the label. + */ + if (super->label[0] && super->label[BTRFS_LABEL_SIZE - 1]) + super->label[BTRFS_LABEL_SIZE - 1] = 0; - return disk_super; + return super; } int btrfs_forget_devices(dev_t devt) @@ -1513,23 +1447,10 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, bool new_device_added = false; struct btrfs_device *device = NULL; struct file *bdev_file; - char *canonical_path = NULL; - u64 bytenr; dev_t devt; - int ret; lockdep_assert_held(&uuid_mutex); - if (!is_good_dev_path(path)) { - canonical_path = kmalloc(PATH_MAX, GFP_KERNEL); - if (canonical_path) { - ret = get_canonical_dev_path(path, canonical_path); - if (ret < 0) { - kfree(canonical_path); - canonical_path = NULL; - } - } - } /* * Avoid an exclusive open here, as the systemd-udev may initiate the * device scan which may race with the user's mount or mkfs command, @@ -1544,20 +1465,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, if (IS_ERR(bdev_file)) return ERR_CAST(bdev_file); - /* - * We would like to check all the super blocks, but doing so would - * allow a mount to succeed after a mkfs from a different filesystem. - * Currently, recovery from a bad primary btrfs superblock is done - * using the userspace command 'btrfs check --super'. - */ - ret = btrfs_sb_log_location_bdev(file_bdev(bdev_file), 0, READ, &bytenr); - if (ret) { - device = ERR_PTR(ret); - goto error_bdev_put; - } - - disk_super = btrfs_read_disk_super(file_bdev(bdev_file), bytenr, - btrfs_sb_offset(0)); + disk_super = btrfs_read_disk_super(file_bdev(bdev_file), 0, false); if (IS_ERR(disk_super)) { device = ERR_CAST(disk_super); goto error_bdev_put; @@ -1574,8 +1482,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, goto free_disk_super; } - device = device_list_add(canonical_path ? : path, disk_super, - &new_device_added); + device = device_list_add(path, disk_super, &new_device_added); if (!IS_ERR(device) && new_device_added) btrfs_free_stale_devices(device->devt, device); @@ -1584,7 +1491,6 @@ free_disk_super: error_bdev_put: fput(bdev_file); - kfree(canonical_path); return device; } @@ -1600,9 +1506,9 @@ static bool contains_pending_extent(struct btrfs_device *device, u64 *start, lockdep_assert_held(&device->fs_info->chunk_mutex); - if (find_first_extent_bit(&device->alloc_state, *start, - &physical_start, &physical_end, - CHUNK_ALLOCATED, NULL)) { + if (btrfs_find_first_extent_bit(&device->alloc_state, *start, + &physical_start, &physical_end, + CHUNK_ALLOCATED, NULL)) { if (in_range(physical_start, *start, len) || in_range(*start, physical_start, @@ -1617,6 +1523,9 @@ static bool contains_pending_extent(struct btrfs_device *device, u64 *start, static u64 dev_extent_search_start(struct btrfs_device *device) { switch (device->fs_devices->chunk_alloc_policy) { + default: + btrfs_warn_unknown_chunk_allocation(device->fs_devices->chunk_alloc_policy); + fallthrough; case BTRFS_CHUNK_ALLOC_REGULAR: return BTRFS_DEVICE_RANGE_RESERVED; case BTRFS_CHUNK_ALLOC_ZONED: @@ -1626,8 +1535,6 @@ static u64 dev_extent_search_start(struct btrfs_device *device) * for superblock logging. */ return 0; - default: - BUG(); } } @@ -1640,7 +1547,8 @@ static bool dev_extent_hole_check_zoned(struct btrfs_device *device, int ret; bool changed = false; - ASSERT(IS_ALIGNED(*hole_start, zone_size)); + ASSERT(IS_ALIGNED(*hole_start, zone_size), + "hole_start=%llu zone_size=%llu", *hole_start, zone_size); while (*hole_size > 0) { pos = btrfs_find_allocatable_zones(device, *hole_start, @@ -1706,6 +1614,9 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, } switch (device->fs_devices->chunk_alloc_policy) { + default: + btrfs_warn_unknown_chunk_allocation(device->fs_devices->chunk_alloc_policy); + fallthrough; case BTRFS_CHUNK_ALLOC_REGULAR: /* No extra check */ break; @@ -1720,8 +1631,6 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, continue; } break; - default: - BUG(); } break; @@ -1891,7 +1800,9 @@ next: else ret = 0; - ASSERT(max_hole_start + max_hole_size <= search_end); + ASSERT(max_hole_start + max_hole_size <= search_end, + "max_hole_start=%llu max_hole_size=%llu search_end=%llu", + max_hole_start, max_hole_size, search_end); out: btrfs_free_path(path); *start = max_hole_start; @@ -2204,7 +2115,7 @@ static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info) down_read(&fs_info->dev_replace.rwsem); if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { - ASSERT(num_devices > 1); + ASSERT(num_devices > 1, "num_devices=%llu", num_devices); num_devices--; } up_read(&fs_info->dev_replace.rwsem); @@ -2220,7 +2131,7 @@ static void btrfs_scratch_superblock(struct btrfs_fs_info *fs_info, const u64 bytenr = btrfs_sb_offset(copy_num); int ret; - disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr); + disk_super = btrfs_read_disk_super(bdev, copy_num, false); if (IS_ERR(disk_super)) return; @@ -2408,7 +2319,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, */ if (cur_devices->num_devices == 0) { list_del_init(&cur_devices->seed_list); - ASSERT(cur_devices->opened == 1); + ASSERT(cur_devices->opened == 1, "opened=%d", cur_devices->opened); cur_devices->opened--; free_fs_devices(cur_devices); } @@ -3338,7 +3249,8 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) * user having built with ASSERT enabled, so if ASSERT doesn't * do anything we still error out. */ - ASSERT(0); + DEBUG_WARN("errr %ld reading chunk map at offset %llu", + PTR_ERR(map), chunk_offset); return PTR_ERR(map); } @@ -3419,8 +3331,16 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) if (ret == -ENOSPC) { const u64 sys_flags = btrfs_system_alloc_profile(fs_info); struct btrfs_block_group *sys_bg; + struct btrfs_space_info *space_info; - sys_bg = btrfs_create_chunk(trans, sys_flags); + space_info = btrfs_find_space_info(fs_info, sys_flags); + if (!space_info) { + ret = -EINVAL; + btrfs_abort_transaction(trans, ret); + goto out; + } + + sys_bg = btrfs_create_chunk(trans, space_info, sys_flags); if (IS_ERR(sys_bg)) { ret = PTR_ERR(sys_bg); btrfs_abort_transaction(trans, ret); @@ -3880,26 +3800,25 @@ static void reset_balance_state(struct btrfs_fs_info *fs_info) * Balance filters. Return 1 if chunk should be filtered out * (should not be balanced). */ -static int chunk_profiles_filter(u64 chunk_type, - struct btrfs_balance_args *bargs) +static bool chunk_profiles_filter(u64 chunk_type, struct btrfs_balance_args *bargs) { chunk_type = chunk_to_extended(chunk_type) & BTRFS_EXTENDED_PROFILE_MASK; if (bargs->profiles & chunk_type) - return 0; + return false; - return 1; + return true; } -static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, - struct btrfs_balance_args *bargs) +static bool chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, + struct btrfs_balance_args *bargs) { struct btrfs_block_group *cache; u64 chunk_used; u64 user_thresh_min; u64 user_thresh_max; - int ret = 1; + bool ret = true; cache = btrfs_lookup_block_group(fs_info, chunk_offset); chunk_used = cache->used; @@ -3917,18 +3836,18 @@ static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_off user_thresh_max = mult_perc(cache->length, bargs->usage_max); if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max) - ret = 0; + ret = false; btrfs_put_block_group(cache); return ret; } -static int chunk_usage_filter(struct btrfs_fs_info *fs_info, - u64 chunk_offset, struct btrfs_balance_args *bargs) +static bool chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, + struct btrfs_balance_args *bargs) { struct btrfs_block_group *cache; u64 chunk_used, user_thresh; - int ret = 1; + bool ret = true; cache = btrfs_lookup_block_group(fs_info, chunk_offset); chunk_used = cache->used; @@ -3941,15 +3860,14 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, user_thresh = mult_perc(cache->length, bargs->usage); if (chunk_used < user_thresh) - ret = 0; + ret = false; btrfs_put_block_group(cache); return ret; } -static int chunk_devid_filter(struct extent_buffer *leaf, - struct btrfs_chunk *chunk, - struct btrfs_balance_args *bargs) +static bool chunk_devid_filter(struct extent_buffer *leaf, struct btrfs_chunk *chunk, + struct btrfs_balance_args *bargs) { struct btrfs_stripe *stripe; int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); @@ -3958,10 +3876,10 @@ static int chunk_devid_filter(struct extent_buffer *leaf, for (i = 0; i < num_stripes; i++) { stripe = btrfs_stripe_nr(chunk, i); if (btrfs_stripe_devid(leaf, stripe) == bargs->devid) - return 0; + return false; } - return 1; + return true; } static u64 calc_data_stripes(u64 type, int num_stripes) @@ -3974,9 +3892,8 @@ static u64 calc_data_stripes(u64 type, int num_stripes) } /* [pstart, pend) */ -static int chunk_drange_filter(struct extent_buffer *leaf, - struct btrfs_chunk *chunk, - struct btrfs_balance_args *bargs) +static bool chunk_drange_filter(struct extent_buffer *leaf, struct btrfs_chunk *chunk, + struct btrfs_balance_args *bargs) { struct btrfs_stripe *stripe; int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); @@ -3987,7 +3904,7 @@ static int chunk_drange_filter(struct extent_buffer *leaf, int i; if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) - return 0; + return false; type = btrfs_chunk_type(leaf, chunk); factor = calc_data_stripes(type, num_stripes); @@ -4003,56 +3920,53 @@ static int chunk_drange_filter(struct extent_buffer *leaf, if (stripe_offset < bargs->pend && stripe_offset + stripe_length > bargs->pstart) - return 0; + return false; } - return 1; + return true; } /* [vstart, vend) */ -static int chunk_vrange_filter(struct extent_buffer *leaf, - struct btrfs_chunk *chunk, - u64 chunk_offset, - struct btrfs_balance_args *bargs) +static bool chunk_vrange_filter(struct extent_buffer *leaf, struct btrfs_chunk *chunk, + u64 chunk_offset, struct btrfs_balance_args *bargs) { if (chunk_offset < bargs->vend && chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart) /* at least part of the chunk is inside this vrange */ - return 0; + return false; - return 1; + return true; } -static int chunk_stripes_range_filter(struct extent_buffer *leaf, - struct btrfs_chunk *chunk, - struct btrfs_balance_args *bargs) +static bool chunk_stripes_range_filter(struct extent_buffer *leaf, + struct btrfs_chunk *chunk, + struct btrfs_balance_args *bargs) { int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); if (bargs->stripes_min <= num_stripes && num_stripes <= bargs->stripes_max) - return 0; + return false; - return 1; + return true; } -static int chunk_soft_convert_filter(u64 chunk_type, - struct btrfs_balance_args *bargs) +static bool chunk_soft_convert_filter(u64 chunk_type, struct btrfs_balance_args *bargs) { if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) - return 0; + return false; chunk_type = chunk_to_extended(chunk_type) & BTRFS_EXTENDED_PROFILE_MASK; if (bargs->target == chunk_type) - return 1; + return true; - return 0; + return false; } -static int should_balance_chunk(struct extent_buffer *leaf, - struct btrfs_chunk *chunk, u64 chunk_offset) +static bool should_balance_chunk(struct extent_buffer *leaf, struct btrfs_chunk *chunk, + u64 chunk_offset) { struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_balance_control *bctl = fs_info->balance_ctl; @@ -4062,7 +3976,7 @@ static int should_balance_chunk(struct extent_buffer *leaf, /* type filter */ if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { - return 0; + return false; } if (chunk_type & BTRFS_BLOCK_GROUP_DATA) @@ -4075,46 +3989,46 @@ static int should_balance_chunk(struct extent_buffer *leaf, /* profiles filter */ if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) && chunk_profiles_filter(chunk_type, bargs)) { - return 0; + return false; } /* usage filter */ if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && chunk_usage_filter(fs_info, chunk_offset, bargs)) { - return 0; + return false; } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && chunk_usage_range_filter(fs_info, chunk_offset, bargs)) { - return 0; + return false; } /* devid filter */ if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) && chunk_devid_filter(leaf, chunk, bargs)) { - return 0; + return false; } /* drange filter, makes sense only with devid filter */ if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) && chunk_drange_filter(leaf, chunk, bargs)) { - return 0; + return false; } /* vrange filter */ if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) && chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) { - return 0; + return false; } /* stripes filter */ if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) && chunk_stripes_range_filter(leaf, chunk, bargs)) { - return 0; + return false; } /* soft profile changing mode */ if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && chunk_soft_convert_filter(chunk_type, bargs)) { - return 0; + return false; } /* @@ -4122,7 +4036,7 @@ static int should_balance_chunk(struct extent_buffer *leaf, */ if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) { if (bargs->limit == 0) - return 0; + return false; else bargs->limit--; } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) { @@ -4132,12 +4046,12 @@ static int should_balance_chunk(struct extent_buffer *leaf, * about the count of all chunks that satisfy the filters. */ if (bargs->limit_max == 0) - return 0; + return false; else bargs->limit_max--; } - return 1; + return true; } static int __btrfs_balance(struct btrfs_fs_info *fs_info) @@ -4752,7 +4666,8 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) } spin_lock(&fs_info->super_lock); - ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED); + ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED, + "exclusive_operation=%d", fs_info->exclusive_operation); fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE; spin_unlock(&fs_info->super_lock); /* @@ -5088,8 +5003,8 @@ again: mutex_lock(&fs_info->chunk_mutex); /* Clear all state bits beyond the shrunk device size */ - clear_extent_bits(&device->alloc_state, new_size, (u64)-1, - CHUNK_STATE_MASK); + btrfs_clear_extent_bits(&device->alloc_state, new_size, (u64)-1, + CHUNK_STATE_MASK); btrfs_device_set_disk_total_bytes(device, new_size); if (list_empty(&device->post_commit_list)) @@ -5216,6 +5131,8 @@ struct alloc_chunk_ctl { u64 stripe_size; u64 chunk_size; int ndevs; + /* Space_info the block group is going to belong. */ + struct btrfs_space_info *space_info; }; static void init_alloc_chunk_ctl_policy_regular( @@ -5289,14 +5206,15 @@ static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices, ctl->ndevs = 0; switch (fs_devices->chunk_alloc_policy) { + default: + btrfs_warn_unknown_chunk_allocation(fs_devices->chunk_alloc_policy); + fallthrough; case BTRFS_CHUNK_ALLOC_REGULAR: init_alloc_chunk_ctl_policy_regular(fs_devices, ctl); break; case BTRFS_CHUNK_ALLOC_ZONED: init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl); break; - default: - BUG(); } } @@ -5435,7 +5353,9 @@ static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl, * It should hold because: * dev_extent_min == dev_extent_want == zone_size * dev_stripes */ - ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min); + ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min, + "ndevs=%d max_avail=%llu dev_extent_min=%llu", ctl->ndevs, + devices_info[ctl->ndevs - 1].max_avail, ctl->dev_extent_min); ctl->stripe_size = zone_size; ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; @@ -5448,7 +5368,9 @@ static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl, ctl->dev_stripes); ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; - ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size); + ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size, + "stripe_size=%llu data_stripes=%d max_chunk_size=%llu", + ctl->stripe_size, data_stripes, ctl->max_chunk_size); } ctl->chunk_size = ctl->stripe_size * data_stripes; @@ -5481,12 +5403,13 @@ static int decide_stripe_size(struct btrfs_fs_devices *fs_devices, ctl->ndevs = min(ctl->ndevs, ctl->devs_max); switch (fs_devices->chunk_alloc_policy) { + default: + btrfs_warn_unknown_chunk_allocation(fs_devices->chunk_alloc_policy); + fallthrough; case BTRFS_CHUNK_ALLOC_REGULAR: return decide_stripe_size_regular(ctl, devices_info); case BTRFS_CHUNK_ALLOC_ZONED: return decide_stripe_size_zoned(ctl, devices_info); - default: - BUG(); } } @@ -5496,9 +5419,9 @@ static void chunk_map_device_set_bits(struct btrfs_chunk_map *map, unsigned int struct btrfs_io_stripe *stripe = &map->stripes[i]; struct btrfs_device *device = stripe->dev; - set_extent_bit(&device->alloc_state, stripe->physical, - stripe->physical + map->stripe_size - 1, - bits | EXTENT_NOWAIT, NULL); + btrfs_set_extent_bit(&device->alloc_state, stripe->physical, + stripe->physical + map->stripe_size - 1, + bits | EXTENT_NOWAIT, NULL); } } @@ -5508,10 +5431,9 @@ static void chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned in struct btrfs_io_stripe *stripe = &map->stripes[i]; struct btrfs_device *device = stripe->dev; - __clear_extent_bit(&device->alloc_state, stripe->physical, - stripe->physical + map->stripe_size - 1, - bits | EXTENT_NOWAIT, - NULL, NULL); + btrfs_clear_extent_bits(&device->alloc_state, stripe->physical, + stripe->physical + map->stripe_size - 1, + bits | EXTENT_NOWAIT); } } @@ -5618,7 +5540,8 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans, return ERR_PTR(ret); } - block_group = btrfs_make_block_group(trans, type, start, ctl->chunk_size); + block_group = btrfs_make_block_group(trans, ctl->space_info, type, start, + ctl->chunk_size); if (IS_ERR(block_group)) { btrfs_remove_chunk_map(info, map); return block_group; @@ -5644,7 +5567,8 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans, } struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, - u64 type) + struct btrfs_space_info *space_info, + u64 type) { struct btrfs_fs_info *info = trans->fs_info; struct btrfs_fs_devices *fs_devices = info->fs_devices; @@ -5656,7 +5580,7 @@ struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, lockdep_assert_held(&info->chunk_mutex); if (!alloc_profile_is_valid(type, 0)) { - ASSERT(0); + DEBUG_WARN("invalid alloc profile for type %llu", type); return ERR_PTR(-EINVAL); } @@ -5668,12 +5592,13 @@ struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { btrfs_err(info, "invalid chunk type 0x%llx requested", type); - ASSERT(0); + DEBUG_WARN(); return ERR_PTR(-EINVAL); } ctl.start = find_next_chunk(info); ctl.type = type; + ctl.space_info = space_info; init_alloc_chunk_ctl(fs_devices, &ctl); devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), @@ -5817,7 +5742,9 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans) struct btrfs_fs_info *fs_info = trans->fs_info; u64 alloc_profile; struct btrfs_block_group *meta_bg; + struct btrfs_space_info *meta_space_info; struct btrfs_block_group *sys_bg; + struct btrfs_space_info *sys_space_info; /* * When adding a new device for sprouting, the seed device is read-only @@ -5841,12 +5768,22 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans) */ alloc_profile = btrfs_metadata_alloc_profile(fs_info); - meta_bg = btrfs_create_chunk(trans, alloc_profile); + meta_space_info = btrfs_find_space_info(fs_info, alloc_profile); + if (!meta_space_info) { + DEBUG_WARN(); + return -EINVAL; + } + meta_bg = btrfs_create_chunk(trans, meta_space_info, alloc_profile); if (IS_ERR(meta_bg)) return PTR_ERR(meta_bg); alloc_profile = btrfs_system_alloc_profile(fs_info); - sys_bg = btrfs_create_chunk(trans, alloc_profile); + sys_space_info = btrfs_find_space_info(fs_info, alloc_profile); + if (!sys_space_info) { + DEBUG_WARN(); + return -EINVAL; + } + sys_bg = btrfs_create_chunk(trans, sys_space_info, alloc_profile); if (IS_ERR(sys_bg)) return PTR_ERR(sys_bg); @@ -6046,7 +5983,7 @@ static int btrfs_read_rr(const struct btrfs_chunk_map *map, int first, int num_s static int find_live_mirror(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map, int first, - int dev_replace_is_ongoing) + bool dev_replace_is_ongoing) { const enum btrfs_read_policy policy = READ_ONCE(fs_info->fs_devices->read_policy); int i; @@ -6055,8 +5992,8 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, int tolerance; struct btrfs_device *srcdev; - ASSERT((map->type & - (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10))); + ASSERT((map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)), + "type=%llu", map->type); if (map->type & BTRFS_BLOCK_GROUP_RAID10) num_stripes = map->sub_stripes; @@ -6357,7 +6294,7 @@ static void handle_ops_on_dev_replace(struct btrfs_io_context *bioc, } /* We can only have at most 2 extra nr_stripes (for DUP). */ - ASSERT(nr_extra_stripes <= 2); + ASSERT(nr_extra_stripes <= 2, "nr_extra_stripes=%d", nr_extra_stripes); /* * For GET_READ_MIRRORS, we can only return at most 1 extra stripe for * replace. @@ -6368,7 +6305,8 @@ static void handle_ops_on_dev_replace(struct btrfs_io_context *bioc, struct btrfs_io_stripe *second = &bioc->stripes[num_stripes + 1]; /* Only DUP can have two extra stripes. */ - ASSERT(bioc->map_type & BTRFS_BLOCK_GROUP_DUP); + ASSERT(bioc->map_type & BTRFS_BLOCK_GROUP_DUP, + "map_type=%llu", bioc->map_type); /* * Swap the last stripe stripes and reduce @nr_extra_stripes. @@ -6395,7 +6333,8 @@ static u64 btrfs_max_io_len(struct btrfs_chunk_map *map, u64 offset, */ io_geom->stripe_offset = offset & BTRFS_STRIPE_LEN_MASK; io_geom->stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT; - ASSERT(io_geom->stripe_offset < U32_MAX); + ASSERT(io_geom->stripe_offset < U32_MAX, + "stripe_offset=%llu", io_geom->stripe_offset); if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { unsigned long full_stripe_len = @@ -6413,8 +6352,12 @@ static u64 btrfs_max_io_len(struct btrfs_chunk_map *map, u64 offset, io_geom->raid56_full_stripe_start = btrfs_stripe_nr_to_offset( rounddown(io_geom->stripe_nr, nr_data_stripes(map))); - ASSERT(io_geom->raid56_full_stripe_start + full_stripe_len > offset); - ASSERT(io_geom->raid56_full_stripe_start <= offset); + ASSERT(io_geom->raid56_full_stripe_start + full_stripe_len > offset, + "raid56_full_stripe_start=%llu full_stripe_len=%lu offset=%llu", + io_geom->raid56_full_stripe_start, full_stripe_len, offset); + ASSERT(io_geom->raid56_full_stripe_start <= offset, + "raid56_full_stripe_start=%llu offset=%llu", + io_geom->raid56_full_stripe_start, offset); /* * For writes to RAID56, allow to write a full stripe set, but * no straddling of stripe sets. @@ -6580,7 +6523,7 @@ static void map_blocks_raid56_read(struct btrfs_chunk_map *map, { int data_stripes = nr_data_stripes(map); - ASSERT(io_geom->mirror_num <= 1); + ASSERT(io_geom->mirror_num <= 1, "mirror_num=%d", io_geom->mirror_num); /* Just grab the data stripe directly. */ io_geom->stripe_index = io_geom->stripe_nr % data_stripes; io_geom->stripe_nr /= data_stripes; @@ -6648,7 +6591,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, int num_copies; struct btrfs_io_context *bioc = NULL; struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; - int dev_replace_is_ongoing = 0; + bool dev_replace_is_ongoing = false; u16 num_alloc_stripes; u64 max_len; @@ -6953,7 +6896,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, atomic_set(&dev->dev_stats_ccnt, 0); btrfs_device_data_ordered_init(dev); - extent_io_tree_init(fs_info, &dev->alloc_state, IO_TREE_DEVICE_ALLOC_STATE); + btrfs_extent_io_tree_init(fs_info, &dev->alloc_state, IO_TREE_DEVICE_ALLOC_STATE); if (devid) tmp = *devid; @@ -7925,7 +7868,7 @@ void btrfs_commit_device_sizes(struct btrfs_transaction *trans) { struct btrfs_device *curr, *next; - ASSERT(trans->state == TRANS_STATE_COMMIT_DOING); + ASSERT(trans->state == TRANS_STATE_COMMIT_DOING, "state=%d" , trans->state); if (list_empty(&trans->dev_update_list)) return; @@ -8294,7 +8237,7 @@ static void map_raid56_repair_block(struct btrfs_io_context *bioc, logical < stripe_start + BTRFS_STRIPE_LEN) break; } - ASSERT(i < data_stripes); + ASSERT(i < data_stripes, "i=%d data_stripes=%d", i, data_stripes); smap->dev = bioc->stripes[i].dev; smap->physical = bioc->stripes[i].physical + ((logical - bioc->full_stripe_logical) & @@ -8323,7 +8266,7 @@ int btrfs_map_repair_block(struct btrfs_fs_info *fs_info, int mirror_ret = mirror_num; int ret; - ASSERT(mirror_num > 0); + ASSERT(mirror_num > 0, "mirror_num=%d", mirror_num); ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length, &bioc, smap, &mirror_ret); @@ -8331,7 +8274,7 @@ int btrfs_map_repair_block(struct btrfs_fs_info *fs_info, return ret; /* The map range should not cross stripe boundary. */ - ASSERT(map_length >= length); + ASSERT(map_length >= length, "map_length=%llu length=%u", map_length, length); /* Already mapped to single stripe. */ if (!bioc) @@ -8343,7 +8286,8 @@ int btrfs_map_repair_block(struct btrfs_fs_info *fs_info, goto out; } - ASSERT(mirror_num <= bioc->num_stripes); + ASSERT(mirror_num <= bioc->num_stripes, + "mirror_num=%d num_stripes=%d", mirror_num, bioc->num_stripes); smap->dev = bioc->stripes[mirror_num - 1].dev; smap->physical = bioc->stripes[mirror_num - 1].physical; out: diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index e247d551da67..137cc232f58e 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -473,7 +473,6 @@ struct btrfs_io_stripe { struct btrfs_device *dev; /* Block mapping. */ u64 physical; - u64 length; bool rst_search_commit_root; /* For the endio handler. */ struct btrfs_io_context *bioc; @@ -715,7 +714,8 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, int btrfs_read_sys_array(struct btrfs_fs_info *fs_info); int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, - u64 type); + struct btrfs_space_info *space_info, + u64 type); void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info); int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, blk_mode_t flags, void *holder); @@ -786,6 +786,8 @@ struct btrfs_chunk_map *btrfs_find_chunk_map_nolock(struct btrfs_fs_info *fs_inf struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, u64 logical, u64 length); void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map); +struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev, + int copy_num, bool drop_cache); void btrfs_release_disk_super(struct btrfs_super_block *super); static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, @@ -847,6 +849,11 @@ static inline const char *btrfs_dev_name(const struct btrfs_device *device) return rcu_str_deref(device->name); } +static inline void btrfs_warn_unknown_chunk_allocation(enum btrfs_chunk_allocation_policy pol) +{ + WARN_ONCE(1, "unknown allocation policy %d, fallback to regular", pol); +} + void btrfs_commit_device_sizes(struct btrfs_transaction *trans); struct list_head * __attribute_const__ btrfs_get_fs_uuids(void); diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 545f413d81fc..5292cd341f70 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -120,8 +120,6 @@ static int copy_data_into_buffer(struct address_space *mapping, ret = btrfs_compress_filemap_get_folio(mapping, cur, &folio); if (ret < 0) return ret; - /* No large folio support yet. */ - ASSERT(!folio_test_large(folio)); offset = offset_in_folio(folio, cur); copy_length = min(folio_size(folio) - offset, @@ -205,7 +203,6 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, workspace->strm.next_in = workspace->buf; workspace->strm.avail_in = copy_length; } else { - unsigned int pg_off; unsigned int cur_len; if (data_in) { @@ -217,9 +214,9 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, start, &in_folio); if (ret < 0) goto out; - pg_off = offset_in_page(start); - cur_len = btrfs_calc_input_length(orig_end, start); - data_in = kmap_local_folio(in_folio, pg_off); + cur_len = btrfs_calc_input_length(in_folio, orig_end, start); + data_in = kmap_local_folio(in_folio, + offset_in_folio(in_folio, start)); start += cur_len; workspace->strm.next_in = data_in; workspace->strm.avail_in = cur_len; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index fb8b8b29c169..b5b0156d5b95 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -989,7 +989,7 @@ int btrfs_advance_sb_log(struct btrfs_device *device, int mirror) } /* All the zones are FULL. Should not reach here. */ - ASSERT(0); + DEBUG_WARN("unexpected state, all zones full"); return -EIO; } @@ -1277,7 +1277,7 @@ struct zone_info { static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, struct zone_info *info, unsigned long *active, - struct btrfs_chunk_map *map) + struct btrfs_chunk_map *map, bool new) { struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; struct btrfs_device *device; @@ -1307,6 +1307,8 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, return 0; } + ASSERT(!new || btrfs_dev_is_empty_zone(device, info->physical)); + /* This zone will be used for allocation, so mark this zone non-empty. */ btrfs_dev_clear_zone_empty(device, info->physical); @@ -1319,6 +1321,18 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, * to determine the allocation offset within the zone. */ WARN_ON(!IS_ALIGNED(info->physical, fs_info->zone_size)); + + if (new) { + sector_t capacity; + + capacity = bdev_zone_capacity(device->bdev, info->physical >> SECTOR_SHIFT); + up_read(&dev_replace->rwsem); + info->alloc_offset = 0; + info->capacity = capacity << SECTOR_SHIFT; + + return 0; + } + nofs_flag = memalloc_nofs_save(); ret = btrfs_get_dev_zone(device, info->physical, &zone); memalloc_nofs_restore(nofs_flag); @@ -1588,7 +1602,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) } for (i = 0; i < map->num_stripes; i++) { - ret = btrfs_load_zone_info(fs_info, i, &zone_info[i], active, map); + ret = btrfs_load_zone_info(fs_info, i, &zone_info[i], active, map, new); if (ret) goto out; @@ -1659,7 +1673,6 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) * stripe. */ cache->alloc_offset = cache->zone_capacity; - ret = 0; } out: @@ -1784,12 +1797,12 @@ static void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered, ordered->disk_bytenr = logical; write_lock(&em_tree->lock); - em = search_extent_mapping(em_tree, ordered->file_offset, - ordered->num_bytes); + em = btrfs_search_extent_mapping(em_tree, ordered->file_offset, + ordered->num_bytes); /* The em should be a new COW extent, thus it should not have an offset. */ ASSERT(em->offset == 0); em->disk_bytenr = logical; - free_extent_map(em); + btrfs_free_extent_map(em); write_unlock(&em_tree->lock); } @@ -1799,8 +1812,8 @@ static bool btrfs_zoned_split_ordered(struct btrfs_ordered_extent *ordered, struct btrfs_ordered_extent *new; if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) && - split_extent_map(ordered->inode, ordered->file_offset, - ordered->num_bytes, len, logical)) + btrfs_split_extent_map(ordered->inode, ordered->file_offset, + ordered->num_bytes, len, logical)) return false; new = btrfs_split_ordered_extent(ordered, len); @@ -2158,27 +2171,15 @@ static void wait_eb_writebacks(struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; const u64 end = block_group->start + block_group->length; - struct radix_tree_iter iter; struct extent_buffer *eb; - void __rcu **slot; + unsigned long index, start = (block_group->start >> fs_info->sectorsize_bits); rcu_read_lock(); - radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, - block_group->start >> fs_info->sectorsize_bits) { - eb = radix_tree_deref_slot(slot); - if (!eb) - continue; - if (radix_tree_deref_retry(eb)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - + xa_for_each_start(&fs_info->buffer_tree, index, eb, start) { if (eb->start < block_group->start) continue; if (eb->start >= end) break; - - slot = radix_tree_iter_resume(slot, &iter); rcu_read_unlock(); wait_on_extent_buffer_writeback(eb); rcu_read_lock(); diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 3541efa765c7..4a796a049b5a 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -24,7 +24,7 @@ #include "super.h" #define ZSTD_BTRFS_MAX_WINDOWLOG 17 -#define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG) +#define ZSTD_BTRFS_MAX_INPUT (1U << ZSTD_BTRFS_MAX_WINDOWLOG) #define ZSTD_BTRFS_DEFAULT_LEVEL 3 #define ZSTD_BTRFS_MIN_LEVEL -15 #define ZSTD_BTRFS_MAX_LEVEL 15 @@ -426,8 +426,8 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio); if (ret < 0) goto out; - cur_len = btrfs_calc_input_length(orig_end, start); - workspace->in_buf.src = kmap_local_folio(in_folio, offset_in_page(start)); + cur_len = btrfs_calc_input_length(in_folio, orig_end, start); + workspace->in_buf.src = kmap_local_folio(in_folio, offset_in_folio(in_folio, start)); workspace->in_buf.pos = 0; workspace->in_buf.size = cur_len; @@ -511,9 +511,9 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio); if (ret < 0) goto out; - cur_len = btrfs_calc_input_length(orig_end, start); + cur_len = btrfs_calc_input_length(in_folio, orig_end, start); workspace->in_buf.src = kmap_local_folio(in_folio, - offset_in_page(start)); + offset_in_folio(in_folio, start)); workspace->in_buf.pos = 0; workspace->in_buf.size = cur_len; } |