diff options
Diffstat (limited to 'fs/btrfs/block-group.c')
| -rw-r--r-- | fs/btrfs/block-group.c | 382 |
1 files changed, 220 insertions, 162 deletions
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index a8129f1ce78c..08b14449fabe 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -34,6 +34,19 @@ int btrfs_should_fragment_free_space(const struct btrfs_block_group *block_group } #endif +static inline bool has_unwritten_metadata(struct btrfs_block_group *block_group) +{ + /* The meta_write_pointer is available only on the zoned setup. */ + if (!btrfs_is_zoned(block_group->fs_info)) + return false; + + if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) + return false; + + return block_group->start + block_group->alloc_offset > + block_group->meta_write_pointer; +} + /* * Return target flags in extended format or 0 if restripe for this chunk_type * is not in progress @@ -525,10 +538,9 @@ int btrfs_add_new_free_space(struct btrfs_block_group *block_group, u64 start, *total_added_ret = 0; while (start < end) { - if (!find_first_extent_bit(&info->excluded_extents, start, - &extent_start, &extent_end, - EXTENT_DIRTY | EXTENT_UPTODATE, - NULL)) + if (!btrfs_find_first_extent_bit(&info->excluded_extents, start, + &extent_start, &extent_end, + EXTENT_DIRTY, NULL)) break; if (extent_start <= start) { @@ -601,8 +613,8 @@ static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ extent_root = btrfs_extent_root(fs_info, max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET)); - path->skip_locking = 1; - path->search_commit_root = 1; + path->skip_locking = true; + path->search_commit_root = true; path->reada = READA_FORWARD; search_offset = index * div_u64(block_group->length, max_index); @@ -701,7 +713,7 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) struct btrfs_block_group *block_group = caching_ctl->block_group; struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_root *extent_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_key key; u64 total_found = 0; @@ -732,8 +744,8 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) * root to add free space. So we skip locking and search the commit * root, since its read-only */ - path->skip_locking = 1; - path->search_commit_root = 1; + path->skip_locking = true; + path->search_commit_root = true; path->reada = READA_FORWARD; key.objectid = last; @@ -828,14 +840,13 @@ next: block_group->start + block_group->length, NULL); out: - btrfs_free_path(path); return ret; } static inline void btrfs_free_excluded_extents(const struct btrfs_block_group *bg) { - clear_extent_bits(&bg->fs_info->excluded_extents, bg->start, - bg->start + bg->length - 1, EXTENT_UPTODATE); + btrfs_clear_extent_bit(&bg->fs_info->excluded_extents, bg->start, + bg->start + bg->length - 1, EXTENT_DIRTY, NULL); } static noinline void caching_thread(struct btrfs_work *work) @@ -879,7 +890,7 @@ static noinline void caching_thread(struct btrfs_work *work) */ if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && !(test_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags))) - ret = load_free_space_tree(caching_ctl); + ret = btrfs_load_free_space_tree(caching_ctl); else ret = load_extent_tree_free(caching_ctl); done: @@ -1054,7 +1065,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_chunk_map *map) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_block_group *block_group; struct btrfs_free_cluster *cluster; struct inode *inode; @@ -1237,7 +1248,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, * another task to attempt to create another block group with the same * item key (and failing with -EEXIST and a transaction abort). */ - ret = remove_block_group_free_space(trans, block_group); + ret = btrfs_remove_block_group_free_space(trans, block_group); if (ret) goto out; @@ -1246,6 +1257,15 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, goto out; spin_lock(&block_group->lock); + /* + * Hitting this WARN means we removed a block group with an unwritten + * region. It will cause "unable to find chunk map for logical" errors. + */ + if (WARN_ON(has_unwritten_metadata(block_group))) + btrfs_warn(fs_info, + "block group %llu is removed before metadata write out", + block_group->start); + set_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags); /* @@ -1285,7 +1305,6 @@ out: btrfs_put_block_group(block_group); if (remove_rsv) btrfs_dec_delayed_refs_rsv_bg_updates(fs_info); - btrfs_free_path(path); return ret; } @@ -1338,7 +1357,7 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( * data in this block group. That check should be done by relocation routine, * not this function. */ -static int inc_block_group_ro(struct btrfs_block_group *cache, int force) +static int inc_block_group_ro(struct btrfs_block_group *cache, bool force) { struct btrfs_space_info *sinfo = cache->space_info; u64 num_bytes; @@ -1383,8 +1402,7 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force) * BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of * leeway to allow us to mark this block group as read only. */ - if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes, - BTRFS_RESERVE_NO_FLUSH)) + if (btrfs_can_overcommit(sinfo, num_bytes, BTRFS_RESERVE_NO_FLUSH)) ret = 0; } @@ -1405,7 +1423,7 @@ out: if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) { btrfs_info(cache->fs_info, "unable to make block group %llu ro", cache->start); - btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0); + btrfs_dump_space_info(cache->space_info, 0, false); } return ret; } @@ -1420,9 +1438,8 @@ static bool clean_pinned_extents(struct btrfs_trans_handle *trans, int ret; spin_lock(&fs_info->trans_lock); - if (trans->transaction->list.prev != &fs_info->trans_list) { - prev_trans = list_last_entry(&trans->transaction->list, - struct btrfs_transaction, list); + if (!list_is_first(&trans->transaction->list, &fs_info->trans_list)) { + prev_trans = list_prev_entry(trans->transaction, list); refcount_inc(&prev_trans->use_count); } spin_unlock(&fs_info->trans_lock); @@ -1439,14 +1456,14 @@ static bool clean_pinned_extents(struct btrfs_trans_handle *trans, */ mutex_lock(&fs_info->unused_bg_unpin_mutex); if (prev_trans) { - ret = clear_extent_bits(&prev_trans->pinned_extents, start, end, - EXTENT_DIRTY); + ret = btrfs_clear_extent_bit(&prev_trans->pinned_extents, start, end, + EXTENT_DIRTY, NULL); if (ret) goto out; } - ret = clear_extent_bits(&trans->transaction->pinned_extents, start, end, - EXTENT_DIRTY); + ret = btrfs_clear_extent_bit(&trans->transaction->pinned_extents, start, end, + EXTENT_DIRTY, NULL); out: mutex_unlock(&fs_info->unused_bg_unpin_mutex); if (prev_trans) @@ -1589,8 +1606,9 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) * needing to allocate extents from the block group. */ used = btrfs_space_info_used(space_info, true); - if (space_info->total_bytes - block_group->length < used && - block_group->zone_unusable < block_group->length) { + if ((space_info->total_bytes - block_group->length < used && + block_group->zone_unusable < block_group->length) || + has_unwritten_metadata(block_group)) { /* * Add a reference for the list, compensate for the ref * drop under the "next" label for the @@ -1619,8 +1637,10 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) ret = btrfs_zone_finish(block_group); if (ret < 0) { btrfs_dec_block_group_ro(block_group); - if (ret == -EAGAIN) + if (ret == -EAGAIN) { + btrfs_link_bg_list(block_group, &retry_list); ret = 0; + } goto next; } @@ -1773,7 +1793,14 @@ static int reclaim_bgs_cmp(void *unused, const struct list_head *a, bg1 = list_entry(a, struct btrfs_block_group, bg_list); bg2 = list_entry(b, struct btrfs_block_group, bg_list); - return bg1->used > bg2->used; + /* + * Some other task may be updating the ->used field concurrently, but it + * is not serious if we get a stale value or load/store tearing issues, + * as sorting the list of block groups to reclaim is not critical and an + * occasional imperfect order is ok. So silence KCSAN and avoid the + * overhead of locking or any other synchronization. + */ + return data_race(bg1->used > bg2->used); } static inline bool btrfs_should_reclaim(const struct btrfs_fs_info *fs_info) @@ -1821,12 +1848,10 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) if (!btrfs_should_reclaim(fs_info)) return; - sb_start_write(fs_info->sb); + guard(super_write)(fs_info->sb); - if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { - sb_end_write(fs_info->sb); + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) return; - } /* * Long running balances can keep us blocked here for eternity, so @@ -1834,7 +1859,6 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) */ if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) { btrfs_exclop_finish(fs_info); - sb_end_write(fs_info->sb); return; } @@ -1846,7 +1870,6 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) */ list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp); while (!list_empty(&fs_info->reclaim_bgs)) { - u64 zone_unusable; u64 used; u64 reserved; int ret = 0; @@ -1913,23 +1936,13 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) goto next; } - /* - * Cache the zone_unusable value before turning the block group - * to read only. As soon as the block group is read only it's - * zone_unusable value gets moved to the block group's read-only - * bytes and isn't available for calculations anymore. We also - * cache it before unlocking the block group, to prevent races - * (reports from KCSAN and such tools) with tasks updating it. - */ - zone_unusable = bg->zone_unusable; - spin_unlock(&bg->lock); spin_unlock(&space_info->lock); /* * Get out fast, in case we're read-only or unmounting the * filesystem. It is OK to drop block groups from the list even - * for the read-only case. As we did sb_start_write(), + * for the read-only case. As we did take the super write lock, * "mount -o remount,ro" won't happen and read-only filesystem * means it is forced read-only due to a fatal error. So, it * never gets back to read-write to let us reclaim again. @@ -1953,7 +1966,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) * called, which is where we will transfer a reserved extent's * size from the "reserved" counter to the "used" counter - this * happens when running delayed references. When we relocate the - * chunk below, relocation first flushes dellaloc, waits for + * chunk below, relocation first flushes delalloc, waits for * ordered extent completion (which is where we create delayed * references for data extents) and commits the current * transaction (which runs delayed references), and only after @@ -1966,14 +1979,8 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) reserved = bg->reserved; spin_unlock(&bg->lock); - btrfs_info(fs_info, - "reclaiming chunk %llu with %llu%% used %llu%% reserved %llu%% unusable", - bg->start, - div64_u64(used * 100, bg->length), - div64_u64(reserved * 100, bg->length), - div64_u64(zone_unusable * 100, bg->length)); trace_btrfs_reclaim_block_group(bg); - ret = btrfs_relocate_chunk(fs_info, bg->start); + ret = btrfs_relocate_chunk(fs_info, bg->start, false); if (ret) { btrfs_dec_block_group_ro(bg); btrfs_err(fs_info, "error relocating chunk %llu", @@ -2018,7 +2025,6 @@ end: list_splice_tail(&retry_list, &fs_info->reclaim_bgs); spin_unlock(&fs_info->unused_bgs_lock); btrfs_exclop_finish(fs_info); - sb_end_write(fs_info->sb); } void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info) @@ -2026,7 +2032,7 @@ void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info) btrfs_reclaim_sweep(fs_info); spin_lock(&fs_info->unused_bgs_lock); if (!list_empty(&fs_info->reclaim_bgs)) - queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work); + queue_work(system_dfl_wq, &fs_info->reclaim_bgs_work); spin_unlock(&fs_info->unused_bgs_lock); } @@ -2059,7 +2065,7 @@ static int read_bg_from_eb(struct btrfs_fs_info *fs_info, const struct btrfs_key return -ENOENT; } - if (map->start != key->objectid || map->chunk_len != key->offset) { + if (unlikely(map->start != key->objectid || map->chunk_len != key->offset)) { btrfs_err(fs_info, "block group %llu len %llu mismatch with chunk %llu len %llu", key->objectid, key->offset, map->start, map->chunk_len); @@ -2072,7 +2078,7 @@ static int read_bg_from_eb(struct btrfs_fs_info *fs_info, const struct btrfs_key flags = btrfs_stack_block_group_flags(&bg) & BTRFS_BLOCK_GROUP_TYPE_MASK; - if (flags != (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { + if (unlikely(flags != (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK))) { btrfs_err(fs_info, "block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx", key->objectid, key->offset, flags, @@ -2218,9 +2224,9 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) if (cache->start < BTRFS_SUPER_INFO_OFFSET) { stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start; cache->bytes_super += stripe_len; - ret = set_extent_bit(&fs_info->excluded_extents, cache->start, - cache->start + stripe_len - 1, - EXTENT_UPTODATE, NULL); + ret = btrfs_set_extent_bit(&fs_info->excluded_extents, cache->start, + cache->start + stripe_len - 1, + EXTENT_DIRTY, NULL); if (ret) return ret; } @@ -2233,7 +2239,7 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) return ret; /* Shouldn't have super stripes in sequential zones */ - if (zoned && nr) { + if (unlikely(zoned && nr)) { kfree(logical); btrfs_err(fs_info, "zoned: block group %llu must not contain super block", @@ -2246,9 +2252,9 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) cache->start + cache->length - logical[nr]); cache->bytes_super += len; - ret = set_extent_bit(&fs_info->excluded_extents, logical[nr], - logical[nr] + len - 1, - EXTENT_UPTODATE, NULL); + ret = btrfs_set_extent_bit(&fs_info->excluded_extents, + logical[nr], logical[nr] + len - 1, + EXTENT_DIRTY, NULL); if (ret) { kfree(logical); return ret; @@ -2324,7 +2330,7 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) break; bg = btrfs_lookup_block_group(fs_info, map->start); - if (!bg) { + if (unlikely(!bg)) { btrfs_err(fs_info, "chunk start=%llu len=%llu doesn't have corresponding block group", map->start, map->chunk_len); @@ -2332,9 +2338,9 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) btrfs_free_chunk_map(map); break; } - if (bg->start != map->start || bg->length != map->chunk_len || - (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != - (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { + if (unlikely(bg->start != map->start || bg->length != map->chunk_len || + (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != + (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK))) { btrfs_err(fs_info, "chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx", map->start, map->chunk_len, @@ -2373,8 +2379,9 @@ static int read_one_block_group(struct btrfs_fs_info *info, cache->commit_used = cache->used; cache->flags = btrfs_stack_block_group_flags(bgi); cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi); + cache->space_info = btrfs_find_space_info(info, cache->flags); - set_free_space_tree_thresholds(cache); + btrfs_set_free_space_tree_thresholds(cache); if (need_clear) { /* @@ -2451,6 +2458,7 @@ static int read_one_block_group(struct btrfs_fs_info *info, btrfs_remove_free_space_cache(cache); goto error; } + trace_btrfs_add_block_group(info, cache, 0); btrfs_add_bg_to_space_info(info, cache); @@ -2495,6 +2503,7 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info) bg->cached = BTRFS_CACHE_FINISHED; bg->used = map->chunk_len; bg->flags = map->type; + bg->space_info = btrfs_find_space_info(fs_info, bg->flags); ret = btrfs_add_block_group_cache(bg); /* * We may have some valid block group cache added already, in @@ -2791,7 +2800,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) block_group->length); if (ret) btrfs_abort_transaction(trans, ret); - add_block_group_free_space(trans, block_group); + btrfs_add_block_group_free_space(trans, block_group); /* * If we restriped during balance, we may have added a new raid @@ -2824,7 +2833,7 @@ next: * space or none at all (due to no need to COW, extent buffers * were already COWed in the current transaction and still * unwritten, tree heights lower than the maximum possible - * height, etc). For data we generally reserve the axact amount + * height, etc). For data we generally reserve the exact amount * of space we are going to allocate later, the exception is * when using compression, as we must reserve space based on the * uncompressed data size, because the compression is only done @@ -2868,8 +2877,8 @@ static u64 calculate_global_root_id(const struct btrfs_fs_info *fs_info, u64 off } struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans, - u64 type, - u64 chunk_offset, u64 size) + struct btrfs_space_info *space_info, + u64 type, u64 chunk_offset, u64 size) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_group *cache; @@ -2889,7 +2898,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran set_bit(BLOCK_GROUP_FLAG_NEW, &cache->runtime_flags); cache->length = size; - set_free_space_tree_thresholds(cache); + btrfs_set_free_space_tree_thresholds(cache); cache->flags = type; cache->cached = BTRFS_CACHE_FINISHED; cache->global_root_id = calculate_global_root_id(fs_info, cache->start); @@ -2923,7 +2932,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran * assigned to our block group. We want our bg to be added to the rbtree * with its ->space_info set. */ - cache->space_info = btrfs_find_space_info(fs_info, cache->flags); + cache->space_info = space_info; ASSERT(cache->space_info); ret = btrfs_add_block_group_cache(cache); @@ -2968,6 +2977,7 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, bool do_chunk_alloc) { struct btrfs_fs_info *fs_info = cache->fs_info; + struct btrfs_space_info *space_info = cache->space_info; struct btrfs_trans_handle *trans; struct btrfs_root *root = btrfs_block_group_root(fs_info); u64 alloc_flags; @@ -3020,7 +3030,7 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, */ alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags); if (alloc_flags != cache->flags) { - ret = btrfs_chunk_alloc(trans, alloc_flags, + ret = btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE); /* * ENOSPC is allowed here, we may have enough space @@ -3048,15 +3058,15 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM)) goto unlock_out; - alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags); - ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); + alloc_flags = btrfs_get_alloc_profile(fs_info, space_info->flags); + ret = btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE); if (ret < 0) goto out; /* * We have allocated a new chunk. We also need to activate that chunk to * grant metadata tickets for zoned filesystem. */ - ret = btrfs_zoned_activate_one_bg(fs_info, cache->space_info, true); + ret = btrfs_zoned_activate_one_bg(space_info, true); if (ret < 0) goto out; @@ -3232,7 +3242,7 @@ again: */ BTRFS_I(inode)->generation = 0; ret = btrfs_update_inode(trans, BTRFS_I(inode)); - if (ret) { + if (unlikely(ret)) { /* * So theoretically we could recover from this, simply set the * super cache generation to 0 so we know to invalidate the @@ -3635,9 +3645,11 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) wait_event(cur_trans->writer_wait, atomic_read(&cur_trans->num_writers) == 1); ret = update_block_group_item(trans, path, cache); - } - if (ret) + if (ret) + btrfs_abort_transaction(trans, ret); + } else if (ret) { btrfs_abort_transaction(trans, ret); + } } /* If its not on the io list, we need to put the block group */ @@ -3738,8 +3750,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, spin_unlock(&cache->lock); spin_unlock(&space_info->lock); - set_extent_bit(&trans->transaction->pinned_extents, bytenr, - bytenr + num_bytes - 1, EXTENT_DIRTY, NULL); + btrfs_set_extent_bit(&trans->transaction->pinned_extents, bytenr, + bytenr + num_bytes - 1, EXTENT_DIRTY, NULL); } spin_lock(&trans->transaction->dirty_bgs_lock); @@ -3785,7 +3797,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, * reservation and return -EAGAIN, otherwise this function always succeeds. */ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, - u64 ram_bytes, u64 num_bytes, int delalloc, + u64 ram_bytes, u64 num_bytes, bool delalloc, bool force_wrong_size_class) { struct btrfs_space_info *space_info = cache->space_info; @@ -3796,30 +3808,38 @@ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, spin_lock(&cache->lock); if (cache->ro) { ret = -EAGAIN; - goto out; + goto out_error; } if (btrfs_block_group_should_use_size_class(cache)) { size_class = btrfs_calc_block_group_size_class(num_bytes); ret = btrfs_use_block_group_size_class(cache, size_class, force_wrong_size_class); if (ret) - goto out; + goto out_error; } + cache->reserved += num_bytes; - space_info->bytes_reserved += num_bytes; + if (delalloc) + cache->delalloc_bytes += num_bytes; + trace_btrfs_space_reservation(cache->fs_info, "space_info", space_info->flags, num_bytes, 1); + spin_unlock(&cache->lock); + + space_info->bytes_reserved += num_bytes; btrfs_space_info_update_bytes_may_use(space_info, -ram_bytes); - if (delalloc) - cache->delalloc_bytes += num_bytes; /* * Compression can use less space than we reserved, so wake tickets if * that happens. */ if (num_bytes < ram_bytes) - btrfs_try_granting_tickets(cache->fs_info, space_info); -out: + btrfs_try_granting_tickets(space_info); + spin_unlock(&space_info->lock); + + return 0; + +out_error: spin_unlock(&cache->lock); spin_unlock(&space_info->lock); return ret; @@ -3828,35 +3848,38 @@ out: /* * Update the block_group and space info counters. * - * @cache: The cache we are manipulating - * @num_bytes: The number of bytes in question - * @delalloc: The blocks are allocated for the delalloc write + * @cache: The cache we are manipulating. + * @num_bytes: The number of bytes in question. + * @is_delalloc: Whether the blocks are allocated for a delalloc write. * * This is called by somebody who is freeing space that was never actually used * on disk. For example if you reserve some space for a new leaf in transaction * A and before transaction A commits you free that leaf, you call this with * reserve set to 0 in order to clear the reservation. */ -void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, - u64 num_bytes, int delalloc) +void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, u64 num_bytes, + bool is_delalloc) { struct btrfs_space_info *space_info = cache->space_info; + bool bg_ro; spin_lock(&space_info->lock); spin_lock(&cache->lock); - if (cache->ro) + bg_ro = cache->ro; + cache->reserved -= num_bytes; + if (is_delalloc) + cache->delalloc_bytes -= num_bytes; + spin_unlock(&cache->lock); + + if (bg_ro) space_info->bytes_readonly += num_bytes; else if (btrfs_is_zoned(cache->fs_info)) space_info->bytes_zone_unusable += num_bytes; - cache->reserved -= num_bytes; + space_info->bytes_reserved -= num_bytes; space_info->max_extent_size = 0; - if (delalloc) - cache->delalloc_bytes -= num_bytes; - spin_unlock(&cache->lock); - - btrfs_try_granting_tickets(cache->fs_info, space_info); + btrfs_try_granting_tickets(space_info); spin_unlock(&space_info->lock); } @@ -3871,14 +3894,14 @@ static void force_metadata_allocation(struct btrfs_fs_info *info) } } -static int should_alloc_chunk(const struct btrfs_fs_info *fs_info, - const struct btrfs_space_info *sinfo, int force) +static bool should_alloc_chunk(const struct btrfs_fs_info *fs_info, + const struct btrfs_space_info *sinfo, int force) { u64 bytes_used = btrfs_space_info_used(sinfo, false); u64 thresh; if (force == CHUNK_ALLOC_FORCE) - return 1; + return true; /* * in limited mode, we want to have some free space up to @@ -3889,22 +3912,31 @@ static int should_alloc_chunk(const struct btrfs_fs_info *fs_info, thresh = max_t(u64, SZ_64M, mult_perc(thresh, 1)); if (sinfo->total_bytes - bytes_used < thresh) - return 1; + return true; } if (bytes_used + SZ_2M < mult_perc(sinfo->total_bytes, 80)) - return 0; - return 1; + return false; + return true; } int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type) { u64 alloc_flags = btrfs_get_alloc_profile(trans->fs_info, type); + struct btrfs_space_info *space_info; - return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); + space_info = btrfs_find_space_info(trans->fs_info, type); + if (!space_info) { + DEBUG_WARN(); + return -EINVAL; + } + + return btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE); } -static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags) +static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_space_info *space_info, + u64 flags) { struct btrfs_block_group *bg; int ret; @@ -3917,7 +3949,7 @@ static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans */ check_system_chunk(trans, flags); - bg = btrfs_create_chunk(trans, flags); + bg = btrfs_create_chunk(trans, space_info, flags); if (IS_ERR(bg)) { ret = PTR_ERR(bg); goto out; @@ -3965,8 +3997,16 @@ static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans if (ret == -ENOSPC) { const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info); struct btrfs_block_group *sys_bg; + struct btrfs_space_info *sys_space_info; - sys_bg = btrfs_create_chunk(trans, sys_flags); + sys_space_info = btrfs_find_space_info(trans->fs_info, sys_flags); + if (unlikely(!sys_space_info)) { + ret = -EINVAL; + btrfs_abort_transaction(trans, ret); + goto out; + } + + sys_bg = btrfs_create_chunk(trans, sys_space_info, sys_flags); if (IS_ERR(sys_bg)) { ret = PTR_ERR(sys_bg); btrfs_abort_transaction(trans, ret); @@ -3974,17 +4014,17 @@ static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans } ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } ret = btrfs_chunk_alloc_add_chunk_item(trans, bg); - if (ret) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } - } else if (ret) { + } else if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } @@ -4097,6 +4137,8 @@ out: * * This function, btrfs_chunk_alloc(), belongs to phase 1. * + * @space_info: specify which space_info the new chunk should belong to. + * * If @force is CHUNK_ALLOC_FORCE: * - return 1 if it successfully allocates a chunk, * - return errors including -ENOSPC otherwise. @@ -4105,11 +4147,11 @@ out: * - return 1 if it successfully allocates a chunk, * - return errors including -ENOSPC otherwise. */ -int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, +int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_space_info *space_info, u64 flags, enum btrfs_chunk_alloc_enum force) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_space_info *space_info; struct btrfs_block_group *ret_bg; bool wait_for_alloc = false; bool should_alloc = false; @@ -4148,9 +4190,6 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, if (flags & BTRFS_BLOCK_GROUP_SYSTEM) return -ENOSPC; - space_info = btrfs_find_space_info(fs_info, flags); - ASSERT(space_info); - do { spin_lock(&space_info->lock); if (force < space_info->force_alloc) @@ -4158,11 +4197,11 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, should_alloc = should_alloc_chunk(fs_info, space_info, force); if (space_info->full) { /* No more free physical space */ + spin_unlock(&space_info->lock); if (should_alloc) ret = -ENOSPC; else ret = 0; - spin_unlock(&space_info->lock); return ret; } else if (!should_alloc) { spin_unlock(&space_info->lock); @@ -4174,16 +4213,16 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, * recheck if we should continue with our allocation * attempt. */ + spin_unlock(&space_info->lock); wait_for_alloc = true; force = CHUNK_ALLOC_NO_FORCE; - spin_unlock(&space_info->lock); mutex_lock(&fs_info->chunk_mutex); mutex_unlock(&fs_info->chunk_mutex); } else { /* Proceed with allocation */ - space_info->chunk_alloc = 1; - wait_for_alloc = false; + space_info->chunk_alloc = true; spin_unlock(&space_info->lock); + wait_for_alloc = false; } cond_resched(); @@ -4211,7 +4250,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, force_metadata_allocation(fs_info); } - ret_bg = do_chunk_alloc(trans, flags); + ret_bg = do_chunk_alloc(trans, space_info, flags); trans->allocating_chunk = false; if (IS_ERR(ret_bg)) { @@ -4230,7 +4269,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, spin_lock(&space_info->lock); if (ret < 0) { if (ret == -ENOSPC) - space_info->full = 1; + space_info->full = true; else goto out; } else { @@ -4240,7 +4279,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; out: - space_info->chunk_alloc = 0; + space_info->chunk_alloc = false; spin_unlock(&space_info->lock); mutex_unlock(&fs_info->chunk_mutex); @@ -4281,12 +4320,16 @@ static void reserve_chunk_space(struct btrfs_trans_handle *trans, if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu", left, bytes, type); - btrfs_dump_space_info(fs_info, info, 0, 0); + btrfs_dump_space_info(info, 0, false); } if (left < bytes) { u64 flags = btrfs_system_alloc_profile(fs_info); struct btrfs_block_group *bg; + struct btrfs_space_info *space_info; + + space_info = btrfs_find_space_info(fs_info, flags); + ASSERT(space_info); /* * Ignore failure to create system chunk. We might end up not @@ -4294,7 +4337,7 @@ static void reserve_chunk_space(struct btrfs_trans_handle *trans, * the paths we visit in the chunk tree (they were already COWed * or created in the current transaction for example). */ - bg = btrfs_create_chunk(trans, flags); + bg = btrfs_create_chunk(trans, space_info, flags); if (IS_ERR(bg)) { ret = PTR_ERR(bg); } else { @@ -4302,7 +4345,7 @@ static void reserve_chunk_space(struct btrfs_trans_handle *trans, * We have a new chunk. We also need to activate it for * zoned filesystem. */ - ret = btrfs_zoned_activate_one_bg(fs_info, info, true); + ret = btrfs_zoned_activate_one_bg(info, true); if (ret < 0) return; @@ -4402,6 +4445,43 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info) } } +static void check_removing_space_info(struct btrfs_space_info *space_info) +{ + struct btrfs_fs_info *info = space_info->fs_info; + + if (space_info->subgroup_id == BTRFS_SUB_GROUP_PRIMARY) { + /* This is a top space_info, proceed with its children first. */ + for (int i = 0; i < BTRFS_SPACE_INFO_SUB_GROUP_MAX; i++) { + if (space_info->sub_group[i]) { + check_removing_space_info(space_info->sub_group[i]); + kfree(space_info->sub_group[i]); + space_info->sub_group[i] = NULL; + } + } + } + + /* + * Do not hide this behind enospc_debug, this is actually important and + * indicates a real bug if this happens. + */ + if (WARN_ON(space_info->bytes_pinned > 0 || space_info->bytes_may_use > 0)) + btrfs_dump_space_info(space_info, 0, false); + + /* + * If there was a failure to cleanup a log tree, very likely due to an + * IO failure on a writeback attempt of one or more of its extent + * buffers, we could not do proper (and cheap) unaccounting of their + * reserved space, so don't warn on bytes_reserved > 0 in that case. + */ + if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) || + !BTRFS_FS_LOG_CLEANUP_ERROR(info)) { + if (WARN_ON(space_info->bytes_reserved > 0)) + btrfs_dump_space_info(space_info, 0, false); + } + + WARN_ON(space_info->reclaim_size > 0); +} + /* * Must be called only after stopping all workers, since we could have block * group caching kthreads running, and therefore they could race with us if we @@ -4427,8 +4507,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) write_lock(&info->block_group_cache_lock); while (!list_empty(&info->caching_block_groups)) { - caching_ctl = list_entry(info->caching_block_groups.next, - struct btrfs_caching_control, list); + caching_ctl = list_first_entry(&info->caching_block_groups, + struct btrfs_caching_control, list); list_del(&caching_ctl->list); btrfs_put_caching_control(caching_ctl); } @@ -4499,32 +4579,10 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) btrfs_release_global_block_rsv(info); while (!list_empty(&info->space_info)) { - space_info = list_entry(info->space_info.next, - struct btrfs_space_info, - list); - - /* - * Do not hide this behind enospc_debug, this is actually - * important and indicates a real bug if this happens. - */ - if (WARN_ON(space_info->bytes_pinned > 0 || - space_info->bytes_may_use > 0)) - btrfs_dump_space_info(info, space_info, 0, 0); - - /* - * If there was a failure to cleanup a log tree, very likely due - * to an IO failure on a writeback attempt of one or more of its - * extent buffers, we could not do proper (and cheap) unaccounting - * of their reserved space, so don't warn on bytes_reserved > 0 in - * that case. - */ - if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) || - !BTRFS_FS_LOG_CLEANUP_ERROR(info)) { - if (WARN_ON(space_info->bytes_reserved > 0)) - btrfs_dump_space_info(info, space_info, 0, 0); - } + space_info = list_first_entry(&info->space_info, + struct btrfs_space_info, list); - WARN_ON(space_info->reclaim_size > 0); + check_removing_space_info(space_info); list_del(&space_info->list); btrfs_sysfs_remove_space_info(space_info); } |
