summaryrefslogtreecommitdiff
path: root/fs/btrfs/disk-io.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/disk-io.c')
-rw-r--r--fs/btrfs/disk-io.c316
1 files changed, 131 insertions, 185 deletions
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f09db62e61a1..1beb9458f622 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -182,22 +182,22 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb,
int mirror_num)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- int num_folios = num_extent_folios(eb);
int ret = 0;
if (sb_rdonly(fs_info->sb))
return -EROFS;
- for (int i = 0; i < num_folios; i++) {
+ for (int i = 0; i < num_extent_folios(eb); i++) {
struct folio *folio = eb->folios[i];
u64 start = max_t(u64, eb->start, folio_pos(folio));
u64 end = min_t(u64, eb->start + eb->len,
folio_pos(folio) + eb->folio_size);
u32 len = end - start;
+ phys_addr_t paddr = PFN_PHYS(folio_pfn(folio)) +
+ offset_in_folio(folio, start);
- ret = btrfs_repair_io_failure(fs_info, 0, start, len,
- start, folio, offset_in_folio(folio, start),
- mirror_num);
+ ret = btrfs_repair_io_failure(fs_info, 0, start, len, start,
+ paddr, mirror_num);
if (ret)
break;
}
@@ -225,7 +225,6 @@ int btrfs_read_extent_buffer(struct extent_buffer *eb,
ASSERT(check);
while (1) {
- clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
ret = read_extent_buffer_pages(eb, mirror_num, check);
if (!ret)
break;
@@ -257,7 +256,7 @@ int btrfs_read_extent_buffer(struct extent_buffer *eb,
/*
* Checksum a dirty tree block before IO.
*/
-blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio)
+int btree_csum_one_bio(struct btrfs_bio *bbio)
{
struct extent_buffer *eb = bbio->private;
struct btrfs_fs_info *fs_info = eb->fs_info;
@@ -268,9 +267,9 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio)
/* Btree blocks are always contiguous on disk. */
if (WARN_ON_ONCE(bbio->file_offset != eb->start))
- return BLK_STS_IOERR;
+ return -EIO;
if (WARN_ON_ONCE(bbio->bio.bi_iter.bi_size != eb->len))
- return BLK_STS_IOERR;
+ return -EIO;
/*
* If an extent_buffer is marked as EXTENT_BUFFER_ZONED_ZEROOUT, don't
@@ -279,14 +278,13 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio)
*/
if (test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)) {
memzero_extent_buffer(eb, 0, eb->len);
- return BLK_STS_OK;
+ return 0;
}
if (WARN_ON_ONCE(found_start != eb->start))
- return BLK_STS_IOERR;
- if (WARN_ON(!btrfs_folio_test_uptodate(fs_info, eb->folios[0],
- eb->start, eb->len)))
- return BLK_STS_IOERR;
+ return -EIO;
+ if (WARN_ON(!btrfs_meta_folio_test_uptodate(eb->folios[0], eb)))
+ return -EIO;
ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
offsetof(struct btrfs_header, fsid),
@@ -314,7 +312,7 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio)
goto error;
}
write_extent_buffer(eb, result, 0, fs_info->csum_size);
- return BLK_STS_OK;
+ return 0;
error:
btrfs_print_tree(eb, 0);
@@ -328,7 +326,7 @@ error:
*/
WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG) ||
btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID);
- return errno_to_blk_status(ret);
+ return ret;
}
static bool check_tree_block_fsid(struct extent_buffer *eb)
@@ -454,15 +452,9 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb,
goto out;
}
- /*
- * If this is a leaf block and it is corrupt, set the corrupt bit so
- * that we don't try and read the other copies of this block, just
- * return -EIO.
- */
- if (found_level == 0 && btrfs_check_leaf(eb)) {
- set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
+ /* If this is a leaf block and it is corrupt, just return -EIO. */
+ if (found_level == 0 && btrfs_check_leaf(eb))
ret = -EIO;
- }
if (found_level > 0 && btrfs_check_node(eb))
ret = -EIO;
@@ -643,11 +635,16 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
}
-static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
- u64 objectid)
+static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
+ u64 objectid, gfp_t flags)
{
+ struct btrfs_root *root;
bool dummy = btrfs_is_testing(fs_info);
+ root = kzalloc(sizeof(*root), flags);
+ if (!root)
+ return NULL;
+
memset(&root->root_key, 0, sizeof(root->root_key));
memset(&root->root_item, 0, sizeof(root->root_item));
memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
@@ -700,10 +697,10 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
btrfs_set_root_last_log_commit(root, 0);
root->anon_dev = 0;
if (!dummy) {
- extent_io_tree_init(fs_info, &root->dirty_log_pages,
- IO_TREE_ROOT_DIRTY_LOG_PAGES);
- extent_io_tree_init(fs_info, &root->log_csum_range,
- IO_TREE_LOG_CSUM_RANGE);
+ btrfs_extent_io_tree_init(fs_info, &root->dirty_log_pages,
+ IO_TREE_ROOT_DIRTY_LOG_PAGES);
+ btrfs_extent_io_tree_init(fs_info, &root->log_csum_range,
+ IO_TREE_LOG_CSUM_RANGE);
}
spin_lock_init(&root->root_item_lock);
@@ -714,14 +711,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
list_add_tail(&root->leak_list, &fs_info->allocated_roots);
spin_unlock(&fs_info->fs_roots_radix_lock);
#endif
-}
-static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
- u64 objectid, gfp_t flags)
-{
- struct btrfs_root *root = kzalloc(sizeof(*root), flags);
- if (root)
- __setup_root(root, fs_info, objectid);
return root;
}
@@ -1089,21 +1079,22 @@ struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
const struct btrfs_key *key)
{
struct btrfs_root *root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
path = btrfs_alloc_path();
if (!path)
return ERR_PTR(-ENOMEM);
root = read_tree_root_path(tree_root, path, key);
- btrfs_free_path(path);
return root;
}
/*
- * Initialize subvolume root in-memory structure
+ * Initialize subvolume root in-memory structure.
*
* @anon_dev: anonymous device to attach to the root, if zero, allocate new
+ *
+ * In case of failure the caller is responsible to call btrfs_free_fs_root()
*/
static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
{
@@ -1127,7 +1118,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
if (!anon_dev) {
ret = get_anon_bdev(&root->anon_dev);
if (ret)
- goto fail;
+ return ret;
} else {
root->anon_dev = anon_dev;
}
@@ -1137,7 +1128,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
ret = btrfs_init_root_free_objectid(root);
if (ret) {
mutex_unlock(&root->objectid_mutex);
- goto fail;
+ return ret;
}
ASSERT(root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
@@ -1145,9 +1136,6 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
mutex_unlock(&root->objectid_mutex);
return 0;
-fail:
- /* The caller is responsible to call btrfs_free_fs_root */
- return ret;
}
static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
@@ -1568,7 +1556,7 @@ static int transaction_kthread(void *arg)
do {
cannot_commit = false;
- delay = msecs_to_jiffies(fs_info->commit_interval * 1000);
+ delay = secs_to_jiffies(fs_info->commit_interval);
mutex_lock(&fs_info->transaction_kthread_mutex);
spin_lock(&fs_info->trans_lock);
@@ -1583,9 +1571,9 @@ static int transaction_kthread(void *arg)
cur->state < TRANS_STATE_COMMIT_PREP &&
delta < fs_info->commit_interval) {
spin_unlock(&fs_info->trans_lock);
- delay -= msecs_to_jiffies((delta - 1) * 1000);
+ delay -= secs_to_jiffies(delta - 1);
delay = min(delay,
- msecs_to_jiffies(fs_info->commit_interval * 1000));
+ secs_to_jiffies(fs_info->commit_interval));
goto sleep;
}
transid = cur->transid;
@@ -1867,8 +1855,8 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
int i;
while (!list_empty(&fs_info->dead_roots)) {
- gang[0] = list_entry(fs_info->dead_roots.next,
- struct btrfs_root, root_list);
+ gang[0] = list_first_entry(&fs_info->dead_roots,
+ struct btrfs_root, root_list);
list_del(&gang[0]->root_list);
if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state))
@@ -1931,9 +1919,9 @@ static int btrfs_init_btree_inode(struct super_block *sb)
inode->i_mapping->a_ops = &btree_aops;
mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
- extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
- IO_TREE_BTREE_INODE_IO);
- extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
+ btrfs_extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
+ IO_TREE_BTREE_INODE_IO);
+ btrfs_extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root);
set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
@@ -2006,7 +1994,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
btrfs_alloc_ordered_workqueue(fs_info, "qgroup-rescan",
ordered_flags);
fs_info->discard_ctl.discard_workers =
- alloc_ordered_workqueue("btrfs_discard", WQ_FREEZABLE);
+ alloc_ordered_workqueue("btrfs-discard", WQ_FREEZABLE);
if (!(fs_info->workers &&
fs_info->delalloc_workers && fs_info->flush_workers &&
@@ -2200,8 +2188,8 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root,
static int load_global_roots(struct btrfs_root *tree_root)
{
- struct btrfs_path *path;
- int ret = 0;
+ BTRFS_PATH_AUTO_FREE(path);
+ int ret;
path = btrfs_alloc_path();
if (!path)
@@ -2210,18 +2198,17 @@ static int load_global_roots(struct btrfs_root *tree_root)
ret = load_global_roots_objectid(tree_root, path,
BTRFS_EXTENT_TREE_OBJECTID, "extent");
if (ret)
- goto out;
+ return ret;
ret = load_global_roots_objectid(tree_root, path,
BTRFS_CSUM_TREE_OBJECTID, "csum");
if (ret)
- goto out;
+ return ret;
if (!btrfs_fs_compat_ro(tree_root->fs_info, FREE_SPACE_TREE))
- goto out;
+ return ret;
ret = load_global_roots_objectid(tree_root, path,
BTRFS_FREE_SPACE_TREE_OBJECTID,
"free space");
-out:
- btrfs_free_path(path);
+
return ret;
}
@@ -2447,21 +2434,27 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info,
* Check sectorsize and nodesize first, other check will need it.
* Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here.
*/
- if (!is_power_of_2(sectorsize) || sectorsize < 4096 ||
+ if (!is_power_of_2(sectorsize) || sectorsize < BTRFS_MIN_BLOCKSIZE ||
sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) {
btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize);
ret = -EINVAL;
}
/*
- * We only support at most two sectorsizes: 4K and PAGE_SIZE.
+ * We only support at most 3 sectorsizes: 4K, PAGE_SIZE, MIN_BLOCKSIZE.
+ *
+ * For 4K page sized systems with non-debug builds, all 3 matches (4K).
+ * For 4K page sized systems with debug builds, there are two block sizes
+ * supported. (4K and 2K)
*
* We can support 16K sectorsize with 64K page size without problem,
* but such sectorsize/pagesize combination doesn't make much sense.
* 4K will be our future standard, PAGE_SIZE is supported from the very
* beginning.
*/
- if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K && sectorsize != PAGE_SIZE)) {
+ if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K &&
+ sectorsize != PAGE_SIZE &&
+ sectorsize != BTRFS_MIN_BLOCKSIZE)) {
btrfs_err(fs_info,
"sectorsize %llu not yet supported for page size %lu",
sectorsize, PAGE_SIZE);
@@ -2561,6 +2554,9 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info,
ret = -EINVAL;
}
+ if (ret)
+ return ret;
+
ret = validate_sys_chunk_array(fs_info, sb);
/*
@@ -2765,10 +2761,21 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
return ret;
}
+/*
+ * Lockdep gets confused between our buffer_tree which requires IRQ locking because
+ * we modify marks in the IRQ context, and our delayed inode xarray which doesn't
+ * have these requirements. Use a class key so lockdep doesn't get them mixed up.
+ */
+static struct lock_class_key buffer_xa_class;
+
void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
{
INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
- INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
+
+ /* Use the same flags as mapping->i_pages. */
+ xa_init_flags(&fs_info->buffer_tree, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT);
+ lockdep_set_class(&fs_info->buffer_tree.xa_lock, &buffer_xa_class);
+
INIT_LIST_HEAD(&fs_info->trans_list);
INIT_LIST_HEAD(&fs_info->dead_roots);
INIT_LIST_HEAD(&fs_info->delayed_iputs);
@@ -2780,7 +2787,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
spin_lock_init(&fs_info->delayed_iput_lock);
spin_lock_init(&fs_info->defrag_inodes_lock);
spin_lock_init(&fs_info->super_lock);
- spin_lock_init(&fs_info->buffer_lock);
spin_lock_init(&fs_info->unused_bgs_lock);
spin_lock_init(&fs_info->treelog_bg_lock);
spin_lock_init(&fs_info->zone_active_bgs_lock);
@@ -2825,6 +2831,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
BTRFS_BLOCK_RSV_GLOBAL);
btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
+ btrfs_init_block_rsv(&fs_info->treelog_rsv, BTRFS_BLOCK_RSV_TREELOG);
btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
BTRFS_BLOCK_RSV_DELOPS);
@@ -2858,8 +2865,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
rwlock_init(&fs_info->block_group_cache_lock);
fs_info->block_group_cache_tree = RB_ROOT_CACHED;
- extent_io_tree_init(fs_info, &fs_info->excluded_extents,
- IO_TREE_FS_EXCLUDED_EXTENTS);
+ btrfs_extent_io_tree_init(fs_info, &fs_info->excluded_extents,
+ IO_TREE_FS_EXCLUDED_EXTENTS);
mutex_init(&fs_info->ordered_operations_mutex);
mutex_init(&fs_info->tree_log_mutex);
@@ -3311,7 +3318,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
/*
* Read super block and check the signature bytes only
*/
- disk_super = btrfs_read_dev_super(fs_devices->latest_dev->bdev);
+ disk_super = btrfs_read_disk_super(fs_devices->latest_dev->bdev, 0, false);
if (IS_ERR(disk_super)) {
ret = PTR_ERR(disk_super);
goto fail_alloc;
@@ -3390,7 +3397,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
fs_info->nodesize = nodesize;
fs_info->sectorsize = sectorsize;
fs_info->sectorsize_bits = ilog2(sectorsize);
- fs_info->sectors_per_page = (PAGE_SIZE >> fs_info->sectorsize_bits);
fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
fs_info->stripesize = stripesize;
fs_info->fs_devices->fs_info = fs_info;
@@ -3416,11 +3422,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
*/
fs_info->max_inline = min_t(u64, fs_info->max_inline, fs_info->sectorsize);
- if (sectorsize < PAGE_SIZE)
- btrfs_warn(fs_info,
- "read-write for sector size %u with page size %lu is experimental",
- sectorsize, PAGE_SIZE);
-
ret = btrfs_init_workqueues(fs_info);
if (ret)
goto fail_sb_buffer;
@@ -3712,85 +3713,6 @@ static void btrfs_end_super_write(struct bio *bio)
bio_put(bio);
}
-struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
- int copy_num, bool drop_cache)
-{
- struct btrfs_super_block *super;
- struct page *page;
- u64 bytenr, bytenr_orig;
- struct address_space *mapping = bdev->bd_mapping;
- int ret;
-
- bytenr_orig = btrfs_sb_offset(copy_num);
- ret = btrfs_sb_log_location_bdev(bdev, copy_num, READ, &bytenr);
- if (ret == -ENOENT)
- return ERR_PTR(-EINVAL);
- else if (ret)
- return ERR_PTR(ret);
-
- if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev))
- return ERR_PTR(-EINVAL);
-
- if (drop_cache) {
- /* This should only be called with the primary sb. */
- ASSERT(copy_num == 0);
-
- /*
- * Drop the page of the primary superblock, so later read will
- * always read from the device.
- */
- invalidate_inode_pages2_range(mapping,
- bytenr >> PAGE_SHIFT,
- (bytenr + BTRFS_SUPER_INFO_SIZE) >> PAGE_SHIFT);
- }
-
- page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS);
- if (IS_ERR(page))
- return ERR_CAST(page);
-
- super = page_address(page);
- if (btrfs_super_magic(super) != BTRFS_MAGIC) {
- btrfs_release_disk_super(super);
- return ERR_PTR(-ENODATA);
- }
-
- if (btrfs_super_bytenr(super) != bytenr_orig) {
- btrfs_release_disk_super(super);
- return ERR_PTR(-EINVAL);
- }
-
- return super;
-}
-
-
-struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev)
-{
- struct btrfs_super_block *super, *latest = NULL;
- int i;
- u64 transid = 0;
-
- /* we would like to check all the supers, but that would make
- * a btrfs mount succeed after a mkfs from a different FS.
- * So, we need to add a special mount option to scan for
- * later supers, using BTRFS_SUPER_MIRROR_MAX instead
- */
- for (i = 0; i < 1; i++) {
- super = btrfs_read_dev_one_super(bdev, i, false);
- if (IS_ERR(super))
- continue;
-
- if (!latest || btrfs_super_generation(super) > transid) {
- if (latest)
- btrfs_release_disk_super(super);
-
- latest = super;
- transid = btrfs_super_generation(super);
- }
- }
-
- return super;
-}
-
/*
* Write superblock @sb to the @device. Do not wait for completion, all the
* folios we use for writing are locked.
@@ -3830,8 +3752,8 @@ static int write_dev_supers(struct btrfs_device *device,
continue;
} else if (ret < 0) {
btrfs_err(device->fs_info,
- "couldn't get super block location for mirror %d",
- i);
+ "couldn't get super block location for mirror %d error %d",
+ i, ret);
atomic_inc(&device->sb_write_errors);
continue;
}
@@ -3850,12 +3772,11 @@ static int write_dev_supers(struct btrfs_device *device,
GFP_NOFS);
if (IS_ERR(folio)) {
btrfs_err(device->fs_info,
- "couldn't get super block page for bytenr %llu",
- bytenr);
+ "couldn't get super block page for bytenr %llu error %ld",
+ bytenr, PTR_ERR(folio));
atomic_inc(&device->sb_write_errors);
continue;
}
- ASSERT(folio_order(folio) == 0);
offset = offset_in_folio(folio, bytenr);
disk_super = folio_address(folio) + offset;
@@ -3928,7 +3849,6 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
/* If the folio has been removed, then we know it completed. */
if (IS_ERR(folio))
continue;
- ASSERT(folio_order(folio) == 0);
/* Folio will be unlocked once the write completes. */
folio_wait_locked(folio);
@@ -4248,8 +4168,9 @@ static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info)
u64 found_end;
found = true;
- while (find_first_extent_bit(&trans->dirty_pages, cur,
- &found_start, &found_end, EXTENT_DIRTY, &cached)) {
+ while (btrfs_find_first_extent_bit(&trans->dirty_pages, cur,
+ &found_start, &found_end,
+ EXTENT_DIRTY, &cached)) {
dirty_bytes += found_end + 1 - found_start;
cur = found_end + 1;
}
@@ -4326,6 +4247,14 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
btrfs_cleanup_defrag_inodes(fs_info);
/*
+ * Handle the error fs first, as it will flush and wait for all ordered
+ * extents. This will generate delayed iputs, thus we want to handle
+ * it first.
+ */
+ if (unlikely(BTRFS_FS_ERROR(fs_info)))
+ btrfs_error_commit_super(fs_info);
+
+ /*
* Wait for any fixup workers to complete.
* If we don't wait for them here and they are still running by the time
* we call kthread_stop() against the cleaner kthread further below, we
@@ -4346,6 +4275,31 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
btrfs_flush_workqueue(fs_info->delalloc_workers);
/*
+ * We can have ordered extents getting their last reference dropped from
+ * the fs_info->workers queue because for async writes for data bios we
+ * queue a work for that queue, at btrfs_wq_submit_bio(), that runs
+ * run_one_async_done() which calls btrfs_bio_end_io() in case the bio
+ * has an error, and that later function can do the final
+ * btrfs_put_ordered_extent() on the ordered extent attached to the bio,
+ * which adds a delayed iput for the inode. So we must flush the queue
+ * so that we don't have delayed iputs after committing the current
+ * transaction below and stopping the cleaner and transaction kthreads.
+ */
+ btrfs_flush_workqueue(fs_info->workers);
+
+ /*
+ * When finishing a compressed write bio we schedule a work queue item
+ * to finish an ordered extent - btrfs_finish_compressed_write_work()
+ * calls btrfs_finish_ordered_extent() which in turns does a call to
+ * btrfs_queue_ordered_fn(), and that queues the ordered extent
+ * completion either in the endio_write_workers work queue or in the
+ * fs_info->endio_freespace_worker work queue. We flush those queues
+ * below, so before we flush them we must flush this queue for the
+ * workers of compressed writes.
+ */
+ flush_workqueue(fs_info->compressed_write_workers);
+
+ /*
* After we parked the cleaner kthread, ordered extents may have
* completed and created new delayed iputs. If one of the async reclaim
* tasks is running and in the RUN_DELAYED_IPUTS flush state, then we
@@ -4369,6 +4323,8 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
/* Ordered extents for free space inodes. */
btrfs_flush_workqueue(fs_info->endio_freespace_worker);
btrfs_run_delayed_iputs(fs_info);
+ /* There should be no more workload to generate new delayed iputs. */
+ set_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state);
cancel_work_sync(&fs_info->async_reclaim_work);
cancel_work_sync(&fs_info->async_data_reclaim_work);
@@ -4403,9 +4359,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
btrfs_err(fs_info, "commit super ret %d", ret);
}
- if (BTRFS_FS_ERROR(fs_info))
- btrfs_error_commit_super(fs_info);
-
kthread_stop(fs_info->transaction_kthread);
kthread_stop(fs_info->cleaner_kthread);
@@ -4413,7 +4366,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags);
if (btrfs_check_quota_leak(fs_info)) {
- WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ DEBUG_WARN("qgroup reserved space leaked");
btrfs_err(fs_info, "qgroup reserved space leaked");
}
@@ -4528,10 +4481,6 @@ static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
/* cleanup FS via transaction */
btrfs_cleanup_transaction(fs_info);
- mutex_lock(&fs_info->cleaner_mutex);
- btrfs_run_delayed_iputs(fs_info);
- mutex_unlock(&fs_info->cleaner_mutex);
-
down_write(&fs_info->cleanup_work_sem);
up_write(&fs_info->cleanup_work_sem);
}
@@ -4674,9 +4623,9 @@ static void btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
u64 start = 0;
u64 end;
- while (find_first_extent_bit(dirty_pages, start, &start, &end,
- mark, NULL)) {
- clear_extent_bits(dirty_pages, start, end, mark);
+ while (btrfs_find_first_extent_bit(dirty_pages, start, &start, &end,
+ mark, NULL)) {
+ btrfs_clear_extent_bits(dirty_pages, start, end, mark);
while (start <= end) {
eb = find_extent_buffer(fs_info, start);
start += fs_info->nodesize;
@@ -4709,14 +4658,14 @@ static void btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
* the same extent range.
*/
mutex_lock(&fs_info->unused_bg_unpin_mutex);
- if (!find_first_extent_bit(unpin, 0, &start, &end,
- EXTENT_DIRTY, &cached_state)) {
+ if (!btrfs_find_first_extent_bit(unpin, 0, &start, &end,
+ EXTENT_DIRTY, &cached_state)) {
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
break;
}
- clear_extent_dirty(unpin, start, end, &cached_state);
- free_extent_state(cached_state);
+ btrfs_clear_extent_dirty(unpin, start, end, &cached_state);
+ btrfs_free_extent_state(cached_state);
btrfs_error_unpin_extent_range(fs_info, start, end);
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
cond_resched();
@@ -4902,7 +4851,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
int btrfs_init_root_free_objectid(struct btrfs_root *root)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
int ret;
struct extent_buffer *l;
struct btrfs_key search_key;
@@ -4918,14 +4867,13 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root)
search_key.offset = (u64)-1;
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
if (ret < 0)
- goto error;
+ return ret;
if (ret == 0) {
/*
* Key with offset -1 found, there would have to exist a root
* with such id, but this is out of valid range.
*/
- ret = -EUCLEAN;
- goto error;
+ return -EUCLEAN;
}
if (path->slots[0] > 0) {
slot = path->slots[0] - 1;
@@ -4936,10 +4884,8 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root)
} else {
root->free_objectid = BTRFS_FIRST_FREE_OBJECTID;
}
- ret = 0;
-error:
- btrfs_free_path(path);
- return ret;
+
+ return 0;
}
int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid)