summaryrefslogtreecommitdiff
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/block-group.h2
-rw-r--r--fs/btrfs/delayed-inode.c5
-rw-r--r--fs/btrfs/disk-io.c27
-rw-r--r--fs/btrfs/extent_io.c2
-rw-r--r--fs/btrfs/free-space-tree.c56
-rw-r--r--fs/btrfs/inode.c123
-rw-r--r--fs/btrfs/ioctl.c6
-rw-r--r--fs/btrfs/scrub.c53
-rw-r--r--fs/btrfs/tree-log.c150
-rw-r--r--fs/btrfs/volumes.c6
-rw-r--r--fs/btrfs/zoned.c86
11 files changed, 348 insertions, 168 deletions
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 9de356bcb411..aa176cc9a324 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -83,6 +83,8 @@ enum btrfs_block_group_flags {
BLOCK_GROUP_FLAG_ZONED_DATA_RELOC,
/* Does the block group need to be added to the free space tree? */
BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE,
+ /* Set after we add a new block group to the free space tree. */
+ BLOCK_GROUP_FLAG_FREE_SPACE_ADDED,
/* Indicate that the block group is placed on a sequential zone */
BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE,
/*
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index c7cc24a5dd5e..8c597fa60523 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1377,7 +1377,10 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info)
{
- WARN_ON(btrfs_first_delayed_node(fs_info->delayed_root));
+ struct btrfs_delayed_node *node = btrfs_first_delayed_node(fs_info->delayed_root);
+
+ if (WARN_ON(node))
+ refcount_dec(&node->refs);
}
static bool could_end_wait(struct btrfs_delayed_root *delayed_root, int seq)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1beb9458f622..0d6ad7512f21 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1835,6 +1835,8 @@ void btrfs_put_root(struct btrfs_root *root)
if (refcount_dec_and_test(&root->refs)) {
if (WARN_ON(!xa_empty(&root->inodes)))
xa_destroy(&root->inodes);
+ if (WARN_ON(!xa_empty(&root->delayed_nodes)))
+ xa_destroy(&root->delayed_nodes);
WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state));
if (root->anon_dev)
free_anon_bdev(root->anon_dev);
@@ -2156,8 +2158,7 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root,
found = true;
root = read_tree_root_path(tree_root, path, &key);
if (IS_ERR(root)) {
- if (!btrfs_test_opt(fs_info, IGNOREBADROOTS))
- ret = PTR_ERR(root);
+ ret = PTR_ERR(root);
break;
}
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
@@ -4310,8 +4311,8 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
*
* So wait for all ongoing ordered extents to complete and then run
* delayed iputs. This works because once we reach this point no one
- * can either create new ordered extents nor create delayed iputs
- * through some other means.
+ * can create new ordered extents, but delayed iputs can still be added
+ * by a reclaim worker (see comments further below).
*
* Also note that btrfs_wait_ordered_roots() is not safe here, because
* it waits for BTRFS_ORDERED_COMPLETE to be set on an ordered extent,
@@ -4322,15 +4323,29 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
btrfs_flush_workqueue(fs_info->endio_write_workers);
/* Ordered extents for free space inodes. */
btrfs_flush_workqueue(fs_info->endio_freespace_worker);
+ /*
+ * Run delayed iputs in case an async reclaim worker is waiting for them
+ * to be run as mentioned above.
+ */
btrfs_run_delayed_iputs(fs_info);
- /* There should be no more workload to generate new delayed iputs. */
- set_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state);
cancel_work_sync(&fs_info->async_reclaim_work);
cancel_work_sync(&fs_info->async_data_reclaim_work);
cancel_work_sync(&fs_info->preempt_reclaim_work);
cancel_work_sync(&fs_info->em_shrinker_work);
+ /*
+ * Run delayed iputs again because an async reclaim worker may have
+ * added new ones if it was flushing delalloc:
+ *
+ * shrink_delalloc() -> btrfs_start_delalloc_roots() ->
+ * start_delalloc_inodes() -> btrfs_add_delayed_iput()
+ */
+ btrfs_run_delayed_iputs(fs_info);
+
+ /* There should be no more workload to generate new delayed iputs. */
+ set_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state);
+
/* Cancel or finish ongoing discard work */
btrfs_discard_cleanup(fs_info);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 849199768664..1dc931c4937f 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4312,7 +4312,6 @@ static int try_release_subpage_extent_buffer(struct folio *folio)
spin_unlock(&eb->refs_lock);
continue;
}
- xa_unlock_irq(&fs_info->buffer_tree);
/*
* If tree ref isn't set then we know the ref on this eb is a
@@ -4329,6 +4328,7 @@ static int try_release_subpage_extent_buffer(struct folio *folio)
* check the folio private at the end. And
* release_extent_buffer() will release the refs_lock.
*/
+ xa_unlock_irq(&fs_info->buffer_tree);
release_extent_buffer(eb);
xa_lock_irq(&fs_info->buffer_tree);
}
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 0c573d46639a..a83c268f7f87 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -1115,11 +1115,21 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot_for_read(extent_root, &key, path, 1, 0);
if (ret < 0)
goto out_locked;
- ASSERT(ret == 0);
+ /*
+ * If ret is 1 (no key found), it means this is an empty block group,
+ * without any extents allocated from it and there's no block group
+ * item (key BTRFS_BLOCK_GROUP_ITEM_KEY) located in the extent tree
+ * because we are using the block group tree feature, so block group
+ * items are stored in the block group tree. It also means there are no
+ * extents allocated for block groups with a start offset beyond this
+ * block group's end offset (this is the last, highest, block group).
+ */
+ if (!btrfs_fs_compat_ro(trans->fs_info, BLOCK_GROUP_TREE))
+ ASSERT(ret == 0);
start = block_group->start;
end = block_group->start + block_group->length;
- while (1) {
+ while (ret == 0) {
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
if (key.type == BTRFS_EXTENT_ITEM_KEY ||
@@ -1149,8 +1159,6 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
ret = btrfs_next_item(extent_root, path);
if (ret < 0)
goto out_locked;
- if (ret)
- break;
}
if (start < end) {
ret = __add_to_free_space_tree(trans, block_group, path2,
@@ -1233,6 +1241,7 @@ static int clear_free_space_tree(struct btrfs_trans_handle *trans,
{
BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
+ struct rb_node *node;
int nr;
int ret;
@@ -1261,6 +1270,16 @@ static int clear_free_space_tree(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
}
+ node = rb_first_cached(&trans->fs_info->block_group_cache_tree);
+ while (node) {
+ struct btrfs_block_group *bg;
+
+ bg = rb_entry(node, struct btrfs_block_group, cache_node);
+ clear_bit(BLOCK_GROUP_FLAG_FREE_SPACE_ADDED, &bg->runtime_flags);
+ node = rb_next(node);
+ cond_resched();
+ }
+
return 0;
}
@@ -1350,12 +1369,18 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info)
block_group = rb_entry(node, struct btrfs_block_group,
cache_node);
+
+ if (test_bit(BLOCK_GROUP_FLAG_FREE_SPACE_ADDED,
+ &block_group->runtime_flags))
+ goto next;
+
ret = populate_free_space_tree(trans, block_group);
if (ret) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
return ret;
}
+next:
if (btrfs_should_end_transaction(trans)) {
btrfs_end_transaction(trans);
trans = btrfs_start_transaction(free_space_root, 1);
@@ -1382,6 +1407,29 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
clear_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags);
+ /*
+ * While rebuilding the free space tree we may allocate new metadata
+ * block groups while modifying the free space tree.
+ *
+ * Because during the rebuild (at btrfs_rebuild_free_space_tree()) we
+ * can use multiple transactions, every time btrfs_end_transaction() is
+ * called at btrfs_rebuild_free_space_tree() we finish the creation of
+ * new block groups by calling btrfs_create_pending_block_groups(), and
+ * that in turn calls us, through add_block_group_free_space(), to add
+ * a free space info item and a free space extent item for the block
+ * group.
+ *
+ * Then later btrfs_rebuild_free_space_tree() may find such new block
+ * groups and processes them with populate_free_space_tree(), which can
+ * fail with EEXIST since there are already items for the block group in
+ * the free space tree. Notice that we say "may find" because a new
+ * block group may be added to the block groups rbtree in a node before
+ * or after the block group currently being processed by the rebuild
+ * process. So signal the rebuild process to skip such new block groups
+ * if it finds them.
+ */
+ set_bit(BLOCK_GROUP_FLAG_FREE_SPACE_ADDED, &block_group->runtime_flags);
+
ret = add_new_free_space_info(trans, block_group, path);
if (ret)
return ret;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c0c778243bf1..fc66872b4c74 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4250,9 +4250,9 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
ret = btrfs_del_inode_ref(trans, root, name, ino, dir_ino, &index);
if (ret) {
- btrfs_info(fs_info,
- "failed to delete reference to %.*s, inode %llu parent %llu",
- name->len, name->name, ino, dir_ino);
+ btrfs_crit(fs_info,
+ "failed to delete reference to %.*s, root %llu inode %llu parent %llu",
+ name->len, name->name, btrfs_root_id(root), ino, dir_ino);
btrfs_abort_transaction(trans, ret);
goto err;
}
@@ -4710,7 +4710,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
int ret = 0;
struct btrfs_trans_handle *trans;
- u64 last_unlink_trans;
struct fscrypt_name fname;
if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
@@ -4736,6 +4735,23 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
goto out_notrans;
}
+ /*
+ * Propagate the last_unlink_trans value of the deleted dir to its
+ * parent directory. This is to prevent an unrecoverable log tree in the
+ * case we do something like this:
+ * 1) create dir foo
+ * 2) create snapshot under dir foo
+ * 3) delete the snapshot
+ * 4) rmdir foo
+ * 5) mkdir foo
+ * 6) fsync foo or some file inside foo
+ *
+ * This is because we can't unlink other roots when replaying the dir
+ * deletes for directory foo.
+ */
+ if (BTRFS_I(inode)->last_unlink_trans >= trans->transid)
+ btrfs_record_snapshot_destroy(trans, BTRFS_I(dir));
+
if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
ret = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry);
goto out;
@@ -4745,27 +4761,11 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
if (ret)
goto out;
- last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
-
/* now the directory is empty */
ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
&fname.disk_name);
- if (!ret) {
+ if (!ret)
btrfs_i_size_write(BTRFS_I(inode), 0);
- /*
- * Propagate the last_unlink_trans value of the deleted dir to
- * its parent directory. This is to prevent an unrecoverable
- * log tree in the case we do something like this:
- * 1) create dir foo
- * 2) create snapshot under dir foo
- * 3) delete the snapshot
- * 4) rmdir foo
- * 5) mkdir foo
- * 6) fsync foo or some file inside foo
- */
- if (last_unlink_trans >= trans->transid)
- BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
- }
out:
btrfs_end_transaction(trans);
out_notrans:
@@ -8059,6 +8059,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
int ret;
int ret2;
bool need_abort = false;
+ bool logs_pinned = false;
struct fscrypt_name old_fname, new_fname;
struct fscrypt_str *old_name, *new_name;
@@ -8182,6 +8183,31 @@ static int btrfs_rename_exchange(struct inode *old_dir,
inode_inc_iversion(new_inode);
simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID &&
+ new_ino != BTRFS_FIRST_FREE_OBJECTID) {
+ /*
+ * If we are renaming in the same directory (and it's not for
+ * root entries) pin the log early to prevent any concurrent
+ * task from logging the directory after we removed the old
+ * entries and before we add the new entries, otherwise that
+ * task can sync a log without any entry for the inodes we are
+ * renaming and therefore replaying that log, if a power failure
+ * happens after syncing the log, would result in deleting the
+ * inodes.
+ *
+ * If the rename affects two different directories, we want to
+ * make sure the that there's no log commit that contains
+ * updates for only one of the directories but not for the
+ * other.
+ *
+ * If we are renaming an entry for a root, we don't care about
+ * log updates since we called btrfs_set_log_full_commit().
+ */
+ btrfs_pin_log_trans(root);
+ btrfs_pin_log_trans(dest);
+ logs_pinned = true;
+ }
+
if (old_dentry->d_parent != new_dentry->d_parent) {
btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
BTRFS_I(old_inode), true);
@@ -8253,30 +8279,23 @@ static int btrfs_rename_exchange(struct inode *old_dir,
BTRFS_I(new_inode)->dir_index = new_idx;
/*
- * Now pin the logs of the roots. We do it to ensure that no other task
- * can sync the logs while we are in progress with the rename, because
- * that could result in an inconsistency in case any of the inodes that
- * are part of this rename operation were logged before.
+ * Do the log updates for all inodes.
+ *
+ * If either entry is for a root we don't need to update the logs since
+ * we've called btrfs_set_log_full_commit() before.
*/
- if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
- btrfs_pin_log_trans(root);
- if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
- btrfs_pin_log_trans(dest);
-
- /* Do the log updates for all inodes. */
- if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+ if (logs_pinned) {
btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
old_rename_ctx.index, new_dentry->d_parent);
- if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir),
new_rename_ctx.index, old_dentry->d_parent);
+ }
- /* Now unpin the logs. */
- if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+out_fail:
+ if (logs_pinned) {
btrfs_end_log_trans(root);
- if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
btrfs_end_log_trans(dest);
-out_fail:
+ }
ret2 = btrfs_end_transaction(trans);
ret = ret ? ret : ret2;
out_notrans:
@@ -8326,6 +8345,7 @@ static int btrfs_rename(struct mnt_idmap *idmap,
int ret2;
u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
struct fscrypt_name old_fname, new_fname;
+ bool logs_pinned = false;
if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
return -EPERM;
@@ -8460,6 +8480,29 @@ static int btrfs_rename(struct mnt_idmap *idmap,
inode_inc_iversion(old_inode);
simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
+ /*
+ * If we are renaming in the same directory (and it's not a
+ * root entry) pin the log to prevent any concurrent task from
+ * logging the directory after we removed the old entry and
+ * before we add the new entry, otherwise that task can sync
+ * a log without any entry for the inode we are renaming and
+ * therefore replaying that log, if a power failure happens
+ * after syncing the log, would result in deleting the inode.
+ *
+ * If the rename affects two different directories, we want to
+ * make sure the that there's no log commit that contains
+ * updates for only one of the directories but not for the
+ * other.
+ *
+ * If we are renaming an entry for a root, we don't care about
+ * log updates since we called btrfs_set_log_full_commit().
+ */
+ btrfs_pin_log_trans(root);
+ btrfs_pin_log_trans(dest);
+ logs_pinned = true;
+ }
+
if (old_dentry->d_parent != new_dentry->d_parent)
btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
BTRFS_I(old_inode), true);
@@ -8524,7 +8567,7 @@ static int btrfs_rename(struct mnt_idmap *idmap,
if (old_inode->i_nlink == 1)
BTRFS_I(old_inode)->dir_index = index;
- if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+ if (logs_pinned)
btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
rename_ctx.index, new_dentry->d_parent);
@@ -8540,6 +8583,10 @@ static int btrfs_rename(struct mnt_idmap *idmap,
}
}
out_fail:
+ if (logs_pinned) {
+ btrfs_end_log_trans(root);
+ btrfs_end_log_trans(dest);
+ }
ret2 = btrfs_end_transaction(trans);
ret = ret ? ret : ret2;
out_notrans:
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 913acef3f0a9..8a60983a697c 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -666,14 +666,14 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
goto out;
}
+ btrfs_record_new_subvolume(trans, BTRFS_I(dir));
+
ret = btrfs_create_new_inode(trans, &new_inode_args);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
- btrfs_record_new_subvolume(trans, BTRFS_I(dir));
-
d_instantiate_new(dentry, new_inode_args.inode);
new_inode_args.inode = NULL;
@@ -3139,7 +3139,7 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
return -EPERM;
if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
- btrfs_err(fs_info, "scrub is not supported on extent tree v2 yet");
+ btrfs_err(fs_info, "scrub: extent tree v2 not yet supported");
return -EINVAL;
}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index ce36fafc771e..7cd5e76a783c 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -557,7 +557,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
*/
for (i = 0; i < ipath->fspath->elem_cnt; ++i)
btrfs_warn_in_rcu(fs_info,
-"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)",
+"scrub: %s at logical %llu on dev %s, physical %llu root %llu inode %llu offset %llu length %u links %u (path: %s)",
swarn->errstr, swarn->logical,
btrfs_dev_name(swarn->dev),
swarn->physical,
@@ -571,7 +571,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
err:
btrfs_warn_in_rcu(fs_info,
- "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
+ "scrub: %s at logical %llu on dev %s, physical %llu root %llu inode %llu offset %llu: path resolving failed with ret=%d",
swarn->errstr, swarn->logical,
btrfs_dev_name(swarn->dev),
swarn->physical,
@@ -596,7 +596,7 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device *
/* Super block error, no need to search extent tree. */
if (is_super) {
- btrfs_warn_in_rcu(fs_info, "%s on device %s, physical %llu",
+ btrfs_warn_in_rcu(fs_info, "scrub: %s on device %s, physical %llu",
errstr, btrfs_dev_name(dev), physical);
return;
}
@@ -631,14 +631,14 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device *
&ref_level);
if (ret < 0) {
btrfs_warn(fs_info,
- "failed to resolve tree backref for logical %llu: %d",
- swarn.logical, ret);
+ "scrub: failed to resolve tree backref for logical %llu: %d",
+ swarn.logical, ret);
break;
}
if (ret > 0)
break;
btrfs_warn_in_rcu(fs_info,
-"%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
+"scrub: %s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
errstr, swarn.logical, btrfs_dev_name(dev),
swarn.physical, (ref_level ? "node" : "leaf"),
ref_level, ref_root);
@@ -718,7 +718,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr
scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree);
scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree);
btrfs_warn_rl(fs_info,
- "tree block %llu mirror %u has bad bytenr, has %llu want %llu",
+ "scrub: tree block %llu mirror %u has bad bytenr, has %llu want %llu",
logical, stripe->mirror_num,
btrfs_stack_header_bytenr(header), logical);
return;
@@ -728,7 +728,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr
scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree);
scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree);
btrfs_warn_rl(fs_info,
- "tree block %llu mirror %u has bad fsid, has %pU want %pU",
+ "scrub: tree block %llu mirror %u has bad fsid, has %pU want %pU",
logical, stripe->mirror_num,
header->fsid, fs_info->fs_devices->fsid);
return;
@@ -738,7 +738,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr
scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree);
scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree);
btrfs_warn_rl(fs_info,
- "tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU",
+ "scrub: tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU",
logical, stripe->mirror_num,
header->chunk_tree_uuid, fs_info->chunk_tree_uuid);
return;
@@ -760,7 +760,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr
scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree);
scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree);
btrfs_warn_rl(fs_info,
- "tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT,
+"scrub: tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT,
logical, stripe->mirror_num,
CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum),
CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum));
@@ -771,7 +771,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr
scrub_bitmap_set_meta_gen_error(stripe, sector_nr, sectors_per_tree);
scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree);
btrfs_warn_rl(fs_info,
- "tree block %llu mirror %u has bad generation, has %llu want %llu",
+ "scrub: tree block %llu mirror %u has bad generation, has %llu want %llu",
logical, stripe->mirror_num,
btrfs_stack_header_generation(header),
stripe->sectors[sector_nr].generation);
@@ -814,7 +814,7 @@ static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr)
*/
if (unlikely(sector_nr + sectors_per_tree > stripe->nr_sectors)) {
btrfs_warn_rl(fs_info,
- "tree block at %llu crosses stripe boundary %llu",
+ "scrub: tree block at %llu crosses stripe boundary %llu",
stripe->logical +
(sector_nr << fs_info->sectorsize_bits),
stripe->logical);
@@ -1046,12 +1046,12 @@ skip:
if (repaired) {
if (dev) {
btrfs_err_rl_in_rcu(fs_info,
- "fixed up error at logical %llu on dev %s physical %llu",
+ "scrub: fixed up error at logical %llu on dev %s physical %llu",
stripe->logical, btrfs_dev_name(dev),
physical);
} else {
btrfs_err_rl_in_rcu(fs_info,
- "fixed up error at logical %llu on mirror %u",
+ "scrub: fixed up error at logical %llu on mirror %u",
stripe->logical, stripe->mirror_num);
}
continue;
@@ -1060,12 +1060,12 @@ skip:
/* The remaining are all for unrepaired. */
if (dev) {
btrfs_err_rl_in_rcu(fs_info,
- "unable to fixup (regular) error at logical %llu on dev %s physical %llu",
+"scrub: unable to fixup (regular) error at logical %llu on dev %s physical %llu",
stripe->logical, btrfs_dev_name(dev),
physical);
} else {
btrfs_err_rl_in_rcu(fs_info,
- "unable to fixup (regular) error at logical %llu on mirror %u",
+ "scrub: unable to fixup (regular) error at logical %llu on mirror %u",
stripe->logical, stripe->mirror_num);
}
@@ -1593,8 +1593,7 @@ static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
physical,
sctx->write_pointer);
if (ret)
- btrfs_err(fs_info,
- "zoned: failed to recover write pointer");
+ btrfs_err(fs_info, "scrub: zoned: failed to recover write pointer");
}
mutex_unlock(&sctx->wr_lock);
btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical);
@@ -1658,7 +1657,7 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg,
int ret;
if (unlikely(!extent_root || !csum_root)) {
- btrfs_err(fs_info, "no valid extent or csum root for scrub");
+ btrfs_err(fs_info, "scrub: no valid extent or csum root found");
return -EUCLEAN;
}
memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) *
@@ -1907,7 +1906,7 @@ static bool stripe_has_metadata_error(struct scrub_stripe *stripe)
struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
btrfs_err(fs_info,
- "stripe %llu has unrepaired metadata sector at %llu",
+ "scrub: stripe %llu has unrepaired metadata sector at logical %llu",
stripe->logical,
stripe->logical + (i << fs_info->sectorsize_bits));
return true;
@@ -2167,7 +2166,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
bitmap_and(&error, &error, &has_extent, stripe->nr_sectors);
if (!bitmap_empty(&error, stripe->nr_sectors)) {
btrfs_err(fs_info,
-"unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl",
+"scrub: unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl",
full_stripe_start, i, stripe->nr_sectors,
&error);
ret = -EIO;
@@ -2789,14 +2788,14 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
ro_set = 0;
} else if (ret == -ETXTBSY) {
btrfs_warn(fs_info,
- "skipping scrub of block group %llu due to active swapfile",
+ "scrub: skipping scrub of block group %llu due to active swapfile",
cache->start);
scrub_pause_off(fs_info);
ret = 0;
goto skip_unfreeze;
} else {
- btrfs_warn(fs_info,
- "failed setting block group ro: %d", ret);
+ btrfs_warn(fs_info, "scrub: failed setting block group ro: %d",
+ ret);
btrfs_unfreeze_block_group(cache);
btrfs_put_block_group(cache);
scrub_pause_off(fs_info);
@@ -2892,13 +2891,13 @@ static int scrub_one_super(struct scrub_ctx *sctx, struct btrfs_device *dev,
ret = btrfs_check_super_csum(fs_info, sb);
if (ret != 0) {
btrfs_err_rl(fs_info,
- "super block at physical %llu devid %llu has bad csum",
+ "scrub: super block at physical %llu devid %llu has bad csum",
physical, dev->devid);
return -EIO;
}
if (btrfs_super_generation(sb) != generation) {
btrfs_err_rl(fs_info,
-"super block at physical %llu devid %llu has bad generation %llu expect %llu",
+"scrub: super block at physical %llu devid %llu has bad generation %llu expect %llu",
physical, dev->devid,
btrfs_super_generation(sb), generation);
return -EUCLEAN;
@@ -3059,7 +3058,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
btrfs_err_in_rcu(fs_info,
- "scrub on devid %llu: filesystem on %s is not writable",
+ "scrub: devid %llu: filesystem on %s is not writable",
devid, btrfs_dev_name(dev));
ret = -EROFS;
goto out;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 97e933113b82..cea8a7e9d6d3 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -143,6 +143,9 @@ static struct btrfs_inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *r
unsigned int nofs_flag;
struct btrfs_inode *inode;
+ /* Only meant to be called for subvolume roots and not for log roots. */
+ ASSERT(is_fstree(btrfs_root_id(root)));
+
/*
* We're holding a transaction handle whether we are logging or
* replaying a log tree, so we must make sure NOFS semantics apply
@@ -604,21 +607,6 @@ static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len,
return 0;
}
-/*
- * simple helper to read an inode off the disk from a given root
- * This can only be called for subvolume roots and not for the log
- */
-static noinline struct btrfs_inode *read_one_inode(struct btrfs_root *root,
- u64 objectid)
-{
- struct btrfs_inode *inode;
-
- inode = btrfs_iget_logging(objectid, root);
- if (IS_ERR(inode))
- return NULL;
- return inode;
-}
-
/* replays a single extent in 'eb' at 'slot' with 'key' into the
* subvolume 'root'. path is released on entry and should be released
* on exit.
@@ -668,15 +656,15 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
extent_end = ALIGN(start + size,
fs_info->sectorsize);
} else {
- ret = 0;
- goto out;
+ btrfs_err(fs_info,
+ "unexpected extent type=%d root=%llu inode=%llu offset=%llu",
+ found_type, btrfs_root_id(root), key->objectid, key->offset);
+ return -EUCLEAN;
}
- inode = read_one_inode(root, key->objectid);
- if (!inode) {
- ret = -EIO;
- goto out;
- }
+ inode = btrfs_iget_logging(key->objectid, root);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
/*
* first check to see if we already have this extent in the
@@ -948,9 +936,10 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
- inode = read_one_inode(root, location.objectid);
- if (!inode) {
- ret = -EIO;
+ inode = btrfs_iget_logging(location.objectid, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ inode = NULL;
goto out;
}
@@ -961,7 +950,8 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
ret = unlink_inode_for_log_replay(trans, dir, inode, &name);
out:
kfree(name.name);
- iput(&inode->vfs_inode);
+ if (inode)
+ iput(&inode->vfs_inode);
return ret;
}
@@ -1072,7 +1062,9 @@ again:
search_key.type = BTRFS_INODE_REF_KEY;
search_key.offset = parent_objectid;
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
- if (ret == 0) {
+ if (ret < 0) {
+ return ret;
+ } else if (ret == 0) {
struct btrfs_inode_ref *victim_ref;
unsigned long ptr;
unsigned long ptr_end;
@@ -1145,13 +1137,13 @@ again:
struct fscrypt_str victim_name;
extref = (struct btrfs_inode_extref *)(base + cur_offset);
+ victim_name.len = btrfs_inode_extref_name_len(leaf, extref);
if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
goto next;
ret = read_alloc_one_name(leaf, &extref->name,
- btrfs_inode_extref_name_len(leaf, extref),
- &victim_name);
+ victim_name.len, &victim_name);
if (ret)
return ret;
@@ -1166,18 +1158,18 @@ again:
kfree(victim_name.name);
return ret;
} else if (!ret) {
- ret = -ENOENT;
- victim_parent = read_one_inode(root,
- parent_objectid);
- if (victim_parent) {
+ victim_parent = btrfs_iget_logging(parent_objectid, root);
+ if (IS_ERR(victim_parent)) {
+ ret = PTR_ERR(victim_parent);
+ } else {
inc_nlink(&inode->vfs_inode);
btrfs_release_path(path);
ret = unlink_inode_for_log_replay(trans,
victim_parent,
inode, &victim_name);
+ iput(&victim_parent->vfs_inode);
}
- iput(&victim_parent->vfs_inode);
kfree(victim_name.name);
if (ret)
return ret;
@@ -1314,9 +1306,9 @@ again:
struct btrfs_inode *dir;
btrfs_release_path(path);
- dir = read_one_inode(root, parent_id);
- if (!dir) {
- ret = -ENOENT;
+ dir = btrfs_iget_logging(parent_id, root);
+ if (IS_ERR(dir)) {
+ ret = PTR_ERR(dir);
kfree(name.name);
goto out;
}
@@ -1388,15 +1380,17 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
* copy the back ref in. The link count fixup code will take
* care of the rest
*/
- dir = read_one_inode(root, parent_objectid);
- if (!dir) {
- ret = -ENOENT;
+ dir = btrfs_iget_logging(parent_objectid, root);
+ if (IS_ERR(dir)) {
+ ret = PTR_ERR(dir);
+ dir = NULL;
goto out;
}
- inode = read_one_inode(root, inode_objectid);
- if (!inode) {
- ret = -EIO;
+ inode = btrfs_iget_logging(inode_objectid, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ inode = NULL;
goto out;
}
@@ -1408,11 +1402,13 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
* parent object can change from one array
* item to another.
*/
- if (!dir)
- dir = read_one_inode(root, parent_objectid);
if (!dir) {
- ret = -ENOENT;
- goto out;
+ dir = btrfs_iget_logging(parent_objectid, root);
+ if (IS_ERR(dir)) {
+ ret = PTR_ERR(dir);
+ dir = NULL;
+ goto out;
+ }
}
} else {
ret = ref_get_fields(eb, ref_ptr, &name, &ref_index);
@@ -1681,9 +1677,9 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
break;
btrfs_release_path(path);
- inode = read_one_inode(root, key.offset);
- if (!inode) {
- ret = -EIO;
+ inode = btrfs_iget_logging(key.offset, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
break;
}
@@ -1719,9 +1715,9 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode;
struct inode *vfs_inode;
- inode = read_one_inode(root, objectid);
- if (!inode)
- return -EIO;
+ inode = btrfs_iget_logging(objectid, root);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
vfs_inode = &inode->vfs_inode;
key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
@@ -1760,14 +1756,14 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir;
int ret;
- inode = read_one_inode(root, location->objectid);
- if (!inode)
- return -ENOENT;
+ inode = btrfs_iget_logging(location->objectid, root);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
- dir = read_one_inode(root, dirid);
- if (!dir) {
+ dir = btrfs_iget_logging(dirid, root);
+ if (IS_ERR(dir)) {
iput(&inode->vfs_inode);
- return -EIO;
+ return PTR_ERR(dir);
}
ret = btrfs_add_link(trans, dir, inode, name, 1, index);
@@ -1844,9 +1840,9 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
bool update_size = true;
bool name_added = false;
- dir = read_one_inode(root, key->objectid);
- if (!dir)
- return -EIO;
+ dir = btrfs_iget_logging(key->objectid, root);
+ if (IS_ERR(dir))
+ return PTR_ERR(dir);
ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name);
if (ret)
@@ -2146,9 +2142,10 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
btrfs_dir_item_key_to_cpu(eb, di, &location);
btrfs_release_path(path);
btrfs_release_path(log_path);
- inode = read_one_inode(root, location.objectid);
- if (!inode) {
- ret = -EIO;
+ inode = btrfs_iget_logging(location.objectid, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ inode = NULL;
goto out;
}
@@ -2300,14 +2297,17 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
if (!log_path)
return -ENOMEM;
- dir = read_one_inode(root, dirid);
- /* it isn't an error if the inode isn't there, that can happen
- * because we replay the deletes before we copy in the inode item
- * from the log
+ dir = btrfs_iget_logging(dirid, root);
+ /*
+ * It isn't an error if the inode isn't there, that can happen because
+ * we replay the deletes before we copy in the inode item from the log.
*/
- if (!dir) {
+ if (IS_ERR(dir)) {
btrfs_free_path(log_path);
- return 0;
+ ret = PTR_ERR(dir);
+ if (ret == -ENOENT)
+ ret = 0;
+ return ret;
}
range_start = 0;
@@ -2466,9 +2466,9 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
struct btrfs_inode *inode;
u64 from;
- inode = read_one_inode(root, key.objectid);
- if (!inode) {
- ret = -EIO;
+ inode = btrfs_iget_logging(key.objectid, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
break;
}
from = ALIGN(i_size_read(&inode->vfs_inode),
@@ -7447,6 +7447,8 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
* full log sync.
* Also we don't need to worry with renames, since btrfs_rename() marks the log
* for full commit when renaming a subvolume.
+ *
+ * Must be called before creating the subvolume entry in its parent directory.
*/
void btrfs_record_new_subvolume(const struct btrfs_trans_handle *trans,
struct btrfs_inode *dir)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 89835071cfea..f475b4b7c457 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3282,6 +3282,12 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
device->bytes_used - dev_extent_len);
atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
btrfs_clear_space_info_full(fs_info);
+
+ if (list_empty(&device->post_commit_list)) {
+ list_add_tail(&device->post_commit_list,
+ &trans->transaction->dev_update_list);
+ }
+
mutex_unlock(&fs_info->chunk_mutex);
}
}
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index b5b0156d5b95..9430b34d3cbb 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -1403,7 +1403,8 @@ static int btrfs_load_block_group_single(struct btrfs_block_group *bg,
static int btrfs_load_block_group_dup(struct btrfs_block_group *bg,
struct btrfs_chunk_map *map,
struct zone_info *zone_info,
- unsigned long *active)
+ unsigned long *active,
+ u64 last_alloc)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
@@ -1426,6 +1427,13 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg,
zone_info[1].physical);
return -EIO;
}
+
+ if (zone_info[0].alloc_offset == WP_CONVENTIONAL)
+ zone_info[0].alloc_offset = last_alloc;
+
+ if (zone_info[1].alloc_offset == WP_CONVENTIONAL)
+ zone_info[1].alloc_offset = last_alloc;
+
if (zone_info[0].alloc_offset != zone_info[1].alloc_offset) {
btrfs_err(bg->fs_info,
"zoned: write pointer offset mismatch of zones in DUP profile");
@@ -1446,7 +1454,8 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg,
static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg,
struct btrfs_chunk_map *map,
struct zone_info *zone_info,
- unsigned long *active)
+ unsigned long *active,
+ u64 last_alloc)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
int i;
@@ -1461,10 +1470,12 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg,
bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity);
for (i = 0; i < map->num_stripes; i++) {
- if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
- zone_info[i].alloc_offset == WP_CONVENTIONAL)
+ if (zone_info[i].alloc_offset == WP_MISSING_DEV)
continue;
+ if (zone_info[i].alloc_offset == WP_CONVENTIONAL)
+ zone_info[i].alloc_offset = last_alloc;
+
if ((zone_info[0].alloc_offset != zone_info[i].alloc_offset) &&
!btrfs_test_opt(fs_info, DEGRADED)) {
btrfs_err(fs_info,
@@ -1494,7 +1505,8 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg,
static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
struct btrfs_chunk_map *map,
struct zone_info *zone_info,
- unsigned long *active)
+ unsigned long *active,
+ u64 last_alloc)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
@@ -1505,10 +1517,29 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
}
for (int i = 0; i < map->num_stripes; i++) {
- if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
- zone_info[i].alloc_offset == WP_CONVENTIONAL)
+ if (zone_info[i].alloc_offset == WP_MISSING_DEV)
continue;
+ if (zone_info[i].alloc_offset == WP_CONVENTIONAL) {
+ u64 stripe_nr, full_stripe_nr;
+ u64 stripe_offset;
+ int stripe_index;
+
+ stripe_nr = div64_u64(last_alloc, map->stripe_size);
+ stripe_offset = stripe_nr * map->stripe_size;
+ full_stripe_nr = div_u64(stripe_nr, map->num_stripes);
+ div_u64_rem(stripe_nr, map->num_stripes, &stripe_index);
+
+ zone_info[i].alloc_offset =
+ full_stripe_nr * map->stripe_size;
+
+ if (stripe_index > i)
+ zone_info[i].alloc_offset += map->stripe_size;
+ else if (stripe_index == i)
+ zone_info[i].alloc_offset +=
+ (last_alloc - stripe_offset);
+ }
+
if (test_bit(0, active) != test_bit(i, active)) {
if (!btrfs_zone_activate(bg))
return -EIO;
@@ -1526,7 +1557,8 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
struct btrfs_chunk_map *map,
struct zone_info *zone_info,
- unsigned long *active)
+ unsigned long *active,
+ u64 last_alloc)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
@@ -1537,8 +1569,7 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
}
for (int i = 0; i < map->num_stripes; i++) {
- if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
- zone_info[i].alloc_offset == WP_CONVENTIONAL)
+ if (zone_info[i].alloc_offset == WP_MISSING_DEV)
continue;
if (test_bit(0, active) != test_bit(i, active)) {
@@ -1549,6 +1580,29 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
}
+ if (zone_info[i].alloc_offset == WP_CONVENTIONAL) {
+ u64 stripe_nr, full_stripe_nr;
+ u64 stripe_offset;
+ int stripe_index;
+
+ stripe_nr = div64_u64(last_alloc, map->stripe_size);
+ stripe_offset = stripe_nr * map->stripe_size;
+ full_stripe_nr = div_u64(stripe_nr,
+ map->num_stripes / map->sub_stripes);
+ div_u64_rem(stripe_nr,
+ (map->num_stripes / map->sub_stripes),
+ &stripe_index);
+
+ zone_info[i].alloc_offset =
+ full_stripe_nr * map->stripe_size;
+
+ if (stripe_index > (i / map->sub_stripes))
+ zone_info[i].alloc_offset += map->stripe_size;
+ else if (stripe_index == (i / map->sub_stripes))
+ zone_info[i].alloc_offset +=
+ (last_alloc - stripe_offset);
+ }
+
if ((i % map->sub_stripes) == 0) {
bg->zone_capacity += zone_info[i].capacity;
bg->alloc_offset += zone_info[i].alloc_offset;
@@ -1637,18 +1691,22 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
ret = btrfs_load_block_group_single(cache, &zone_info[0], active);
break;
case BTRFS_BLOCK_GROUP_DUP:
- ret = btrfs_load_block_group_dup(cache, map, zone_info, active);
+ ret = btrfs_load_block_group_dup(cache, map, zone_info, active,
+ last_alloc);
break;
case BTRFS_BLOCK_GROUP_RAID1:
case BTRFS_BLOCK_GROUP_RAID1C3:
case BTRFS_BLOCK_GROUP_RAID1C4:
- ret = btrfs_load_block_group_raid1(cache, map, zone_info, active);
+ ret = btrfs_load_block_group_raid1(cache, map, zone_info,
+ active, last_alloc);
break;
case BTRFS_BLOCK_GROUP_RAID0:
- ret = btrfs_load_block_group_raid0(cache, map, zone_info, active);
+ ret = btrfs_load_block_group_raid0(cache, map, zone_info,
+ active, last_alloc);
break;
case BTRFS_BLOCK_GROUP_RAID10:
- ret = btrfs_load_block_group_raid10(cache, map, zone_info, active);
+ ret = btrfs_load_block_group_raid10(cache, map, zone_info,
+ active, last_alloc);
break;
case BTRFS_BLOCK_GROUP_RAID5:
case BTRFS_BLOCK_GROUP_RAID6: