diff options
| author | Qu Wenruo <wqu@suse.com> | 2025-09-09 12:38:47 +0930 |
|---|---|---|
| committer | David Sterba <dsterba@suse.com> | 2025-09-23 08:49:25 +0200 |
| commit | 98077f7f2180fa996710452564ebe71adc66af59 (patch) | |
| tree | 675217b15ece31bb721ce4013ef49a5d27cf3d63 | |
| parent | e9bed72e883e7c6e6a2057ea29fcd4ba69225f91 (diff) | |
btrfs: enable experimental bs > ps support
With all the preparation patches, we're able to finally enable btrfs
block size (sector size) larger than page size support and give it a
full fstests run.
And obviously this new feature is hidden behind experimental flags, and
should not be considered as a core feature yet as btrfs' default block
size is still 4K.
But this is still a feature that will shine in the future where 16K
block sized device are widely adopted.
For now there are some features explicitly disabled:
- Direct IO
This is the most complex part to support, the root reason is we can
not control the pages of iov iter passed in.
User space programs can only ensure the virtual addresses are
contiguous, but have no control on their physical addresses.
Our bs > ps support heavily relies on large folios, and direct IO
memory can easily break it.
So direct IO is disabled and will always fall back to buffered IO.
- RAID56
In theory we can convert RAID56 to use large folios, but it will need
to be converted back to page based if we want to support direct IO in
the future.
So just reject it for now.
- Encoded send
- Encoded read
Both are utilizing btrfs_encoded_read_regular_fill_pages(), and send
is utilizing vmallocated memory.
Unfortunately for vmallocated memory we can not guarantee the minimal
folio order.
For send, it will just always fallback to regular writes, which reads
from page cache and will follow the existing folio order requirement.
- Encoded write
Encoded write itself is allocating pages by themselves, and we can
easily change it to follow the minimal order.
But since encoded read is already disabled, there is no need to only
enable encoded write.
Finally just like what we did for bs < ps support in the past, add a
warning message for bs > ps mounts.
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
| -rw-r--r-- | fs/btrfs/direct-io.c | 12 | ||||
| -rw-r--r-- | fs/btrfs/disk-io.c | 14 | ||||
| -rw-r--r-- | fs/btrfs/fs.c | 3 | ||||
| -rw-r--r-- | fs/btrfs/ioctl.c | 35 | ||||
| -rw-r--r-- | fs/btrfs/send.c | 9 |
5 files changed, 58 insertions, 15 deletions
diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c index fe9a4bd7e6e6..802d4dbe5b38 100644 --- a/fs/btrfs/direct-io.c +++ b/fs/btrfs/direct-io.c @@ -786,6 +786,18 @@ static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, if (iov_iter_alignment(iter) & blocksize_mask) return -EINVAL; + /* + * For bs > ps support, we heavily rely on large folios to make sure no + * block will cross large folio boundaries. + * + * But memory provided by direct IO is only virtually contiguous, not + * physically contiguous, and will break the btrfs' large folio requirement. + * + * So for bs > ps support, all direct IOs should fallback to buffered ones. + */ + if (fs_info->sectorsize > PAGE_SIZE) + return -EINVAL; + return 0; } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 5c57f523f449..0aee3239518d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3242,18 +3242,24 @@ int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount) } /* - * Subpage runtime limitation on v1 cache. + * Subpage/bs > ps runtime limitation on v1 cache. * * V1 space cache still has some hard coded PAGE_SIZE usage, while * we're already defaulting to v2 cache, no need to bother v1 as it's * going to be deprecated anyway. */ - if (fs_info->sectorsize < PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) { + if (fs_info->sectorsize != PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) { btrfs_warn(fs_info, "v1 space cache is not supported for page size %lu with sectorsize %u", PAGE_SIZE, fs_info->sectorsize); return -EINVAL; } + if (fs_info->sectorsize > PAGE_SIZE && btrfs_fs_incompat(fs_info, RAID56)) { + btrfs_err(fs_info, + "RAID56 is not supported for page size %lu with sectorsize %u", + PAGE_SIZE, fs_info->sectorsize); + return -EINVAL; + } /* This can be called by remount, we need to protect the super block. */ spin_lock(&fs_info->super_lock); @@ -3388,6 +3394,10 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device fs_info->stripesize = stripesize; fs_info->fs_devices->fs_info = fs_info; + if (fs_info->sectorsize > PAGE_SIZE) + btrfs_warn(fs_info, + "support for block size %u with page size %zu is experimental, some features may be missing", + fs_info->sectorsize, PAGE_SIZE); /* * Handle the space caching options appropriately now that we have the * super block loaded and validated. diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c index 29ad1c859194..feb0a2faa837 100644 --- a/fs/btrfs/fs.c +++ b/fs/btrfs/fs.c @@ -97,8 +97,7 @@ bool __attribute_const__ btrfs_supported_blocksize(u32 blocksize) */ if (IS_ENABLED(CONFIG_HIGHMEM) && blocksize > PAGE_SIZE) return false; - if (blocksize <= PAGE_SIZE) - return true; + return true; #endif return false; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 063291519b36..0e9e2b999392 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4418,6 +4418,10 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp, goto out_acct; } + if (fs_info->sectorsize > PAGE_SIZE) { + ret = -ENOTTY; + goto out_acct; + } if (compat) { #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) struct btrfs_ioctl_encoded_io_args_32 args32; @@ -4509,6 +4513,7 @@ out_acct: static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool compat) { + struct btrfs_fs_info *fs_info = inode_to_fs_info(file->f_inode); struct btrfs_ioctl_encoded_io_args args; struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; @@ -4522,6 +4527,11 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool goto out_acct; } + if (fs_info->sectorsize > PAGE_SIZE) { + ret = -ENOTTY; + goto out_acct; + } + if (!(file->f_mode & FMODE_WRITE)) { ret = -EBADF; goto out_acct; @@ -4780,14 +4790,14 @@ out_fail: static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue_flags) { + struct file *file = cmd->file; + struct btrfs_inode *inode = BTRFS_I(file->f_inode); + struct extent_io_tree *io_tree = &inode->io_tree; + struct btrfs_fs_info *fs_info = inode->root->fs_info; size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags); size_t copy_end; int ret; u64 disk_bytenr, disk_io_size; - struct file *file; - struct btrfs_inode *inode; - struct btrfs_fs_info *fs_info; - struct extent_io_tree *io_tree; loff_t pos; struct kiocb kiocb; struct extent_state *cached_state = NULL; @@ -4803,10 +4813,11 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue ret = -EPERM; goto out_acct; } - file = cmd->file; - inode = BTRFS_I(file->f_inode); - fs_info = inode->root->fs_info; - io_tree = &inode->io_tree; + if (fs_info->sectorsize > PAGE_SIZE) { + ret = -ENOTTY; + goto out_acct; + } + sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)); if (issue_flags & IO_URING_F_COMPAT) { @@ -4933,9 +4944,10 @@ out_acct: static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issue_flags) { + struct file *file = cmd->file; + struct btrfs_fs_info *fs_info = inode_to_fs_info(file->f_inode); loff_t pos; struct kiocb kiocb; - struct file *file; ssize_t ret; void __user *sqe_addr; struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd); @@ -4948,8 +4960,11 @@ static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issu ret = -EPERM; goto out_acct; } + if (fs_info->sectorsize > PAGE_SIZE) { + ret = -ENOTTY; + goto out_acct; + } - file = cmd->file; sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr)); if (!(file->f_mode & FMODE_WRITE)) { diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 32653fc44a75..5e073502b9e8 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -5654,7 +5654,14 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path, ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - if ((sctx->flags & BTRFS_SEND_FLAG_COMPRESSED) && + /* + * Do not go through encoded read for bs > ps cases. + * + * Encoded send is using vmallocated pages as buffer, which we can + * not ensure every folio is large enough to contain a block. + */ + if (sctx->send_root->fs_info->sectorsize <= PAGE_SIZE && + (sctx->flags & BTRFS_SEND_FLAG_COMPRESSED) && btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) { bool is_inline = (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE); |
