diff options
author | Carlos Maiolino <cem@kernel.org> | 2025-03-10 10:35:39 +0100 |
---|---|---|
committer | Carlos Maiolino <cem@kernel.org> | 2025-03-10 10:35:39 +0100 |
commit | 32f6987f938433bcd93e6c04aecedfe079460ceb (patch) | |
tree | 9310a6a0d70671e5c881f3364ea4d85e8ec647e3 | |
parent | 5d138b6fb4da63b46981bca744f8f262d2524281 (diff) | |
parent | 358cab79dd025fa434681f8c3b0961eeb3446ffe (diff) |
Merge branch 'xfs-6.15-merge' into for-next
XFS code for 6.15 to be merged into linux-next
Signed-off-by: Carlos Maiolino <cem@kernel.org>
92 files changed, 6397 insertions, 1393 deletions
diff --git a/Documentation/filesystems/iomap/design.rst b/Documentation/filesystems/iomap/design.rst index b0d0188a095e..e29651a42eec 100644 --- a/Documentation/filesystems/iomap/design.rst +++ b/Documentation/filesystems/iomap/design.rst @@ -246,6 +246,10 @@ The fields are as follows: * **IOMAP_F_PRIVATE**: Starting with this value, the upper bits can be set by the filesystem for its own purposes. + * **IOMAP_F_ANON_WRITE**: Indicates that (write) I/O does not have a target + block assigned to it yet and the file system will do that in the bio + submission handler, splitting the I/O as needed. + These flags can be set by iomap itself during file operations. The filesystem should supply an ``->iomap_end`` function if it needs to observe these flags: @@ -352,6 +356,11 @@ operations: ``IOMAP_NOWAIT`` is often set on behalf of ``IOCB_NOWAIT`` or ``RWF_NOWAIT``. + * ``IOMAP_DONTCACHE`` is set when the caller wishes to perform a + buffered file I/O and would like the kernel to drop the pagecache + after the I/O completes, if it isn't already being used by another + thread. + If it is necessary to read existing file contents from a `different <https://lore.kernel.org/all/20191008071527.29304-9-hch@lst.de/>`_ device or address range on a device, the filesystem should return that diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst index 2c7f5df9d8b0..b08a79d11d9f 100644 --- a/Documentation/filesystems/iomap/operations.rst +++ b/Documentation/filesystems/iomap/operations.rst @@ -131,6 +131,8 @@ These ``struct kiocb`` flags are significant for buffered I/O with iomap: * ``IOCB_NOWAIT``: Turns on ``IOMAP_NOWAIT``. + * ``IOCB_DONTCACHE``: Turns on ``IOMAP_DONTCACHE``. + Internal per-Folio State ------------------------ @@ -283,7 +285,7 @@ The ``ops`` structure must be specified and is as follows: struct iomap_writeback_ops { int (*map_blocks)(struct iomap_writepage_ctx *wpc, struct inode *inode, loff_t offset, unsigned len); - int (*prepare_ioend)(struct iomap_ioend *ioend, int status); + int (*submit_ioend)(struct iomap_writepage_ctx *wpc, int status); void (*discard_folio)(struct folio *folio, loff_t pos); }; @@ -306,13 +308,12 @@ The fields are as follows: purpose. This function must be supplied by the filesystem. - - ``prepare_ioend``: Enables filesystems to transform the writeback - ioend or perform any other preparatory work before the writeback I/O - is submitted. + - ``submit_ioend``: Allows the file systems to hook into writeback bio + submission. This might include pre-write space accounting updates, or installing a custom ``->bi_end_io`` function for internal purposes, such as deferring the ioend completion to a workqueue to run metadata update - transactions from process context. + transactions from process context before submitting the bio. This function is optional. - ``discard_folio``: iomap calls this function after ``->map_blocks`` @@ -341,7 +342,7 @@ This can happen in interrupt or process context, depending on the storage device. Filesystems that need to update internal bookkeeping (e.g. unwritten -extent conversions) should provide a ``->prepare_ioend`` function to +extent conversions) should provide a ``->submit_ioend`` function to set ``struct iomap_end::bio::bi_end_io`` to its own function. This function should call ``iomap_finish_ioends`` after finishing its own work (e.g. unwritten extent conversion). @@ -513,8 +514,8 @@ IOMAP_WRITE`` with any combination of the following enhancements: if the mapping is unwritten and the filesystem cannot handle zeroing the unaligned regions without exposing stale contents. - * ``IOMAP_ATOMIC``: This write is being issued with torn-write - protection. + * ``IOMAP_ATOMIC_HW``: This write is being issued with torn-write + protection based on HW-offload support. Only a single bio can be created for the write, and the write must not be split into multiple I/O requests, i.e. flag REQ_ATOMIC must be set. @@ -525,8 +526,20 @@ IOMAP_WRITE`` with any combination of the following enhancements: conversion or copy on write), all updates for the entire file range must be committed atomically as well. Only one space mapping is allowed per untorn write. - Untorn writes must be aligned to, and must not be longer than, a - single file block. + Untorn writes may be longer than a single file block. In all cases, + the mapping start disk block must have at least the same alignment as + the write offset. + + * ``IOMAP_ATOMIC_SW``: This write is being issued with torn-write + protection via a software mechanism provided by the filesystem. + All the disk block alignment and single bio restrictions which apply + to IOMAP_ATOMIC_HW do not apply here. + SW-based untorn writes would typically be used as a fallback when + HW-based untorn writes may not be issued, e.g. the range of the write + covers multiple extents, meaning that it is not possible to issue + a single bio. + All filesystem metadata updates for the entire file range must be + committed atomically as well. Callers commonly hold ``i_rwsem`` in shared or exclusive mode before calling this function. @@ -1258,7 +1258,7 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, } #endif /* CONFIG_FS_DAX_PMD */ -static s64 dax_unshare_iter(struct iomap_iter *iter) +static int dax_unshare_iter(struct iomap_iter *iter) { struct iomap *iomap = &iter->iomap; const struct iomap *srcmap = iomap_iter_srcmap(iter); @@ -1266,11 +1266,11 @@ static s64 dax_unshare_iter(struct iomap_iter *iter) u64 copy_len = iomap_length(iter); u32 mod; int id = 0; - s64 ret = 0; + s64 ret; void *daddr = NULL, *saddr = NULL; if (!iomap_want_unshare_iter(iter)) - return iomap_length(iter); + return iomap_iter_advance_full(iter); /* * Extend the file range to be aligned to fsblock/pagesize, because @@ -1300,14 +1300,14 @@ static s64 dax_unshare_iter(struct iomap_iter *iter) if (ret < 0) goto out_unlock; - if (copy_mc_to_kernel(daddr, saddr, copy_len) == 0) - ret = iomap_length(iter); - else + if (copy_mc_to_kernel(daddr, saddr, copy_len) != 0) ret = -EIO; out_unlock: dax_read_unlock(id); - return dax_mem2blk_err(ret); + if (ret < 0) + return dax_mem2blk_err(ret); + return iomap_iter_advance_full(iter); } int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len, @@ -1326,7 +1326,7 @@ int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len, iter.len = min(len, size - pos); while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = dax_unshare_iter(&iter); + iter.status = dax_unshare_iter(&iter); return ret; } EXPORT_SYMBOL_GPL(dax_file_unshare); @@ -1354,17 +1354,16 @@ static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size) return ret; } -static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero) +static int dax_zero_iter(struct iomap_iter *iter, bool *did_zero) { const struct iomap *iomap = &iter->iomap; const struct iomap *srcmap = iomap_iter_srcmap(iter); - loff_t pos = iter->pos; u64 length = iomap_length(iter); - s64 written = 0; + int ret; /* already zeroed? we're done. */ if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) - return length; + return iomap_iter_advance(iter, &length); /* * invalidate the pages whose sharing state is to be changed @@ -1372,33 +1371,35 @@ static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero) */ if (iomap->flags & IOMAP_F_SHARED) invalidate_inode_pages2_range(iter->inode->i_mapping, - pos >> PAGE_SHIFT, - (pos + length - 1) >> PAGE_SHIFT); + iter->pos >> PAGE_SHIFT, + (iter->pos + length - 1) >> PAGE_SHIFT); do { + loff_t pos = iter->pos; unsigned offset = offset_in_page(pos); - unsigned size = min_t(u64, PAGE_SIZE - offset, length); pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); - long rc; int id; + length = min_t(u64, PAGE_SIZE - offset, length); + id = dax_read_lock(); - if (IS_ALIGNED(pos, PAGE_SIZE) && size == PAGE_SIZE) - rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1); + if (IS_ALIGNED(pos, PAGE_SIZE) && length == PAGE_SIZE) + ret = dax_zero_page_range(iomap->dax_dev, pgoff, 1); else - rc = dax_memzero(iter, pos, size); + ret = dax_memzero(iter, pos, length); dax_read_unlock(id); - if (rc < 0) - return rc; - pos += size; - length -= size; - written += size; + if (ret < 0) + return ret; + + ret = iomap_iter_advance(iter, &length); + if (ret) + return ret; } while (length > 0); if (did_zero) *did_zero = true; - return written; + return ret; } int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, @@ -1413,7 +1414,7 @@ int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, int ret; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = dax_zero_iter(&iter, did_zero); + iter.status = dax_zero_iter(&iter, did_zero); return ret; } EXPORT_SYMBOL_GPL(dax_zero_range); @@ -1431,8 +1432,7 @@ int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, } EXPORT_SYMBOL_GPL(dax_truncate_page); -static loff_t dax_iomap_iter(const struct iomap_iter *iomi, - struct iov_iter *iter) +static int dax_iomap_iter(struct iomap_iter *iomi, struct iov_iter *iter) { const struct iomap *iomap = &iomi->iomap; const struct iomap *srcmap = iomap_iter_srcmap(iomi); @@ -1451,8 +1451,10 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, if (pos >= end) return 0; - if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) - return iov_iter_zero(min(length, end - pos), iter); + if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) { + done = iov_iter_zero(min(length, end - pos), iter); + return iomap_iter_advance(iomi, &done); + } } /* @@ -1485,7 +1487,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, } id = dax_read_lock(); - while (pos < end) { + while ((pos = iomi->pos) < end) { unsigned offset = pos & (PAGE_SIZE - 1); const size_t size = ALIGN(length + offset, PAGE_SIZE); pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); @@ -1535,18 +1537,16 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr, map_len, iter); - pos += xfer; - length -= xfer; - done += xfer; - - if (xfer == 0) + length = xfer; + ret = iomap_iter_advance(iomi, &length); + if (!ret && xfer == 0) ret = -EFAULT; if (xfer < map_len) break; } dax_read_unlock(id); - return done ? done : ret; + return ret; } /** @@ -1586,7 +1586,7 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, iomi.flags |= IOMAP_NOWAIT; while ((ret = iomap_iter(&iomi, ops)) > 0) - iomi.processed = dax_iomap_iter(&iomi, iter); + iomi.status = dax_iomap_iter(&iomi, iter); done = iomi.pos - iocb->ki_pos; iocb->ki_pos = iomi.pos; @@ -1757,7 +1757,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, while ((error = iomap_iter(&iter, ops)) > 0) { if (WARN_ON_ONCE(iomap_length(&iter) < PAGE_SIZE)) { - iter.processed = -EIO; /* fs corruption? */ + iter.status = -EIO; /* fs corruption? */ continue; } @@ -1769,8 +1769,10 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, ret |= VM_FAULT_MAJOR; } - if (!(ret & VM_FAULT_ERROR)) - iter.processed = PAGE_SIZE; + if (!(ret & VM_FAULT_ERROR)) { + u64 length = PAGE_SIZE; + iter.status = iomap_iter_advance(&iter, &length); + } } if (iomap_errp) @@ -1883,8 +1885,10 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, continue; /* actually breaks out of the loop */ ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, true); - if (ret != VM_FAULT_FALLBACK) - iter.processed = PMD_SIZE; + if (ret != VM_FAULT_FALLBACK) { + u64 length = PMD_SIZE; + iter.status = iomap_iter_advance(&iter, &length); + } } unlock_entry: @@ -1999,12 +2003,13 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, unsigned int order, } EXPORT_SYMBOL_GPL(dax_finish_sync_fault); -static loff_t dax_range_compare_iter(struct iomap_iter *it_src, +static int dax_range_compare_iter(struct iomap_iter *it_src, struct iomap_iter *it_dest, u64 len, bool *same) { const struct iomap *smap = &it_src->iomap; const struct iomap *dmap = &it_dest->iomap; loff_t pos1 = it_src->pos, pos2 = it_dest->pos; + u64 dest_len; void *saddr, *daddr; int id, ret; @@ -2012,7 +2017,7 @@ static loff_t dax_range_compare_iter(struct iomap_iter *it_src, if (smap->type == IOMAP_HOLE && dmap->type == IOMAP_HOLE) { *same = true; - return len; + goto advance; } if (smap->type == IOMAP_HOLE || dmap->type == IOMAP_HOLE) { @@ -2035,7 +2040,13 @@ static loff_t dax_range_compare_iter(struct iomap_iter *it_src, if (!*same) len = 0; dax_read_unlock(id); - return len; + +advance: + dest_len = len; + ret = iomap_iter_advance(it_src, &len); + if (!ret) + ret = iomap_iter_advance(it_dest, &dest_len); + return ret; out_unlock: dax_read_unlock(id); @@ -2058,15 +2069,15 @@ int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff, .len = len, .flags = IOMAP_DAX, }; - int ret, compared = 0; + int ret, status; while ((ret = iomap_iter(&src_iter, ops)) > 0 && (ret = iomap_iter(&dst_iter, ops)) > 0) { - compared = dax_range_compare_iter(&src_iter, &dst_iter, + status = dax_range_compare_iter(&src_iter, &dst_iter, min(src_iter.len, dst_iter.len), same); - if (compared < 0) + if (status < 0) return ret; - src_iter.processed = dst_iter.processed = compared; + src_iter.status = dst_iter.status = status; } return ret; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 7c54ae5fcbd4..ba2f1e3db7c7 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3467,7 +3467,7 @@ static inline bool ext4_want_directio_fallback(unsigned flags, ssize_t written) return false; /* atomic writes are all-or-nothing */ - if (flags & IOMAP_ATOMIC) + if (flags & IOMAP_ATOMIC_HW) return false; /* can only try again if we wrote nothing */ diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 1795c4e8dbf6..366516b98b3f 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1300,7 +1300,8 @@ static int gfs2_block_zero_range(struct inode *inode, loff_t from, unsigned int length) { BUG_ON(current->journal_info); - return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops); + return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops, + NULL); } #define GFS2_JTRUNC_REVOKES 8192 diff --git a/fs/iomap/Makefile b/fs/iomap/Makefile index 381d76c5c232..69e8ebb41302 100644 --- a/fs/iomap/Makefile +++ b/fs/iomap/Makefile @@ -12,6 +12,7 @@ iomap-y += trace.o \ iter.o iomap-$(CONFIG_BLOCK) += buffered-io.o \ direct-io.o \ + ioend.o \ fiemap.o \ seek.o iomap-$(CONFIG_SWAP) += swapfile.o diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index d303e6c8900c..d52cfdc299c4 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -12,17 +12,15 @@ #include <linux/buffer_head.h> #include <linux/dax.h> #include <linux/writeback.h> -#include <linux/list_sort.h> #include <linux/swap.h> #include <linux/bio.h> #include <linux/sched/signal.h> #include <linux/migrate.h> +#include "internal.h" #include "trace.h" #include "../internal.h" -#define IOEND_BATCH_SIZE 4096 - /* * Structure allocated for each folio to track per-block uptodate, dirty state * and I/O completions. @@ -40,8 +38,6 @@ struct iomap_folio_state { unsigned long state[]; }; -static struct bio_set iomap_ioend_bioset; - static inline bool ifs_is_fully_uptodate(struct folio *folio, struct iomap_folio_state *ifs) { @@ -366,15 +362,14 @@ static inline bool iomap_block_needs_zeroing(const struct iomap_iter *iter, pos >= i_size_read(iter->inode); } -static loff_t iomap_readpage_iter(const struct iomap_iter *iter, - struct iomap_readpage_ctx *ctx, loff_t offset) +static int iomap_readpage_iter(struct iomap_iter *iter, + struct iomap_readpage_ctx *ctx) { const struct iomap *iomap = &iter->iomap; - loff_t pos = iter->pos + offset; - loff_t length = iomap_length(iter) - offset; + loff_t pos = iter->pos; + loff_t length = iomap_length(iter); struct folio *folio = ctx->cur_folio; struct iomap_folio_state *ifs; - loff_t orig_pos = pos; size_t poff, plen; sector_t sector; @@ -438,25 +433,22 @@ done: * we can skip trailing ones as they will be handled in the next * iteration. */ - return pos - orig_pos + plen; + length = pos - iter->pos + plen; + return iomap_iter_advance(iter, &length); } -static loff_t iomap_read_folio_iter(const struct iomap_iter *iter, +static int iomap_read_folio_iter(struct iomap_iter *iter, struct iomap_readpage_ctx *ctx) { - struct folio *folio = ctx->cur_folio; - size_t offset = offset_in_folio(folio, iter->pos); - loff_t length = min_t(loff_t, folio_size(folio) - offset, - iomap_length(iter)); - loff_t done, ret; - - for (done = 0; done < length; done += ret) { - ret = iomap_readpage_iter(iter, ctx, done); - if (ret <= 0) + int ret; + + while (iomap_length(iter)) { + ret = iomap_readpage_iter(iter, ctx); + if (ret) return ret; } - return done; + return 0; } int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops) @@ -474,7 +466,7 @@ int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops) trace_iomap_readpage(iter.inode, 1); while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_read_folio_iter(&iter, &ctx); + iter.status = iomap_read_folio_iter(&iter, &ctx); if (ctx.bio) { submit_bio(ctx.bio); @@ -493,15 +485,14 @@ int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops) } EXPORT_SYMBOL_GPL(iomap_read_folio); -static loff_t iomap_readahead_iter(const struct iomap_iter *iter, +static int iomap_readahead_iter(struct iomap_iter *iter, struct iomap_readpage_ctx *ctx) { - loff_t length = iomap_length(iter); - loff_t done, ret; + int ret; - for (done = 0; done < length; done += ret) { + while (iomap_length(iter)) { if (ctx->cur_folio && - offset_in_folio(ctx->cur_folio, iter->pos + done) == 0) { + offset_in_folio(ctx->cur_folio, iter->pos) == 0) { if (!ctx->cur_folio_in_bio) folio_unlock(ctx->cur_folio); ctx->cur_folio = NULL; @@ -510,12 +501,12 @@ static loff_t iomap_readahead_iter(const struct iomap_iter *iter, ctx->cur_folio = readahead_folio(ctx->rac); ctx->cur_folio_in_bio = false; } - ret = iomap_readpage_iter(iter, ctx, done); - if (ret <= 0) + ret = iomap_readpage_iter(iter, ctx); + if (ret) return ret; } - return done; + return 0; } /** @@ -547,7 +538,7 @@ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops) trace_iomap_readahead(rac->mapping->host, readahead_count(rac)); while (iomap_iter(&iter, ops) > 0) - iter.processed = iomap_readahead_iter(&iter, &ctx); + iter.status = iomap_readahead_iter(&iter, &ctx); if (ctx.bio) submit_bio(ctx.bio); @@ -603,6 +594,8 @@ struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len) if (iter->flags & IOMAP_NOWAIT) fgp |= FGP_NOWAIT; + if (iter->flags & IOMAP_DONTCACHE) + fgp |= FGP_DONTCACHE; fgp |= fgf_set_order(len); return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, @@ -907,12 +900,10 @@ static bool iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, return __iomap_write_end(iter->inode, pos, len, copied, folio); } -static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) +static int iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) { - loff_t length = iomap_length(iter); - loff_t pos = iter->pos; ssize_t total_written = 0; - long status = 0; + int status = 0; struct address_space *mapping = iter->inode->i_mapping; size_t chunk = mapping_max_folio_size(mapping); unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0; @@ -923,7 +914,8 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) size_t offset; /* Offset into folio */ size_t bytes; /* Bytes to write to folio */ size_t copied; /* Bytes copied from user */ - size_t written; /* Bytes have been written */ + u64 written; /* Bytes have been written */ + loff_t pos = iter->pos; bytes = iov_iter_count(i); retry: @@ -934,8 +926,8 @@ retry: if (unlikely(status)) break; - if (bytes > length) - bytes = length; + if (bytes > iomap_length(iter)) + bytes = iomap_length(iter); /* * Bring in the user page that we'll copy from _first_. @@ -1006,17 +998,12 @@ retry: goto retry; } } else { - pos += written; total_written += written; - length -= written; + iomap_iter_advance(iter, &written); } - } while (iov_iter_count(i) && length); + } while (iov_iter_count(i) && iomap_length(iter)); - if (status == -EAGAIN) { - iov_iter_revert(i, total_written); - return -EAGAIN; - } - return total_written ? total_written : status; + return total_written ? 0 : status; } ssize_t @@ -1034,9 +1021,11 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, if (iocb->ki_flags & IOCB_NOWAIT) iter.flags |= IOMAP_NOWAIT; + if (iocb->ki_flags & IOCB_DONTCACHE) + iter.flags |= IOMAP_DONTCACHE; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_write_iter(&iter, i); + iter.status = iomap_write_iter(&iter, i); if (unlikely(iter.pos == iocb->ki_pos)) return ret; @@ -1270,23 +1259,22 @@ void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte, } EXPORT_SYMBOL_GPL(iomap_write_delalloc_release); -static loff_t iomap_unshare_iter(struct iomap_iter *iter) +static int iomap_unshare_iter(struct iomap_iter *iter) { struct iomap *iomap = &iter->iomap; - loff_t pos = iter->pos; - loff_t length = iomap_length(iter); - loff_t written = 0; + u64 bytes = iomap_length(iter); + int status; if (!iomap_want_unshare_iter(iter)) - return length; + return iomap_iter_advance(iter, &bytes); do { struct folio *folio; - int status; size_t offset; - size_t bytes = min_t(u64, SIZE_MAX, length); + loff_t pos = iter->pos; bool ret; + bytes = min_t(u64, SIZE_MAX, bytes); status = iomap_write_begin(iter, pos, bytes, &folio); if (unlikely(status)) return status; @@ -1304,14 +1292,14 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter) cond_resched(); - pos += bytes; - written += bytes; - length -= bytes; - balance_dirty_pages_ratelimited(iter->inode->i_mapping); - } while (length > 0); - return written; + status = iomap_iter_advance(iter, &bytes); + if (status) + break; + } while (bytes > 0); + + return status; } int @@ -1331,7 +1319,7 @@ iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, iter.len = min(len, size - pos); while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_unshare_iter(&iter); + iter.status = iomap_unshare_iter(&iter); return ret; } EXPORT_SYMBOL_GPL(iomap_file_unshare); @@ -1350,19 +1338,18 @@ static inline int iomap_zero_iter_flush_and_stale(struct iomap_iter *i) return filemap_write_and_wait_range(mapping, i->pos, end); } -static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) +static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) { - loff_t pos = iter->pos; - loff_t length = iomap_length(iter); - loff_t written = 0; + u64 bytes = iomap_length(iter); + int status; do { struct folio *folio; - int status; size_t offset; - size_t bytes = min_t(u64, SIZE_MAX, length); + loff_t pos = iter->pos; bool ret; + bytes = min_t(u64, SIZE_MAX, bytes); status = iomap_write_begin(iter, pos, bytes, &folio); if (status) return status; @@ -1383,25 +1370,26 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) if (WARN_ON_ONCE(!ret)) return -EIO; - pos += bytes; - length -= bytes; - written += bytes; - } while (length > 0); + status = iomap_iter_advance(iter, &bytes); + if (status) + break; + } while (bytes > 0); if (did_zero) *did_zero = true; - return written; + return status; } int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, - const struct iomap_ops *ops) + const struct iomap_ops *ops, void *private) { struct iomap_iter iter = { .inode = inode, .pos = pos, .len = len, .flags = IOMAP_ZERO, + .private = private, }; struct address_space *mapping = inode->i_mapping; unsigned int blocksize = i_blocksize(inode); @@ -1424,7 +1412,7 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, filemap_range_needs_writeback(mapping, pos, pos + plen - 1)) { iter.len = plen; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_zero_iter(&iter, did_zero); + iter.status = iomap_zero_iter(&iter, did_zero); iter.len = len - (iter.pos - pos); if (ret || !iter.len) @@ -1443,17 +1431,19 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) { - loff_t proc = iomap_length(&iter); + s64 status; if (range_dirty) { range_dirty = false; - proc = iomap_zero_iter_flush_and_stale(&iter); + status = iomap_zero_iter_flush_and_stale(&iter); + } else { + status = iomap_iter_advance_full(&iter); } - iter.processed = proc; + iter.status = status; continue; } - iter.processed = iomap_zero_iter(&iter, did_zero); + iter.status = iomap_zero_iter(&iter, did_zero); } return ret; } @@ -1461,7 +1451,7 @@ EXPORT_SYMBOL_GPL(iomap_zero_range); int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, - const struct iomap_ops *ops) + const struct iomap_ops *ops, void *private) { unsigned int blocksize = i_blocksize(inode); unsigned int off = pos & (blocksize - 1); @@ -1469,11 +1459,12 @@ iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, /* Block boundary? Nothing to do */ if (!off) return 0; - return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops); + return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops, + private); } EXPORT_SYMBOL_GPL(iomap_truncate_page); -static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter, +static int iomap_folio_mkwrite_iter(struct iomap_iter *iter, struct folio *folio) { loff_t length = iomap_length(iter); @@ -1490,14 +1481,16 @@ static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter, folio_mark_dirty(folio); } - return length; + return iomap_iter_advance(iter, &length); } -vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) +vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops, + void *private) { struct iomap_iter iter = { .inode = file_inode(vmf->vma->vm_file), .flags = IOMAP_WRITE | IOMAP_FAULT, + .private = private, }; struct folio *folio = page_folio(vmf->page); ssize_t ret; @@ -1509,7 +1502,7 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) iter.pos = folio_pos(folio); iter.len = ret; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_folio_mkwrite_iter(&iter, folio); + iter.status = iomap_folio_mkwrite_iter(&iter, folio); if (ret < 0) goto out_unlock; @@ -1538,16 +1531,15 @@ static void iomap_finish_folio_write(struct inode *inode, struct folio *folio, * state, release holds on bios, and finally free up memory. Do not use the * ioend after this. */ -static u32 -iomap_finish_ioend(struct iomap_ioend *ioend, int error) +u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend) { struct inode *inode = ioend->io_inode; struct bio *bio = &ioend->io_bio; struct folio_iter fi; u32 folio_count = 0; - if (error) { - mapping_set_error(inode->i_mapping, error); + if (ioend->io_error) { + mapping_set_error(inode->i_mapping, ioend->io_error); if (!bio_flagged(bio, BIO_QUIET)) { pr_err_ratelimited( "%s: writeback error on inode %lu, offset %lld, sector %llu", @@ -1566,116 +1558,16 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error) return folio_count; } -/* - * Ioend completion routine for merged bios. This can only be called from task - * contexts as merged ioends can be of unbound length. Hence we have to break up - * the writeback completions into manageable chunks to avoid long scheduler - * holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get - * good batch processing throughput without creating adverse scheduler latency - * conditions. - */ -void -iomap_finish_ioends(struct iomap_ioend *ioend, int error) -{ - struct list_head tmp; - u32 completions; - - might_sleep(); - - list_replace_init(&ioend->io_list, &tmp); - completions = iomap_finish_ioend(ioend, error); - - while (!list_empty(&tmp)) { - if (completions > IOEND_BATCH_SIZE * 8) { - cond_resched(); - completions = 0; - } - ioend = list_first_entry(&tmp, struct iomap_ioend, io_list); - list_del_init(&ioend->io_list); - completions += iomap_finish_ioend(ioend, error); - } -} -EXPORT_SYMBOL_GPL(iomap_finish_ioends); - -/* - * We can merge two adjacent ioends if they have the same set of work to do. - */ -static bool -iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next) -{ - if (ioend->io_bio.bi_status != next->io_bio.bi_status) - return false; - if (next->io_flags & IOMAP_F_BOUNDARY) - return false; - if ((ioend->io_flags & IOMAP_F_SHARED) ^ - (next->io_flags & IOMAP_F_SHARED)) - return false; - if ((ioend->io_type == IOMAP_UNWRITTEN) ^ - (next->io_type == IOMAP_UNWRITTEN)) - return false; - if (ioend->io_offset + ioend->io_size != next->io_offset) - return false; - /* - * Do not merge physically discontiguous ioends. The filesystem - * completion functions will have to iterate the physical - * discontiguities even if we merge the ioends at a logical level, so - * we don't gain anything by merging physical discontiguities here. - * - * We cannot use bio->bi_iter.bi_sector here as it is modified during - * submission so does not point to the start sector of the bio at - * completion. - */ - if (ioend->io_sector + (ioend->io_size >> 9) != next->io_sector) - return false; - return true; -} - -void -iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends) -{ - struct iomap_ioend *next; - - INIT_LIST_HEAD(&ioend->io_list); - - while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend, - io_list))) { - if (!iomap_ioend_can_merge(ioend, next)) - break; - list_move_tail(&next->io_list, &ioend->io_list); - ioend->io_size += next->io_size; - } -} -EXPORT_SYMBOL_GPL(iomap_ioend_try_merge); - -static int -iomap_ioend_compare(void *priv, const struct list_head *a, - const struct list_head *b) -{ - struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list); - struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list); - - if (ia->io_offset < ib->io_offset) - return -1; - if (ia->io_offset > ib->io_offset) - return 1; - return 0; -} - -void -iomap_sort_ioends(struct list_head *ioend_list) -{ - list_sort(NULL, ioend_list, iomap_ioend_compare); -} -EXPORT_SYMBOL_GPL(iomap_sort_ioends); - static void iomap_writepage_end_bio(struct bio *bio) { - iomap_finish_ioend(iomap_ioend_from_bio(bio), - blk_status_to_errno(bio->bi_status)); + struct iomap_ioend *ioend = iomap_ioend_from_bio(bio); + + ioend->io_error = blk_status_to_errno(bio->bi_status); + iomap_finish_ioend_buffered(ioend); } /* - * Submit the final bio for an ioend. + * Submit an ioend. * * If @error is non-zero, it means that we have a situation where some part of * the submission process has failed after we've marked pages for writeback. @@ -1694,14 +1586,18 @@ static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error) * failure happened so that the file system end I/O handler gets called * to clean up. */ - if (wpc->ops->prepare_ioend) - error = wpc->ops->prepare_ioend(wpc->ioend, error); + if (wpc->ops->submit_ioend) { + error = wpc->ops->submit_ioend(wpc, error); + } else { + if (WARN_ON_ONCE(wpc->iomap.flags & IOMAP_F_ANON_WRITE)) + error = -EIO; + if (!error) + submit_bio(&wpc->ioend->io_bio); + } if (error) { wpc->ioend->io_bio.bi_status = errno_to_blk_status(error); bio_endio(&wpc->ioend->io_bio); - } else { - submit_bio(&wpc->ioend->io_bio); } wpc->ioend = NULL; @@ -1709,9 +1605,9 @@ static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error) } static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc, - struct writeback_control *wbc, struct inode *inode, loff_t pos) + struct writeback_control *wbc, struct inode *inode, loff_t pos, + u16 ioend_flags) { - struct iomap_ioend *ioend; struct bio *bio; bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS, @@ -1719,36 +1615,24 @@ static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc, GFP_NOFS, &iomap_ioend_bioset); bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos); bio->bi_end_io = iomap_writepage_end_bio; - wbc_init_bio(wbc, bio); bio->bi_write_hint = inode->i_write_hint; - - ioend = iomap_ioend_from_bio(bio); - INIT_LIST_HEAD(&ioend->io_list); - ioend->io_type = wpc->iomap.type; - ioend->io_flags = wpc->iomap.flags; - if (pos > wpc->iomap.offset) - wpc->iomap.flags &= ~IOMAP_F_BOUNDARY; - ioend->io_inode = inode; - ioend->io_size = 0; - ioend->io_offset = pos; - ioend->io_sector = bio->bi_iter.bi_sector; - + wbc_init_bio(wbc, bio); wpc->nr_folios = 0; - return ioend; + return iomap_init_ioend(inode, bio, pos, ioend_flags); } -static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos) +static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos, + u16 ioend_flags) { - if (wpc->iomap.offset == pos && (wpc->iomap.flags & IOMAP_F_BOUNDARY)) + if (ioend_flags & IOMAP_IOEND_BOUNDARY) return false; - if ((wpc->iomap.flags & IOMAP_F_SHARED) != - (wpc->ioend->io_flags & IOMAP_F_SHARED)) - return false; - if (wpc->iomap.type != wpc->ioend->io_type) + if ((ioend_flags & IOMAP_IOEND_NOMERGE_FLAGS) != + (wpc->ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS)) return false; if (pos != wpc->ioend->io_offset + wpc->ioend->io_size) return false; - if (iomap_sector(&wpc->iomap, pos) != + if (!(wpc->iomap.flags & IOMAP_F_ANON_WRITE) && + iomap_sector(&wpc->iomap, pos) != bio_end_sector(&wpc->ioend->io_bio)) return false; /* @@ -1779,14 +1663,23 @@ static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, { struct iomap_folio_state *ifs = folio->private; size_t poff = offset_in_folio(folio, pos); + unsigned int ioend_flags = 0; int error; - if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos)) { + if (wpc->iomap.type == IOMAP_UNWRITTEN) + ioend_flags |= IOMAP_IOEND_UNWRITTEN; + if (wpc->iomap.flags & IOMAP_F_SHARED) + ioend_flags |= IOMAP_IOEND_SHARED; + if (pos == wpc->iomap.offset && (wpc->iomap.flags & IOMAP_F_BOUNDARY)) + ioend_flags |= IOMAP_IOEND_BOUNDARY; + + if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos, ioend_flags)) { new_ioend: error = iomap_submit_ioend(wpc, 0); if (error) return error; - wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos); + wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos, + ioend_flags); } if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff)) @@ -2062,11 +1955,3 @@ iomap_writepages(struct address_space *mapping, struct writeback_control *wbc, return iomap_submit_ioend(wpc, error); } EXPORT_SYMBOL_GPL(iomap_writepages); - -static int __init iomap_buffered_init(void) -{ - return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE), - offsetof(struct iomap_ioend, io_bio), - BIOSET_NEED_BVECS); -} -fs_initcall(iomap_buffered_init); diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index b521eb15759e..5299f70428ef 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2010 Red Hat, Inc. - * Copyright (c) 2016-2021 Christoph Hellwig. + * Copyright (c) 2016-2025 Christoph Hellwig. */ #include <linux/module.h> #include <linux/compiler.h> @@ -12,6 +12,7 @@ #include <linux/backing-dev.h> #include <linux/uio.h> #include <linux/task_io_accounting_ops.h> +#include "internal.h" #include "trace.h" #include "../internal.h" @@ -20,6 +21,7 @@ * Private flags for iomap_dio, must not overlap with the public ones in * iomap.h: */ +#define IOMAP_DIO_NO_INVALIDATE (1U << 25) #define IOMAP_DIO_CALLER_COMP (1U << 26) #define IOMAP_DIO_INLINE_COMP (1U << 27) #define IOMAP_DIO_WRITE_THROUGH (1U << 28) @@ -81,10 +83,12 @@ static void iomap_dio_submit_bio(const struct iomap_iter *iter, WRITE_ONCE(iocb->private, bio); } - if (dio->dops && dio->dops->submit_io) + if (dio->dops && dio->dops->submit_io) { dio->dops->submit_io(iter, bio, pos); - else + } else { + WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_ANON_WRITE); submit_bio(bio); + } } ssize_t iomap_dio_complete(struct iomap_dio *dio) @@ -117,7 +121,8 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio) * ->end_io() when necessary, otherwise a racing buffer read would cache * zeros from unwritten extents. */ - if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE)) + if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE) && + !(dio->flags & IOMAP_DIO_NO_INVALIDATE)) kiocb_invalidate_post_direct_write(iocb, dio->size); inode_dio_end(file_inode(iocb->ki_filp)); @@ -163,43 +168,31 @@ static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret) cmpxchg(&dio->error, 0, ret); } -void iomap_dio_bio_end_io(struct bio *bio) +/* + * Called when dio->ref reaches zero from an I/O completion. + */ +static void iomap_dio_done(struct iomap_dio *dio) { - struct iomap_dio *dio = bio->bi_private; - bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); struct kiocb *iocb = dio->iocb; - if (bio->bi_status) - iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status)); - if (!atomic_dec_and_test(&dio->ref)) - goto release_bio; - - /* - * Synchronous dio, task itself will handle any completion work - * that needs after IO. All we need to do is wake the task. - */ if (dio->wait_for_completion) { + /* + * Synchronous I/O, task itself will handle any completion work + * that needs after IO. All we need to do is wake the task. + */ struct task_struct *waiter = dio->submit.waiter; WRITE_ONCE(dio->submit.waiter, NULL); blk_wake_io_task(waiter); - goto release_bio; - } - - /* - * Flagged with IOMAP_DIO_INLINE_COMP, we can complete it inline - */ - if (dio->flags & IOMAP_DIO_INLINE_COMP) { + } else if (dio->flags & IOMAP_DIO_INLINE_COMP) { WRITE_ONCE(iocb->private, NULL); iomap_dio_complete_work(&dio->aio.work); - goto release_bio; - } - - /* - * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then schedule - * our completion that way to avoid an async punt to a workqueue. - */ - if (dio->flags & IOMAP_DIO_CALLER_COMP) { + } else if (dio->flags & IOMAP_DIO_CALLER_COMP) { + /* + * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then + * schedule our completion that way to avoid an async punt to a + * workqueue. + */ /* only polled IO cares about private cleared */ iocb->private = dio; iocb->dio_complete = iomap_dio_deferred_complete; @@ -217,19 +210,31 @@ void iomap_dio_bio_end_io(struct bio *bio) * issuer. */ iocb->ki_complete(iocb, 0); - goto release_bio; + } else { + struct inode *inode = file_inode(iocb->ki_filp); + + /* + * Async DIO completion that requires filesystem level + * completion work gets punted to a work queue to complete as + * the operation may require more IO to be issued to finalise + * filesystem metadata changes or guarantee data integrity. + */ + INIT_WORK(&dio->aio.work, iomap_dio_complete_work); + queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work); } +} + +void iomap_dio_bio_end_io(struct bio *bio) +{ + struct iomap_dio *dio = bio->bi_private; + bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); + + if (bio->bi_status) + iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status)); + + if (atomic_dec_and_test(&dio->ref)) + iomap_dio_done(dio); - /* - * Async DIO completion that requires filesystem level completion work - * gets punted to a work queue to complete as the operation may require - * more IO to be issued to finalise filesystem metadata changes or - * guarantee data integrity. - */ - INIT_WORK(&dio->aio.work, iomap_dio_complete_work); - queue_work(file_inode(iocb->ki_filp)->i_sb->s_dio_done_wq, - &dio->aio.work); -release_bio: if (should_dirty) { bio_check_pages_dirty(bio); } else { @@ -239,6 +244,47 @@ release_bio: } EXPORT_SYMBOL_GPL(iomap_dio_bio_end_io); +u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend) +{ + struct iomap_dio *dio = ioend->io_bio.bi_private; + bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); + u32 vec_count = ioend->io_bio.bi_vcnt; + + if (ioend->io_error) + iomap_dio_set_error(dio, ioend->io_error); + + if (atomic_dec_and_test(&dio->ref)) { + /* + * Try to avoid another context switch for the completion given + * that we are already called from the ioend completion + * workqueue, but never invalidate pages from this thread to + * avoid deadlocks with buffered I/O completions. Tough luck if + * you hit the tiny race with someone dirtying the range now + * between this check and the actual completion. + */ + if (!dio->iocb->ki_filp->f_mapping->nrpages) { + dio->flags |= IOMAP_DIO_INLINE_COMP; + dio->flags |= IOMAP_DIO_NO_INVALIDATE; + } + dio->flags &= ~IOMAP_DIO_CALLER_COMP; + iomap_dio_done(dio); + } + + if (should_dirty) { + bio_check_pages_dirty(&ioend->io_bio); + } else { + bio_release_pages(&ioend->io_bio, false); + bio_put(&ioend->io_bio); + } + + /* + * Return the number of bvecs completed as even direct I/O completions + * do significant per-folio work and we'll still want to give up the + * CPU after a lot of completions. + */ + return vec_count; +} + static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, loff_t pos, unsigned len) { @@ -271,7 +317,7 @@ static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, * clearing the WRITE_THROUGH flag in the dio request. */ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio, - const struct iomap *iomap, bool use_fua, bool atomic) + const struct iomap *iomap, bool use_fua, bool atomic_hw) { blk_opf_t opflags = REQ_SYNC | REQ_IDLE; @@ -283,30 +329,29 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio, opflags |= REQ_FUA; else dio->flags &= ~IOMAP_DIO_WRITE_THROUGH; - if (atomic) + if (atomic_hw) opflags |= REQ_ATOMIC; return opflags; } -static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, - struct iomap_dio *dio) +static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) { const struct iomap *iomap = &iter->iomap; struct inode *inode = iter->inode; unsigned int fs_block_size = i_blocksize(inode), pad; + bool atomic_hw = iter->flags & IOMAP_ATOMIC_HW; const loff_t length = iomap_length(iter); - bool atomic = iter->flags & IOMAP_ATOMIC; loff_t pos = iter->pos; blk_opf_t bio_opf; struct bio *bio; bool need_zeroout = false; bool use_fua = false; int nr_pages, ret = 0; - size_t copied = 0; + u64 copied = 0; size_t orig_count; - if (atomic && length != fs_block_size) + if (atomic_hw && length != iter->len) return -EINVAL; if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) || @@ -383,7 +428,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, goto out; } - bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua, atomic); + bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua, atomic_hw); nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS); do { @@ -416,7 +461,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, } n = bio->bi_iter.bi_size; - if (WARN_ON_ONCE(atomic && n != length)) { + if (WARN_ON_ONCE(atomic_hw && n != length)) { /* * This bio should have covered the complete length, * which it doesn't, so error. We may need to zero out @@ -467,30 +512,28 @@ out: /* Undo iter limitation to current extent */ iov_iter_reexpand(dio->submit.iter, orig_count - copied); if (copied) - return copied; + return iomap_iter_advance(iter, &copied); return ret; } -static loff_t iomap_dio_hole_iter(const struct iomap_iter *iter, - struct iomap_dio *dio) +static int iomap_dio_hole_iter(struct iomap_iter *iter, struct iomap_dio *dio) { loff_t length = iov_iter_zero(iomap_length(iter), dio->submit.iter); dio->size += length; if (!length) return -EFAULT; - return length; + return iomap_iter_advance(iter, &length); } -static loff_t iomap_dio_inline_iter(const struct iomap_iter *iomi, - struct iomap_dio *dio) +static int iomap_dio_inline_iter(struct iomap_iter *iomi, struct iomap_dio *dio) { const struct iomap *iomap = &iomi->iomap; struct iov_iter *iter = dio->submit.iter; void *inline_data = iomap_inline_data(iomap, iomi->pos); loff_t length = iomap_length(iomi); loff_t pos = iomi->pos; - size_t copied; + u64 copied; if (WARN_ON_ONCE(!iomap_inline_data_valid(iomap))) return -EIO; @@ -512,11 +555,10 @@ static loff_t iomap_dio_inline_iter(const struct iomap_iter *iomi, dio->size += copied; if (!copied) return -EFAULT; - return copied; + return iomap_iter_advance(iomi, &copied); } -static loff_t iomap_dio_iter(const struct iomap_iter *iter, - struct iomap_dio *dio) +static int iomap_dio_iter(struct iomap_iter *iter, struct iomap_dio *dio) { switch (iter->iomap.type) { case IOMAP_HOLE: @@ -610,9 +652,6 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (iocb->ki_flags & IOCB_NOWAIT) iomi.flags |= IOMAP_NOWAIT; - if (iocb->ki_flags & IOCB_ATOMIC) - iomi.flags |= IOMAP_ATOMIC; - if (iov_iter_rw(iter) == READ) { /* reads can always complete inline */ dio->flags |= IOMAP_DIO_INLINE_COMP; @@ -647,6 +686,11 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, iomi.flags |= IOMAP_OVERWRITE_ONLY; } + if (dio_flags & IOMAP_DIO_ATOMIC_SW) + iomi.flags |= IOMAP_ATOMIC_SW; + else if (iocb->ki_flags & IOCB_ATOMIC) + iomi.flags |= IOMAP_ATOMIC_HW; + /* for data sync or sync, we need sync completion processing */ if (iocb_is_dsync(iocb)) { dio->flags |= IOMAP_DIO_NEED_SYNC; @@ -700,7 +744,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, blk_start_plug(&plug); while ((ret = iomap_iter(&iomi, ops)) > 0) { - iomi.processed = iomap_dio_iter(&iomi, dio); + iomi.status = iomap_dio_iter(&iomi, dio); /* * We can only poll for single bio I/Os. diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c index 610ca6f1ec9b..80675c42e94e 100644 --- a/fs/iomap/fiemap.c +++ b/fs/iomap/fiemap.c @@ -39,24 +39,23 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi, iomap->length, flags); } -static loff_t iomap_fiemap_iter(const struct iomap_iter *iter, +static int iomap_fiemap_iter(struct iomap_iter *iter, struct fiemap_extent_info *fi, struct iomap *prev) { int ret; if (iter->iomap.type == IOMAP_HOLE) - return iomap_length(iter); + goto advance; ret = iomap_to_fiemap(fi, prev, 0); *prev = iter->iomap; - switch (ret) { - case 0: /* success */ - return iomap_length(iter); - case 1: /* extent array full */ - return 0; - default: /* error */ + if (ret < 0) return ret; - } + if (ret == 1) /* extent array full */ + return 0; + +advance: + return iomap_iter_advance_full(iter); } int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, @@ -78,7 +77,7 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, return ret; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_fiemap_iter(&iter, fi, &prev); + iter.status = iomap_fiemap_iter(&iter, fi, &prev); if (prev.type != IOMAP_HOLE) { ret = iomap_to_fiemap(fi, &prev, FIEMAP_EXTENT_LAST); @@ -114,7 +113,7 @@ iomap_bmap(struct address_space *mapping, sector_t bno, while ((ret = iomap_iter(&iter, ops)) > 0) { if (iter.iomap.type == IOMAP_MAPPED) bno = iomap_sector(&iter.iomap, iter.pos) >> blkshift; - /* leave iter.processed unset to abort loop */ + /* leave iter.status unset to abort loop */ } if (ret) return 0; diff --git a/fs/iomap/internal.h b/fs/iomap/internal.h new file mode 100644 index 000000000000..f6992a3bf66a --- /dev/null +++ b/fs/iomap/internal.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _IOMAP_INTERNAL_H +#define _IOMAP_INTERNAL_H 1 + +#define IOEND_BATCH_SIZE 4096 + +u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend); +u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend); + +#endif /* _IOMAP_INTERNAL_H */ diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c new file mode 100644 index 000000000000..18894ebba6db --- /dev/null +++ b/fs/iomap/ioend.c @@ -0,0 +1,216 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2024-2025 Christoph Hellwig. + */ +#include <linux/iomap.h> +#include <linux/list_sort.h> +#include "internal.h" + +struct bio_set iomap_ioend_bioset; +EXPORT_SYMBOL_GPL(iomap_ioend_bioset); + +struct iomap_ioend *iomap_init_ioend(struct inode *inode, + struct bio *bio, loff_t file_offset, u16 ioend_flags) +{ + struct iomap_ioend *ioend = iomap_ioend_from_bio(bio); + + atomic_set(&ioend->io_remaining, 1); + ioend->io_error = 0; + ioend->io_parent = NULL; + INIT_LIST_HEAD(&ioend->io_list); + ioend->io_flags = ioend_flags; + ioend->io_inode = inode; + ioend->io_offset = file_offset; + ioend->io_size = bio->bi_iter.bi_size; + ioend->io_sector = bio->bi_iter.bi_sector; + ioend->io_private = NULL; + return ioend; +} +EXPORT_SYMBOL_GPL(iomap_init_ioend); + +static u32 iomap_finish_ioend(struct iomap_ioend *ioend, int error) +{ + if (ioend->io_parent) { + struct bio *bio = &ioend->io_bio; + + ioend = ioend->io_parent; + bio_put(bio); + } + + if (error) + cmpxchg(&ioend->io_error, 0, error); + + if (!atomic_dec_and_test(&ioend->io_remaining)) + return 0; + if (ioend->io_flags & IOMAP_IOEND_DIRECT) + return iomap_finish_ioend_direct(ioend); + return iomap_finish_ioend_buffered(ioend); +} + +/* + * Ioend completion routine for merged bios. This can only be called from task + * contexts as merged ioends can be of unbound length. Hence we have to break up + * the writeback completions into manageable chunks to avoid long scheduler + * holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get + * good batch processing throughput without creating adverse scheduler latency + * conditions. + */ +void iomap_finish_ioends(struct iomap_ioend *ioend, int error) +{ + struct list_head tmp; + u32 completions; + + might_sleep(); + + list_replace_init(&ioend->io_list, &tmp); + completions = iomap_finish_ioend(ioend, error); + + while (!list_empty(&tmp)) { + if (completions > IOEND_BATCH_SIZE * 8) { + cond_resched(); + completions = 0; + } + ioend = list_first_entry(&tmp, struct iomap_ioend, io_list); + list_del_init(&ioend->io_list); + completions += iomap_finish_ioend(ioend, error); + } +} +EXPORT_SYMBOL_GPL(iomap_finish_ioends); + +/* + * We can merge two adjacent ioends if they have the same set of work to do. + */ +static bool iomap_ioend_can_merge(struct iomap_ioend *ioend, + struct iomap_ioend *next) +{ + if (ioend->io_bio.bi_status != next->io_bio.bi_status) + return false; + if (next->io_flags & IOMAP_IOEND_BOUNDARY) + return false; + if ((ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS) != + (next->io_flags & IOMAP_IOEND_NOMERGE_FLAGS)) + return false; + if (ioend->io_offset + ioend->io_size != next->io_offset) + return false; + /* + * Do not merge physically discontiguous ioends. The filesystem + * completion functions will have to iterate the physical + * discontiguities even if we merge the ioends at a logical level, so + * we don't gain anything by merging physical discontiguities here. + * + * We cannot use bio->bi_iter.bi_sector here as it is modified during + * submission so does not point to the start sector of the bio at + * completion. + */ + if (ioend->io_sector + (ioend->io_size >> SECTOR_SHIFT) != + next->io_sector) + return false; + return true; +} + +void iomap_ioend_try_merge(struct iomap_ioend *ioend, + struct list_head *more_ioends) +{ + struct iomap_ioend *next; + + INIT_LIST_HEAD(&ioend->io_list); + + while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend, + io_list))) { + if (!iomap_ioend_can_merge(ioend, next)) + break; + list_move_tail(&next->io_list, &ioend->io_list); + ioend->io_size += next->io_size; + } +} +EXPORT_SYMBOL_GPL(iomap_ioend_try_merge); + +static int iomap_ioend_compare(void *priv, const struct list_head *a, + const struct list_head *b) +{ + struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list); + struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list); + + if (ia->io_offset < ib->io_offset) + return -1; + if (ia->io_offset > ib->io_offset) + return 1; + return 0; +} + +void iomap_sort_ioends(struct list_head *ioend_list) +{ + list_sort(NULL, ioend_list, iomap_ioend_compare); +} +EXPORT_SYMBOL_GPL(iomap_sort_ioends); + +/* + * Split up to the first @max_len bytes from @ioend if the ioend covers more + * than @max_len bytes. + * + * If @is_append is set, the split will be based on the hardware limits for + * REQ_OP_ZONE_APPEND commands and can be less than @max_len if the hardware + * limits don't allow the entire @max_len length. + * + * The bio embedded into @ioend must be a REQ_OP_WRITE because the block layer + * does not allow splitting REQ_OP_ZONE_APPEND bios. The file systems has to + * switch the operation after this call, but before submitting the bio. + */ +struct iomap_ioend *iomap_split_ioend(struct iomap_ioend *ioend, + unsigned int max_len, bool is_append) +{ + struct bio *bio = &ioend->io_bio; + struct iomap_ioend *split_ioend; + unsigned int nr_segs; + int sector_offset; + struct bio *split; + + if (is_append) { + struct queue_limits *lim = bdev_limits(bio->bi_bdev); + + max_len = min(max_len, + lim->max_zone_append_sectors << SECTOR_SHIFT); + + sector_offset = bio_split_rw_at(bio, lim, &nr_segs, max_len); + if (unlikely(sector_offset < 0)) + return ERR_PTR(sector_offset); + if (!sector_offset) + return NULL; + } else { + if (bio->bi_iter.bi_size <= max_len) + return NULL; + sector_offset = max_len >> SECTOR_SHIFT; + } + + /* ensure the split ioend is still block size aligned */ + sector_offset = ALIGN_DOWN(sector_offset << SECTOR_SHIFT, + i_blocksize(ioend->io_inode)) >> SECTOR_SHIFT; + + split = bio_split(bio, sector_offset, GFP_NOFS, &iomap_ioend_bioset); + if (IS_ERR(split)) + return ERR_CAST(split); + split->bi_private = bio->bi_private; + split->bi_end_io = bio->bi_end_io; + + split_ioend = iomap_init_ioend(ioend->io_inode, split, ioend->io_offset, + ioend->io_flags); + split_ioend->io_parent = ioend; + + atomic_inc(&ioend->io_remaining); + ioend->io_offset += split_ioend->io_size; + ioend->io_size -= split_ioend->io_size; + + split_ioend->io_sector = ioend->io_sector; + if (!is_append) + ioend->io_sector += (split_ioend->io_size >> SECTOR_SHIFT); + return split_ioend; +} +EXPORT_SYMBOL_GPL(iomap_split_ioend); + +static int __init iomap_ioend_init(void) +{ + return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE), + offsetof(struct iomap_ioend, io_bio), + BIOSET_NEED_BVECS); +} +fs_initcall(iomap_ioend_init); diff --git a/fs/iomap/iter.c b/fs/iomap/iter.c index 3790918646af..6ffc6a7b9ba5 100644 --- a/fs/iomap/iter.c +++ b/fs/iomap/iter.c @@ -7,40 +7,25 @@ #include <linux/iomap.h> #include "trace.h" -/* - * Advance to the next range we need to map. - * - * If the iomap is marked IOMAP_F_STALE, it means the existing map was not fully - * processed - it was aborted because the extent the iomap spanned may have been - * changed during the operation. In this case, the iteration behaviour is to - * remap the unprocessed range of the iter, and that means we may need to remap - * even when we've made no progress (i.e. iter->processed = 0). Hence the - * "finished iterating" case needs to distinguish between - * (processed = 0) meaning we are done and (processed = 0 && stale) meaning we - * need to remap the entire remaining range. - */ -static inline int iomap_iter_advance(struct iomap_iter *iter) +static inline void iomap_iter_reset_iomap(struct iomap_iter *iter) { - bool stale = iter->iomap.flags & IOMAP_F_STALE; - int ret = 1; - - /* handle the previous iteration (if any) */ - if (iter->iomap.length) { - if (iter->processed < 0) - return iter->processed; - if (WARN_ON_ONCE(iter->processed > iomap_length(iter))) - return -EIO; - iter->pos += iter->processed; - iter->len -= iter->processed; - if (!iter->len || (!iter->processed && !stale)) - ret = 0; - } - - /* clear the per iteration state */ - iter->processed = 0; + iter->status = 0; memset(&iter->iomap, 0, sizeof(iter->iomap)); memset(&iter->srcmap, 0, sizeof(iter->srcmap)); - return ret; +} + +/* + * Advance the current iterator position and output the length remaining for the + * current mapping. + */ +int iomap_iter_advance(struct iomap_iter *iter, u64 *count) +{ + if (WARN_ON_ONCE(*count > iomap_length(iter))) + return -EIO; + iter->pos += *count; + iter->len -= *count; + *count = iomap_length(iter); + return 0; } static inline void iomap_iter_done(struct iomap_iter *iter) @@ -50,6 +35,8 @@ static inline void iomap_iter_done(struct iomap_iter *iter) WARN_ON_ONCE(iter->iomap.offset + iter->iomap.length <= iter->pos); WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_STALE); + iter->iter_start_pos = iter->pos; + trace_iomap_iter_dstmap(iter->inode, &iter->iomap); if (iter->srcmap.type != IOMAP_HOLE) trace_iomap_iter_srcmap(iter->inode, &iter->srcmap); @@ -67,26 +54,58 @@ static inline void iomap_iter_done(struct iomap_iter *iter) * function must be called in a loop that continues as long it returns a * positive value. If 0 or a negative value is returned, the caller must not * return to the loop body. Within a loop body, there are two ways to break out - * of the loop body: leave @iter.processed unchanged, or set it to a negative + * of the loop body: leave @iter.status unchanged, or set it to a negative * errno. */ int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops) { + bool stale = iter->iomap.flags & IOMAP_F_STALE; + ssize_t advanced; + u64 olen; int ret; - if (iter->iomap.length && ops->iomap_end) { - ret = ops->iomap_end(iter->inode, iter->pos, iomap_length(iter), - iter->processed > 0 ? iter->processed : 0, - iter->flags, &iter->iomap); - if (ret < 0 && !iter->processed) + trace_iomap_iter(iter, ops, _RET_IP_); + + if (!iter->iomap.length) + goto begin; + + /* + * Calculate how far the iter was advanced and the original length bytes + * for ->iomap_end(). + */ + advanced = iter->pos - iter->iter_start_pos; + olen = iter->len + advanced; + + if (ops->iomap_end) { + ret = ops->iomap_end(iter->inode, iter->iter_start_pos, + iomap_length_trim(iter, iter->iter_start_pos, + olen), + advanced, iter->flags, &iter->iomap); + if (ret < 0 && !advanced) return ret; } - trace_iomap_iter(iter, ops, _RET_IP_); - ret = iomap_iter_advance(iter); + /* detect old return semantics where this would advance */ + if (WARN_ON_ONCE(iter->status > 0)) + iter->status = -EIO; + + /* + * Use iter->len to determine whether to continue onto the next mapping. + * Explicitly terminate on error status or if the current iter has not + * advanced at all (i.e. no work was done for some reason) unless the + * mapping has been marked stale and needs to be reprocessed. + */ + if (iter->status < 0) + ret = iter->status; + else if (iter->len == 0 || (!advanced && !stale)) + ret = 0; + else + ret = 1; + iomap_iter_reset_iomap(iter); if (ret <= 0) return ret; +begin: ret = ops->iomap_begin(iter->inode, iter->pos, iter->len, iter->flags, &iter->iomap, &iter->srcmap); if (ret < 0) diff --git a/fs/iomap/seek.c b/fs/iomap/seek.c index a845c012b50c..04d7919636c1 100644 --- a/fs/iomap/seek.c +++ b/fs/iomap/seek.c @@ -10,7 +10,7 @@ #include <linux/pagemap.h> #include <linux/pagevec.h> -static loff_t iomap_seek_hole_iter(const struct iomap_iter *iter, +static int iomap_seek_hole_iter(struct iomap_iter *iter, loff_t *hole_pos) { loff_t length = iomap_length(iter); @@ -20,13 +20,13 @@ static loff_t iomap_seek_hole_iter(const struct iomap_iter *iter, *hole_pos = mapping_seek_hole_data(iter->inode->i_mapping, iter->pos, iter->pos + length, SEEK_HOLE); if (*hole_pos == iter->pos + length) - return length; + return iomap_iter_advance(iter, &length); return 0; case IOMAP_HOLE: *hole_pos = iter->pos; return 0; default: - return length; + return iomap_iter_advance(iter, &length); } } @@ -47,7 +47,7 @@ iomap_seek_hole(struct inode *inode, loff_t pos, const struct iomap_ops *ops) iter.len = size - pos; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_seek_hole_iter(&iter, &pos); + iter.status = iomap_seek_hole_iter(&iter, &pos); if (ret < 0) return ret; if (iter.len) /* found hole before EOF */ @@ -56,19 +56,19 @@ iomap_seek_hole(struct inode *inode, loff_t pos, const struct iomap_ops *ops) } EXPORT_SYMBOL_GPL(iomap_seek_hole); -static loff_t iomap_seek_data_iter(const struct iomap_iter *iter, +static int iomap_seek_data_iter(struct iomap_iter *iter, loff_t *hole_pos) { loff_t length = iomap_length(iter); switch (iter->iomap.type) { case IOMAP_HOLE: - return length; + return iomap_iter_advance(iter, &length); case IOMAP_UNWRITTEN: *hole_pos = mapping_seek_hole_data(iter->inode->i_mapping, iter->pos, iter->pos + length, SEEK_DATA); if (*hole_pos < 0) - return length; + return iomap_iter_advance(iter, &length); return 0; default: *hole_pos = iter->pos; @@ -93,7 +93,7 @@ iomap_seek_data(struct inode *inode, loff_t pos, const struct iomap_ops *ops) iter.len = size - pos; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_seek_data_iter(&iter, &pos); + iter.status = iomap_seek_data_iter(&iter, &pos); if (ret < 0) return ret; if (iter.len) /* found data before EOF */ diff --git a/fs/iomap/swapfile.c b/fs/iomap/swapfile.c index b90d0eda9e51..c1a762c10ce4 100644 --- a/fs/iomap/swapfile.c +++ b/fs/iomap/swapfile.c @@ -94,7 +94,7 @@ static int iomap_swapfile_fail(struct iomap_swapfile_info *isi, const char *str) * swap only cares about contiguous page-aligned physical extents and makes no * distinction between written and unwritten extents. */ -static loff_t iomap_swapfile_iter(const struct iomap_iter *iter, +static int iomap_swapfile_iter(struct iomap_iter *iter, struct iomap *iomap, struct iomap_swapfile_info *isi) { switch (iomap->type) { @@ -132,7 +132,8 @@ static loff_t iomap_swapfile_iter(const struct iomap_iter *iter, return error; memcpy(&isi->iomap, iomap, sizeof(isi->iomap)); } - return iomap_length(iter); + + return iomap_iter_advance_full(iter); } /* @@ -166,7 +167,7 @@ int iomap_swapfile_activate(struct swap_info_struct *sis, return ret; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_swapfile_iter(&iter, &iter.iomap, &isi); + iter.status = iomap_swapfile_iter(&iter, &iter.iomap, &isi); if (ret < 0) return ret; diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index 4118a42cdab0..69af89044ebd 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -99,7 +99,7 @@ DEFINE_RANGE_EVENT(iomap_dio_rw_queued); { IOMAP_FAULT, "FAULT" }, \ { IOMAP_DIRECT, "DIRECT" }, \ { IOMAP_NOWAIT, "NOWAIT" }, \ - { IOMAP_ATOMIC, "ATOMIC" } + { IOMAP_ATOMIC_HW, "ATOMIC_HW" } #define IOMAP_F_FLAGS_STRINGS \ { IOMAP_F_NEW, "NEW" }, \ @@ -207,7 +207,7 @@ TRACE_EVENT(iomap_iter, __field(u64, ino) __field(loff_t, pos) __field(u64, length) - __field(s64, processed) + __field(int, status) __field(unsigned int, flags) __field(const void *, ops) __field(unsigned long, caller) @@ -217,17 +217,17 @@ TRACE_EVENT(iomap_iter, __entry->ino = iter->inode->i_ino; __entry->pos = iter->pos; __entry->length = iomap_length(iter); - __entry->processed = iter->processed; + __entry->status = iter->status; __entry->flags = iter->flags; __entry->ops = ops; __entry->caller = caller; ), - TP_printk("dev %d:%d ino 0x%llx pos 0x%llx length 0x%llx processed %lld flags %s (0x%x) ops %ps caller %pS", + TP_printk("dev %d:%d ino 0x%llx pos 0x%llx length 0x%llx status %d flags %s (0x%x) ops %ps caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->pos, __entry->length, - __entry->processed, + __entry->status, __print_flags(__entry->flags, "|", IOMAP_FLAGS_STRINGS), __entry->flags, __entry->ops, diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 7afa51e41427..5bf501cf8271 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -64,6 +64,7 @@ xfs-y += $(addprefix libxfs/, \ xfs-$(CONFIG_XFS_RT) += $(addprefix libxfs/, \ xfs_rtbitmap.o \ xfs_rtgroup.o \ + xfs_zones.o \ ) # highlevel code @@ -136,7 +137,11 @@ xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \ xfs_quotaops.o # xfs_rtbitmap is shared with libxfs -xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o +xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o \ + xfs_zone_alloc.o \ + xfs_zone_gc.o \ + xfs_zone_info.o \ + xfs_zone_space_resv.o xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 0ef19f1469ec..63255820b58a 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -34,13 +34,13 @@ #include "xfs_ag.h" #include "xfs_ag_resv.h" #include "xfs_refcount.h" -#include "xfs_icache.h" #include "xfs_iomap.h" #include "xfs_health.h" #include "xfs_bmap_item.h" #include "xfs_symlink_remote.h" #include "xfs_inode_util.h" #include "xfs_rtgroup.h" +#include "xfs_zone_alloc.h" struct kmem_cache *xfs_bmap_intent_cache; @@ -171,18 +171,16 @@ xfs_bmbt_update( * Compute the worst-case number of indirect blocks that will be used * for ip's delayed extent of length "len". */ -STATIC xfs_filblks_t +xfs_filblks_t xfs_bmap_worst_indlen( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_filblks_t len) /* delayed extent length */ + struct xfs_inode *ip, /* incore inode pointer */ + xfs_filblks_t len) /* delayed extent length */ { - int level; /* btree level number */ - int maxrecs; /* maximum record count at this level */ - xfs_mount_t *mp; /* mount structure */ - xfs_filblks_t rval; /* return value */ + struct xfs_mount *mp = ip->i_mount; + int maxrecs = mp->m_bmap_dmxr[0]; + int level; + xfs_filblks_t rval; - mp = ip->i_mount; - maxrecs = mp->m_bmap_dmxr[0]; for (level = 0, rval = 0; level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK); level++) { @@ -2572,146 +2570,6 @@ done: } /* - * Convert a hole to a delayed allocation. - */ -STATIC void -xfs_bmap_add_extent_hole_delay( - xfs_inode_t *ip, /* incore inode pointer */ - int whichfork, - struct xfs_iext_cursor *icur, - xfs_bmbt_irec_t *new) /* new data to add to file extents */ -{ - struct xfs_ifork *ifp; /* inode fork pointer */ - xfs_bmbt_irec_t left; /* left neighbor extent entry */ - xfs_filblks_t newlen=0; /* new indirect size */ - xfs_filblks_t oldlen=0; /* old indirect size */ - xfs_bmbt_irec_t right; /* right neighbor extent entry */ - uint32_t state = xfs_bmap_fork_to_state(whichfork); - xfs_filblks_t temp; /* temp for indirect calculations */ - - ifp = xfs_ifork_ptr(ip, whichfork); - ASSERT(isnullstartblock(new->br_startblock)); - - /* - * Check and set flags if this segment has a left neighbor - */ - if (xfs_iext_peek_prev_extent(ifp, icur, &left)) { - state |= BMAP_LEFT_VALID; - if (isnullstartblock(left.br_startblock)) - state |= BMAP_LEFT_DELAY; - } - - /* - * Check and set flags if the current (right) segment exists. - * If it doesn't exist, we're converting the hole at end-of-file. - */ - if (xfs_iext_get_extent(ifp, icur, &right)) { - state |= BMAP_RIGHT_VALID; - if (isnullstartblock(right.br_startblock)) - state |= BMAP_RIGHT_DELAY; - } - - /* - * Set contiguity flags on the left and right neighbors. - * Don't let extents get too large, even if the pieces are contiguous. - */ - if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) && - left.br_startoff + left.br_blockcount == new->br_startoff && - left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) - state |= BMAP_LEFT_CONTIG; - - if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) && - new->br_startoff + new->br_blockcount == right.br_startoff && - new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN && - (!(state & BMAP_LEFT_CONTIG) || - (left.br_blockcount + new->br_blockcount + - right.br_blockcount <= XFS_MAX_BMBT_EXTLEN))) - state |= BMAP_RIGHT_CONTIG; - - /* - * Switch out based on the contiguity flags. - */ - switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) { - case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: - /* - * New allocation is contiguous with delayed allocations - * on the left and on the right. - * Merge all three into a single extent record. - */ - temp = left.br_blockcount + new->br_blockcount + - right.br_blockcount; - - oldlen = startblockval(left.br_startblock) + - startblockval(new->br_startblock) + - startblockval(right.br_startblock); - newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), - oldlen); - left.br_startblock = nullstartblock(newlen); - left.br_blockcount = temp; - - xfs_iext_remove(ip, icur, state); - xfs_iext_prev(ifp, icur); - xfs_iext_update_extent(ip, state, icur, &left); - break; - - case BMAP_LEFT_CONTIG: - /* - * New allocation is contiguous with a delayed allocation - * on the left. - * Merge the new allocation with the left neighbor. - */ - temp = left.br_blockcount + new->br_blockcount; - - oldlen = startblockval(left.br_startblock) + - startblockval(new->br_startblock); - newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), - oldlen); - left.br_blockcount = temp; - left.br_startblock = nullstartblock(newlen); - - xfs_iext_prev(ifp, icur); - xfs_iext_update_extent(ip, state, icur, &left); - break; - - case BMAP_RIGHT_CONTIG: - /* - * New allocation is contiguous with a delayed allocation - * on the right. - * Merge the new allocation with the right neighbor. - */ - temp = new->br_blockcount + right.br_blockcount; - oldlen = startblockval(new->br_startblock) + - startblockval(right.br_startblock); - newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), - oldlen); - right.br_startoff = new->br_startoff; - right.br_startblock = nullstartblock(newlen); - right.br_blockcount = temp; - xfs_iext_update_extent(ip, state, icur, &right); - break; - - case 0: - /* - * New allocation is not contiguous with another - * delayed allocation. - * Insert a new entry. - */ - oldlen = newlen = 0; - xfs_iext_insert(ip, icur, new, state); - break; - } - if (oldlen != newlen) { - ASSERT(oldlen > newlen); - xfs_add_fdblocks(ip->i_mount, oldlen - newlen); - - /* - * Nothing to do for disk quota accounting here. - */ - xfs_mod_delalloc(ip, 0, (int64_t)newlen - oldlen); - } -} - -/* * Convert a hole to a real allocation. */ STATIC int /* error */ @@ -4039,144 +3897,6 @@ xfs_bmapi_read( return 0; } -/* - * Add a delayed allocation extent to an inode. Blocks are reserved from the - * global pool and the extent inserted into the inode in-core extent tree. - * - * On entry, got refers to the first extent beyond the offset of the extent to - * allocate or eof is specified if no such extent exists. On return, got refers - * to the extent record that was inserted to the inode fork. - * - * Note that the allocated extent may have been merged with contiguous extents - * during insertion into the inode fork. Thus, got does not reflect the current - * state of the inode fork on return. If necessary, the caller can use lastx to - * look up the updated record in the inode fork. - */ -int -xfs_bmapi_reserve_delalloc( - struct xfs_inode *ip, - int whichfork, - xfs_fileoff_t off, - xfs_filblks_t len, - xfs_filblks_t prealloc, - struct xfs_bmbt_irec *got, - struct xfs_iext_cursor *icur, - int eof) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); - xfs_extlen_t alen; - xfs_extlen_t indlen; - uint64_t fdblocks; - int error; - xfs_fileoff_t aoff; - bool use_cowextszhint = - whichfork == XFS_COW_FORK && !prealloc; - -retry: - /* - * Cap the alloc length. Keep track of prealloc so we know whether to - * tag the inode before we return. - */ - aoff = off; - alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN); - if (!eof) - alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff); - if (prealloc && alen >= len) - prealloc = alen - len; - - /* - * If we're targetting the COW fork but aren't creating a speculative - * posteof preallocation, try to expand the reservation to align with - * the COW extent size hint if there's sufficient free space. - * - * Unlike the data fork, the CoW cancellation functions will free all - * the reservations at inactivation, so we don't require that every - * delalloc reservation have a dirty pagecache. - */ - if (use_cowextszhint) { - struct xfs_bmbt_irec prev; - xfs_extlen_t extsz = xfs_get_cowextsz_hint(ip); - - if (!xfs_iext_peek_prev_extent(ifp, icur, &prev)) - prev.br_startoff = NULLFILEOFF; - - error = xfs_bmap_extsize_align(mp, got, &prev, extsz, 0, eof, - 1, 0, &aoff, &alen); - ASSERT(!error); - } - - /* - * Make a transaction-less quota reservation for delayed allocation - * blocks. This number gets adjusted later. We return if we haven't - * allocated blocks already inside this loop. - */ - error = xfs_quota_reserve_blkres(ip, alen); - if (error) - goto out; - - /* - * Split changing sb for alen and indlen since they could be coming - * from different places. - */ - indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen); - ASSERT(indlen > 0); - - fdblocks = indlen; - if (XFS_IS_REALTIME_INODE(ip)) { - error = xfs_dec_frextents(mp, xfs_blen_to_rtbxlen(mp, alen)); - if (error) - goto out_unreserve_quota; - } else { - fdblocks += alen; - } - - error = xfs_dec_fdblocks(mp, fdblocks, false); - if (error) - goto out_unreserve_frextents; - - ip->i_delayed_blks += alen; - xfs_mod_delalloc(ip, alen, indlen); - - got->br_startoff = aoff; - got->br_startblock = nullstartblock(indlen); - got->br_blockcount = alen; - got->br_state = XFS_EXT_NORM; - - xfs_bmap_add_extent_hole_delay(ip, whichfork, icur, got); - - /* - * Tag the inode if blocks were preallocated. Note that COW fork - * preallocation can occur at the start or end of the extent, even when - * prealloc == 0, so we must also check the aligned offset and length. - */ - if (whichfork == XFS_DATA_FORK && prealloc) - xfs_inode_set_eofblocks_tag(ip); - if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len)) - xfs_inode_set_cowblocks_tag(ip); - - return 0; - -out_unreserve_frextents: - if (XFS_IS_REALTIME_INODE(ip)) - xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, alen)); -out_unreserve_quota: - if (XFS_IS_QUOTA_ON(mp)) - xfs_quota_unreserve_blkres(ip, alen); -out: - if (error == -ENOSPC || error == -EDQUOT) { - trace_xfs_delalloc_enospc(ip, off, len); - - if (prealloc || use_cowextszhint) { - /* retry without any preallocation */ - use_cowextszhint = false; - prealloc = 0; - goto retry; - } - } - return error; -} - static int xfs_bmapi_allocate( struct xfs_bmalloca *bma) @@ -4948,7 +4668,8 @@ xfs_bmap_del_extent_delay( int whichfork, struct xfs_iext_cursor *icur, struct xfs_bmbt_irec *got, - struct xfs_bmbt_irec *del) + struct xfs_bmbt_irec *del, + uint32_t bflags) /* bmapi flags */ { struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); @@ -5068,10 +4789,18 @@ xfs_bmap_del_extent_delay( da_diff = da_old - da_new; fdblocks = da_diff; - if (isrt) - xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, del->br_blockcount)); - else + if (bflags & XFS_BMAPI_REMAP) { + ; + } else if (isrt) { + xfs_rtbxlen_t rtxlen; + + rtxlen = xfs_blen_to_rtbxlen(mp, del->br_blockcount); + if (xfs_is_zoned_inode(ip)) + xfs_zoned_add_available(mp, rtxlen); + xfs_add_frextents(mp, rtxlen); + } else { fdblocks += del->br_blockcount; + } xfs_add_fdblocks(mp, fdblocks); xfs_mod_delalloc(ip, -(int64_t)del->br_blockcount, -da_diff); @@ -5670,7 +5399,8 @@ __xfs_bunmapi( delete: if (wasdel) { - xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del); + xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, + &del, flags); } else { error = xfs_bmap_del_extent_real(ip, tp, &icur, cur, &del, &tmp_logflags, whichfork, diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 4b721d935994..b4d9c6e0f3f9 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -204,7 +204,7 @@ int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, xfs_extnum_t nexts, int *done); void xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork, struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got, - struct xfs_bmbt_irec *del); + struct xfs_bmbt_irec *del, uint32_t bflags); void xfs_bmap_del_extent_cow(struct xfs_inode *ip, struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *del); @@ -219,10 +219,6 @@ int xfs_bmap_insert_extents(struct xfs_trans *tp, struct xfs_inode *ip, bool *done, xfs_fileoff_t stop_fsb); int xfs_bmap_split_extent(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t split_offset); -int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork, - xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc, - struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur, - int eof); int xfs_bmapi_convert_delalloc(struct xfs_inode *ip, int whichfork, xfs_off_t offset, struct iomap *iomap, unsigned int *seq); int xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp, @@ -233,6 +229,7 @@ xfs_extlen_t xfs_bmapi_minleft(struct xfs_trans *tp, struct xfs_inode *ip, int fork); int xfs_bmap_btalloc_low_space(struct xfs_bmalloca *ap, struct xfs_alloc_arg *args); +xfs_filblks_t xfs_bmap_worst_indlen(struct xfs_inode *ip, xfs_filblks_t len); enum xfs_bmap_intent_type { XFS_BMAP_MAP = 1, diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index b1007fb661ba..9566a7623365 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -178,9 +178,10 @@ typedef struct xfs_sb { xfs_rgnumber_t sb_rgcount; /* number of realtime groups */ xfs_rtxlen_t sb_rgextents; /* size of a realtime group in rtx */ - uint8_t sb_rgblklog; /* rt group number shift */ uint8_t sb_pad[7]; /* zeroes */ + xfs_rfsblock_t sb_rtstart; /* start of internal RT section (FSB) */ + xfs_filblks_t sb_rtreserved; /* reserved (zoned) RT blocks */ /* must be padded to 64 bit alignment */ } xfs_sb_t; @@ -270,9 +271,10 @@ struct xfs_dsb { __be64 sb_metadirino; /* metadata directory tree root */ __be32 sb_rgcount; /* # of realtime groups */ __be32 sb_rgextents; /* size of rtgroup in rtx */ - __u8 sb_rgblklog; /* rt group number shift */ __u8 sb_pad[7]; /* zeroes */ + __be64 sb_rtstart; /* start of internal RT section (FSB) */ + __be64 sb_rtreserved; /* reserved (zoned) RT blocks */ /* * The size of this structure must be padded to 64 bit alignment. @@ -395,6 +397,9 @@ xfs_sb_has_ro_compat_feature( #define XFS_SB_FEAT_INCOMPAT_EXCHRANGE (1 << 6) /* exchangerange supported */ #define XFS_SB_FEAT_INCOMPAT_PARENT (1 << 7) /* parent pointers */ #define XFS_SB_FEAT_INCOMPAT_METADIR (1 << 8) /* metadata dir tree */ +#define XFS_SB_FEAT_INCOMPAT_ZONED (1 << 9) /* zoned RT allocator */ +#define XFS_SB_FEAT_INCOMPAT_ZONE_GAPS (1 << 10) /* RTGs have LBA gaps */ + #define XFS_SB_FEAT_INCOMPAT_ALL \ (XFS_SB_FEAT_INCOMPAT_FTYPE | \ XFS_SB_FEAT_INCOMPAT_SPINODES | \ @@ -404,7 +409,9 @@ xfs_sb_has_ro_compat_feature( XFS_SB_FEAT_INCOMPAT_NREXT64 | \ XFS_SB_FEAT_INCOMPAT_EXCHRANGE | \ XFS_SB_FEAT_INCOMPAT_PARENT | \ - XFS_SB_FEAT_INCOMPAT_METADIR) + XFS_SB_FEAT_INCOMPAT_METADIR | \ + XFS_SB_FEAT_INCOMPAT_ZONED | \ + XFS_SB_FEAT_INCOMPAT_ZONE_GAPS) #define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL static inline bool @@ -952,7 +959,12 @@ struct xfs_dinode { __be64 di_changecount; /* number of attribute changes */ __be64 di_lsn; /* flush sequence */ __be64 di_flags2; /* more random flags */ - __be32 di_cowextsize; /* basic cow extent size for file */ + union { + /* basic cow extent size for (regular) file */ + __be32 di_cowextsize; + /* used blocks in RTG for (zoned) rtrmap inode */ + __be32 di_used_blocks; + }; __u8 di_pad2[12]; /* more padding for future expansion */ /* fields only written to during inode creation */ diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 2c3171262b44..12463ba766da 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -189,7 +189,9 @@ struct xfs_fsop_geom { uint32_t checked; /* o: checked fs & rt metadata */ __u32 rgextents; /* rt extents in a realtime group */ __u32 rgcount; /* number of realtime groups */ - __u64 reserved[16]; /* reserved space */ + __u64 rtstart; /* start of internal rt section */ + __u64 rtreserved; /* RT (zoned) reserved blocks */ + __u64 reserved[14]; /* reserved space */ }; #define XFS_FSOP_GEOM_SICK_COUNTERS (1 << 0) /* summary counters */ @@ -247,6 +249,7 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_EXCHANGE_RANGE (1 << 24) /* exchange range */ #define XFS_FSOP_GEOM_FLAGS_PARENT (1 << 25) /* linux parent pointers */ #define XFS_FSOP_GEOM_FLAGS_METADIR (1 << 26) /* metadata directories */ +#define XFS_FSOP_GEOM_FLAGS_ZONED (1 << 27) /* zoned rt device */ /* * Minimum and maximum sizes need for growth checks. @@ -1079,6 +1082,15 @@ struct xfs_rtgroup_geometry { #define XFS_IOC_COMMIT_RANGE _IOW ('X', 131, struct xfs_commit_range) /* XFS_IOC_GETFSUUID ---------- deprecated 140 */ +/* + * Devices supported by a single XFS file system. Reported in fsmaps fmr_device + * when using internal RT devices. + */ +enum xfs_device { + XFS_DEV_DATA = 1, + XFS_DEV_LOG = 2, + XFS_DEV_RT = 3, +}; #ifndef HAVE_BBMACROS /* diff --git a/fs/xfs/libxfs/xfs_group.h b/fs/xfs/libxfs/xfs_group.h index 242b05627c7a..4423932a2313 100644 --- a/fs/xfs/libxfs/xfs_group.h +++ b/fs/xfs/libxfs/xfs_group.h @@ -19,10 +19,23 @@ struct xfs_group { #ifdef __KERNEL__ /* -- kernel only structures below this line -- */ - /* - * Track freed but not yet committed extents. - */ - struct xfs_extent_busy_tree *xg_busy_extents; + union { + /* + * For perags and non-zoned RT groups: + * Track freed but not yet committed extents. + */ + struct xfs_extent_busy_tree *xg_busy_extents; + + /* + * For zoned RT groups: + * List of groups that need a zone reset. + * + * The zonegc code forces a log flush of the rtrmap inode before + * resetting the write pointer, so there is no need for + * individual busy extent tracking. + */ + struct xfs_group *xg_next_reset; + }; /* * Bitsets of per-ag metadata that have been checked and/or are sick. @@ -107,9 +120,15 @@ xfs_gbno_to_daddr( xfs_agblock_t gbno) { struct xfs_mount *mp = xg->xg_mount; - uint32_t blocks = mp->m_groups[xg->xg_type].blocks; + struct xfs_groups *g = &mp->m_groups[xg->xg_type]; + xfs_fsblock_t fsbno; + + if (g->has_daddr_gaps) + fsbno = xfs_gbno_to_fsb(xg, gbno); + else + fsbno = (xfs_fsblock_t)xg->xg_gno * g->blocks + gbno; - return XFS_FSB_TO_BB(mp, (xfs_fsblock_t)xg->xg_gno * blocks + gbno); + return XFS_FSB_TO_BB(mp, g->start_fsb + fsbno); } static inline uint32_t diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index f3a840a425f5..57513ba19d6a 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -1927,7 +1927,7 @@ xfs_dialloc( * that we can immediately allocate, but then we allow allocation on the * second pass if we fail to find an AG with free inodes in it. */ - if (percpu_counter_read_positive(&mp->m_fdblocks) < + if (xfs_estimate_freecounter(mp, XC_FREE_BLOCKS) < mp->m_low_space[XFS_LOWSP_1_PCNT]) { ok_alloc = false; low_space = true; diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index f24fa628fecf..992e6d337709 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -252,7 +252,10 @@ xfs_inode_from_disk( be64_to_cpu(from->di_changecount)); ip->i_crtime = xfs_inode_from_disk_ts(from, from->di_crtime); ip->i_diflags2 = be64_to_cpu(from->di_flags2); + /* also covers the di_used_blocks union arm: */ ip->i_cowextsize = be32_to_cpu(from->di_cowextsize); + BUILD_BUG_ON(sizeof(from->di_cowextsize) != + sizeof(from->di_used_blocks)); } error = xfs_iformat_data_fork(ip, from); @@ -349,6 +352,7 @@ xfs_inode_to_disk( to->di_changecount = cpu_to_be64(inode_peek_iversion(inode)); to->di_crtime = xfs_inode_to_disk_ts(ip, ip->i_crtime); to->di_flags2 = cpu_to_be64(ip->i_diflags2); + /* also covers the di_used_blocks union arm: */ to->di_cowextsize = cpu_to_be32(ip->i_cowextsize); to->di_ino = cpu_to_be64(ip->i_ino); to->di_lsn = cpu_to_be64(lsn); @@ -752,11 +756,18 @@ xfs_dinode_verify( !xfs_has_rtreflink(mp)) return __this_address; - /* COW extent size hint validation */ - fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize), - mode, flags, flags2); - if (fa) - return fa; + if (xfs_has_zoned(mp) && + dip->di_metatype == cpu_to_be16(XFS_METAFILE_RTRMAP)) { + if (be32_to_cpu(dip->di_used_blocks) > mp->m_sb.sb_rgextents) + return __this_address; + } else { + /* COW extent size hint validation */ + fa = xfs_inode_validate_cowextsize(mp, + be32_to_cpu(dip->di_cowextsize), + mode, flags, flags2); + if (fa) + return fa; + } /* bigtime iflag can only happen on bigtime filesystems */ if (xfs_dinode_has_bigtime(dip) && diff --git a/fs/xfs/libxfs/xfs_inode_util.c b/fs/xfs/libxfs/xfs_inode_util.c index deb0b7c00a1f..48fe49a5f050 100644 --- a/fs/xfs/libxfs/xfs_inode_util.c +++ b/fs/xfs/libxfs/xfs_inode_util.c @@ -322,6 +322,7 @@ xfs_inode_init( if (xfs_has_v3inodes(mp)) { inode_set_iversion(inode, 1); + /* also covers the di_used_blocks union arm: */ ip->i_cowextsize = 0; times |= XFS_ICHGTIME_CREATE; } diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index a472ac2e45d0..0d637c276db0 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -475,7 +475,12 @@ struct xfs_log_dinode { xfs_lsn_t di_lsn; uint64_t di_flags2; /* more random flags */ - uint32_t di_cowextsize; /* basic cow extent size for file */ + union { + /* basic cow extent size for (regular) file */ + uint32_t di_cowextsize; + /* used blocks in RTG for (zoned) rtrmap inode */ + uint32_t di_used_blocks; + }; uint8_t di_pad2[12]; /* more padding for future expansion */ /* fields only written to during inode creation */ diff --git a/fs/xfs/libxfs/xfs_metafile.c b/fs/xfs/libxfs/xfs_metafile.c index 2f5f554a36d4..225923e463c4 100644 --- a/fs/xfs/libxfs/xfs_metafile.c +++ b/fs/xfs/libxfs/xfs_metafile.c @@ -21,6 +21,9 @@ #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_alloc.h" +#include "xfs_rtgroup.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_rtrefcount_btree.h" static const struct { enum xfs_metafile_type mtype; @@ -74,12 +77,11 @@ xfs_metafile_clear_iflag( } /* - * Is the amount of space that could be allocated towards a given metadata - * file at or beneath a certain threshold? + * Is the metafile reservations at or beneath a certain threshold? */ static inline bool xfs_metafile_resv_can_cover( - struct xfs_inode *ip, + struct xfs_mount *mp, int64_t rhs) { /* @@ -88,43 +90,38 @@ xfs_metafile_resv_can_cover( * global free block count. Take care of the first case to avoid * touching the per-cpu counter. */ - if (ip->i_delayed_blks >= rhs) + if (mp->m_metafile_resv_avail >= rhs) return true; /* * There aren't enough blocks left in the inode's reservation, but it * isn't critical unless there also isn't enough free space. */ - return __percpu_counter_compare(&ip->i_mount->m_fdblocks, - rhs - ip->i_delayed_blks, 2048) >= 0; + return xfs_compare_freecounter(mp, XC_FREE_BLOCKS, + rhs - mp->m_metafile_resv_avail, 2048) >= 0; } /* - * Is this metadata file critically low on blocks? For now we'll define that - * as the number of blocks we can get our hands on being less than 10% of what - * we reserved or less than some arbitrary number (maximum btree height). + * Is the metafile reservation critically low on blocks? For now we'll define + * that as the number of blocks we can get our hands on being less than 10% of + * what we reserved or less than some arbitrary number (maximum btree height). */ bool xfs_metafile_resv_critical( - struct xfs_inode *ip) + struct xfs_mount *mp) { - uint64_t asked_low_water; + ASSERT(xfs_has_metadir(mp)); - if (!ip) - return false; - - ASSERT(xfs_is_metadir_inode(ip)); - trace_xfs_metafile_resv_critical(ip, 0); + trace_xfs_metafile_resv_critical(mp, 0); - if (!xfs_metafile_resv_can_cover(ip, ip->i_mount->m_rtbtree_maxlevels)) + if (!xfs_metafile_resv_can_cover(mp, mp->m_rtbtree_maxlevels)) return true; - asked_low_water = div_u64(ip->i_meta_resv_asked, 10); - if (!xfs_metafile_resv_can_cover(ip, asked_low_water)) + if (!xfs_metafile_resv_can_cover(mp, + div_u64(mp->m_metafile_resv_target, 10))) return true; - return XFS_TEST_ERROR(false, ip->i_mount, - XFS_ERRTAG_METAFILE_RESV_CRITICAL); + return XFS_TEST_ERROR(false, mp, XFS_ERRTAG_METAFILE_RESV_CRITICAL); } /* Allocate a block from the metadata file's reservation. */ @@ -133,22 +130,24 @@ xfs_metafile_resv_alloc_space( struct xfs_inode *ip, struct xfs_alloc_arg *args) { + struct xfs_mount *mp = ip->i_mount; int64_t len = args->len; ASSERT(xfs_is_metadir_inode(ip)); ASSERT(args->resv == XFS_AG_RESV_METAFILE); - trace_xfs_metafile_resv_alloc_space(ip, args->len); + trace_xfs_metafile_resv_alloc_space(mp, args->len); /* * Allocate the blocks from the metadata inode's block reservation * and update the ondisk sb counter. */ - if (ip->i_delayed_blks > 0) { + mutex_lock(&mp->m_metafile_resv_lock); + if (mp->m_metafile_resv_avail > 0) { int64_t from_resv; - from_resv = min_t(int64_t, len, ip->i_delayed_blks); - ip->i_delayed_blks -= from_resv; + from_resv = min_t(int64_t, len, mp->m_metafile_resv_avail); + mp->m_metafile_resv_avail -= from_resv; xfs_mod_delalloc(ip, 0, -from_resv); xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS, -from_resv); @@ -175,6 +174,9 @@ xfs_metafile_resv_alloc_space( xfs_trans_mod_sb(args->tp, field, -len); } + mp->m_metafile_resv_used += args->len; + mutex_unlock(&mp->m_metafile_resv_lock); + ip->i_nblocks += args->len; xfs_trans_log_inode(args->tp, ip, XFS_ILOG_CORE); } @@ -186,26 +188,33 @@ xfs_metafile_resv_free_space( struct xfs_trans *tp, xfs_filblks_t len) { + struct xfs_mount *mp = ip->i_mount; int64_t to_resv; ASSERT(xfs_is_metadir_inode(ip)); - trace_xfs_metafile_resv_free_space(ip, len); + + trace_xfs_metafile_resv_free_space(mp, len); ip->i_nblocks -= len; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + mutex_lock(&mp->m_metafile_resv_lock); + mp->m_metafile_resv_used -= len; + /* * Add the freed blocks back into the inode's delalloc reservation * until it reaches the maximum size. Update the ondisk fdblocks only. */ - to_resv = ip->i_meta_resv_asked - (ip->i_nblocks + ip->i_delayed_blks); + to_resv = mp->m_metafile_resv_target - + (mp->m_metafile_resv_used + mp->m_metafile_resv_avail); if (to_resv > 0) { to_resv = min_t(int64_t, to_resv, len); - ip->i_delayed_blks += to_resv; + mp->m_metafile_resv_avail += to_resv; xfs_mod_delalloc(ip, 0, to_resv); xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, to_resv); len -= to_resv; } + mutex_unlock(&mp->m_metafile_resv_lock); /* * Everything else goes back to the filesystem, so update the in-core @@ -215,61 +224,99 @@ xfs_metafile_resv_free_space( xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len); } -/* Release a metadata file's space reservation. */ +static void +__xfs_metafile_resv_free( + struct xfs_mount *mp) +{ + if (mp->m_metafile_resv_avail) { + xfs_mod_sb_delalloc(mp, -(int64_t)mp->m_metafile_resv_avail); + xfs_add_fdblocks(mp, mp->m_metafile_resv_avail); + } + mp->m_metafile_resv_avail = 0; + mp->m_metafile_resv_used = 0; + mp->m_metafile_resv_target = 0; +} + +/* Release unused metafile space reservation. */ void xfs_metafile_resv_free( - struct xfs_inode *ip) + struct xfs_mount *mp) { - /* Non-btree metadata inodes don't need space reservations. */ - if (!ip || !ip->i_meta_resv_asked) + if (!xfs_has_metadir(mp)) return; - ASSERT(xfs_is_metadir_inode(ip)); - trace_xfs_metafile_resv_free(ip, 0); + trace_xfs_metafile_resv_free(mp, 0); - if (ip->i_delayed_blks) { - xfs_mod_delalloc(ip, 0, -ip->i_delayed_blks); - xfs_add_fdblocks(ip->i_mount, ip->i_delayed_blks); - ip->i_delayed_blks = 0; - } - ip->i_meta_resv_asked = 0; + mutex_lock(&mp->m_metafile_resv_lock); + __xfs_metafile_resv_free(mp); + mutex_unlock(&mp->m_metafile_resv_lock); } -/* Set up a metadata file's space reservation. */ +/* Set up a metafile space reservation. */ int xfs_metafile_resv_init( - struct xfs_inode *ip, - xfs_filblks_t ask) + struct xfs_mount *mp) { + struct xfs_rtgroup *rtg = NULL; + xfs_filblks_t used = 0, target = 0; xfs_filblks_t hidden_space; - xfs_filblks_t used; - int error; + xfs_rfsblock_t dblocks_avail = mp->m_sb.sb_dblocks / 4; + int error = 0; - if (!ip || ip->i_meta_resv_asked > 0) + if (!xfs_has_metadir(mp)) return 0; - ASSERT(xfs_is_metadir_inode(ip)); + /* + * Free any previous reservation to have a clean slate. + */ + mutex_lock(&mp->m_metafile_resv_lock); + __xfs_metafile_resv_free(mp); + + /* + * Currently the only btree metafiles that require reservations are the + * rtrmap and the rtrefcount. Anything new will have to be added here + * as well. + */ + while ((rtg = xfs_rtgroup_next(mp, rtg))) { + if (xfs_has_rtrmapbt(mp)) { + used += rtg_rmap(rtg)->i_nblocks; + target += xfs_rtrmapbt_calc_reserves(mp); + } + if (xfs_has_rtreflink(mp)) { + used += rtg_refcount(rtg)->i_nblocks; + target += xfs_rtrefcountbt_calc_reserves(mp); + } + } + + if (!target) + goto out_unlock; /* - * Space taken by all other metadata btrees are accounted on-disk as + * Space taken by the per-AG metadata btrees are accounted on-disk as * used space. We therefore only hide the space that is reserved but * not used by the trees. */ - used = ip->i_nblocks; - if (used > ask) - ask = used; - hidden_space = ask - used; + if (used > target) + target = used; + else if (target > dblocks_avail) + target = dblocks_avail; + hidden_space = target - used; - error = xfs_dec_fdblocks(ip->i_mount, hidden_space, true); + error = xfs_dec_fdblocks(mp, hidden_space, true); if (error) { - trace_xfs_metafile_resv_init_error(ip, error, _RET_IP_); - return error; + trace_xfs_metafile_resv_init_error(mp, 0); + goto out_unlock; } - xfs_mod_delalloc(ip, 0, hidden_space); - ip->i_delayed_blks = hidden_space; - ip->i_meta_resv_asked = ask; + xfs_mod_sb_delalloc(mp, hidden_space); + + mp->m_metafile_resv_target = target; + mp->m_metafile_resv_used = used; + mp->m_metafile_resv_avail = hidden_space; + + trace_xfs_metafile_resv_init(mp, target); - trace_xfs_metafile_resv_init(ip, ask); - return 0; +out_unlock: + mutex_unlock(&mp->m_metafile_resv_lock); + return error; } diff --git a/fs/xfs/libxfs/xfs_metafile.h b/fs/xfs/libxfs/xfs_metafile.h index 95af4b52e5a7..ae6f9e779b98 100644 --- a/fs/xfs/libxfs/xfs_metafile.h +++ b/fs/xfs/libxfs/xfs_metafile.h @@ -26,13 +26,13 @@ void xfs_metafile_clear_iflag(struct xfs_trans *tp, struct xfs_inode *ip); /* Space reservations for metadata inodes. */ struct xfs_alloc_arg; -bool xfs_metafile_resv_critical(struct xfs_inode *ip); +bool xfs_metafile_resv_critical(struct xfs_mount *mp); void xfs_metafile_resv_alloc_space(struct xfs_inode *ip, struct xfs_alloc_arg *args); void xfs_metafile_resv_free_space(struct xfs_inode *ip, struct xfs_trans *tp, xfs_filblks_t len); -void xfs_metafile_resv_free(struct xfs_inode *ip); -int xfs_metafile_resv_init(struct xfs_inode *ip, xfs_filblks_t ask); +void xfs_metafile_resv_free(struct xfs_mount *mp); +int xfs_metafile_resv_init(struct xfs_mount *mp); /* Code specific to kernel/userspace; must be provided externally. */ diff --git a/fs/xfs/libxfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h index a85ecddaa48e..5ed44fdf7491 100644 --- a/fs/xfs/libxfs/xfs_ondisk.h +++ b/fs/xfs/libxfs/xfs_ondisk.h @@ -233,8 +233,8 @@ xfs_check_ondisk_structs(void) 16299260424LL); /* superblock field checks we got from xfs/122 */ - XFS_CHECK_STRUCT_SIZE(struct xfs_dsb, 288); - XFS_CHECK_STRUCT_SIZE(struct xfs_sb, 288); + XFS_CHECK_STRUCT_SIZE(struct xfs_dsb, 304); + XFS_CHECK_STRUCT_SIZE(struct xfs_sb, 304); XFS_CHECK_SB_OFFSET(sb_magicnum, 0); XFS_CHECK_SB_OFFSET(sb_blocksize, 4); XFS_CHECK_SB_OFFSET(sb_dblocks, 8); @@ -295,6 +295,8 @@ xfs_check_ondisk_structs(void) XFS_CHECK_SB_OFFSET(sb_rgextents, 276); XFS_CHECK_SB_OFFSET(sb_rgblklog, 280); XFS_CHECK_SB_OFFSET(sb_pad, 281); + XFS_CHECK_SB_OFFSET(sb_rtstart, 288); + XFS_CHECK_SB_OFFSET(sb_rtreserved, 296); } #endif /* __XFS_ONDISK_H */ diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 770adf60dd73..5057536e586c 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -1123,6 +1123,7 @@ xfs_rtfree_blocks( xfs_extlen_t mod; int error; + ASSERT(!xfs_has_zoned(mp)); ASSERT(rtlen <= XFS_MAX_BMBT_EXTLEN); mod = xfs_blen_to_rtxoff(mp, rtlen); @@ -1174,6 +1175,9 @@ xfs_rtalloc_query_range( end = min(end, rtg->rtg_extents - 1); + if (xfs_has_zoned(mp)) + return -EINVAL; + /* Iterate the bitmap, looking for discrepancies. */ while (start <= end) { struct xfs_rtalloc_rec rec; @@ -1268,6 +1272,8 @@ xfs_rtbitmap_blockcount_len( struct xfs_mount *mp, xfs_rtbxlen_t rtextents) { + if (xfs_has_zoned(mp)) + return 0; return howmany_64(rtextents, xfs_rtbitmap_rtx_per_rbmblock(mp)); } @@ -1308,6 +1314,11 @@ xfs_rtsummary_blockcount( xfs_rtbxlen_t rextents = xfs_rtbitmap_bitcount(mp); unsigned long long rsumwords; + if (xfs_has_zoned(mp)) { + *rsumlevels = 0; + return 0; + } + *rsumlevels = xfs_compute_rextslog(rextents) + 1; rsumwords = xfs_rtbitmap_blockcount_len(mp, rextents) * (*rsumlevels); return howmany_64(rsumwords, mp->m_blockwsize); diff --git a/fs/xfs/libxfs/xfs_rtgroup.c b/fs/xfs/libxfs/xfs_rtgroup.c index d84d32f1b48f..9186c58e83d5 100644 --- a/fs/xfs/libxfs/xfs_rtgroup.c +++ b/fs/xfs/libxfs/xfs_rtgroup.c @@ -194,15 +194,17 @@ xfs_rtgroup_lock( ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) || !(rtglock_flags & XFS_RTGLOCK_BITMAP)); - if (rtglock_flags & XFS_RTGLOCK_BITMAP) { - /* - * Lock both realtime free space metadata inodes for a freespace - * update. - */ - xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_EXCL); - xfs_ilock(rtg_summary(rtg), XFS_ILOCK_EXCL); - } else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) { - xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_SHARED); + if (!xfs_has_zoned(rtg_mount(rtg))) { + if (rtglock_flags & XFS_RTGLOCK_BITMAP) { + /* + * Lock both realtime free space metadata inodes for a + * freespace update. + */ + xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_EXCL); + xfs_ilock(rtg_summary(rtg), XFS_ILOCK_EXCL); + } else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) { + xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_SHARED); + } } if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg)) @@ -228,11 +230,13 @@ xfs_rtgroup_unlock( if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg)) xfs_iunlock(rtg_rmap(rtg), XFS_ILOCK_EXCL); - if (rtglock_flags & XFS_RTGLOCK_BITMAP) { - xfs_iunlock(rtg_summary(rtg), XFS_ILOCK_EXCL); - xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_EXCL); - } else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) { - xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_SHARED); + if (!xfs_has_zoned(rtg_mount(rtg))) { + if (rtglock_flags & XFS_RTGLOCK_BITMAP) { + xfs_iunlock(rtg_summary(rtg), XFS_ILOCK_EXCL); + xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_EXCL); + } else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) { + xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_SHARED); + } } } @@ -249,7 +253,8 @@ xfs_rtgroup_trans_join( ASSERT(!(rtglock_flags & ~XFS_RTGLOCK_ALL_FLAGS)); ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED)); - if (rtglock_flags & XFS_RTGLOCK_BITMAP) { + if (!xfs_has_zoned(rtg_mount(rtg)) && + (rtglock_flags & XFS_RTGLOCK_BITMAP)) { xfs_trans_ijoin(tp, rtg_bitmap(rtg), XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, rtg_summary(rtg), XFS_ILOCK_EXCL); } @@ -270,7 +275,7 @@ xfs_rtgroup_get_geometry( /* Fill out form. */ memset(rgeo, 0, sizeof(*rgeo)); rgeo->rg_number = rtg_rgno(rtg); - rgeo->rg_length = rtg_group(rtg)->xg_block_count; + rgeo->rg_length = rtg_blocks(rtg); xfs_rtgroup_geom_health(rtg, rgeo); return 0; } @@ -354,6 +359,7 @@ static const struct xfs_rtginode_ops xfs_rtginode_ops[XFS_RTGI_MAX] = { .sick = XFS_SICK_RG_BITMAP, .fmt_mask = (1U << XFS_DINODE_FMT_EXTENTS) | (1U << XFS_DINODE_FMT_BTREE), + .enabled = xfs_has_nonzoned, .create = xfs_rtbitmap_create, }, [XFS_RTGI_SUMMARY] = { @@ -362,6 +368,7 @@ static const struct xfs_rtginode_ops xfs_rtginode_ops[XFS_RTGI_MAX] = { .sick = XFS_SICK_RG_SUMMARY, .fmt_mask = (1U << XFS_DINODE_FMT_EXTENTS) | (1U << XFS_DINODE_FMT_BTREE), + .enabled = xfs_has_nonzoned, .create = xfs_rtsummary_create, }, [XFS_RTGI_RMAP] = { diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h index 03f39d4e43fc..d36a6ae0abe5 100644 --- a/fs/xfs/libxfs/xfs_rtgroup.h +++ b/fs/xfs/libxfs/xfs_rtgroup.h @@ -37,15 +37,33 @@ struct xfs_rtgroup { xfs_rtxnum_t rtg_extents; /* - * Cache of rt summary level per bitmap block with the invariant that - * rtg_rsum_cache[bbno] > the maximum i for which rsum[i][bbno] != 0, - * or 0 if rsum[i][bbno] == 0 for all i. - * + * For bitmap based RT devices this points to a cache of rt summary + * level per bitmap block with the invariant that rtg_rsum_cache[bbno] + * > the maximum i for which rsum[i][bbno] != 0, or 0 if + * rsum[i][bbno] == 0 for all i. * Reads and writes are serialized by the rsumip inode lock. + * + * For zoned RT devices this points to the open zone structure for + * a group that is open for writers, or is NULL. */ - uint8_t *rtg_rsum_cache; + union { + uint8_t *rtg_rsum_cache; + struct xfs_open_zone *rtg_open_zone; + }; }; +/* + * For zoned RT devices this is set on groups that have no written blocks + * and can be picked by the allocator for opening. + */ +#define XFS_RTG_FREE XA_MARK_0 + +/* + * For zoned RT devices this is set on groups that are fully written and that + * have unused blocks. Used by the garbage collection to pick targets. + */ +#define XFS_RTG_RECLAIMABLE XA_MARK_1 + static inline struct xfs_rtgroup *to_rtg(struct xfs_group *xg) { return container_of(xg, struct xfs_rtgroup, rtg_group); @@ -66,6 +84,11 @@ static inline xfs_rgnumber_t rtg_rgno(const struct xfs_rtgroup *rtg) return rtg->rtg_group.xg_gno; } +static inline xfs_rgblock_t rtg_blocks(const struct xfs_rtgroup *rtg) +{ + return rtg->rtg_group.xg_block_count; +} + static inline struct xfs_inode *rtg_bitmap(const struct xfs_rtgroup *rtg) { return rtg->rtg_inodes[XFS_RTGI_BITMAP]; @@ -222,10 +245,14 @@ xfs_rtb_to_daddr( xfs_rtblock_t rtbno) { struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG]; - xfs_rgnumber_t rgno = xfs_rtb_to_rgno(mp, rtbno); - uint64_t start_bno = (xfs_rtblock_t)rgno * g->blocks; - return XFS_FSB_TO_BB(mp, start_bno + (rtbno & g->blkmask)); + if (xfs_has_rtgroups(mp) && !g->has_daddr_gaps) { + xfs_rgnumber_t rgno = xfs_rtb_to_rgno(mp, rtbno); + + rtbno = (xfs_rtblock_t)rgno * g->blocks + (rtbno & g->blkmask); + } + + return XFS_FSB_TO_BB(mp, g->start_fsb + rtbno); } static inline xfs_rtblock_t @@ -233,10 +260,11 @@ xfs_daddr_to_rtb( struct xfs_mount *mp, xfs_daddr_t daddr) { - xfs_rfsblock_t bno = XFS_BB_TO_FSBT(mp, daddr); + struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG]; + xfs_rfsblock_t bno; - if (xfs_has_rtgroups(mp)) { - struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG]; + bno = XFS_BB_TO_FSBT(mp, daddr) - g->start_fsb; + if (xfs_has_rtgroups(mp) && !g->has_daddr_gaps) { xfs_rgnumber_t rgno; uint32_t rgbno; diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.c b/fs/xfs/libxfs/xfs_rtrmap_btree.c index e4ec36943cb7..9bdc2cbfc113 100644 --- a/fs/xfs/libxfs/xfs_rtrmap_btree.c +++ b/fs/xfs/libxfs/xfs_rtrmap_btree.c @@ -1033,3 +1033,22 @@ xfs_rtrmapbt_init_rtsb( xfs_btree_del_cursor(cur, error); return error; } + +/* + * Return the highest rgbno currently tracked by the rmap for this rtg. + */ +xfs_rgblock_t +xfs_rtrmap_highest_rgbno( + struct xfs_rtgroup *rtg) +{ + struct xfs_btree_block *block = rtg_rmap(rtg)->i_df.if_broot; + union xfs_btree_key key = {}; + struct xfs_btree_cur *cur; + + if (block->bb_numrecs == 0) + return NULLRGBLOCK; + cur = xfs_rtrmapbt_init_cursor(NULL, rtg); + xfs_btree_get_keys(cur, block, &key); + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + return be32_to_cpu(key.__rmap_bigkey[1].rm_startblock); +} diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.h b/fs/xfs/libxfs/xfs_rtrmap_btree.h index 9d0915089891..e328fd62a149 100644 --- a/fs/xfs/libxfs/xfs_rtrmap_btree.h +++ b/fs/xfs/libxfs/xfs_rtrmap_btree.h @@ -207,4 +207,6 @@ struct xfs_btree_cur *xfs_rtrmapbt_mem_cursor(struct xfs_rtgroup *rtg, int xfs_rtrmapbt_mem_init(struct xfs_mount *mp, struct xfbtree *xfbtree, struct xfs_buftarg *btp, xfs_rgnumber_t rgno); +xfs_rgblock_t xfs_rtrmap_highest_rgbno(struct xfs_rtgroup *rtg); + #endif /* __XFS_RTRMAP_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 3dc5f5dba162..e42bfd04a7c6 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -30,6 +30,7 @@ #include "xfs_rtgroup.h" #include "xfs_rtrmap_btree.h" #include "xfs_rtrefcount_btree.h" +#include "xfs_rtbitmap.h" /* * Physical superblock buffer manipulations. Shared with libxfs in userspace. @@ -185,6 +186,8 @@ xfs_sb_version_to_features( features |= XFS_FEAT_PARENT; if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR) features |= XFS_FEAT_METADIR; + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) + features |= XFS_FEAT_ZONED; return features; } @@ -266,6 +269,9 @@ static uint64_t xfs_expected_rbmblocks( struct xfs_sb *sbp) { + if (xfs_sb_is_v5(sbp) && + (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED)) + return 0; return howmany_64(xfs_extents_per_rbm(sbp), NBBY * xfs_rtbmblock_size(sbp)); } @@ -275,9 +281,15 @@ bool xfs_validate_rt_geometry( struct xfs_sb *sbp) { - if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE || - sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) - return false; + if (xfs_sb_is_v5(sbp) && + (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED)) { + if (sbp->sb_rextsize != 1) + return false; + } else { + if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE || + sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) + return false; + } if (sbp->sb_rblocks == 0) { if (sbp->sb_rextents != 0 || sbp->sb_rbmblocks != 0 || @@ -435,6 +447,34 @@ xfs_validate_sb_rtgroups( return 0; } +static int +xfs_validate_sb_zoned( + struct xfs_mount *mp, + struct xfs_sb *sbp) +{ + if (sbp->sb_frextents != 0) { + xfs_warn(mp, +"sb_frextents must be zero for zoned file systems."); + return -EINVAL; + } + + if (sbp->sb_rtstart && sbp->sb_rtstart < sbp->sb_dblocks) { + xfs_warn(mp, +"sb_rtstart (%lld) overlaps sb_dblocks (%lld).", + sbp->sb_rtstart, sbp->sb_dblocks); + return -EINVAL; + } + + if (sbp->sb_rtreserved && sbp->sb_rtreserved >= sbp->sb_rblocks) { + xfs_warn(mp, +"sb_rtreserved (%lld) larger than sb_rblocks (%lld).", + sbp->sb_rtreserved, sbp->sb_rblocks); + return -EINVAL; + } + + return 0; +} + /* Check the validity of the SB. */ STATIC int xfs_validate_sb_common( @@ -523,6 +563,11 @@ xfs_validate_sb_common( if (error) return error; } + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) { + error = xfs_validate_sb_zoned(mp, sbp); + if (error) + return error; + } } else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) { xfs_notice(mp, @@ -835,6 +880,14 @@ __xfs_sb_from_disk( to->sb_rgcount = 1; to->sb_rgextents = 0; } + + if (to->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) { + to->sb_rtstart = be64_to_cpu(from->sb_rtstart); + to->sb_rtreserved = be64_to_cpu(from->sb_rtreserved); + } else { + to->sb_rtstart = 0; + to->sb_rtreserved = 0; + } } void @@ -1001,6 +1054,11 @@ xfs_sb_to_disk( to->sb_rbmino = cpu_to_be64(0); to->sb_rsumino = cpu_to_be64(0); } + + if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) { + to->sb_rtstart = cpu_to_be64(from->sb_rtstart); + to->sb_rtreserved = cpu_to_be64(from->sb_rtreserved); + } } /* @@ -1146,6 +1204,10 @@ xfs_sb_mount_rextsize( rgs->blocks = sbp->sb_rgextents * sbp->sb_rextsize; rgs->blklog = mp->m_sb.sb_rgblklog; rgs->blkmask = xfs_mask32lo(mp->m_sb.sb_rgblklog); + rgs->start_fsb = mp->m_sb.sb_rtstart; + if (xfs_sb_has_incompat_feature(sbp, + XFS_SB_FEAT_INCOMPAT_ZONE_GAPS)) + rgs->has_daddr_gaps = true; } else { rgs->blocks = 0; rgs->blklog = 0; @@ -1265,8 +1327,7 @@ xfs_log_sb( mp->m_sb.sb_ifree = min_t(uint64_t, percpu_counter_sum_positive(&mp->m_ifree), mp->m_sb.sb_icount); - mp->m_sb.sb_fdblocks = - percpu_counter_sum_positive(&mp->m_fdblocks); + mp->m_sb.sb_fdblocks = xfs_sum_freecounter(mp, XC_FREE_BLOCKS); } /* @@ -1275,9 +1336,10 @@ xfs_log_sb( * we handle nearly-lockless reservations, so we must use the _positive * variant here to avoid writing out nonsense frextents. */ - if (xfs_has_rtgroups(mp)) + if (xfs_has_rtgroups(mp) && !xfs_has_zoned(mp)) { mp->m_sb.sb_frextents = - percpu_counter_sum_positive(&mp->m_frextents); + xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS); + } xfs_sb_to_disk(bp->b_addr, &mp->m_sb); xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF); @@ -1510,6 +1572,8 @@ xfs_fs_geometry( geo->flags |= XFS_FSOP_GEOM_FLAGS_EXCHANGE_RANGE; if (xfs_has_metadir(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_METADIR; + if (xfs_has_zoned(mp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_ZONED; geo->rtsectsize = sbp->sb_blocksize; geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp); @@ -1530,6 +1594,10 @@ xfs_fs_geometry( geo->rgcount = sbp->sb_rgcount; geo->rgextents = sbp->sb_rgextents; } + if (xfs_has_zoned(mp)) { + geo->rtstart = sbp->sb_rtstart; + geo->rtreserved = sbp->sb_rtreserved; + } } /* Read a secondary superblock. */ diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index ca2401c1facd..f6f4f2d4b5db 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -233,6 +233,34 @@ enum xfs_group_type { { XG_TYPE_AG, "ag" }, \ { XG_TYPE_RTG, "rtg" } +enum xfs_free_counter { + /* + * Number of free blocks on the data device. + */ + XC_FREE_BLOCKS, + + /* + * Number of free RT extents on the RT device. + */ + XC_FREE_RTEXTENTS, + + /* + * Number of available for use RT extents. + * + * This counter only exists for zoned RT device and indicates the number + * of RT extents that can be directly used by writes. XC_FREE_RTEXTENTS + * also includes blocks that have been written previously and freed, but + * sit in a rtgroup that still needs a zone reset. + */ + XC_FREE_RTAVAILABLE, + XC_FREE_NR, +}; + +#define XFS_FREECOUNTER_STR \ + { XC_FREE_BLOCKS, "blocks" }, \ + { XC_FREE_RTEXTENTS, "rtextents" }, \ + { XC_FREE_RTAVAILABLE, "rtavailable" } + /* * Type verifier functions */ diff --git a/fs/xfs/libxfs/xfs_zones.c b/fs/xfs/libxfs/xfs_zones.c new file mode 100644 index 000000000000..b0791a71931c --- /dev/null +++ b/fs/xfs/libxfs/xfs_zones.c @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2023-2025 Christoph Hellwig. + * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_rtgroup.h" +#include "xfs_zones.h" + +static bool +xfs_zone_validate_empty( + struct blk_zone *zone, + struct xfs_rtgroup *rtg, + xfs_rgblock_t *write_pointer) +{ + struct xfs_mount *mp = rtg_mount(rtg); + + if (rtg_rmap(rtg)->i_used_blocks > 0) { + xfs_warn(mp, "empty zone %u has non-zero used counter (0x%x).", + rtg_rgno(rtg), rtg_rmap(rtg)->i_used_blocks); + return false; + } + + *write_pointer = 0; + return true; +} + +static bool +xfs_zone_validate_wp( + struct blk_zone *zone, + struct xfs_rtgroup *rtg, + xfs_rgblock_t *write_pointer) +{ + struct xfs_mount *mp = rtg_mount(rtg); + xfs_rtblock_t wp_fsb = xfs_daddr_to_rtb(mp, zone->wp); + + if (rtg_rmap(rtg)->i_used_blocks > rtg->rtg_extents) { + xfs_warn(mp, "zone %u has too large used counter (0x%x).", + rtg_rgno(rtg), rtg_rmap(rtg)->i_used_blocks); + return false; + } + + if (xfs_rtb_to_rgno(mp, wp_fsb) != rtg_rgno(rtg)) { + xfs_warn(mp, "zone %u write pointer (0x%llx) outside of zone.", + rtg_rgno(rtg), wp_fsb); + return false; + } + + *write_pointer = xfs_rtb_to_rgbno(mp, wp_fsb); + if (*write_pointer >= rtg->rtg_extents) { + xfs_warn(mp, "zone %u has invalid write pointer (0x%x).", + rtg_rgno(rtg), *write_pointer); + return false; + } + + return true; +} + +static bool +xfs_zone_validate_full( + struct blk_zone *zone, + struct xfs_rtgroup *rtg, + xfs_rgblock_t *write_pointer) +{ + struct xfs_mount *mp = rtg_mount(rtg); + + if (rtg_rmap(rtg)->i_used_blocks > rtg->rtg_extents) { + xfs_warn(mp, "zone %u has too large used counter (0x%x).", + rtg_rgno(rtg), rtg_rmap(rtg)->i_used_blocks); + return false; + } + + *write_pointer = rtg->rtg_extents; + return true; +} + +static bool +xfs_zone_validate_seq( + struct blk_zone *zone, + struct xfs_rtgroup *rtg, + xfs_rgblock_t *write_pointer) +{ + struct xfs_mount *mp = rtg_mount(rtg); + + switch (zone->cond) { + case BLK_ZONE_COND_EMPTY: + return xfs_zone_validate_empty(zone, rtg, write_pointer); + case BLK_ZONE_COND_IMP_OPEN: + case BLK_ZONE_COND_EXP_OPEN: + case BLK_ZONE_COND_CLOSED: + return xfs_zone_validate_wp(zone, rtg, write_pointer); + case BLK_ZONE_COND_FULL: + return xfs_zone_validate_full(zone, rtg, write_pointer); + case BLK_ZONE_COND_NOT_WP: + case BLK_ZONE_COND_OFFLINE: + case BLK_ZONE_COND_READONLY: + xfs_warn(mp, "zone %u has unsupported zone condition 0x%x.", + rtg_rgno(rtg), zone->cond); + return false; + default: + xfs_warn(mp, "zone %u has unknown zone condition 0x%x.", + rtg_rgno(rtg), zone->cond); + return false; + } +} + +static bool +xfs_zone_validate_conv( + struct blk_zone *zone, + struct xfs_rtgroup *rtg) +{ + struct xfs_mount *mp = rtg_mount(rtg); + + switch (zone->cond) { + case BLK_ZONE_COND_NOT_WP: + return true; + default: + xfs_warn(mp, +"conventional zone %u has unsupported zone condition 0x%x.", + rtg_rgno(rtg), zone->cond); + return false; + } +} + +bool +xfs_zone_validate( + struct blk_zone *zone, + struct xfs_rtgroup *rtg, + xfs_rgblock_t *write_pointer) +{ + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG]; + uint32_t expected_size; + + /* + * Check that the zone capacity matches the rtgroup size stored in the + * superblock. Note that all zones including the last one must have a + * uniform capacity. + */ + if (XFS_BB_TO_FSB(mp, zone->capacity) != g->blocks) { + xfs_warn(mp, +"zone %u capacity (0x%llx) does not match RT group size (0x%x).", + rtg_rgno(rtg), XFS_BB_TO_FSB(mp, zone->capacity), + g->blocks); + return false; + } + + if (g->has_daddr_gaps) { + expected_size = 1 << g->blklog; + } else { + if (zone->len != zone->capacity) { + xfs_warn(mp, +"zone %u has capacity != size ((0x%llx vs 0x%llx)", + rtg_rgno(rtg), + XFS_BB_TO_FSB(mp, zone->len), + XFS_BB_TO_FSB(mp, zone->capacity)); + return false; + } + expected_size = g->blocks; + } + + if (XFS_BB_TO_FSB(mp, zone->len) != expected_size) { + xfs_warn(mp, +"zone %u length (0x%llx) does match geometry (0x%x).", + rtg_rgno(rtg), XFS_BB_TO_FSB(mp, zone->len), + expected_size); + } + + switch (zone->type) { + case BLK_ZONE_TYPE_CONVENTIONAL: + return xfs_zone_validate_conv(zone, rtg); + case BLK_ZONE_TYPE_SEQWRITE_REQ: + return xfs_zone_validate_seq(zone, rtg, write_pointer); + default: + xfs_warn(mp, "zoned %u has unsupported type 0x%x.", + rtg_rgno(rtg), zone->type); + return false; + } +} diff --git a/fs/xfs/libxfs/xfs_zones.h b/fs/xfs/libxfs/xfs_zones.h new file mode 100644 index 000000000000..c4f1367b2cca --- /dev/null +++ b/fs/xfs/libxfs/xfs_zones.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LIBXFS_ZONES_H +#define _LIBXFS_ZONES_H + +struct xfs_rtgroup; + +/* + * In order to guarantee forward progress for GC we need to reserve at least + * two zones: one that will be used for moving data into and one spare zone + * making sure that we have enough space to relocate a nearly-full zone. + * To allow for slightly sloppy accounting for when we need to reserve the + * second zone, we actually reserve three as that is easier than doing fully + * accurate bookkeeping. + */ +#define XFS_GC_ZONES 3U + +/* + * In addition we need two zones for user writes, one open zone for writing + * and one to still have available blocks without resetting the open zone + * when data in the open zone has been freed. + */ +#define XFS_RESERVED_ZONES (XFS_GC_ZONES + 1) +#define XFS_MIN_ZONES (XFS_RESERVED_ZONES + 1) + +/* + * Always keep one zone out of the general open zone pool to allow for GC to + * happen while other writers are waiting for free space. + */ +#define XFS_OPEN_GC_ZONES 1U +#define XFS_MIN_OPEN_ZONES (XFS_OPEN_GC_ZONES + 1U) + +bool xfs_zone_validate(struct blk_zone *zone, struct xfs_rtgroup *rtg, + xfs_rgblock_t *write_pointer); + +#endif /* _LIBXFS_ZONES_H */ diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index 9f8c312dfd3c..303374df44bd 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -69,6 +69,8 @@ STATIC size_t xchk_superblock_ondisk_size( struct xfs_mount *mp) { + if (xfs_has_zoned(mp)) + return offsetofend(struct xfs_dsb, sb_rtreserved); if (xfs_has_metadir(mp)) return offsetofend(struct xfs_dsb, sb_pad); if (xfs_has_metauuid(mp)) diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 66da7d4d56ba..4f1e2574660d 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -1038,8 +1038,8 @@ xchk_bmap( switch (whichfork) { case XFS_COW_FORK: - /* No CoW forks on non-reflink filesystems. */ - if (!xfs_has_reflink(mp)) { + /* No CoW forks filesystem doesn't support out of place writes */ + if (!xfs_has_reflink(mp) && !xfs_has_zoned(mp)) { xchk_ino_set_corrupt(sc, sc->ip->i_ino); return 0; } diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index ca23cf4db6c5..e629663e460a 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -350,7 +350,7 @@ retry: * The global incore space reservation is taken from the incore * counters, so leave that out of the computation. */ - fsc->fdblocks -= mp->m_resblks_avail; + fsc->fdblocks -= mp->m_free[XC_FREE_BLOCKS].res_avail; /* * Delayed allocation reservations are taken out of the incore counters @@ -413,7 +413,13 @@ xchk_fscount_count_frextents( fsc->frextents = 0; fsc->frextents_delayed = 0; - if (!xfs_has_realtime(mp)) + + /* + * Don't bother verifying and repairing the fs counters for zoned file + * systems as they don't track an on-disk frextents count, and the + * in-memory percpu counter also includes reservations. + */ + if (!xfs_has_realtime(mp) || xfs_has_zoned(mp)) return 0; while ((rtg = xfs_rtgroup_next(mp, rtg))) { @@ -513,8 +519,8 @@ xchk_fscounters( /* Snapshot the percpu counters. */ icount = percpu_counter_sum(&mp->m_icount); ifree = percpu_counter_sum(&mp->m_ifree); - fdblocks = percpu_counter_sum(&mp->m_fdblocks); - frextents = percpu_counter_sum(&mp->m_frextents); + fdblocks = xfs_sum_freecounter_raw(mp, XC_FREE_BLOCKS); + frextents = xfs_sum_freecounter_raw(mp, XC_FREE_RTEXTENTS); /* No negative values, please! */ if (icount < 0 || ifree < 0) @@ -589,15 +595,17 @@ xchk_fscounters( try_again = true; } - if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks, - fsc->fdblocks)) { + if (!xchk_fscount_within_range(sc, fdblocks, + &mp->m_free[XC_FREE_BLOCKS].count, fsc->fdblocks)) { if (fsc->frozen) xchk_set_corrupt(sc); else try_again = true; } - if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents, + if (!xfs_has_zoned(mp) && + !xchk_fscount_within_range(sc, frextents, + &mp->m_free[XC_FREE_RTEXTENTS].count, fsc->frextents - fsc->frextents_delayed)) { if (fsc->frozen) xchk_set_corrupt(sc); diff --git a/fs/xfs/scrub/fscounters_repair.c b/fs/xfs/scrub/fscounters_repair.c index cda13447a373..f0d2b04644e4 100644 --- a/fs/xfs/scrub/fscounters_repair.c +++ b/fs/xfs/scrub/fscounters_repair.c @@ -64,7 +64,7 @@ xrep_fscounters( percpu_counter_set(&mp->m_icount, fsc->icount); percpu_counter_set(&mp->m_ifree, fsc->ifree); - percpu_counter_set(&mp->m_fdblocks, fsc->fdblocks); + xfs_set_freecounter(mp, XC_FREE_BLOCKS, fsc->fdblocks); /* * Online repair is only supported on v5 file systems, which require @@ -74,10 +74,12 @@ xrep_fscounters( * track of the delalloc reservations separately, as they are are * subtracted from m_frextents, but not included in sb_frextents. */ - percpu_counter_set(&mp->m_frextents, - fsc->frextents - fsc->frextents_delayed); - if (!xfs_has_rtgroups(mp)) - mp->m_sb.sb_frextents = fsc->frextents; + if (!xfs_has_zoned(mp)) { + xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, + fsc->frextents - fsc->frextents_delayed); + if (!xfs_has_rtgroups(mp)) + mp->m_sb.sb_frextents = fsc->frextents; + } return 0; } diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index db6edd5a5fe5..bb3f475b6353 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -273,6 +273,13 @@ xchk_inode_cowextsize( xfs_failaddr_t fa; uint32_t value = be32_to_cpu(dip->di_cowextsize); + /* + * The used block counter for rtrmap is checked and repaired elsewhere. + */ + if (xfs_has_zoned(sc->mp) && + dip->di_metatype == cpu_to_be16(XFS_METAFILE_RTRMAP)) + return; + fa = xfs_inode_validate_cowextsize(sc->mp, value, mode, flags, flags2); if (fa) xchk_ino_set_corrupt(sc, ino); diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c index 13ff1c933cb8..4299063ffe87 100644 --- a/fs/xfs/scrub/inode_repair.c +++ b/fs/xfs/scrub/inode_repair.c @@ -710,7 +710,9 @@ xrep_dinode_extsize_hints( XFS_DIFLAG_EXTSZINHERIT); } - if (dip->di_version < 3) + if (dip->di_version < 3 || + (xfs_has_zoned(sc->mp) && + dip->di_metatype == cpu_to_be16(XFS_METAFILE_RTRMAP))) return; fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize), diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c index ac38f5843090..1588ce971cb8 100644 --- a/fs/xfs/scrub/newbt.c +++ b/fs/xfs/scrub/newbt.c @@ -62,7 +62,7 @@ xrep_newbt_estimate_slack( free = sc->sa.pag->pagf_freeblks; sz = xfs_ag_block_count(sc->mp, pag_agno(sc->sa.pag)); } else { - free = percpu_counter_sum(&sc->mp->m_fdblocks); + free = xfs_sum_freecounter_raw(sc->mp, XC_FREE_BLOCKS); sz = sc->mp->m_sb.sb_dblocks; } diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index b32fb233cf84..8703897c0a9c 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -935,10 +935,13 @@ xrep_reap_metadir_fsblocks( if (error) return error; - if (xreap_dirty(&rs)) - return xrep_defer_finish(sc); + if (xreap_dirty(&rs)) { + error = xrep_defer_finish(sc); + if (error) + return error; + } - return 0; + return xrep_reset_metafile_resv(sc); } /* diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 3b5288d3ef4e..f8f9ed30f56b 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -43,6 +43,7 @@ #include "xfs_rtalloc.h" #include "xfs_metafile.h" #include "xfs_rtrefcount_btree.h" +#include "xfs_zone_alloc.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -1050,7 +1051,13 @@ xrep_require_rtext_inuse( xfs_rtxnum_t startrtx; xfs_rtxnum_t endrtx; bool is_free = false; - int error; + int error = 0; + + if (xfs_has_zoned(mp)) { + if (!xfs_zone_rgbno_is_valid(sc->sr.rtg, rgbno + len - 1)) + return -EFSCORRUPTED; + return 0; + } startrtx = xfs_rgbno_to_rtx(mp, rgbno); endrtx = xfs_rgbno_to_rtx(mp, rgbno + len - 1); @@ -1386,11 +1393,12 @@ int xrep_reset_metafile_resv( struct xfs_scrub *sc) { - struct xfs_inode *ip = sc->ip; + struct xfs_mount *mp = sc->mp; int64_t delta; int error; - delta = ip->i_nblocks + ip->i_delayed_blks - ip->i_meta_resv_asked; + delta = mp->m_metafile_resv_used + mp->m_metafile_resv_avail - + mp->m_metafile_resv_target; if (delta == 0) return 0; @@ -1401,11 +1409,11 @@ xrep_reset_metafile_resv( if (delta > 0) { int64_t give_back; - give_back = min_t(uint64_t, delta, ip->i_delayed_blks); + give_back = min_t(uint64_t, delta, mp->m_metafile_resv_avail); if (give_back > 0) { - xfs_mod_delalloc(ip, 0, -give_back); - xfs_add_fdblocks(ip->i_mount, give_back); - ip->i_delayed_blks -= give_back; + xfs_mod_sb_delalloc(mp, -give_back); + xfs_add_fdblocks(mp, give_back); + mp->m_metafile_resv_avail -= give_back; } return 0; @@ -1413,24 +1421,23 @@ xrep_reset_metafile_resv( /* * Not enough reservation; try to take some blocks from the filesystem - * to the metadata inode. @delta is negative here, so invert the sign. + * to the metabtree reservation. */ - delta = -delta; - error = xfs_dec_fdblocks(sc->mp, delta, true); + delta = -delta; /* delta is negative here, so invert the sign. */ + error = xfs_dec_fdblocks(mp, delta, true); while (error == -ENOSPC) { delta--; if (delta == 0) { xfs_warn(sc->mp, -"Insufficient free space to reset space reservation for inode 0x%llx after repair.", - ip->i_ino); +"Insufficient free space to reset metabtree reservation after repair."); return 0; } - error = xfs_dec_fdblocks(sc->mp, delta, true); + error = xfs_dec_fdblocks(mp, delta, true); } if (error) return error; - xfs_mod_delalloc(ip, 0, delta); - ip->i_delayed_blks += delta; + xfs_mod_sb_delalloc(mp, delta); + mp->m_metafile_resv_avail += delta; return 0; } diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index e8c776a34c1d..d5ff8609dbfb 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -21,6 +21,7 @@ #include "xfs_rmap.h" #include "xfs_rtrmap_btree.h" #include "xfs_exchmaps.h" +#include "xfs_zone_alloc.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/repair.h" @@ -272,7 +273,6 @@ xchk_xref_is_used_rt_space( xfs_extlen_t len) { struct xfs_rtgroup *rtg = sc->sr.rtg; - struct xfs_inode *rbmip = rtg_bitmap(rtg); xfs_rtxnum_t startext; xfs_rtxnum_t endext; bool is_free; @@ -281,6 +281,13 @@ xchk_xref_is_used_rt_space( if (xchk_skip_xref(sc->sm)) return; + if (xfs_has_zoned(sc->mp)) { + if (!xfs_zone_rgbno_is_valid(rtg, + xfs_rtb_to_rgbno(sc->mp, rtbno) + len - 1)) + xchk_ino_xref_set_corrupt(sc, rtg_rmap(rtg)->i_ino); + return; + } + startext = xfs_rtb_to_rtx(sc->mp, rtbno); endext = xfs_rtb_to_rtx(sc->mp, rtbno + len - 1); error = xfs_rtalloc_extent_is_free(rtg, sc->tp, startext, @@ -288,5 +295,5 @@ xchk_xref_is_used_rt_space( if (!xchk_should_check_xref(sc, &error, NULL)) return; if (is_free) - xchk_ino_xref_set_corrupt(sc, rbmip->i_ino); + xchk_ino_xref_set_corrupt(sc, rtg_bitmap(rtg)->i_ino); } diff --git a/fs/xfs/scrub/rtrefcount_repair.c b/fs/xfs/scrub/rtrefcount_repair.c index 257cfb24beb4..983362447826 100644 --- a/fs/xfs/scrub/rtrefcount_repair.c +++ b/fs/xfs/scrub/rtrefcount_repair.c @@ -697,32 +697,6 @@ err_cur: return error; } -/* - * Now that we've logged the roots of the new btrees, invalidate all of the - * old blocks and free them. - */ -STATIC int -xrep_rtrefc_remove_old_tree( - struct xrep_rtrefc *rr) -{ - int error; - - /* - * Free all the extents that were allocated to the former rtrefcountbt - * and aren't cross-linked with something else. - */ - error = xrep_reap_metadir_fsblocks(rr->sc, - &rr->old_rtrefcountbt_blocks); - if (error) - return error; - - /* - * Ensure the proper reservation for the rtrefcount inode so that we - * don't fail to expand the btree. - */ - return xrep_reset_metafile_resv(rr->sc); -} - /* Rebuild the rt refcount btree. */ int xrep_rtrefcountbt( @@ -769,8 +743,12 @@ xrep_rtrefcountbt( if (error) goto out_bitmap; - /* Kill the old tree. */ - error = xrep_rtrefc_remove_old_tree(rr); + /* + * Free all the extents that were allocated to the former rtrefcountbt + * and aren't cross-linked with something else. + */ + error = xrep_reap_metadir_fsblocks(rr->sc, + &rr->old_rtrefcountbt_blocks); if (error) goto out_bitmap; diff --git a/fs/xfs/scrub/rtrmap_repair.c b/fs/xfs/scrub/rtrmap_repair.c index f2fdd7a9fc24..fc2592c53af5 100644 --- a/fs/xfs/scrub/rtrmap_repair.c +++ b/fs/xfs/scrub/rtrmap_repair.c @@ -810,28 +810,6 @@ err_cur: /* Reaping the old btree. */ -/* Reap the old rtrmapbt blocks. */ -STATIC int -xrep_rtrmap_remove_old_tree( - struct xrep_rtrmap *rr) -{ - int error; - - /* - * Free all the extents that were allocated to the former rtrmapbt and - * aren't cross-linked with something else. - */ - error = xrep_reap_metadir_fsblocks(rr->sc, &rr->old_rtrmapbt_blocks); - if (error) - return error; - - /* - * Ensure the proper reservation for the rtrmap inode so that we don't - * fail to expand the new btree. - */ - return xrep_reset_metafile_resv(rr->sc); -} - static inline bool xrep_rtrmapbt_want_live_update( struct xchk_iscan *iscan, @@ -995,8 +973,11 @@ xrep_rtrmapbt( if (error) goto out_records; - /* Kill the old tree. */ - error = xrep_rtrmap_remove_old_tree(rr); + /* + * Free all the extents that were allocated to the former rtrmapbt and + * aren't cross-linked with something else. + */ + error = xrep_reap_metadir_fsblocks(rr->sc, &rr->old_rtrmapbt_blocks); if (error) goto out_records; diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 6fa9e3e5bab7..9908850bf76f 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -399,12 +399,14 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { }, [XFS_SCRUB_TYPE_RTBITMAP] = { /* realtime bitmap */ .type = ST_RTGROUP, + .has = xfs_has_nonzoned, .setup = xchk_setup_rtbitmap, .scrub = xchk_rtbitmap, .repair = xrep_rtbitmap, }, [XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */ .type = ST_RTGROUP, + .has = xfs_has_nonzoned, .setup = xchk_setup_rtsummary, .scrub = xchk_rtsummary, .repair = xrep_rtsummary, diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 6d9965b546cb..26a04a783489 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. - * Copyright (c) 2016-2018 Christoph Hellwig. + * Copyright (c) 2016-2025 Christoph Hellwig. * All Rights Reserved. */ #include "xfs.h" @@ -20,6 +20,8 @@ #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_icache.h" +#include "xfs_zone_alloc.h" +#include "xfs_rtgroup.h" struct xfs_writepage_ctx { struct iomap_writepage_ctx ctx; @@ -77,6 +79,26 @@ xfs_setfilesize( return xfs_trans_commit(tp); } +static void +xfs_ioend_put_open_zones( + struct iomap_ioend *ioend) +{ + struct iomap_ioend *tmp; + + /* + * Put the open zone for all ioends merged into this one (if any). + */ + list_for_each_entry(tmp, &ioend->io_list, io_list) + xfs_open_zone_put(tmp->io_private); + + /* + * The main ioend might not have an open zone if the submission failed + * before xfs_zone_alloc_and_submit got called. + */ + if (ioend->io_private) + xfs_open_zone_put(ioend->io_private); +} + /* * IO write completion. */ @@ -86,6 +108,7 @@ xfs_end_ioend( { struct xfs_inode *ip = XFS_I(ioend->io_inode); struct xfs_mount *mp = ip->i_mount; + bool is_zoned = xfs_is_zoned_inode(ip); xfs_off_t offset = ioend->io_offset; size_t size = ioend->io_size; unsigned int nofs_flag; @@ -115,10 +138,11 @@ xfs_end_ioend( */ error = blk_status_to_errno(ioend->io_bio.bi_status); if (unlikely(error)) { - if (ioend->io_flags & IOMAP_F_SHARED) { + if (ioend->io_flags & IOMAP_IOEND_SHARED) { + ASSERT(!is_zoned); xfs_reflink_cancel_cow_range(ip, offset, size, true); xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, offset, - offset + size); + offset + size, NULL); } goto done; } @@ -126,14 +150,21 @@ xfs_end_ioend( /* * Success: commit the COW or unwritten blocks if needed. */ - if (ioend->io_flags & IOMAP_F_SHARED) + if (is_zoned) + error = xfs_zoned_end_io(ip, offset, size, ioend->io_sector, + ioend->io_private, NULLFSBLOCK); + else if (ioend->io_flags & IOMAP_IOEND_SHARED) error = xfs_reflink_end_cow(ip, offset, size); - else if (ioend->io_type == IOMAP_UNWRITTEN) + else if (ioend->io_flags & IOMAP_IOEND_UNWRITTEN) error = xfs_iomap_write_unwritten(ip, offset, size, false); - if (!error && xfs_ioend_is_append(ioend)) + if (!error && + !(ioend->io_flags & IOMAP_IOEND_DIRECT) && + xfs_ioend_is_append(ioend)) error = xfs_setfilesize(ip, offset, size); done: + if (is_zoned) + xfs_ioend_put_open_zones(ioend); iomap_finish_ioends(ioend, error); memalloc_nofs_restore(nofs_flag); } @@ -176,17 +207,27 @@ xfs_end_io( } } -STATIC void +void xfs_end_bio( struct bio *bio) { struct iomap_ioend *ioend = iomap_ioend_from_bio(bio); struct xfs_inode *ip = XFS_I(ioend->io_inode); + struct xfs_mount *mp = ip->i_mount; unsigned long flags; + /* + * For Appends record the actually written block number and set the + * boundary flag if needed. + */ + if (IS_ENABLED(CONFIG_XFS_RT) && bio_is_zone_append(bio)) { + ioend->io_sector = bio->bi_iter.bi_sector; + xfs_mark_rtg_boundary(ioend); + } + spin_lock_irqsave(&ip->i_ioend_lock, flags); if (list_empty(&ip->i_ioend_list)) - WARN_ON_ONCE(!queue_work(ip->i_mount->m_unwritten_workqueue, + WARN_ON_ONCE(!queue_work(mp->m_unwritten_workqueue, &ip->i_ioend_work)); list_add_tail(&ioend->io_list, &ip->i_ioend_list); spin_unlock_irqrestore(&ip->i_ioend_lock, flags); @@ -396,10 +437,11 @@ allocate_blocks: } static int -xfs_prepare_ioend( - struct iomap_ioend *ioend, +xfs_submit_ioend( + struct iomap_writepage_ctx *wpc, int status) { + struct iomap_ioend *ioend = wpc->ioend; unsigned int nofs_flag; /* @@ -410,7 +452,7 @@ xfs_prepare_ioend( nofs_flag = memalloc_nofs_save(); /* Convert CoW extents to regular */ - if (!status && (ioend->io_flags & IOMAP_F_SHARED)) { + if (!status && (ioend->io_flags & IOMAP_IOEND_SHARED)) { status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode), ioend->io_offset, ioend->io_size); } @@ -418,10 +460,14 @@ xfs_prepare_ioend( memalloc_nofs_restore(nofs_flag); /* send ioends that might require a transaction to the completion wq */ - if (xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN || - (ioend->io_flags & IOMAP_F_SHARED)) + if (xfs_ioend_is_append(ioend) || + (ioend->io_flags & (IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_SHARED))) ioend->io_bio.bi_end_io = xfs_end_bio; - return status; + + if (status) + return status; + submit_bio(&ioend->io_bio); + return 0; } /* @@ -458,12 +504,107 @@ xfs_discard_folio( * folio itself and not the start offset that is passed in. */ xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos, - folio_pos(folio) + folio_size(folio)); + folio_pos(folio) + folio_size(folio), NULL); } static const struct iomap_writeback_ops xfs_writeback_ops = { .map_blocks = xfs_map_blocks, - .prepare_ioend = xfs_prepare_ioend, + .submit_ioend = xfs_submit_ioend, + .discard_folio = xfs_discard_folio, +}; + +struct xfs_zoned_writepage_ctx { + struct iomap_writepage_ctx ctx; + struct xfs_open_zone *open_zone; +}; + +static inline struct xfs_zoned_writepage_ctx * +XFS_ZWPC(struct iomap_writepage_ctx *ctx) +{ + return container_of(ctx, struct xfs_zoned_writepage_ctx, ctx); +} + +static int +xfs_zoned_map_blocks( + struct iomap_writepage_ctx *wpc, + struct inode *inode, + loff_t offset, + unsigned int len) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + len); + xfs_filblks_t count_fsb; + struct xfs_bmbt_irec imap, del; + struct xfs_iext_cursor icur; + + if (xfs_is_shutdown(mp)) + return -EIO; + + XFS_ERRORTAG_DELAY(mp, XFS_ERRTAG_WB_DELAY_MS); + + /* + * All dirty data must be covered by delalloc extents. But truncate can + * remove delalloc extents underneath us or reduce their size. + * Returning a hole tells iomap to not write back any data from this + * range, which is the right thing to do in that case. + * + * Otherwise just tell iomap to treat ranges previously covered by a + * delalloc extent as mapped. The actual block allocation will be done + * just before submitting the bio. + * + * This implies we never map outside folios that are locked or marked + * as under writeback, and thus there is no need check the fork sequence + * count here. + */ + xfs_ilock(ip, XFS_ILOCK_EXCL); + if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap)) + imap.br_startoff = end_fsb; /* fake a hole past EOF */ + if (imap.br_startoff > offset_fsb) { + imap.br_blockcount = imap.br_startoff - offset_fsb; + imap.br_startoff = offset_fsb; + imap.br_startblock = HOLESTARTBLOCK; + imap.br_state = XFS_EXT_NORM; + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, 0); + return 0; + } + end_fsb = min(end_fsb, imap.br_startoff + imap.br_blockcount); + count_fsb = end_fsb - offset_fsb; + + del = imap; + xfs_trim_extent(&del, offset_fsb, count_fsb); + xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, &icur, &imap, &del, + XFS_BMAPI_REMAP); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + wpc->iomap.type = IOMAP_MAPPED; + wpc->iomap.flags = IOMAP_F_DIRTY; + wpc->iomap.bdev = mp->m_rtdev_targp->bt_bdev; + wpc->iomap.offset = offset; + wpc->iomap.length = XFS_FSB_TO_B(mp, count_fsb); + wpc->iomap.flags = IOMAP_F_ANON_WRITE; + + trace_xfs_zoned_map_blocks(ip, offset, wpc->iomap.length); + return 0; +} + +static int +xfs_zoned_submit_ioend( + struct iomap_writepage_ctx *wpc, + int status) +{ + wpc->ioend->io_bio.bi_end_io = xfs_end_bio; + if (status) + return status; + xfs_zone_alloc_and_submit(wpc->ioend, &XFS_ZWPC(wpc)->open_zone); + return 0; +} + +static const struct iomap_writeback_ops xfs_zoned_writeback_ops = { + .map_blocks = xfs_zoned_map_blocks, + .submit_ioend = xfs_zoned_submit_ioend, .discard_folio = xfs_discard_folio, }; @@ -472,10 +613,25 @@ xfs_vm_writepages( struct address_space *mapping, struct writeback_control *wbc) { - struct xfs_writepage_ctx wpc = { }; + struct xfs_inode *ip = XFS_I(mapping->host); + + xfs_iflags_clear(ip, XFS_ITRUNCATED); - xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); - return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops); + if (xfs_is_zoned_inode(ip)) { + struct xfs_zoned_writepage_ctx xc = { }; + int error; + + error = iomap_writepages(mapping, wbc, &xc.ctx, + &xfs_zoned_writeback_ops); + if (xc.open_zone) + xfs_open_zone_put(xc.open_zone); + return error; + } else { + struct xfs_writepage_ctx wpc = { }; + + return iomap_writepages(mapping, wbc, &wpc.ctx, + &xfs_writeback_ops); + } } STATIC int diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index e0bd68419764..5a7a0f1a0b49 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -9,6 +9,7 @@ extern const struct address_space_operations xfs_address_space_operations; extern const struct address_space_operations xfs_dax_aops; -int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size); +int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size); +void xfs_end_bio(struct bio *bio); #endif /* __XFS_AOPS_H__ */ diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 0836fea2d6d8..06ca11731e43 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -30,6 +30,7 @@ #include "xfs_reflink.h" #include "xfs_rtbitmap.h" #include "xfs_rtgroup.h" +#include "xfs_zone_alloc.h" /* Kernel only BMAP related definitions and functions */ @@ -436,7 +437,8 @@ xfs_bmap_punch_delalloc_range( struct xfs_inode *ip, int whichfork, xfs_off_t start_byte, - xfs_off_t end_byte) + xfs_off_t end_byte, + struct xfs_zone_alloc_ctx *ac) { struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); @@ -467,7 +469,21 @@ xfs_bmap_punch_delalloc_range( continue; } - xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del); + if (xfs_is_zoned_inode(ip) && ac) { + /* + * In a zoned buffered write context we need to return + * the punched delalloc allocations to the allocation + * context. This allows reusing them in the following + * iomap iterations. + */ + xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, + &del, XFS_BMAPI_REMAP); + ac->reserved_blocks += del.br_blockcount; + } else { + xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, + &del, 0); + } + if (!xfs_iext_get_extent(ifp, &icur, &got)) break; } @@ -582,7 +598,7 @@ xfs_free_eofblocks( if (ip->i_delayed_blks) { xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, round_up(XFS_ISIZE(ip), mp->m_sb.sb_blocksize), - LLONG_MAX); + LLONG_MAX, NULL); } xfs_inode_clear_eofblocks_tag(ip); return 0; @@ -825,7 +841,8 @@ int xfs_free_file_space( struct xfs_inode *ip, xfs_off_t offset, - xfs_off_t len) + xfs_off_t len, + struct xfs_zone_alloc_ctx *ac) { struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t startoffset_fsb; @@ -880,7 +897,7 @@ xfs_free_file_space( return 0; if (offset + len > XFS_ISIZE(ip)) len = XFS_ISIZE(ip) - offset; - error = xfs_zero_range(ip, offset, len, NULL); + error = xfs_zero_range(ip, offset, len, ac, NULL); if (error) return error; @@ -968,7 +985,8 @@ int xfs_collapse_file_space( struct xfs_inode *ip, xfs_off_t offset, - xfs_off_t len) + xfs_off_t len, + struct xfs_zone_alloc_ctx *ac) { struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; @@ -981,7 +999,7 @@ xfs_collapse_file_space( trace_xfs_collapse_file_space(ip); - error = xfs_free_file_space(ip, offset, len); + error = xfs_free_file_space(ip, offset, len, ac); if (error) return error; diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index b29760d36e1a..c477b3361630 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -15,6 +15,7 @@ struct xfs_inode; struct xfs_mount; struct xfs_trans; struct xfs_bmalloca; +struct xfs_zone_alloc_ctx; #ifdef CONFIG_XFS_RT int xfs_bmap_rtalloc(struct xfs_bmalloca *ap); @@ -31,7 +32,8 @@ xfs_bmap_rtalloc(struct xfs_bmalloca *ap) #endif /* CONFIG_XFS_RT */ void xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, int whichfork, - xfs_off_t start_byte, xfs_off_t end_byte); + xfs_off_t start_byte, xfs_off_t end_byte, + struct xfs_zone_alloc_ctx *ac); struct kgetbmap { __s64 bmv_offset; /* file offset of segment in blocks */ @@ -54,13 +56,13 @@ int xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip, /* preallocation and hole punch interface */ int xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset, - xfs_off_t len); + xfs_off_t len); int xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset, - xfs_off_t len); + xfs_off_t len, struct xfs_zone_alloc_ctx *ac); int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset, - xfs_off_t len); + xfs_off_t len, struct xfs_zone_alloc_ctx *ac); int xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset, - xfs_off_t len); + xfs_off_t len); /* EOF block manipulation functions */ bool xfs_can_free_eofblocks(struct xfs_inode *ip); diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 3f2403a7b49c..c1a306268ae4 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -844,7 +844,8 @@ xfs_ioc_trim( if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (mp->m_rtdev_targp && + + if (mp->m_rtdev_targp && !xfs_has_zoned(mp) && bdev_max_discard_sectors(mp->m_rtdev_targp->bt_bdev)) rt_bdev = mp->m_rtdev_targp->bt_bdev; if (!bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev) && !rt_bdev) diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index ea43c9a6e54c..da3161572735 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -671,7 +671,7 @@ xfs_extent_busy_wait_all( while ((pag = xfs_perag_next(mp, pag))) xfs_extent_busy_wait_group(pag_group(pag)); - if (xfs_has_rtgroups(mp)) + if (xfs_has_rtgroups(mp) && !xfs_has_zoned(mp)) while ((rtg = xfs_rtgroup_next(mp, rtg))) xfs_extent_busy_wait_group(rtg_group(rtg)); } diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index a25c713ff888..777438b853da 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -29,6 +29,7 @@ #include "xfs_inode.h" #include "xfs_rtbitmap.h" #include "xfs_rtgroup.h" +#include "xfs_zone_alloc.h" struct kmem_cache *xfs_efi_cache; struct kmem_cache *xfs_efd_cache; @@ -767,21 +768,35 @@ xfs_rtextent_free_finish_item( trace_xfs_extent_free_deferred(mp, xefi); - if (!(xefi->xefi_flags & XFS_EFI_CANCELLED)) { - if (*rtgp != to_rtg(xefi->xefi_group)) { - *rtgp = to_rtg(xefi->xefi_group); - xfs_rtgroup_lock(*rtgp, XFS_RTGLOCK_BITMAP); - xfs_rtgroup_trans_join(tp, *rtgp, - XFS_RTGLOCK_BITMAP); - } - error = xfs_rtfree_blocks(tp, *rtgp, - xefi->xefi_startblock, xefi->xefi_blockcount); + if (xefi->xefi_flags & XFS_EFI_CANCELLED) + goto done; + + if (*rtgp != to_rtg(xefi->xefi_group)) { + unsigned int lock_flags; + + if (xfs_has_zoned(mp)) + lock_flags = XFS_RTGLOCK_RMAP; + else + lock_flags = XFS_RTGLOCK_BITMAP; + + *rtgp = to_rtg(xefi->xefi_group); + xfs_rtgroup_lock(*rtgp, lock_flags); + xfs_rtgroup_trans_join(tp, *rtgp, lock_flags); } + + if (xfs_has_zoned(mp)) { + error = xfs_zone_free_blocks(tp, *rtgp, xefi->xefi_startblock, + xefi->xefi_blockcount); + } else { + error = xfs_rtfree_blocks(tp, *rtgp, xefi->xefi_startblock, + xefi->xefi_blockcount); + } + if (error == -EAGAIN) { xfs_efd_from_efi(efdp); return error; } - +done: xfs_efd_add_extent(efdp, xefi); xfs_extent_free_cancel_item(item); return error; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index f7a7d89c345e..fe8cf9d96eb0 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -25,6 +25,8 @@ #include "xfs_iomap.h" #include "xfs_reflink.h" #include "xfs_file.h" +#include "xfs_aops.h" +#include "xfs_zone_alloc.h" #include <linux/dax.h> #include <linux/falloc.h> @@ -150,7 +152,7 @@ xfs_file_fsync( * ensure newly written file data make it to disk before logging the new * inode size in case of an extending write. */ - if (XFS_IS_REALTIME_INODE(ip)) + if (XFS_IS_REALTIME_INODE(ip) && mp->m_rtdev_targp != mp->m_ddev_targp) error = blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev); else if (mp->m_logdev_targp != mp->m_ddev_targp) error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev); @@ -360,7 +362,8 @@ xfs_file_write_zero_eof( struct iov_iter *from, unsigned int *iolock, size_t count, - bool *drained_dio) + bool *drained_dio, + struct xfs_zone_alloc_ctx *ac) { struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); loff_t isize; @@ -414,7 +417,7 @@ xfs_file_write_zero_eof( trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); xfs_ilock(ip, XFS_MMAPLOCK_EXCL); - error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL); + error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, ac, NULL); xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); return error; @@ -431,7 +434,8 @@ STATIC ssize_t xfs_file_write_checks( struct kiocb *iocb, struct iov_iter *from, - unsigned int *iolock) + unsigned int *iolock, + struct xfs_zone_alloc_ctx *ac) { struct inode *inode = iocb->ki_filp->f_mapping->host; size_t count = iov_iter_count(from); @@ -481,7 +485,7 @@ restart: */ if (iocb->ki_pos > i_size_read(inode)) { error = xfs_file_write_zero_eof(iocb, from, iolock, count, - &drained_dio); + &drained_dio, ac); if (error == 1) goto restart; if (error) @@ -491,6 +495,48 @@ restart: return kiocb_modified(iocb); } +static ssize_t +xfs_zoned_write_space_reserve( + struct xfs_inode *ip, + struct kiocb *iocb, + struct iov_iter *from, + unsigned int flags, + struct xfs_zone_alloc_ctx *ac) +{ + loff_t count = iov_iter_count(from); + int error; + + if (iocb->ki_flags & IOCB_NOWAIT) + flags |= XFS_ZR_NOWAIT; + + /* + * Check the rlimit and LFS boundary first so that we don't over-reserve + * by possibly a lot. + * + * The generic write path will redo this check later, and it might have + * changed by then. If it got expanded we'll stick to our earlier + * smaller limit, and if it is decreased the new smaller limit will be + * used and our extra space reservation will be returned after finishing + * the write. + */ + error = generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, &count); + if (error) + return error; + + /* + * Sloppily round up count to file system blocks. + * + * This will often reserve an extra block, but that avoids having to look + * at the start offset, which isn't stable for O_APPEND until taking the + * iolock. Also we need to reserve a block each for zeroing the old + * EOF block and the new start block if they are unaligned. + * + * Any remaining block will be returned after the write. + */ + return xfs_zoned_space_reserve(ip, + XFS_B_TO_FSB(ip->i_mount, count) + 1 + 2, flags, ac); +} + static int xfs_dio_write_end_io( struct kiocb *iocb, @@ -503,6 +549,9 @@ xfs_dio_write_end_io( loff_t offset = iocb->ki_pos; unsigned int nofs_flag; + ASSERT(!xfs_is_zoned_inode(ip) || + !(flags & (IOMAP_DIO_UNWRITTEN | IOMAP_DIO_COW))); + trace_xfs_end_io_direct_write(ip, offset, size); if (xfs_is_shutdown(ip->i_mount)) @@ -582,14 +631,51 @@ static const struct iomap_dio_ops xfs_dio_write_ops = { .end_io = xfs_dio_write_end_io, }; +static void +xfs_dio_zoned_submit_io( + const struct iomap_iter *iter, + struct bio *bio, + loff_t file_offset) +{ + struct xfs_mount *mp = XFS_I(iter->inode)->i_mount; + struct xfs_zone_alloc_ctx *ac = iter->private; + xfs_filblks_t count_fsb; + struct iomap_ioend *ioend; + + count_fsb = XFS_B_TO_FSB(mp, bio->bi_iter.bi_size); + if (count_fsb > ac->reserved_blocks) { + xfs_err(mp, +"allocation (%lld) larger than reservation (%lld).", + count_fsb, ac->reserved_blocks); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + bio_io_error(bio); + return; + } + ac->reserved_blocks -= count_fsb; + + bio->bi_end_io = xfs_end_bio; + ioend = iomap_init_ioend(iter->inode, bio, file_offset, + IOMAP_IOEND_DIRECT); + xfs_zone_alloc_and_submit(ioend, &ac->open_zone); +} + +static const struct iomap_dio_ops xfs_dio_zoned_write_ops = { + .bio_set = &iomap_ioend_bioset, + .submit_io = xfs_dio_zoned_submit_io, + .end_io = xfs_dio_write_end_io, +}; + /* - * Handle block aligned direct I/O writes + * Handle block aligned direct I/O writes. */ static noinline ssize_t xfs_file_dio_write_aligned( struct xfs_inode *ip, struct kiocb *iocb, - struct iov_iter *from) + struct iov_iter *from, + const struct iomap_ops *ops, + const struct iomap_dio_ops *dops, + struct xfs_zone_alloc_ctx *ac) { unsigned int iolock = XFS_IOLOCK_SHARED; ssize_t ret; @@ -597,7 +683,7 @@ xfs_file_dio_write_aligned( ret = xfs_ilock_iocb_for_write(iocb, &iolock); if (ret) return ret; - ret = xfs_file_write_checks(iocb, from, &iolock); + ret = xfs_file_write_checks(iocb, from, &iolock, ac); if (ret) goto out_unlock; @@ -611,11 +697,31 @@ xfs_file_dio_write_aligned( iolock = XFS_IOLOCK_SHARED; } trace_xfs_file_direct_write(iocb, from); - ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, - &xfs_dio_write_ops, 0, NULL, 0); + ret = iomap_dio_rw(iocb, from, ops, dops, 0, ac, 0); out_unlock: - if (iolock) - xfs_iunlock(ip, iolock); + xfs_iunlock(ip, iolock); + return ret; +} + +/* + * Handle block aligned direct I/O writes to zoned devices. + */ +static noinline ssize_t +xfs_file_dio_write_zoned( + struct xfs_inode *ip, + struct kiocb *iocb, + struct iov_iter *from) +{ + struct xfs_zone_alloc_ctx ac = { }; + ssize_t ret; + + ret = xfs_zoned_write_space_reserve(ip, iocb, from, 0, &ac); + if (ret < 0) + return ret; + ret = xfs_file_dio_write_aligned(ip, iocb, from, + &xfs_zoned_direct_write_iomap_ops, + &xfs_dio_zoned_write_ops, &ac); + xfs_zoned_space_unreserve(ip, &ac); return ret; } @@ -675,7 +781,7 @@ retry_exclusive: goto out_unlock; } - ret = xfs_file_write_checks(iocb, from, &iolock); + ret = xfs_file_write_checks(iocb, from, &iolock, NULL); if (ret) goto out_unlock; @@ -721,9 +827,21 @@ xfs_file_dio_write( /* direct I/O must be aligned to device logical sector size */ if ((iocb->ki_pos | count) & target->bt_logical_sectormask) return -EINVAL; - if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask) + + /* + * For always COW inodes we also must check the alignment of each + * individual iovec segment, as they could end up with different + * I/Os due to the way bio_iov_iter_get_pages works, and we'd + * then overwrite an already written block. + */ + if (((iocb->ki_pos | count) & ip->i_mount->m_blockmask) || + (xfs_is_always_cow_inode(ip) && + (iov_iter_alignment(from) & ip->i_mount->m_blockmask))) return xfs_file_dio_write_unaligned(ip, iocb, from); - return xfs_file_dio_write_aligned(ip, iocb, from); + if (xfs_is_zoned_inode(ip)) + return xfs_file_dio_write_zoned(ip, iocb, from); + return xfs_file_dio_write_aligned(ip, iocb, from, + &xfs_direct_write_iomap_ops, &xfs_dio_write_ops, NULL); } static noinline ssize_t @@ -740,7 +858,7 @@ xfs_file_dax_write( ret = xfs_ilock_iocb(iocb, iolock); if (ret) return ret; - ret = xfs_file_write_checks(iocb, from, &iolock); + ret = xfs_file_write_checks(iocb, from, &iolock, NULL); if (ret) goto out; @@ -784,7 +902,7 @@ write_retry: if (ret) return ret; - ret = xfs_file_write_checks(iocb, from, &iolock); + ret = xfs_file_write_checks(iocb, from, &iolock, NULL); if (ret) goto out; @@ -832,6 +950,67 @@ out: } STATIC ssize_t +xfs_file_buffered_write_zoned( + struct kiocb *iocb, + struct iov_iter *from) +{ + struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); + struct xfs_mount *mp = ip->i_mount; + unsigned int iolock = XFS_IOLOCK_EXCL; + bool cleared_space = false; + struct xfs_zone_alloc_ctx ac = { }; + ssize_t ret; + + ret = xfs_zoned_write_space_reserve(ip, iocb, from, XFS_ZR_GREEDY, &ac); + if (ret < 0) + return ret; + + ret = xfs_ilock_iocb(iocb, iolock); + if (ret) + goto out_unreserve; + + ret = xfs_file_write_checks(iocb, from, &iolock, &ac); + if (ret) + goto out_unlock; + + /* + * Truncate the iter to the length that we were actually able to + * allocate blocks for. This needs to happen after + * xfs_file_write_checks, because that assigns ki_pos for O_APPEND + * writes. + */ + iov_iter_truncate(from, + XFS_FSB_TO_B(mp, ac.reserved_blocks) - + (iocb->ki_pos & mp->m_blockmask)); + if (!iov_iter_count(from)) + goto out_unlock; + +retry: + trace_xfs_file_buffered_write(iocb, from); + ret = iomap_file_buffered_write(iocb, from, + &xfs_buffered_write_iomap_ops, &ac); + if (ret == -ENOSPC && !cleared_space) { + /* + * Kick off writeback to convert delalloc space and release the + * usually too pessimistic indirect block reservations. + */ + xfs_flush_inodes(mp); + cleared_space = true; + goto retry; + } + +out_unlock: + xfs_iunlock(ip, iolock); +out_unreserve: + xfs_zoned_space_unreserve(ip, &ac); + if (ret > 0) { + XFS_STATS_ADD(mp, xs_write_bytes, ret); + ret = generic_write_sync(iocb, ret); + } + return ret; +} + +STATIC ssize_t xfs_file_write_iter( struct kiocb *iocb, struct iov_iter *from) @@ -878,6 +1057,8 @@ xfs_file_write_iter( return ret; } + if (xfs_is_zoned_inode(ip)) + return xfs_file_buffered_write_zoned(iocb, from); return xfs_file_buffered_write(iocb, from); } @@ -932,7 +1113,8 @@ static int xfs_falloc_collapse_range( struct file *file, loff_t offset, - loff_t len) + loff_t len, + struct xfs_zone_alloc_ctx *ac) { struct inode *inode = file_inode(file); loff_t new_size = i_size_read(inode) - len; @@ -948,7 +1130,7 @@ xfs_falloc_collapse_range( if (offset + len >= i_size_read(inode)) return -EINVAL; - error = xfs_collapse_file_space(XFS_I(inode), offset, len); + error = xfs_collapse_file_space(XFS_I(inode), offset, len, ac); if (error) return error; return xfs_falloc_setsize(file, new_size); @@ -1004,7 +1186,8 @@ xfs_falloc_zero_range( struct file *file, int mode, loff_t offset, - loff_t len) + loff_t len, + struct xfs_zone_alloc_ctx *ac) { struct inode *inode = file_inode(file); unsigned int blksize = i_blocksize(inode); @@ -1017,7 +1200,7 @@ xfs_falloc_zero_range( if (error) return error; - error = xfs_free_file_space(XFS_I(inode), offset, len); + error = xfs_free_file_space(XFS_I(inode), offset, len, ac); if (error) return error; @@ -1088,22 +1271,18 @@ xfs_falloc_allocate_range( FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE) STATIC long -xfs_file_fallocate( +__xfs_file_fallocate( struct file *file, int mode, loff_t offset, - loff_t len) + loff_t len, + struct xfs_zone_alloc_ctx *ac) { struct inode *inode = file_inode(file); struct xfs_inode *ip = XFS_I(inode); long error; uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; - if (!S_ISREG(inode->i_mode)) - return -EINVAL; - if (mode & ~XFS_FALLOC_FL_SUPPORTED) - return -EOPNOTSUPP; - xfs_ilock(ip, iolock); error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP); if (error) @@ -1124,16 +1303,16 @@ xfs_file_fallocate( switch (mode & FALLOC_FL_MODE_MASK) { case FALLOC_FL_PUNCH_HOLE: - error = xfs_free_file_space(ip, offset, len); + error = xfs_free_file_space(ip, offset, len, ac); break; case FALLOC_FL_COLLAPSE_RANGE: - error = xfs_falloc_collapse_range(file, offset, len); + error = xfs_falloc_collapse_range(file, offset, len, ac); break; case FALLOC_FL_INSERT_RANGE: error = xfs_falloc_insert_range(file, offset, len); break; case FALLOC_FL_ZERO_RANGE: - error = xfs_falloc_zero_range(file, mode, offset, len); + error = xfs_falloc_zero_range(file, mode, offset, len, ac); break; case FALLOC_FL_UNSHARE_RANGE: error = xfs_falloc_unshare_range(file, mode, offset, len); @@ -1154,6 +1333,54 @@ out_unlock: return error; } +static long +xfs_file_zoned_fallocate( + struct file *file, + int mode, + loff_t offset, + loff_t len) +{ + struct xfs_zone_alloc_ctx ac = { }; + struct xfs_inode *ip = XFS_I(file_inode(file)); + int error; + + error = xfs_zoned_space_reserve(ip, 2, XFS_ZR_RESERVED, &ac); + if (error) + return error; + error = __xfs_file_fallocate(file, mode, offset, len, &ac); + xfs_zoned_space_unreserve(ip, &ac); + return error; +} + +static long +xfs_file_fallocate( + struct file *file, + int mode, + loff_t offset, + loff_t len) +{ + struct inode *inode = file_inode(file); + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + if (mode & ~XFS_FALLOC_FL_SUPPORTED) + return -EOPNOTSUPP; + + /* + * For zoned file systems, zeroing the first and last block of a hole + * punch requires allocating a new block to rewrite the remaining data + * and new zeroes out of place. Get a reservations for those before + * taking the iolock. Dip into the reserved pool because we are + * expected to be able to punch a hole even on a completely full + * file system. + */ + if (xfs_is_zoned_inode(XFS_I(inode)) && + (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE | + FALLOC_FL_COLLAPSE_RANGE))) + return xfs_file_zoned_fallocate(file, mode, offset, len); + return __xfs_file_fallocate(file, mode, offset, len, NULL); +} + STATIC int xfs_file_fadvise( struct file *file, @@ -1347,15 +1574,22 @@ xfs_file_release( * blocks. This avoids open/read/close workloads from removing EOF * blocks that other writers depend upon to reduce fragmentation. * + * Inodes on the zoned RT device never have preallocations, so skip + * taking the locks below. + */ + if (!inode->i_nlink || + !(file->f_mode & FMODE_WRITE) || + (ip->i_diflags & XFS_DIFLAG_APPEND) || + xfs_is_zoned_inode(ip)) + return 0; + + /* * If we can't get the iolock just skip truncating the blocks past EOF * because we could deadlock with the mmap_lock otherwise. We'll get * another chance to drop them once the last reference to the inode is * dropped, so we'll never leak blocks permanently. */ - if (inode->i_nlink && - (file->f_mode & FMODE_WRITE) && - !(ip->i_diflags & XFS_DIFLAG_APPEND) && - !xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) && + if (!xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) && xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { if (xfs_can_free_eofblocks(ip) && !xfs_iflags_test_and_set(ip, XFS_EOFBLOCKS_RELEASED)) @@ -1472,9 +1706,10 @@ xfs_dax_read_fault( * i_lock (XFS - extent map serialisation) */ static vm_fault_t -xfs_write_fault( +__xfs_write_fault( struct vm_fault *vmf, - unsigned int order) + unsigned int order, + struct xfs_zone_alloc_ctx *ac) { struct inode *inode = file_inode(vmf->vma->vm_file); struct xfs_inode *ip = XFS_I(inode); @@ -1511,13 +1746,50 @@ xfs_write_fault( if (IS_DAX(inode)) ret = xfs_dax_fault_locked(vmf, order, true); else - ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops); + ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops, + ac); xfs_iunlock(ip, lock_mode); sb_end_pagefault(inode->i_sb); return ret; } +static vm_fault_t +xfs_write_fault_zoned( + struct vm_fault *vmf, + unsigned int order) +{ + struct xfs_inode *ip = XFS_I(file_inode(vmf->vma->vm_file)); + unsigned int len = folio_size(page_folio(vmf->page)); + struct xfs_zone_alloc_ctx ac = { }; + int error; + vm_fault_t ret; + + /* + * This could over-allocate as it doesn't check for truncation. + * + * But as the overallocation is limited to less than a folio and will be + * release instantly that's just fine. + */ + error = xfs_zoned_space_reserve(ip, XFS_B_TO_FSB(ip->i_mount, len), 0, + &ac); + if (error < 0) + return vmf_fs_error(error); + ret = __xfs_write_fault(vmf, order, &ac); + xfs_zoned_space_unreserve(ip, &ac); + return ret; +} + +static vm_fault_t +xfs_write_fault( + struct vm_fault *vmf, + unsigned int order) +{ + if (xfs_is_zoned_inode(XFS_I(file_inode(vmf->vma->vm_file)))) + return xfs_write_fault_zoned(vmf, order); + return __xfs_write_fault(vmf, order, NULL); +} + static inline bool xfs_is_write_fault( struct vm_fault *vmf) @@ -1626,7 +1898,8 @@ const struct file_operations xfs_file_operations = { .fadvise = xfs_file_fadvise, .remap_file_range = xfs_file_remap_range, .fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC | - FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE, + FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE | + FOP_DONTCACHE, }; const struct file_operations xfs_dir_file_operations = { diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 1dbd2d75f7ae..a4bc1642fe56 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -879,17 +879,39 @@ xfs_getfsmap_rtdev_rmapbt( struct xfs_mount *mp = tp->t_mountp; struct xfs_rtgroup *rtg = NULL; struct xfs_btree_cur *bt_cur = NULL; + xfs_daddr_t rtstart_daddr; xfs_rtblock_t start_rtb; xfs_rtblock_t end_rtb; xfs_rgnumber_t start_rg, end_rg; uint64_t eofs; int error = 0; - eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); + eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart + mp->m_sb.sb_rblocks); if (keys[0].fmr_physical >= eofs) return 0; - start_rtb = xfs_daddr_to_rtb(mp, keys[0].fmr_physical); - end_rtb = xfs_daddr_to_rtb(mp, min(eofs - 1, keys[1].fmr_physical)); + + rtstart_daddr = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart); + if (keys[0].fmr_physical < rtstart_daddr) { + struct xfs_fsmap_irec frec = { + .owner = XFS_RMAP_OWN_FS, + .len_daddr = rtstart_daddr, + }; + + /* Adjust the low key if we are continuing from where we left off. */ + if (keys[0].fmr_length > 0) { + info->low_daddr = keys[0].fmr_physical + keys[0].fmr_length; + return 0; + } + + /* Fabricate an rmap entry for space occupied by the data dev */ + error = xfs_getfsmap_helper(tp, info, &frec); + if (error) + return error; + } + + start_rtb = xfs_daddr_to_rtb(mp, rtstart_daddr + keys[0].fmr_physical); + end_rtb = xfs_daddr_to_rtb(mp, rtstart_daddr + + min(eofs - 1, keys[1].fmr_physical)); info->missing_owner = XFS_FMR_OWN_FREE; @@ -1004,22 +1026,40 @@ xfs_getfsmap_rtdev_rmapbt( } #endif /* CONFIG_XFS_RT */ +static uint32_t +xfs_getfsmap_device( + struct xfs_mount *mp, + enum xfs_device dev) +{ + if (mp->m_sb.sb_rtstart) + return dev; + + switch (dev) { + case XFS_DEV_DATA: + return new_encode_dev(mp->m_ddev_targp->bt_dev); + case XFS_DEV_LOG: + return new_encode_dev(mp->m_logdev_targp->bt_dev); + case XFS_DEV_RT: + if (!mp->m_rtdev_targp) + break; + return new_encode_dev(mp->m_rtdev_targp->bt_dev); + } + + return -1; +} + /* Do we recognize the device? */ STATIC bool xfs_getfsmap_is_valid_device( struct xfs_mount *mp, struct xfs_fsmap *fm) { - if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX || - fm->fmr_device == new_encode_dev(mp->m_ddev_targp->bt_dev)) - return true; - if (mp->m_logdev_targp && - fm->fmr_device == new_encode_dev(mp->m_logdev_targp->bt_dev)) - return true; - if (mp->m_rtdev_targp && - fm->fmr_device == new_encode_dev(mp->m_rtdev_targp->bt_dev)) - return true; - return false; + return fm->fmr_device == 0 || + fm->fmr_device == UINT_MAX || + fm->fmr_device == xfs_getfsmap_device(mp, XFS_DEV_DATA) || + fm->fmr_device == xfs_getfsmap_device(mp, XFS_DEV_LOG) || + (mp->m_rtdev_targp && + fm->fmr_device == xfs_getfsmap_device(mp, XFS_DEV_RT)); } /* Ensure that the low key is less than the high key. */ @@ -1126,7 +1166,7 @@ xfs_getfsmap( /* Set up our device handlers. */ memset(handlers, 0, sizeof(handlers)); handlers[0].nr_sectors = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); - handlers[0].dev = new_encode_dev(mp->m_ddev_targp->bt_dev); + handlers[0].dev = xfs_getfsmap_device(mp, XFS_DEV_DATA); if (use_rmap) handlers[0].fn = xfs_getfsmap_datadev_rmapbt; else @@ -1134,13 +1174,17 @@ xfs_getfsmap( if (mp->m_logdev_targp != mp->m_ddev_targp) { handlers[1].nr_sectors = XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks); - handlers[1].dev = new_encode_dev(mp->m_logdev_targp->bt_dev); + handlers[1].dev = xfs_getfsmap_device(mp, XFS_DEV_LOG); handlers[1].fn = xfs_getfsmap_logdev; } #ifdef CONFIG_XFS_RT - if (mp->m_rtdev_targp) { + /* + * For zoned file systems there is no rtbitmap, so only support fsmap + * if the callers is privileged enough to use the full rmap version. + */ + if (mp->m_rtdev_targp && (use_rmap || !xfs_has_zoned(mp))) { handlers[2].nr_sectors = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); - handlers[2].dev = new_encode_dev(mp->m_rtdev_targp->bt_dev); + handlers[2].dev = xfs_getfsmap_device(mp, XFS_DEV_RT); if (use_rmap) handlers[2].fn = xfs_getfsmap_rtdev_rmapbt; else @@ -1230,7 +1274,13 @@ xfs_getfsmap( if (tp) xfs_trans_cancel(tp); - head->fmh_oflags = FMH_OF_DEV_T; + + /* + * For internal RT device we need to report different synthetic devices + * for a single physical device, and thus can't report the actual dev_t. + */ + if (!mp->m_sb.sb_rtstart) + head->fmh_oflags = FMH_OF_DEV_T; return error; } diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 455298503d01..ee2cefbd5df8 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -24,6 +24,7 @@ #include "xfs_rtalloc.h" #include "xfs_rtrmap_btree.h" #include "xfs_rtrefcount_btree.h" +#include "xfs_metafile.h" /* * Write new AG headers to disk. Non-transactional, but need to be @@ -307,6 +308,10 @@ xfs_growfs_data( if (!mutex_trylock(&mp->m_growlock)) return -EWOULDBLOCK; + /* we can't grow the data section when an internal RT section exists */ + if (in->newblocks != mp->m_sb.sb_dblocks && mp->m_sb.sb_rtstart) + return -EINVAL; + /* update imaxpct separately to the physical grow of the filesystem */ if (in->imaxpct != mp->m_sb.sb_imax_pct) { error = xfs_growfs_imaxpct(mp, in->imaxpct); @@ -366,6 +371,7 @@ xfs_growfs_log( int xfs_reserve_blocks( struct xfs_mount *mp, + enum xfs_free_counter ctr, uint64_t request) { int64_t lcounter, delta; @@ -373,6 +379,8 @@ xfs_reserve_blocks( int64_t free; int error = 0; + ASSERT(ctr < XC_FREE_NR); + /* * With per-cpu counters, this becomes an interesting problem. we need * to work out if we are freeing or allocation blocks first, then we can @@ -391,16 +399,16 @@ xfs_reserve_blocks( * counters directly since we shouldn't have any problems unreserving * space. */ - if (mp->m_resblks > request) { - lcounter = mp->m_resblks_avail - request; + if (mp->m_free[ctr].res_total > request) { + lcounter = mp->m_free[ctr].res_avail - request; if (lcounter > 0) { /* release unused blocks */ fdblks_delta = lcounter; - mp->m_resblks_avail -= lcounter; + mp->m_free[ctr].res_avail -= lcounter; } - mp->m_resblks = request; + mp->m_free[ctr].res_total = request; if (fdblks_delta) { spin_unlock(&mp->m_sb_lock); - xfs_add_fdblocks(mp, fdblks_delta); + xfs_add_freecounter(mp, ctr, fdblks_delta); spin_lock(&mp->m_sb_lock); } @@ -409,7 +417,7 @@ xfs_reserve_blocks( /* * If the request is larger than the current reservation, reserve the - * blocks before we update the reserve counters. Sample m_fdblocks and + * blocks before we update the reserve counters. Sample m_free and * perform a partial reservation if the request exceeds free space. * * The code below estimates how many blocks it can request from @@ -419,10 +427,10 @@ xfs_reserve_blocks( * space to fill it because mod_fdblocks will refill an undersized * reserve when it can. */ - free = percpu_counter_sum(&mp->m_fdblocks) - - xfs_fdblocks_unavailable(mp); - delta = request - mp->m_resblks; - mp->m_resblks = request; + free = xfs_sum_freecounter_raw(mp, ctr) - + xfs_freecounter_unavailable(mp, ctr); + delta = request - mp->m_free[ctr].res_total; + mp->m_free[ctr].res_total = request; if (delta > 0 && free > 0) { /* * We'll either succeed in getting space from the free block @@ -436,9 +444,9 @@ xfs_reserve_blocks( */ fdblks_delta = min(free, delta); spin_unlock(&mp->m_sb_lock); - error = xfs_dec_fdblocks(mp, fdblks_delta, 0); + error = xfs_dec_freecounter(mp, ctr, fdblks_delta, 0); if (!error) - xfs_add_fdblocks(mp, fdblks_delta); + xfs_add_freecounter(mp, ctr, fdblks_delta); spin_lock(&mp->m_sb_lock); } out: @@ -558,15 +566,13 @@ xfs_fs_reserve_ag_blocks( return error; } - if (xfs_has_realtime(mp)) { - err2 = xfs_rt_resv_init(mp); - if (err2 && err2 != -ENOSPC) { - xfs_warn(mp, - "Error %d reserving realtime metadata reserve pool.", err2); - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - } + err2 = xfs_metafile_resv_init(mp); + if (err2 && err2 != -ENOSPC) { + xfs_warn(mp, + "Error %d reserving realtime metadata reserve pool.", err2); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - if (err2 && !error) + if (!error) error = err2; } @@ -582,9 +588,7 @@ xfs_fs_unreserve_ag_blocks( { struct xfs_perag *pag = NULL; - if (xfs_has_realtime(mp)) - xfs_rt_resv_free(mp); - + xfs_metafile_resv_free(mp); while ((pag = xfs_perag_next(mp, pag))) xfs_ag_resv_free(pag); } diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h index 3e2f73bcf831..9d23c361ef56 100644 --- a/fs/xfs/xfs_fsops.h +++ b/fs/xfs/xfs_fsops.h @@ -8,7 +8,8 @@ int xfs_growfs_data(struct xfs_mount *mp, struct xfs_growfs_data *in); int xfs_growfs_log(struct xfs_mount *mp, struct xfs_growfs_log *in); -int xfs_reserve_blocks(struct xfs_mount *mp, uint64_t request); +int xfs_reserve_blocks(struct xfs_mount *mp, enum xfs_free_counter cnt, + uint64_t request); int xfs_fs_goingdown(struct xfs_mount *mp, uint32_t inflags); int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 7b6c026d01a1..2f53ca7e12d4 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -2073,10 +2073,10 @@ xfs_inodegc_want_queue_rt_file( { struct xfs_mount *mp = ip->i_mount; - if (!XFS_IS_REALTIME_INODE(ip)) + if (!XFS_IS_REALTIME_INODE(ip) || xfs_has_zoned(mp)) return false; - if (__percpu_counter_compare(&mp->m_frextents, + if (xfs_compare_freecounter(mp, XC_FREE_RTEXTENTS, mp->m_low_rtexts[XFS_LOWSP_5_PCNT], XFS_FDBLOCKS_BATCH) < 0) return true; @@ -2104,7 +2104,7 @@ xfs_inodegc_want_queue_work( if (items > mp->m_ino_geo.inodes_per_cluster) return true; - if (__percpu_counter_compare(&mp->m_fdblocks, + if (xfs_compare_freecounter(mp, XC_FREE_BLOCKS, mp->m_low_space[XFS_LOWSP_5_PCNT], XFS_FDBLOCKS_BATCH) < 0) return true; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index b1f9f156ec88..7ded570e0191 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3074,5 +3074,6 @@ bool xfs_is_always_cow_inode( const struct xfs_inode *ip) { - return ip->i_mount->m_always_cow && xfs_has_reflink(ip->i_mount); + return xfs_is_zoned_inode(ip) || + (ip->i_mount->m_always_cow && xfs_has_reflink(ip->i_mount)); } diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index c08093a65352..4bb7a99e0dc4 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -25,19 +25,9 @@ struct xfs_dquot; typedef struct xfs_inode { /* Inode linking and identification information. */ struct xfs_mount *i_mount; /* fs mount struct ptr */ - union { - struct { - struct xfs_dquot *i_udquot; /* user dquot */ - struct xfs_dquot *i_gdquot; /* group dquot */ - struct xfs_dquot *i_pdquot; /* project dquot */ - }; - - /* - * Space that has been set aside to accomodate expansions of a - * metadata btree rooted in this file. - */ - uint64_t i_meta_resv_asked; - }; + struct xfs_dquot *i_udquot; /* user dquot */ + struct xfs_dquot *i_gdquot; /* group dquot */ + struct xfs_dquot *i_pdquot; /* project dquot */ /* Inode location stuff */ xfs_ino_t i_ino; /* inode number (agno/agino)*/ @@ -69,8 +59,13 @@ typedef struct xfs_inode { xfs_rfsblock_t i_nblocks; /* # of direct & btree blocks */ prid_t i_projid; /* owner's project id */ xfs_extlen_t i_extsize; /* basic/minimum extent size */ - /* cowextsize is only used for v3 inodes, flushiter for v1/2 */ + /* + * i_used_blocks is used for zoned rtrmap inodes, + * i_cowextsize is used for other v3 inodes, + * i_flushiter for v1/2 inodes + */ union { + uint32_t i_used_blocks; /* used blocks in RTG */ xfs_extlen_t i_cowextsize; /* basic cow extent size */ uint16_t i_flushiter; /* incremented on flush */ }; @@ -309,6 +304,11 @@ static inline bool xfs_is_internal_inode(const struct xfs_inode *ip) xfs_is_quota_inode(&mp->m_sb, ip->i_ino); } +static inline bool xfs_is_zoned_inode(const struct xfs_inode *ip) +{ + return xfs_has_zoned(ip->i_mount) && XFS_IS_REALTIME_INODE(ip); +} + bool xfs_is_always_cow_inode(const struct xfs_inode *ip); static inline bool xfs_is_cow_inode(const struct xfs_inode *ip) diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 35803fcf0beb..40fc1bf900af 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -596,6 +596,7 @@ xfs_inode_to_log_dinode( to->di_changecount = inode_peek_iversion(inode); to->di_crtime = xfs_inode_to_log_dinode_ts(ip, ip->i_crtime); to->di_flags2 = ip->i_diflags2; + /* also covers the di_used_blocks union arm: */ to->di_cowextsize = ip->i_cowextsize; to->di_ino = ip->i_ino; to->di_lsn = lsn; diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c index f3bfb814378c..7205fd14f6b3 100644 --- a/fs/xfs/xfs_inode_item_recover.c +++ b/fs/xfs/xfs_inode_item_recover.c @@ -203,6 +203,7 @@ xfs_log_dinode_to_disk( to->di_crtime = xfs_log_dinode_to_disk_ts(from, from->di_crtime); to->di_flags2 = cpu_to_be64(from->di_flags2); + /* also covers the di_used_blocks union arm: */ to->di_cowextsize = cpu_to_be32(from->di_cowextsize); to->di_ino = cpu_to_be64(from->di_ino); to->di_lsn = cpu_to_be64(lsn); diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index ed85322507dd..d250f7f74e3b 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1131,15 +1131,15 @@ xfs_ioctl_getset_resblocks( error = mnt_want_write_file(filp); if (error) return error; - error = xfs_reserve_blocks(mp, fsop.resblks); + error = xfs_reserve_blocks(mp, XC_FREE_BLOCKS, fsop.resblks); mnt_drop_write_file(filp); if (error) return error; } spin_lock(&mp->m_sb_lock); - fsop.resblks = mp->m_resblks; - fsop.resblks_avail = mp->m_resblks_avail; + fsop.resblks = mp->m_free[XC_FREE_BLOCKS].res_total; + fsop.resblks_avail = mp->m_free[XC_FREE_BLOCKS].res_avail; spin_unlock(&mp->m_sb_lock); if (copy_to_user(arg, &fsop, sizeof(fsop))) @@ -1155,9 +1155,9 @@ xfs_ioctl_fs_counts( struct xfs_fsop_counts out = { .allocino = percpu_counter_read_positive(&mp->m_icount), .freeino = percpu_counter_read_positive(&mp->m_ifree), - .freedata = percpu_counter_read_positive(&mp->m_fdblocks) - - xfs_fdblocks_unavailable(mp), - .freertx = percpu_counter_read_positive(&mp->m_frextents), + .freedata = xfs_estimate_freecounter(mp, XC_FREE_BLOCKS) - + xfs_freecounter_unavailable(mp, XC_FREE_BLOCKS), + .freertx = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS), }; if (copy_to_user(uarg, &out, sizeof(out))) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index d61460309a78..30e257f683bb 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -30,6 +30,8 @@ #include "xfs_reflink.h" #include "xfs_health.h" #include "xfs_rtbitmap.h" +#include "xfs_icache.h" +#include "xfs_zone_alloc.h" #define XFS_ALLOC_ALIGN(mp, off) \ (((off) >> mp->m_allocsize_log) << mp->m_allocsize_log) @@ -431,13 +433,14 @@ xfs_quota_calc_throttle( static int64_t xfs_iomap_freesp( - struct percpu_counter *counter, + struct xfs_mount *mp, + unsigned int idx, uint64_t low_space[XFS_LOWSP_MAX], int *shift) { int64_t freesp; - freesp = percpu_counter_read_positive(counter); + freesp = xfs_estimate_freecounter(mp, idx); if (freesp < low_space[XFS_LOWSP_5_PCNT]) { *shift = 2; if (freesp < low_space[XFS_LOWSP_4_PCNT]) @@ -536,10 +539,10 @@ xfs_iomap_prealloc_size( if (unlikely(XFS_IS_REALTIME_INODE(ip))) freesp = xfs_rtbxlen_to_blen(mp, - xfs_iomap_freesp(&mp->m_frextents, + xfs_iomap_freesp(mp, XC_FREE_RTEXTENTS, mp->m_low_rtexts, &shift)); else - freesp = xfs_iomap_freesp(&mp->m_fdblocks, mp->m_low_space, + freesp = xfs_iomap_freesp(mp, XC_FREE_BLOCKS, mp->m_low_space, &shift); /* @@ -962,6 +965,59 @@ const struct iomap_ops xfs_direct_write_iomap_ops = { .iomap_begin = xfs_direct_write_iomap_begin, }; +#ifdef CONFIG_XFS_RT +/* + * This is really simple. The space has already been reserved before taking the + * IOLOCK, the actual block allocation is done just before submitting the bio + * and only recorded in the extent map on I/O completion. + */ +static int +xfs_zoned_direct_write_iomap_begin( + struct inode *inode, + loff_t offset, + loff_t length, + unsigned flags, + struct iomap *iomap, + struct iomap *srcmap) +{ + struct xfs_inode *ip = XFS_I(inode); + int error; + + ASSERT(!(flags & IOMAP_OVERWRITE_ONLY)); + + /* + * Needs to be pushed down into the allocator so that only writes into + * a single zone can be supported. + */ + if (flags & IOMAP_NOWAIT) + return -EAGAIN; + + /* + * Ensure the extent list is in memory in so that we don't have to do + * read it from the I/O completion handler. + */ + if (xfs_need_iread_extents(&ip->i_df)) { + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (error) + return error; + } + + iomap->type = IOMAP_MAPPED; + iomap->flags = IOMAP_F_DIRTY; + iomap->bdev = ip->i_mount->m_rtdev_targp->bt_bdev; + iomap->offset = offset; + iomap->length = length; + iomap->flags = IOMAP_F_ANON_WRITE; + return 0; +} + +const struct iomap_ops xfs_zoned_direct_write_iomap_ops = { + .iomap_begin = xfs_zoned_direct_write_iomap_begin, +}; +#endif /* CONFIG_XFS_RT */ + static int xfs_dax_write_iomap_end( struct inode *inode, @@ -987,6 +1043,455 @@ const struct iomap_ops xfs_dax_write_iomap_ops = { .iomap_end = xfs_dax_write_iomap_end, }; +/* + * Convert a hole to a delayed allocation. + */ +static void +xfs_bmap_add_extent_hole_delay( + struct xfs_inode *ip, /* incore inode pointer */ + int whichfork, + struct xfs_iext_cursor *icur, + struct xfs_bmbt_irec *new) /* new data to add to file extents */ +{ + struct xfs_ifork *ifp; /* inode fork pointer */ + xfs_bmbt_irec_t left; /* left neighbor extent entry */ + xfs_filblks_t newlen=0; /* new indirect size */ + xfs_filblks_t oldlen=0; /* old indirect size */ + xfs_bmbt_irec_t right; /* right neighbor extent entry */ + uint32_t state = xfs_bmap_fork_to_state(whichfork); + xfs_filblks_t temp; /* temp for indirect calculations */ + + ifp = xfs_ifork_ptr(ip, whichfork); + ASSERT(isnullstartblock(new->br_startblock)); + + /* + * Check and set flags if this segment has a left neighbor + */ + if (xfs_iext_peek_prev_extent(ifp, icur, &left)) { + state |= BMAP_LEFT_VALID; + if (isnullstartblock(left.br_startblock)) + state |= BMAP_LEFT_DELAY; + } + + /* + * Check and set flags if the current (right) segment exists. + * If it doesn't exist, we're converting the hole at end-of-file. + */ + if (xfs_iext_get_extent(ifp, icur, &right)) { + state |= BMAP_RIGHT_VALID; + if (isnullstartblock(right.br_startblock)) + state |= BMAP_RIGHT_DELAY; + } + + /* + * Set contiguity flags on the left and right neighbors. + * Don't let extents get too large, even if the pieces are contiguous. + */ + if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) && + left.br_startoff + left.br_blockcount == new->br_startoff && + left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) + state |= BMAP_LEFT_CONTIG; + + if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) && + new->br_startoff + new->br_blockcount == right.br_startoff && + new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN && + (!(state & BMAP_LEFT_CONTIG) || + (left.br_blockcount + new->br_blockcount + + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN))) + state |= BMAP_RIGHT_CONTIG; + + /* + * Switch out based on the contiguity flags. + */ + switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) { + case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + /* + * New allocation is contiguous with delayed allocations + * on the left and on the right. + * Merge all three into a single extent record. + */ + temp = left.br_blockcount + new->br_blockcount + + right.br_blockcount; + + oldlen = startblockval(left.br_startblock) + + startblockval(new->br_startblock) + + startblockval(right.br_startblock); + newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + oldlen); + left.br_startblock = nullstartblock(newlen); + left.br_blockcount = temp; + + xfs_iext_remove(ip, icur, state); + xfs_iext_prev(ifp, icur); + xfs_iext_update_extent(ip, state, icur, &left); + break; + + case BMAP_LEFT_CONTIG: + /* + * New allocation is contiguous with a delayed allocation + * on the left. + * Merge the new allocation with the left neighbor. + */ + temp = left.br_blockcount + new->br_blockcount; + + oldlen = startblockval(left.br_startblock) + + startblockval(new->br_startblock); + newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + oldlen); + left.br_blockcount = temp; + left.br_startblock = nullstartblock(newlen); + + xfs_iext_prev(ifp, icur); + xfs_iext_update_extent(ip, state, icur, &left); + break; + + case BMAP_RIGHT_CONTIG: + /* + * New allocation is contiguous with a delayed allocation + * on the right. + * Merge the new allocation with the right neighbor. + */ + temp = new->br_blockcount + right.br_blockcount; + oldlen = startblockval(new->br_startblock) + + startblockval(right.br_startblock); + newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + oldlen); + right.br_startoff = new->br_startoff; + right.br_startblock = nullstartblock(newlen); + right.br_blockcount = temp; + xfs_iext_update_extent(ip, state, icur, &right); + break; + + case 0: + /* + * New allocation is not contiguous with another + * delayed allocation. + * Insert a new entry. + */ + oldlen = newlen = 0; + xfs_iext_insert(ip, icur, new, state); + break; + } + if (oldlen != newlen) { + ASSERT(oldlen > newlen); + xfs_add_fdblocks(ip->i_mount, oldlen - newlen); + + /* + * Nothing to do for disk quota accounting here. + */ + xfs_mod_delalloc(ip, 0, (int64_t)newlen - oldlen); + } +} + +/* + * Add a delayed allocation extent to an inode. Blocks are reserved from the + * global pool and the extent inserted into the inode in-core extent tree. + * + * On entry, got refers to the first extent beyond the offset of the extent to + * allocate or eof is specified if no such extent exists. On return, got refers + * to the extent record that was inserted to the inode fork. + * + * Note that the allocated extent may have been merged with contiguous extents + * during insertion into the inode fork. Thus, got does not reflect the current + * state of the inode fork on return. If necessary, the caller can use lastx to + * look up the updated record in the inode fork. + */ +static int +xfs_bmapi_reserve_delalloc( + struct xfs_inode *ip, + int whichfork, + xfs_fileoff_t off, + xfs_filblks_t len, + xfs_filblks_t prealloc, + struct xfs_bmbt_irec *got, + struct xfs_iext_cursor *icur, + int eof) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); + xfs_extlen_t alen; + xfs_extlen_t indlen; + uint64_t fdblocks; + int error; + xfs_fileoff_t aoff; + bool use_cowextszhint = + whichfork == XFS_COW_FORK && !prealloc; + +retry: + /* + * Cap the alloc length. Keep track of prealloc so we know whether to + * tag the inode before we return. + */ + aoff = off; + alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN); + if (!eof) + alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff); + if (prealloc && alen >= len) + prealloc = alen - len; + + /* + * If we're targetting the COW fork but aren't creating a speculative + * posteof preallocation, try to expand the reservation to align with + * the COW extent size hint if there's sufficient free space. + * + * Unlike the data fork, the CoW cancellation functions will free all + * the reservations at inactivation, so we don't require that every + * delalloc reservation have a dirty pagecache. + */ + if (use_cowextszhint) { + struct xfs_bmbt_irec prev; + xfs_extlen_t extsz = xfs_get_cowextsz_hint(ip); + + if (!xfs_iext_peek_prev_extent(ifp, icur, &prev)) + prev.br_startoff = NULLFILEOFF; + + error = xfs_bmap_extsize_align(mp, got, &prev, extsz, 0, eof, + 1, 0, &aoff, &alen); + ASSERT(!error); + } + + /* + * Make a transaction-less quota reservation for delayed allocation + * blocks. This number gets adjusted later. We return if we haven't + * allocated blocks already inside this loop. + */ + error = xfs_quota_reserve_blkres(ip, alen); + if (error) + goto out; + + /* + * Split changing sb for alen and indlen since they could be coming + * from different places. + */ + indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen); + ASSERT(indlen > 0); + + fdblocks = indlen; + if (XFS_IS_REALTIME_INODE(ip)) { + ASSERT(!xfs_is_zoned_inode(ip)); + error = xfs_dec_frextents(mp, xfs_blen_to_rtbxlen(mp, alen)); + if (error) + goto out_unreserve_quota; + } else { + fdblocks += alen; + } + + error = xfs_dec_fdblocks(mp, fdblocks, false); + if (error) + goto out_unreserve_frextents; + + ip->i_delayed_blks += alen; + xfs_mod_delalloc(ip, alen, indlen); + + got->br_startoff = aoff; + got->br_startblock = nullstartblock(indlen); + got->br_blockcount = alen; + got->br_state = XFS_EXT_NORM; + + xfs_bmap_add_extent_hole_delay(ip, whichfork, icur, got); + + /* + * Tag the inode if blocks were preallocated. Note that COW fork + * preallocation can occur at the start or end of the extent, even when + * prealloc == 0, so we must also check the aligned offset and length. + */ + if (whichfork == XFS_DATA_FORK && prealloc) + xfs_inode_set_eofblocks_tag(ip); + if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len)) + xfs_inode_set_cowblocks_tag(ip); + + return 0; + +out_unreserve_frextents: + if (XFS_IS_REALTIME_INODE(ip)) + xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, alen)); +out_unreserve_quota: + if (XFS_IS_QUOTA_ON(mp)) + xfs_quota_unreserve_blkres(ip, alen); +out: + if (error == -ENOSPC || error == -EDQUOT) { + trace_xfs_delalloc_enospc(ip, off, len); + + if (prealloc || use_cowextszhint) { + /* retry without any preallocation */ + use_cowextszhint = false; + prealloc = 0; + goto retry; + } + } + return error; +} + +static int +xfs_zoned_buffered_write_iomap_begin( + struct inode *inode, + loff_t offset, + loff_t count, + unsigned flags, + struct iomap *iomap, + struct iomap *srcmap) +{ + struct iomap_iter *iter = + container_of(iomap, struct iomap_iter, iomap); + struct xfs_zone_alloc_ctx *ac = iter->private; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, count); + u16 iomap_flags = IOMAP_F_SHARED; + unsigned int lockmode = XFS_ILOCK_EXCL; + xfs_filblks_t count_fsb; + xfs_extlen_t indlen; + struct xfs_bmbt_irec got; + struct xfs_iext_cursor icur; + int error = 0; + + ASSERT(!xfs_get_extsz_hint(ip)); + ASSERT(!(flags & IOMAP_UNSHARE)); + ASSERT(ac); + + if (xfs_is_shutdown(mp)) + return -EIO; + + error = xfs_qm_dqattach(ip); + if (error) + return error; + + error = xfs_ilock_for_iomap(ip, flags, &lockmode); + if (error) + return error; + + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) || + XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + xfs_bmap_mark_sick(ip, XFS_DATA_FORK); + error = -EFSCORRUPTED; + goto out_unlock; + } + + XFS_STATS_INC(mp, xs_blk_mapw); + + error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); + if (error) + goto out_unlock; + + /* + * For zeroing operations check if there is any data to zero first. + * + * For regular writes we always need to allocate new blocks, but need to + * provide the source mapping when the range is unaligned to support + * read-modify-write of the whole block in the page cache. + * + * In either case we need to limit the reported range to the boundaries + * of the source map in the data fork. + */ + if (!IS_ALIGNED(offset, mp->m_sb.sb_blocksize) || + !IS_ALIGNED(offset + count, mp->m_sb.sb_blocksize) || + (flags & IOMAP_ZERO)) { + struct xfs_bmbt_irec smap; + struct xfs_iext_cursor scur; + + if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &scur, + &smap)) + smap.br_startoff = end_fsb; /* fake hole until EOF */ + if (smap.br_startoff > offset_fsb) { + /* + * We never need to allocate blocks for zeroing a hole. + */ + if (flags & IOMAP_ZERO) { + xfs_hole_to_iomap(ip, iomap, offset_fsb, + smap.br_startoff); + goto out_unlock; + } + end_fsb = min(end_fsb, smap.br_startoff); + } else { + end_fsb = min(end_fsb, + smap.br_startoff + smap.br_blockcount); + xfs_trim_extent(&smap, offset_fsb, + end_fsb - offset_fsb); + error = xfs_bmbt_to_iomap(ip, srcmap, &smap, flags, 0, + xfs_iomap_inode_sequence(ip, 0)); + if (error) + goto out_unlock; + } + } + + if (!ip->i_cowfp) + xfs_ifork_init_cow(ip); + + if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got)) + got.br_startoff = end_fsb; + if (got.br_startoff <= offset_fsb) { + trace_xfs_reflink_cow_found(ip, &got); + goto done; + } + + /* + * Cap the maximum length to keep the chunks of work done here somewhat + * symmetric with the work writeback does. + */ + end_fsb = min(end_fsb, got.br_startoff); + count_fsb = min3(end_fsb - offset_fsb, XFS_MAX_BMBT_EXTLEN, + XFS_B_TO_FSB(mp, 1024 * PAGE_SIZE)); + + /* + * The block reservation is supposed to cover all blocks that the + * operation could possible write, but there is a nasty corner case + * where blocks could be stolen from underneath us: + * + * 1) while this thread iterates over a larger buffered write, + * 2) another thread is causing a write fault that calls into + * ->page_mkwrite in range this thread writes to, using up the + * delalloc reservation created by a previous call to this function. + * 3) another thread does direct I/O on the range that the write fault + * happened on, which causes writeback of the dirty data. + * 4) this then set the stale flag, which cuts the current iomap + * iteration short, causing the new call to ->iomap_begin that gets + * us here again, but now without a sufficient reservation. + * + * This is a very unusual I/O pattern, and nothing but generic/095 is + * known to hit it. There's not really much we can do here, so turn this + * into a short write. + */ + if (count_fsb > ac->reserved_blocks) { + xfs_warn_ratelimited(mp, +"Short write on ino 0x%llx comm %.20s due to three-way race with write fault and direct I/O", + ip->i_ino, current->comm); + count_fsb = ac->reserved_blocks; + if (!count_fsb) { + error = -EIO; + goto out_unlock; + } + } + + error = xfs_quota_reserve_blkres(ip, count_fsb); + if (error) + goto out_unlock; + + indlen = xfs_bmap_worst_indlen(ip, count_fsb); + error = xfs_dec_fdblocks(mp, indlen, false); + if (error) + goto out_unlock; + ip->i_delayed_blks += count_fsb; + xfs_mod_delalloc(ip, count_fsb, indlen); + + got.br_startoff = offset_fsb; + got.br_startblock = nullstartblock(indlen); + got.br_blockcount = count_fsb; + got.br_state = XFS_EXT_NORM; + xfs_bmap_add_extent_hole_delay(ip, XFS_COW_FORK, &icur, &got); + ac->reserved_blocks -= count_fsb; + iomap_flags |= IOMAP_F_NEW; + + trace_xfs_iomap_alloc(ip, offset, XFS_FSB_TO_B(mp, count_fsb), + XFS_COW_FORK, &got); +done: + error = xfs_bmbt_to_iomap(ip, iomap, &got, flags, iomap_flags, + xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED)); +out_unlock: + xfs_iunlock(ip, lockmode); + return error; +} + static int xfs_buffered_write_iomap_begin( struct inode *inode, @@ -1013,6 +1518,10 @@ xfs_buffered_write_iomap_begin( if (xfs_is_shutdown(mp)) return -EIO; + if (xfs_is_zoned_inode(ip)) + return xfs_zoned_buffered_write_iomap_begin(inode, offset, + count, flags, iomap, srcmap); + /* we can't use delayed allocations when using extent size hints */ if (xfs_get_extsz_hint(ip)) return xfs_direct_write_iomap_begin(inode, offset, count, @@ -1245,10 +1754,13 @@ xfs_buffered_write_delalloc_punch( loff_t length, struct iomap *iomap) { + struct iomap_iter *iter = + container_of(iomap, struct iomap_iter, iomap); + xfs_bmap_punch_delalloc_range(XFS_I(inode), (iomap->flags & IOMAP_F_SHARED) ? XFS_COW_FORK : XFS_DATA_FORK, - offset, offset + length); + offset, offset + length, iter->private); } static int @@ -1485,6 +1997,7 @@ xfs_zero_range( struct xfs_inode *ip, loff_t pos, loff_t len, + struct xfs_zone_alloc_ctx *ac, bool *did_zero) { struct inode *inode = VFS_I(ip); @@ -1495,13 +2008,14 @@ xfs_zero_range( return dax_zero_range(inode, pos, len, did_zero, &xfs_dax_write_iomap_ops); return iomap_zero_range(inode, pos, len, did_zero, - &xfs_buffered_write_iomap_ops); + &xfs_buffered_write_iomap_ops, ac); } int xfs_truncate_page( struct xfs_inode *ip, loff_t pos, + struct xfs_zone_alloc_ctx *ac, bool *did_zero) { struct inode *inode = VFS_I(ip); @@ -1510,5 +2024,5 @@ xfs_truncate_page( return dax_truncate_page(inode, pos, did_zero, &xfs_dax_write_iomap_ops); return iomap_truncate_page(inode, pos, did_zero, - &xfs_buffered_write_iomap_ops); + &xfs_buffered_write_iomap_ops, ac); } diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 8347268af727..d330c4a581b1 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -10,6 +10,7 @@ struct xfs_inode; struct xfs_bmbt_irec; +struct xfs_zone_alloc_ctx; int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb, xfs_fileoff_t count_fsb, unsigned int flags, @@ -24,8 +25,9 @@ int xfs_bmbt_to_iomap(struct xfs_inode *ip, struct iomap *iomap, u16 iomap_flags, u64 sequence_cookie); int xfs_zero_range(struct xfs_inode *ip, loff_t pos, loff_t len, - bool *did_zero); -int xfs_truncate_page(struct xfs_inode *ip, loff_t pos, bool *did_zero); + struct xfs_zone_alloc_ctx *ac, bool *did_zero); +int xfs_truncate_page(struct xfs_inode *ip, loff_t pos, + struct xfs_zone_alloc_ctx *ac, bool *did_zero); static inline xfs_filblks_t xfs_aligned_fsb_count( @@ -49,6 +51,7 @@ xfs_aligned_fsb_count( extern const struct iomap_ops xfs_buffered_write_iomap_ops; extern const struct iomap_ops xfs_direct_write_iomap_ops; +extern const struct iomap_ops xfs_zoned_direct_write_iomap_ops; extern const struct iomap_ops xfs_read_iomap_ops; extern const struct iomap_ops xfs_seek_iomap_ops; extern const struct iomap_ops xfs_xattr_iomap_ops; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 40289fe6f5b2..444193f543ef 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -29,6 +29,7 @@ #include "xfs_xattr.h" #include "xfs_file.h" #include "xfs_bmap.h" +#include "xfs_zone_alloc.h" #include <linux/posix_acl.h> #include <linux/security.h> @@ -854,6 +855,7 @@ xfs_setattr_size( uint lock_flags = 0; uint resblks = 0; bool did_zeroing = false; + struct xfs_zone_alloc_ctx ac = { }; xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL); ASSERT(S_ISREG(inode->i_mode)); @@ -890,6 +892,28 @@ xfs_setattr_size( inode_dio_wait(inode); /* + * Normally xfs_zoned_space_reserve is supposed to be called outside the + * IOLOCK. For truncate we can't do that since ->setattr is called with + * it already held by the VFS. So for now chicken out and try to + * allocate space under it. + * + * To avoid deadlocks this means we can't block waiting for space, which + * can lead to spurious -ENOSPC if there are no directly available + * blocks. We mitigate this a bit by allowing zeroing to dip into the + * reserved pool, but eventually the VFS calling convention needs to + * change. + */ + if (xfs_is_zoned_inode(ip)) { + error = xfs_zoned_space_reserve(ip, 1, + XFS_ZR_NOWAIT | XFS_ZR_RESERVED, &ac); + if (error) { + if (error == -EAGAIN) + return -ENOSPC; + return error; + } + } + + /* * File data changes must be complete before we start the transaction to * modify the inode. This needs to be done before joining the inode to * the transaction because the inode cannot be unlocked once it is a @@ -902,11 +926,14 @@ xfs_setattr_size( if (newsize > oldsize) { trace_xfs_zero_eof(ip, oldsize, newsize - oldsize); error = xfs_zero_range(ip, oldsize, newsize - oldsize, - &did_zeroing); + &ac, &did_zeroing); } else { - error = xfs_truncate_page(ip, newsize, &did_zeroing); + error = xfs_truncate_page(ip, newsize, &ac, &did_zeroing); } + if (xfs_is_zoned_inode(ip)) + xfs_zoned_space_unreserve(ip, &ac); + if (error) return error; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index f8851ff835de..6493bdb57351 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -20,6 +20,7 @@ #include "xfs_sysfs.h" #include "xfs_sb.h" #include "xfs_health.h" +#include "xfs_zone_alloc.h" struct kmem_cache *xfs_log_ticket_cache; @@ -3540,6 +3541,9 @@ xlog_force_shutdown( spin_unlock(&log->l_icloglock); wake_up_var(&log->l_opstate); + if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(log->l_mp)) + xfs_zoned_wake_all(log->l_mp); + return log_error; } diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c index 6ed485ff2756..15d410d16bb2 100644 --- a/fs/xfs/xfs_message.c +++ b/fs/xfs/xfs_message.c @@ -173,6 +173,10 @@ xfs_warn_experimental( .opstate = XFS_OPSTATE_WARNED_METADIR, .name = "metadata directory tree", }, + [XFS_EXPERIMENTAL_ZONED] = { + .opstate = XFS_OPSTATE_WARNED_ZONED, + .name = "zoned RT device", + }, }; ASSERT(feat >= 0 && feat < XFS_EXPERIMENTAL_MAX); BUILD_BUG_ON(ARRAY_SIZE(features) != XFS_EXPERIMENTAL_MAX); diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h index 7fb36ced9df7..a92a4d09c8e9 100644 --- a/fs/xfs/xfs_message.h +++ b/fs/xfs/xfs_message.h @@ -99,6 +99,7 @@ enum xfs_experimental_feat { XFS_EXPERIMENTAL_EXCHRANGE, XFS_EXPERIMENTAL_PPTR, XFS_EXPERIMENTAL_METADIR, + XFS_EXPERIMENTAL_ZONED, XFS_EXPERIMENTAL_MAX, }; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index b69356582b86..e65a659901d5 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -40,6 +40,7 @@ #include "xfs_rtrmap_btree.h" #include "xfs_rtrefcount_btree.h" #include "scrub/stats.h" +#include "xfs_zone_alloc.h" static DEFINE_MUTEX(xfs_uuid_table_mutex); static int xfs_uuid_table_size; @@ -461,22 +462,38 @@ xfs_mount_reset_sbqflags( return xfs_sync_sb(mp, false); } +static const char *const xfs_free_pool_name[] = { + [XC_FREE_BLOCKS] = "free blocks", + [XC_FREE_RTEXTENTS] = "free rt extents", + [XC_FREE_RTAVAILABLE] = "available rt extents", +}; + uint64_t -xfs_default_resblks(xfs_mount_t *mp) +xfs_default_resblks( + struct xfs_mount *mp, + enum xfs_free_counter ctr) { - uint64_t resblks; - - /* - * We default to 5% or 8192 fsbs of space reserved, whichever is - * smaller. This is intended to cover concurrent allocation - * transactions when we initially hit enospc. These each require a 4 - * block reservation. Hence by default we cover roughly 2000 concurrent - * allocation reservations. - */ - resblks = mp->m_sb.sb_dblocks; - do_div(resblks, 20); - resblks = min_t(uint64_t, resblks, 8192); - return resblks; + switch (ctr) { + case XC_FREE_BLOCKS: + /* + * Default to 5% or 8192 FSBs of space reserved, whichever is + * smaller. + * + * This is intended to cover concurrent allocation transactions + * when we initially hit ENOSPC. These each require a 4 block + * reservation. Hence by default we cover roughly 2000 + * concurrent allocation reservations. + */ + return min(div_u64(mp->m_sb.sb_dblocks, 20), 8192ULL); + case XC_FREE_RTEXTENTS: + case XC_FREE_RTAVAILABLE: + if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(mp)) + return xfs_zoned_default_resblks(mp, ctr); + return 0; + default: + ASSERT(0); + return 0; + } } /* Ensure the summary counts are correct. */ @@ -543,7 +560,7 @@ xfs_check_summary_counts( * If we're mounting the rt volume after recovering the log, recompute * frextents from the rtbitmap file to fix the inconsistency. */ - if (xfs_has_realtime(mp) && !xfs_is_clean(mp)) { + if (xfs_has_realtime(mp) && !xfs_has_zoned(mp) && !xfs_is_clean(mp)) { error = xfs_rtalloc_reinit_frextents(mp); if (error) return error; @@ -678,6 +695,7 @@ xfs_mountfs( uint quotamount = 0; uint quotaflags = 0; int error = 0; + int i; xfs_sb_mount_common(mp, sbp); @@ -747,27 +765,15 @@ xfs_mountfs( /* enable fail_at_unmount as default */ mp->m_fail_unmount = true; - super_set_sysfs_name_id(mp->m_super); - - error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, - NULL, mp->m_super->s_id); - if (error) - goto out; - - error = xfs_sysfs_init(&mp->m_stats.xs_kobj, &xfs_stats_ktype, - &mp->m_kobj, "stats"); + error = xfs_mount_sysfs_init(mp); if (error) - goto out_remove_sysfs; + goto out_remove_scrub_stats; xchk_stats_register(mp->m_scrub_stats, mp->m_debugfs); - error = xfs_error_sysfs_init(mp); - if (error) - goto out_remove_scrub_stats; - error = xfs_errortag_init(mp); if (error) - goto out_remove_error_sysfs; + goto out_remove_sysfs; error = xfs_uuid_mount(mp); if (error) @@ -1031,6 +1037,12 @@ xfs_mountfs( if (xfs_is_readonly(mp) && !xfs_has_norecovery(mp)) xfs_log_clean(mp); + if (xfs_has_zoned(mp)) { + error = xfs_mount_zones(mp); + if (error) + goto out_rtunmount; + } + /* * Complete the quota initialisation, post-log-replay component. */ @@ -1046,22 +1058,28 @@ xfs_mountfs( * privileged transactions. This is needed so that transaction * space required for critical operations can dip into this pool * when at ENOSPC. This is needed for operations like create with - * attr, unwritten extent conversion at ENOSPC, etc. Data allocations - * are not allowed to use this reserved space. + * attr, unwritten extent conversion at ENOSPC, garbage collection + * etc. Data allocations are not allowed to use this reserved space. * * This may drive us straight to ENOSPC on mount, but that implies * we were already there on the last unmount. Warn if this occurs. */ if (!xfs_is_readonly(mp)) { - error = xfs_reserve_blocks(mp, xfs_default_resblks(mp)); - if (error) - xfs_warn(mp, - "Unable to allocate reserve blocks. Continuing without reserve pool."); + for (i = 0; i < XC_FREE_NR; i++) { + error = xfs_reserve_blocks(mp, i, + xfs_default_resblks(mp, i)); + if (error) + xfs_warn(mp, +"Unable to allocate reserve blocks. Continuing without reserve pool for %s.", + xfs_free_pool_name[i]); + } /* Reserve AG blocks for future btree expansion. */ error = xfs_fs_reserve_ag_blocks(mp); if (error && error != -ENOSPC) goto out_agresv; + + xfs_zone_gc_start(mp); } return 0; @@ -1069,6 +1087,8 @@ xfs_mountfs( out_agresv: xfs_fs_unreserve_ag_blocks(mp); xfs_qm_unmount_quotas(mp); + if (xfs_has_zoned(mp)) + xfs_unmount_zones(mp); out_rtunmount: xfs_rtunmount_inodes(mp); out_rele_rip: @@ -1116,13 +1136,10 @@ xfs_mountfs( xfs_uuid_unmount(mp); out_remove_errortag: xfs_errortag_del(mp); - out_remove_error_sysfs: - xfs_error_sysfs_del(mp); + out_remove_sysfs: + xfs_mount_sysfs_del(mp); out_remove_scrub_stats: xchk_stats_unregister(mp->m_scrub_stats); - xfs_sysfs_del(&mp->m_stats.xs_kobj); - out_remove_sysfs: - xfs_sysfs_del(&mp->m_kobj); out: return error; } @@ -1148,8 +1165,12 @@ xfs_unmountfs( xfs_inodegc_flush(mp); xfs_blockgc_stop(mp); + if (!test_bit(XFS_OPSTATE_READONLY, &mp->m_opstate)) + xfs_zone_gc_stop(mp); xfs_fs_unreserve_ag_blocks(mp); xfs_qm_unmount_quotas(mp); + if (xfs_has_zoned(mp)) + xfs_unmount_zones(mp); xfs_rtunmount_inodes(mp); xfs_irele(mp->m_rootip); if (mp->m_metadirip) @@ -1173,7 +1194,7 @@ xfs_unmountfs( * we only every apply deltas to the superblock and hence the incore * value does not matter.... */ - error = xfs_reserve_blocks(mp, 0); + error = xfs_reserve_blocks(mp, XC_FREE_BLOCKS, 0); if (error) xfs_warn(mp, "Unable to free reserved block pool. " "Freespace may not be correct on next mount."); @@ -1195,10 +1216,8 @@ xfs_unmountfs( xfs_free_rtgroups(mp, 0, mp->m_sb.sb_rgcount); xfs_free_perag_range(mp, 0, mp->m_sb.sb_agcount); xfs_errortag_del(mp); - xfs_error_sysfs_del(mp); xchk_stats_unregister(mp->m_scrub_stats); - xfs_sysfs_del(&mp->m_stats.xs_kobj); - xfs_sysfs_del(&mp->m_kobj); + xfs_mount_sysfs_del(mp); } /* @@ -1220,52 +1239,67 @@ xfs_fs_writable( return true; } +/* + * Estimate the amount of free space that is not available to userspace and is + * not explicitly reserved from the incore fdblocks. This includes: + * + * - The minimum number of blocks needed to support splitting a bmap btree + * - The blocks currently in use by the freespace btrees because they record + * the actual blocks that will fill per-AG metadata space reservations + */ +uint64_t +xfs_freecounter_unavailable( + struct xfs_mount *mp, + enum xfs_free_counter ctr) +{ + if (ctr != XC_FREE_BLOCKS) + return 0; + return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks); +} + void xfs_add_freecounter( struct xfs_mount *mp, - struct percpu_counter *counter, + enum xfs_free_counter ctr, uint64_t delta) { - bool has_resv_pool = (counter == &mp->m_fdblocks); + struct xfs_freecounter *counter = &mp->m_free[ctr]; uint64_t res_used; /* * If the reserve pool is depleted, put blocks back into it first. * Most of the time the pool is full. */ - if (!has_resv_pool || mp->m_resblks == mp->m_resblks_avail) { - percpu_counter_add(counter, delta); + if (likely(counter->res_avail == counter->res_total)) { + percpu_counter_add(&counter->count, delta); return; } spin_lock(&mp->m_sb_lock); - res_used = mp->m_resblks - mp->m_resblks_avail; + res_used = counter->res_total - counter->res_avail; if (res_used > delta) { - mp->m_resblks_avail += delta; + counter->res_avail += delta; } else { delta -= res_used; - mp->m_resblks_avail = mp->m_resblks; - percpu_counter_add(counter, delta); + counter->res_avail = counter->res_total; + percpu_counter_add(&counter->count, delta); } spin_unlock(&mp->m_sb_lock); } + +/* Adjust in-core free blocks or RT extents. */ int xfs_dec_freecounter( struct xfs_mount *mp, - struct percpu_counter *counter, + enum xfs_free_counter ctr, uint64_t delta, bool rsvd) { - int64_t lcounter; - uint64_t set_aside = 0; + struct xfs_freecounter *counter = &mp->m_free[ctr]; s32 batch; - bool has_resv_pool; - ASSERT(counter == &mp->m_fdblocks || counter == &mp->m_frextents); - has_resv_pool = (counter == &mp->m_fdblocks); - if (rsvd) - ASSERT(has_resv_pool); + ASSERT(ctr < XC_FREE_NR); /* * Taking blocks away, need to be more accurate the closer we @@ -1275,7 +1309,7 @@ xfs_dec_freecounter( * then make everything serialise as we are real close to * ENOSPC. */ - if (__percpu_counter_compare(counter, 2 * XFS_FDBLOCKS_BATCH, + if (__percpu_counter_compare(&counter->count, 2 * XFS_FDBLOCKS_BATCH, XFS_FDBLOCKS_BATCH) < 0) batch = 1; else @@ -1292,34 +1326,34 @@ xfs_dec_freecounter( * problems (i.e. transaction abort, pagecache discards, etc.) than * slightly premature -ENOSPC. */ - if (has_resv_pool) - set_aside = xfs_fdblocks_unavailable(mp); - percpu_counter_add_batch(counter, -((int64_t)delta), batch); - if (__percpu_counter_compare(counter, set_aside, - XFS_FDBLOCKS_BATCH) >= 0) { - /* we had space! */ - return 0; - } - - /* - * lock up the sb for dipping into reserves before releasing the space - * that took us to ENOSPC. - */ - spin_lock(&mp->m_sb_lock); - percpu_counter_add(counter, delta); - if (!has_resv_pool || !rsvd) - goto fdblocks_enospc; - - lcounter = (long long)mp->m_resblks_avail - delta; - if (lcounter >= 0) { - mp->m_resblks_avail = lcounter; + percpu_counter_add_batch(&counter->count, -((int64_t)delta), batch); + if (__percpu_counter_compare(&counter->count, + xfs_freecounter_unavailable(mp, ctr), + XFS_FDBLOCKS_BATCH) < 0) { + /* + * Lock up the sb for dipping into reserves before releasing the + * space that took us to ENOSPC. + */ + spin_lock(&mp->m_sb_lock); + percpu_counter_add(&counter->count, delta); + if (!rsvd) + goto fdblocks_enospc; + if (delta > counter->res_avail) { + if (ctr == XC_FREE_BLOCKS) + xfs_warn_once(mp, +"Reserve blocks depleted! Consider increasing reserve pool size."); + goto fdblocks_enospc; + } + counter->res_avail -= delta; + trace_xfs_freecounter_reserved(mp, ctr, delta, _RET_IP_); spin_unlock(&mp->m_sb_lock); - return 0; } - xfs_warn_once(mp, -"Reserve blocks depleted! Consider increasing reserve pool size."); + + /* we had space! */ + return 0; fdblocks_enospc: + trace_xfs_freecounter_enospc(mp, ctr, delta, _RET_IP_); spin_unlock(&mp->m_sb_lock); return -ENOSPC; } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index fbed172d6770..799b84220ebb 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -98,11 +98,41 @@ struct xfs_groups { uint8_t blklog; /* + * Zoned devices can have gaps beyond the usable capacity of a zone and + * the end in the LBA/daddr address space. In other words, the hardware + * equivalent to the RT groups already takes care of the power of 2 + * alignment for us. In this case the sparse FSB/RTB address space maps + * 1:1 to the device address space. + */ + bool has_daddr_gaps; + + /* * Mask to extract the group-relative block number from a FSB. * For a pre-rtgroups filesystem we pretend to have one very large * rtgroup, so this mask must be 64-bit. */ uint64_t blkmask; + + /* + * Start of the first group in the device. This is used to support a + * RT device following the data device on the same block device for + * SMR hard drives. + */ + xfs_fsblock_t start_fsb; +}; + +struct xfs_freecounter { + /* free blocks for general use: */ + struct percpu_counter count; + + /* total reserved blocks: */ + uint64_t res_total; + + /* available reserved blocks: */ + uint64_t res_avail; + + /* reserved blks @ remount,ro: */ + uint64_t res_saved; }; /* @@ -198,6 +228,7 @@ typedef struct xfs_mount { bool m_fail_unmount; bool m_finobt_nores; /* no per-AG finobt resv. */ bool m_update_sb; /* sb needs update in mount */ + unsigned int m_max_open_zones; /* * Bitsets of per-fs metadata that have been checked and/or are sick. @@ -222,8 +253,8 @@ typedef struct xfs_mount { spinlock_t ____cacheline_aligned m_sb_lock; /* sb counter lock */ struct percpu_counter m_icount; /* allocated inodes counter */ struct percpu_counter m_ifree; /* free inodes counter */ - struct percpu_counter m_fdblocks; /* free block counter */ - struct percpu_counter m_frextents; /* free rt extent counter */ + + struct xfs_freecounter m_free[XC_FREE_NR]; /* * Count of data device blocks reserved for delayed allocations, @@ -245,10 +276,8 @@ typedef struct xfs_mount { atomic64_t m_allocbt_blks; struct xfs_groups m_groups[XG_TYPE_MAX]; - uint64_t m_resblks; /* total reserved blocks */ - uint64_t m_resblks_avail;/* available reserved blocks */ - uint64_t m_resblks_save; /* reserved blks @ remount,ro */ struct delayed_work m_reclaim_work; /* background inode reclaim */ + struct xfs_zone_info *m_zone_info; /* zone allocator information */ struct dentry *m_debugfs; /* debugfs parent */ struct xfs_kobj m_kobj; struct xfs_kobj m_error_kobj; @@ -258,10 +287,16 @@ typedef struct xfs_mount { #ifdef CONFIG_XFS_ONLINE_SCRUB_STATS struct xchk_stats *m_scrub_stats; #endif + struct xfs_kobj m_zoned_kobj; xfs_agnumber_t m_agfrotor; /* last ag where space found */ atomic_t m_agirotor; /* last ag dir inode alloced */ atomic_t m_rtgrotor; /* last rtgroup rtpicked */ + struct mutex m_metafile_resv_lock; + uint64_t m_metafile_resv_target; + uint64_t m_metafile_resv_used; + uint64_t m_metafile_resv_avail; + /* Memory shrinker to throttle and reprioritize inodegc */ struct shrinker *m_inodegc_shrinker; /* @@ -336,8 +371,10 @@ typedef struct xfs_mount { #define XFS_FEAT_NREXT64 (1ULL << 26) /* large extent counters */ #define XFS_FEAT_EXCHANGE_RANGE (1ULL << 27) /* exchange range */ #define XFS_FEAT_METADIR (1ULL << 28) /* metadata directory tree */ +#define XFS_FEAT_ZONED (1ULL << 29) /* zoned RT device */ /* Mount features */ +#define XFS_FEAT_NOLIFETIME (1ULL << 47) /* disable lifetime hints */ #define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */ #define XFS_FEAT_NOALIGN (1ULL << 49) /* ignore alignment */ #define XFS_FEAT_ALLOCSIZE (1ULL << 50) /* user specified allocation size */ @@ -392,6 +429,8 @@ __XFS_HAS_FEAT(needsrepair, NEEDSREPAIR) __XFS_HAS_FEAT(large_extent_counts, NREXT64) __XFS_HAS_FEAT(exchange_range, EXCHANGE_RANGE) __XFS_HAS_FEAT(metadir, METADIR) +__XFS_HAS_FEAT(zoned, ZONED) +__XFS_HAS_FEAT(nolifetime, NOLIFETIME) static inline bool xfs_has_rtgroups(const struct xfs_mount *mp) { @@ -402,7 +441,9 @@ static inline bool xfs_has_rtgroups(const struct xfs_mount *mp) static inline bool xfs_has_rtsb(const struct xfs_mount *mp) { /* all rtgroups filesystems with an rt section have an rtsb */ - return xfs_has_rtgroups(mp) && xfs_has_realtime(mp); + return xfs_has_rtgroups(mp) && + xfs_has_realtime(mp) && + !xfs_has_zoned(mp); } static inline bool xfs_has_rtrmapbt(const struct xfs_mount *mp) @@ -417,6 +458,11 @@ static inline bool xfs_has_rtreflink(const struct xfs_mount *mp) xfs_has_reflink(mp); } +static inline bool xfs_has_nonzoned(const struct xfs_mount *mp) +{ + return !xfs_has_zoned(mp); +} + /* * Some features are always on for v5 file systems, allow the compiler to * eliminiate dead code when building without v4 support. @@ -520,6 +566,10 @@ __XFS_HAS_FEAT(nouuid, NOUUID) #define XFS_OPSTATE_WARNED_METADIR 17 /* Filesystem should use qflags to determine quotaon status */ #define XFS_OPSTATE_RESUMING_QUOTAON 18 +/* Kernel has logged a warning about zoned RT device being used on this fs. */ +#define XFS_OPSTATE_WARNED_ZONED 19 +/* (Zoned) GC is in progress */ +#define XFS_OPSTATE_ZONEGC_RUNNING 20 #define __XFS_IS_OPSTATE(name, NAME) \ static inline bool xfs_is_ ## name (struct xfs_mount *mp) \ @@ -564,6 +614,7 @@ static inline bool xfs_clear_resuming_quotaon(struct xfs_mount *mp) #endif /* CONFIG_XFS_QUOTA */ __XFS_IS_OPSTATE(done_with_log_incompat, UNSET_LOG_INCOMPAT) __XFS_IS_OPSTATE(using_logged_xattrs, USE_LARP) +__XFS_IS_OPSTATE(zonegc_running, ZONEGC_RUNNING) static inline bool xfs_should_warn(struct xfs_mount *mp, long nr) @@ -633,7 +684,8 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d) } extern void xfs_uuid_table_free(void); -extern uint64_t xfs_default_resblks(xfs_mount_t *mp); +uint64_t xfs_default_resblks(struct xfs_mount *mp, + enum xfs_free_counter ctr); extern int xfs_mountfs(xfs_mount_t *mp); extern void xfs_unmountfs(xfs_mount_t *); @@ -646,45 +698,74 @@ extern void xfs_unmountfs(xfs_mount_t *); */ #define XFS_FDBLOCKS_BATCH 1024 +uint64_t xfs_freecounter_unavailable(struct xfs_mount *mp, + enum xfs_free_counter ctr); + /* - * Estimate the amount of free space that is not available to userspace and is - * not explicitly reserved from the incore fdblocks. This includes: - * - * - The minimum number of blocks needed to support splitting a bmap btree - * - The blocks currently in use by the freespace btrees because they record - * the actual blocks that will fill per-AG metadata space reservations + * Sum up the freecount, but never return negative values. */ -static inline uint64_t -xfs_fdblocks_unavailable( - struct xfs_mount *mp) +static inline s64 xfs_sum_freecounter(struct xfs_mount *mp, + enum xfs_free_counter ctr) { - return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks); + return percpu_counter_sum_positive(&mp->m_free[ctr].count); } -int xfs_dec_freecounter(struct xfs_mount *mp, struct percpu_counter *counter, +/* + * Same as above, but does return negative values. Mostly useful for + * special cases like repair and tracing. + */ +static inline s64 xfs_sum_freecounter_raw(struct xfs_mount *mp, + enum xfs_free_counter ctr) +{ + return percpu_counter_sum(&mp->m_free[ctr].count); +} + +/* + * This just provides and estimate without the cpu-local updates, use + * xfs_sum_freecounter for the exact value. + */ +static inline s64 xfs_estimate_freecounter(struct xfs_mount *mp, + enum xfs_free_counter ctr) +{ + return percpu_counter_read_positive(&mp->m_free[ctr].count); +} + +static inline int xfs_compare_freecounter(struct xfs_mount *mp, + enum xfs_free_counter ctr, s64 rhs, s32 batch) +{ + return __percpu_counter_compare(&mp->m_free[ctr].count, rhs, batch); +} + +static inline void xfs_set_freecounter(struct xfs_mount *mp, + enum xfs_free_counter ctr, uint64_t val) +{ + percpu_counter_set(&mp->m_free[ctr].count, val); +} + +int xfs_dec_freecounter(struct xfs_mount *mp, enum xfs_free_counter ctr, uint64_t delta, bool rsvd); -void xfs_add_freecounter(struct xfs_mount *mp, struct percpu_counter *counter, +void xfs_add_freecounter(struct xfs_mount *mp, enum xfs_free_counter ctr, uint64_t delta); static inline int xfs_dec_fdblocks(struct xfs_mount *mp, uint64_t delta, bool reserved) { - return xfs_dec_freecounter(mp, &mp->m_fdblocks, delta, reserved); + return xfs_dec_freecounter(mp, XC_FREE_BLOCKS, delta, reserved); } static inline void xfs_add_fdblocks(struct xfs_mount *mp, uint64_t delta) { - xfs_add_freecounter(mp, &mp->m_fdblocks, delta); + xfs_add_freecounter(mp, XC_FREE_BLOCKS, delta); } static inline int xfs_dec_frextents(struct xfs_mount *mp, uint64_t delta) { - return xfs_dec_freecounter(mp, &mp->m_frextents, delta, false); + return xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, delta, false); } static inline void xfs_add_frextents(struct xfs_mount *mp, uint64_t delta) { - xfs_add_freecounter(mp, &mp->m_frextents, delta); + xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, delta); } extern int xfs_readsb(xfs_mount_t *, int); @@ -706,5 +787,9 @@ int xfs_add_incompat_log_feature(struct xfs_mount *mp, uint32_t feature); bool xfs_clear_incompat_log_features(struct xfs_mount *mp); void xfs_mod_delalloc(struct xfs_inode *ip, int64_t data_delta, int64_t ind_delta); +static inline void xfs_mod_sb_delalloc(struct xfs_mount *mp, int64_t delta) +{ + percpu_counter_add(&mp->m_delalloc_blks, delta); +} #endif /* __XFS_MOUNT_H__ */ diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index e1ba5af6250f..417439b58785 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1711,7 +1711,8 @@ xfs_qm_mount_quotas( * immediately. We only support rtquota if rtgroups are enabled to * avoid problems with older kernels. */ - if (mp->m_sb.sb_rextents && !xfs_has_rtgroups(mp)) { + if (mp->m_sb.sb_rextents && + (!xfs_has_rtgroups(mp) || xfs_has_zoned(mp))) { xfs_notice(mp, "Cannot turn on quotas for realtime filesystem"); mp->m_qflags = 0; goto write_changes; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 59f7fc16eb80..cc3b4df88110 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -235,7 +235,7 @@ xfs_reflink_trim_around_shared( int error = 0; /* Holes, unwritten, and delalloc extents cannot be shared */ - if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_written_extent(irec)) { + if (!xfs_is_reflink_inode(ip) || !xfs_bmap_is_written_extent(irec)) { *shared = false; return 0; } @@ -651,7 +651,7 @@ xfs_reflink_cancel_cow_blocks( if (isnullstartblock(del.br_startblock)) { xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, &icur, &got, - &del); + &del, 0); } else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) { ASSERT((*tpp)->t_highest_agno == NULLAGNUMBER); @@ -1207,15 +1207,9 @@ xfs_reflink_ag_has_free_space( if (!xfs_has_rmapbt(mp)) return 0; if (XFS_IS_REALTIME_INODE(ip)) { - struct xfs_rtgroup *rtg; - xfs_rgnumber_t rgno; - - rgno = xfs_rtb_to_rgno(mp, fsb); - rtg = xfs_rtgroup_get(mp, rgno); - if (xfs_metafile_resv_critical(rtg_rmap(rtg))) - error = -ENOSPC; - xfs_rtgroup_put(rtg); - return error; + if (xfs_metafile_resv_critical(mp)) + return -ENOSPC; + return 0; } agno = XFS_FSB_TO_AGNO(mp, fsb); @@ -1538,7 +1532,7 @@ xfs_reflink_zero_posteof( return 0; trace_xfs_zero_eof(ip, isize, pos - isize); - return xfs_zero_range(ip, isize, pos - isize, NULL); + return xfs_zero_range(ip, isize, pos - isize, NULL, NULL); } /* diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 57bef567e011..9a99629d7de4 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -33,6 +33,7 @@ #include "xfs_trace.h" #include "xfs_rtrefcount_btree.h" #include "xfs_reflink.h" +#include "xfs_zone_alloc.h" /* * Return whether there are any free extents in the size range given @@ -663,7 +664,8 @@ xfs_rtunmount_rtg( for (i = 0; i < XFS_RTGI_MAX; i++) xfs_rtginode_irele(&rtg->rtg_inodes[i]); - kvfree(rtg->rtg_rsum_cache); + if (!xfs_has_zoned(rtg_mount(rtg))) + kvfree(rtg->rtg_rsum_cache); } static int @@ -858,6 +860,84 @@ xfs_growfs_rt_init_rtsb( return error; } +static void +xfs_growfs_rt_sb_fields( + struct xfs_trans *tp, + const struct xfs_mount *nmp) +{ + struct xfs_mount *mp = tp->t_mountp; + + if (nmp->m_sb.sb_rextsize != mp->m_sb.sb_rextsize) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSIZE, + nmp->m_sb.sb_rextsize - mp->m_sb.sb_rextsize); + if (nmp->m_sb.sb_rbmblocks != mp->m_sb.sb_rbmblocks) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBMBLOCKS, + nmp->m_sb.sb_rbmblocks - mp->m_sb.sb_rbmblocks); + if (nmp->m_sb.sb_rblocks != mp->m_sb.sb_rblocks) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBLOCKS, + nmp->m_sb.sb_rblocks - mp->m_sb.sb_rblocks); + if (nmp->m_sb.sb_rextents != mp->m_sb.sb_rextents) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTENTS, + nmp->m_sb.sb_rextents - mp->m_sb.sb_rextents); + if (nmp->m_sb.sb_rextslog != mp->m_sb.sb_rextslog) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSLOG, + nmp->m_sb.sb_rextslog - mp->m_sb.sb_rextslog); + if (nmp->m_sb.sb_rgcount != mp->m_sb.sb_rgcount) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_RGCOUNT, + nmp->m_sb.sb_rgcount - mp->m_sb.sb_rgcount); +} + +static int +xfs_growfs_rt_zoned( + struct xfs_rtgroup *rtg, + xfs_rfsblock_t nrblocks) +{ + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_mount *nmp; + struct xfs_trans *tp; + xfs_rtbxlen_t freed_rtx; + int error; + + /* + * Calculate new sb and mount fields for this round. Also ensure the + * rtg_extents value is uptodate as the rtbitmap code relies on it. + */ + nmp = xfs_growfs_rt_alloc_fake_mount(mp, nrblocks, + mp->m_sb.sb_rextsize); + if (!nmp) + return -ENOMEM; + freed_rtx = nmp->m_sb.sb_rextents - mp->m_sb.sb_rextents; + + xfs_rtgroup_calc_geometry(nmp, rtg, rtg_rgno(rtg), + nmp->m_sb.sb_rgcount, nmp->m_sb.sb_rextents); + + error = xfs_trans_alloc(mp, &M_RES(nmp)->tr_growrtfree, 0, 0, 0, &tp); + if (error) + goto out_free; + + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); + xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP); + + xfs_growfs_rt_sb_fields(tp, nmp); + xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, freed_rtx); + + error = xfs_trans_commit(tp); + if (error) + goto out_free; + + /* + * Ensure the mount RT feature flag is now set, and compute new + * maxlevels for rt btrees. + */ + mp->m_features |= XFS_FEAT_REALTIME; + xfs_rtrmapbt_compute_maxlevels(mp); + xfs_rtrefcountbt_compute_maxlevels(mp); + xfs_zoned_add_available(mp, freed_rtx); +out_free: + kfree(nmp); + return error; +} + static int xfs_growfs_rt_bmblock( struct xfs_rtgroup *rtg, @@ -943,24 +1023,7 @@ xfs_growfs_rt_bmblock( /* * Update superblock fields. */ - if (nmp->m_sb.sb_rextsize != mp->m_sb.sb_rextsize) - xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_REXTSIZE, - nmp->m_sb.sb_rextsize - mp->m_sb.sb_rextsize); - if (nmp->m_sb.sb_rbmblocks != mp->m_sb.sb_rbmblocks) - xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_RBMBLOCKS, - nmp->m_sb.sb_rbmblocks - mp->m_sb.sb_rbmblocks); - if (nmp->m_sb.sb_rblocks != mp->m_sb.sb_rblocks) - xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_RBLOCKS, - nmp->m_sb.sb_rblocks - mp->m_sb.sb_rblocks); - if (nmp->m_sb.sb_rextents != mp->m_sb.sb_rextents) - xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_REXTENTS, - nmp->m_sb.sb_rextents - mp->m_sb.sb_rextents); - if (nmp->m_sb.sb_rextslog != mp->m_sb.sb_rextslog) - xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_REXTSLOG, - nmp->m_sb.sb_rextslog - mp->m_sb.sb_rextslog); - if (nmp->m_sb.sb_rgcount != mp->m_sb.sb_rgcount) - xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_RGCOUNT, - nmp->m_sb.sb_rgcount - mp->m_sb.sb_rgcount); + xfs_growfs_rt_sb_fields(args.tp, nmp); /* * Free the new extent. @@ -1127,6 +1190,11 @@ xfs_growfs_rtg( goto out_rele; } + if (xfs_has_zoned(mp)) { + error = xfs_growfs_rt_zoned(rtg, nrblocks); + goto out_rele; + } + error = xfs_growfs_rt_alloc_blocks(rtg, nrblocks, rextsize, &bmblocks); if (error) goto out_rele; @@ -1146,8 +1214,7 @@ xfs_growfs_rtg( if (old_rsum_cache) kvfree(old_rsum_cache); - xfs_rtgroup_rele(rtg); - return 0; + goto out_rele; out_error: /* @@ -1195,6 +1262,22 @@ xfs_growfs_check_rtgeom( if (min_logfsbs > mp->m_sb.sb_logblocks) return -EINVAL; + + if (xfs_has_zoned(mp)) { + uint32_t gblocks = mp->m_groups[XG_TYPE_RTG].blocks; + uint32_t rem; + + if (rextsize != 1) + return -EINVAL; + div_u64_rem(mp->m_sb.sb_rblocks, gblocks, &rem); + if (rem) { + xfs_warn(mp, +"new RT volume size (%lld) not aligned to RT group size (%d)", + mp->m_sb.sb_rblocks, gblocks); + return -EINVAL; + } + } + return 0; } @@ -1249,6 +1332,35 @@ xfs_grow_last_rtg( } /* + * Read in the last block of the RT device to make sure it is accessible. + */ +static int +xfs_rt_check_size( + struct xfs_mount *mp, + xfs_rfsblock_t last_block) +{ + xfs_daddr_t daddr = XFS_FSB_TO_BB(mp, last_block); + struct xfs_buf *bp; + int error; + + if (XFS_BB_TO_FSB(mp, daddr) != last_block) { + xfs_warn(mp, "RT device size overflow: %llu != %llu", + XFS_BB_TO_FSB(mp, daddr), last_block); + return -EFBIG; + } + + error = xfs_buf_read_uncached(mp->m_rtdev_targp, + XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart) + daddr, + XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL); + if (error) + xfs_warn(mp, "cannot read last RT device sector (%lld)", + last_block); + else + xfs_buf_relse(bp); + return error; +} + +/* * Grow the realtime area of the filesystem. */ int @@ -1259,7 +1371,6 @@ xfs_growfs_rt( xfs_rgnumber_t old_rgcount = mp->m_sb.sb_rgcount; xfs_rgnumber_t new_rgcount = 1; xfs_rgnumber_t rgno; - struct xfs_buf *bp; xfs_agblock_t old_rextsize = mp->m_sb.sb_rextsize; int error; @@ -1302,15 +1413,10 @@ xfs_growfs_rt( error = xfs_sb_validate_fsb_count(&mp->m_sb, in->newblocks); if (error) goto out_unlock; - /* - * Read in the last block of the device, make sure it exists. - */ - error = xfs_buf_read_uncached(mp->m_rtdev_targp, - XFS_FSB_TO_BB(mp, in->newblocks - 1), - XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL); + + error = xfs_rt_check_size(mp, in->newblocks - 1); if (error) goto out_unlock; - xfs_buf_relse(bp); /* * Calculate new parameters. These are the final values to be reached. @@ -1376,8 +1482,7 @@ xfs_growfs_rt( error = error2; /* Reset the rt metadata btree space reservations. */ - xfs_rt_resv_free(mp); - error2 = xfs_rt_resv_init(mp); + error2 = xfs_metafile_resv_init(mp); if (error2 && error2 != -ENOSPC) error = error2; } @@ -1444,10 +1549,6 @@ int /* error */ xfs_rtmount_init( struct xfs_mount *mp) /* file system mount structure */ { - struct xfs_buf *bp; /* buffer for last block of subvolume */ - xfs_daddr_t d; /* address of last block of subvolume */ - int error; - if (mp->m_sb.sb_rblocks == 0) return 0; if (mp->m_rtdev_targp == NULL) { @@ -1458,25 +1559,7 @@ xfs_rtmount_init( mp->m_rsumblocks = xfs_rtsummary_blockcount(mp, &mp->m_rsumlevels); - /* - * Check that the realtime section is an ok size. - */ - d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); - if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_rblocks) { - xfs_warn(mp, "realtime mount -- %llu != %llu", - (unsigned long long) XFS_BB_TO_FSB(mp, d), - (unsigned long long) mp->m_sb.sb_rblocks); - return -EFBIG; - } - error = xfs_buf_read_uncached(mp->m_rtdev_targp, - d - XFS_FSB_TO_BB(mp, 1), - XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL); - if (error) { - xfs_warn(mp, "realtime device size check failed"); - return error; - } - xfs_buf_relse(bp); - return 0; + return xfs_rt_check_size(mp, mp->m_sb.sb_rblocks - 1); } static int @@ -1519,50 +1602,10 @@ xfs_rtalloc_reinit_frextents( spin_lock(&mp->m_sb_lock); mp->m_sb.sb_frextents = val; spin_unlock(&mp->m_sb_lock); - percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents); + xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, mp->m_sb.sb_frextents); return 0; } -/* Free space reservations for rt metadata inodes. */ -void -xfs_rt_resv_free( - struct xfs_mount *mp) -{ - struct xfs_rtgroup *rtg = NULL; - unsigned int i; - - while ((rtg = xfs_rtgroup_next(mp, rtg))) { - for (i = 0; i < XFS_RTGI_MAX; i++) - xfs_metafile_resv_free(rtg->rtg_inodes[i]); - } -} - -/* Reserve space for rt metadata inodes' space expansion. */ -int -xfs_rt_resv_init( - struct xfs_mount *mp) -{ - struct xfs_rtgroup *rtg = NULL; - xfs_filblks_t ask; - int error = 0; - - while ((rtg = xfs_rtgroup_next(mp, rtg))) { - int err2; - - ask = xfs_rtrmapbt_calc_reserves(mp); - err2 = xfs_metafile_resv_init(rtg_rmap(rtg), ask); - if (err2 && !error) - error = err2; - - ask = xfs_rtrefcountbt_calc_reserves(mp); - err2 = xfs_metafile_resv_init(rtg_refcount(rtg), ask); - if (err2 && !error) - error = err2; - } - - return error; -} - /* * Read in the bmbt of an rt metadata inode so that we never have to load them * at runtime. This enables the use of shared ILOCKs for rtbitmap scans. Use @@ -1613,6 +1656,8 @@ xfs_rtmount_rtg( } } + if (xfs_has_zoned(mp)) + return 0; return xfs_alloc_rsum_cache(rtg, mp->m_sb.sb_rbmblocks); } @@ -2097,6 +2142,8 @@ xfs_bmap_rtalloc( ap->datatype & XFS_ALLOC_INITIAL_USER_DATA; int error; + ASSERT(!xfs_has_zoned(ap->tp->t_mountp)); + retry: error = xfs_rtallocate_align(ap, &ralen, &raminlen, &prod, &noalign); if (error) diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index 0d95b29092c9..78a690b489ed 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -34,9 +34,6 @@ int /* error */ xfs_rtmount_inodes( struct xfs_mount *mp); /* file system mount structure */ -void xfs_rt_resv_free(struct xfs_mount *mp); -int xfs_rt_resv_init(struct xfs_mount *mp); - /* * Grow the realtime area of the filesystem. */ @@ -65,8 +62,6 @@ xfs_rtmount_init( } # define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (-ENOSYS)) # define xfs_rtunmount_inodes(m) -# define xfs_rt_resv_free(mp) ((void)0) -# define xfs_rt_resv_init(mp) (0) static inline int xfs_growfs_check_rtgeom(const struct xfs_mount *mp, diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 0055066fb1d9..af5e63cb6a99 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -46,6 +46,7 @@ #include "xfs_exchmaps_item.h" #include "xfs_parent.h" #include "xfs_rtalloc.h" +#include "xfs_zone_alloc.h" #include "scrub/stats.h" #include "scrub/rcbag_btree.h" @@ -109,7 +110,8 @@ enum { Opt_filestreams, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota, Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce, - Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum, + Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum, Opt_max_open_zones, + Opt_lifetime, Opt_nolifetime, }; static const struct fs_parameter_spec xfs_fs_parameters[] = { @@ -154,6 +156,9 @@ static const struct fs_parameter_spec xfs_fs_parameters[] = { fsparam_flag("nodiscard", Opt_nodiscard), fsparam_flag("dax", Opt_dax), fsparam_enum("dax", Opt_dax_enum, dax_param_enums), + fsparam_u32("max_open_zones", Opt_max_open_zones), + fsparam_flag("lifetime", Opt_lifetime), + fsparam_flag("nolifetime", Opt_nolifetime), {} }; @@ -182,6 +187,7 @@ xfs_fs_show_options( { XFS_FEAT_LARGE_IOSIZE, ",largeio" }, { XFS_FEAT_DAX_ALWAYS, ",dax=always" }, { XFS_FEAT_DAX_NEVER, ",dax=never" }, + { XFS_FEAT_NOLIFETIME, ",nolifetime" }, { 0, NULL } }; struct xfs_mount *mp = XFS_M(root->d_sb); @@ -233,6 +239,9 @@ xfs_fs_show_options( if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT)) seq_puts(m, ",noquota"); + if (mp->m_max_open_zones) + seq_printf(m, ",max_open_zones=%u", mp->m_max_open_zones); + return 0; } @@ -533,7 +542,15 @@ xfs_setup_devices( if (error) return error; } - if (mp->m_rtdev_targp) { + + if (mp->m_sb.sb_rtstart) { + if (mp->m_rtdev_targp) { + xfs_warn(mp, + "can't use internal and external rtdev at the same time"); + return -EINVAL; + } + mp->m_rtdev_targp = mp->m_ddev_targp; + } else if (mp->m_rtname) { error = xfs_setsize_buftarg(mp->m_rtdev_targp, mp->m_sb.sb_sectsize); if (error) @@ -757,7 +774,7 @@ xfs_mount_free( { if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) xfs_free_buftarg(mp->m_logdev_targp); - if (mp->m_rtdev_targp) + if (mp->m_rtdev_targp && mp->m_rtdev_targp != mp->m_ddev_targp) xfs_free_buftarg(mp->m_rtdev_targp); if (mp->m_ddev_targp) xfs_free_buftarg(mp->m_ddev_targp); @@ -814,6 +831,7 @@ xfs_fs_sync_fs( if (sb->s_writers.frozen == SB_FREEZE_PAGEFAULT) { xfs_inodegc_stop(mp); xfs_blockgc_stop(mp); + xfs_zone_gc_stop(mp); } return 0; @@ -834,10 +852,12 @@ xfs_statfs_data( struct kstatfs *st) { int64_t fdblocks = - percpu_counter_sum(&mp->m_fdblocks); + xfs_sum_freecounter(mp, XC_FREE_BLOCKS); /* make sure st->f_bfree does not underflow */ - st->f_bfree = max(0LL, fdblocks - xfs_fdblocks_unavailable(mp)); + st->f_bfree = max(0LL, + fdblocks - xfs_freecounter_unavailable(mp, XC_FREE_BLOCKS)); + /* * sb_dblocks can change during growfs, but nothing cares about reporting * the old or new value during growfs. @@ -856,8 +876,9 @@ xfs_statfs_rt( struct kstatfs *st) { st->f_bfree = xfs_rtbxlen_to_blen(mp, - percpu_counter_sum_positive(&mp->m_frextents)); - st->f_blocks = mp->m_sb.sb_rblocks; + xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS)); + st->f_blocks = mp->m_sb.sb_rblocks - xfs_rtbxlen_to_blen(mp, + mp->m_free[XC_FREE_RTEXTENTS].res_total); } static void @@ -922,24 +943,32 @@ xfs_fs_statfs( } STATIC void -xfs_save_resvblks(struct xfs_mount *mp) +xfs_save_resvblks( + struct xfs_mount *mp) { - mp->m_resblks_save = mp->m_resblks; - xfs_reserve_blocks(mp, 0); + enum xfs_free_counter i; + + for (i = 0; i < XC_FREE_NR; i++) { + mp->m_free[i].res_saved = mp->m_free[i].res_total; + xfs_reserve_blocks(mp, i, 0); + } } STATIC void -xfs_restore_resvblks(struct xfs_mount *mp) +xfs_restore_resvblks( + struct xfs_mount *mp) { - uint64_t resblks; + uint64_t resblks; + enum xfs_free_counter i; - if (mp->m_resblks_save) { - resblks = mp->m_resblks_save; - mp->m_resblks_save = 0; - } else - resblks = xfs_default_resblks(mp); - - xfs_reserve_blocks(mp, resblks); + for (i = 0; i < XC_FREE_NR; i++) { + if (mp->m_free[i].res_saved) { + resblks = mp->m_free[i].res_saved; + mp->m_free[i].res_saved = 0; + } else + resblks = xfs_default_resblks(mp, i); + xfs_reserve_blocks(mp, i, resblks); + } } /* @@ -976,6 +1005,7 @@ xfs_fs_freeze( if (ret && !xfs_is_readonly(mp)) { xfs_blockgc_start(mp); xfs_inodegc_start(mp); + xfs_zone_gc_start(mp); } return ret; @@ -997,6 +1027,7 @@ xfs_fs_unfreeze( * filesystem. */ if (!xfs_is_readonly(mp)) { + xfs_zone_gc_start(mp); xfs_blockgc_start(mp); xfs_inodegc_start(mp); } @@ -1058,6 +1089,19 @@ xfs_finish_flags( return -EINVAL; } + if (!xfs_has_zoned(mp)) { + if (mp->m_max_open_zones) { + xfs_warn(mp, +"max_open_zones mount option only supported on zoned file systems."); + return -EINVAL; + } + if (mp->m_features & XFS_FEAT_NOLIFETIME) { + xfs_warn(mp, +"nolifetime mount option only supported on zoned file systems."); + return -EINVAL; + } + } + return 0; } @@ -1065,7 +1109,8 @@ static int xfs_init_percpu_counters( struct xfs_mount *mp) { - int error; + int error; + int i; error = percpu_counter_init(&mp->m_icount, 0, GFP_KERNEL); if (error) @@ -1075,30 +1120,29 @@ xfs_init_percpu_counters( if (error) goto free_icount; - error = percpu_counter_init(&mp->m_fdblocks, 0, GFP_KERNEL); - if (error) - goto free_ifree; - error = percpu_counter_init(&mp->m_delalloc_blks, 0, GFP_KERNEL); if (error) - goto free_fdblocks; + goto free_ifree; error = percpu_counter_init(&mp->m_delalloc_rtextents, 0, GFP_KERNEL); if (error) goto free_delalloc; - error = percpu_counter_init(&mp->m_frextents, 0, GFP_KERNEL); - if (error) - goto free_delalloc_rt; + for (i = 0; i < XC_FREE_NR; i++) { + error = percpu_counter_init(&mp->m_free[i].count, 0, + GFP_KERNEL); + if (error) + goto free_freecounters; + } return 0; -free_delalloc_rt: +free_freecounters: + while (--i > 0) + percpu_counter_destroy(&mp->m_free[i].count); percpu_counter_destroy(&mp->m_delalloc_rtextents); free_delalloc: percpu_counter_destroy(&mp->m_delalloc_blks); -free_fdblocks: - percpu_counter_destroy(&mp->m_fdblocks); free_ifree: percpu_counter_destroy(&mp->m_ifree); free_icount: @@ -1112,24 +1156,28 @@ xfs_reinit_percpu_counters( { percpu_counter_set(&mp->m_icount, mp->m_sb.sb_icount); percpu_counter_set(&mp->m_ifree, mp->m_sb.sb_ifree); - percpu_counter_set(&mp->m_fdblocks, mp->m_sb.sb_fdblocks); - percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents); + xfs_set_freecounter(mp, XC_FREE_BLOCKS, mp->m_sb.sb_fdblocks); + if (!xfs_has_zoned(mp)) + xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, + mp->m_sb.sb_frextents); } static void xfs_destroy_percpu_counters( struct xfs_mount *mp) { + enum xfs_free_counter i; + + for (i = 0; i < XC_FREE_NR; i++) + percpu_counter_destroy(&mp->m_free[i].count); percpu_counter_destroy(&mp->m_icount); percpu_counter_destroy(&mp->m_ifree); - percpu_counter_destroy(&mp->m_fdblocks); ASSERT(xfs_is_shutdown(mp) || percpu_counter_sum(&mp->m_delalloc_rtextents) == 0); percpu_counter_destroy(&mp->m_delalloc_rtextents); ASSERT(xfs_is_shutdown(mp) || percpu_counter_sum(&mp->m_delalloc_blks) == 0); percpu_counter_destroy(&mp->m_delalloc_blks); - percpu_counter_destroy(&mp->m_frextents); } static int @@ -1210,6 +1258,18 @@ xfs_fs_shutdown( xfs_force_shutdown(XFS_M(sb), SHUTDOWN_DEVICE_REMOVED); } +static int +xfs_fs_show_stats( + struct seq_file *m, + struct dentry *root) +{ + struct xfs_mount *mp = XFS_M(root->d_sb); + + if (xfs_has_zoned(mp) && IS_ENABLED(CONFIG_XFS_RT)) + xfs_zoned_show_stats(m, mp); + return 0; +} + static const struct super_operations xfs_super_operations = { .alloc_inode = xfs_fs_alloc_inode, .destroy_inode = xfs_fs_destroy_inode, @@ -1224,6 +1284,7 @@ static const struct super_operations xfs_super_operations = { .nr_cached_objects = xfs_fs_nr_cached_objects, .free_cached_objects = xfs_fs_free_cached_objects, .shutdown = xfs_fs_shutdown, + .show_stats = xfs_fs_show_stats, }; static int @@ -1436,6 +1497,15 @@ xfs_fs_parse_param( xfs_fs_warn_deprecated(fc, param, XFS_FEAT_NOATTR2, true); parsing_mp->m_features |= XFS_FEAT_NOATTR2; return 0; + case Opt_max_open_zones: + parsing_mp->m_max_open_zones = result.uint_32; + return 0; + case Opt_lifetime: + parsing_mp->m_features &= ~XFS_FEAT_NOLIFETIME; + return 0; + case Opt_nolifetime: + parsing_mp->m_features |= XFS_FEAT_NOLIFETIME; + return 0; default: xfs_warn(parsing_mp, "unknown mount option [%s].", param->key); return -EINVAL; @@ -1780,8 +1850,17 @@ xfs_fs_fill_super( mp->m_features &= ~XFS_FEAT_DISCARD; } - if (xfs_has_metadir(mp)) + if (xfs_has_zoned(mp)) { + if (!xfs_has_metadir(mp)) { + xfs_alert(mp, + "metadir feature required for zoned realtime devices."); + error = -EINVAL; + goto out_filestream_unmount; + } + xfs_warn_experimental(mp, XFS_EXPERIMENTAL_ZONED); + } else if (xfs_has_metadir(mp)) { xfs_warn_experimental(mp, XFS_EXPERIMENTAL_METADIR); + } if (xfs_has_reflink(mp)) { if (xfs_has_realtime(mp) && @@ -1793,6 +1872,13 @@ xfs_fs_fill_super( goto out_filestream_unmount; } + if (xfs_has_zoned(mp)) { + xfs_alert(mp, + "reflink not compatible with zoned RT device!"); + error = -EINVAL; + goto out_filestream_unmount; + } + if (xfs_globals.always_cow) { xfs_info(mp, "using DEBUG-only always_cow mode."); mp->m_always_cow = true; @@ -1917,6 +2003,9 @@ xfs_remount_rw( /* Re-enable the background inode inactivation worker. */ xfs_inodegc_start(mp); + /* Restart zone reclaim */ + xfs_zone_gc_start(mp); + return 0; } @@ -1961,6 +2050,9 @@ xfs_remount_ro( */ xfs_inodegc_stop(mp); + /* Stop zone reclaim */ + xfs_zone_gc_stop(mp); + /* Free the per-AG metadata reservation pool. */ xfs_fs_unreserve_ag_blocks(mp); @@ -2082,6 +2174,7 @@ xfs_init_fs_context( for (i = 0; i < XG_TYPE_MAX; i++) xa_init(&mp->m_groups[i].xa); mutex_init(&mp->m_growlock); + mutex_init(&mp->m_metafile_resv_lock); INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker); INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); mp->m_kobj.kobject.kset = xfs_kset; diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index 60cb5318fdae..b0857e3c1270 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -13,6 +13,7 @@ #include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_mount.h" +#include "xfs_zones.h" struct xfs_sysfs_attr { struct attribute attr; @@ -69,7 +70,7 @@ static struct attribute *xfs_mp_attrs[] = { }; ATTRIBUTE_GROUPS(xfs_mp); -const struct kobj_type xfs_mp_ktype = { +static const struct kobj_type xfs_mp_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, .default_groups = xfs_mp_groups, @@ -701,45 +702,103 @@ out_error: return error; } +static inline struct xfs_mount *zoned_to_mp(struct kobject *kobj) +{ + return container_of(to_kobj(kobj), struct xfs_mount, m_zoned_kobj); +} + +static ssize_t +max_open_zones_show( + struct kobject *kobj, + char *buf) +{ + /* only report the open zones available for user data */ + return sysfs_emit(buf, "%u\n", + zoned_to_mp(kobj)->m_max_open_zones - XFS_OPEN_GC_ZONES); +} +XFS_SYSFS_ATTR_RO(max_open_zones); + +static struct attribute *xfs_zoned_attrs[] = { + ATTR_LIST(max_open_zones), + NULL, +}; +ATTRIBUTE_GROUPS(xfs_zoned); + +static const struct kobj_type xfs_zoned_ktype = { + .release = xfs_sysfs_release, + .sysfs_ops = &xfs_sysfs_ops, + .default_groups = xfs_zoned_groups, +}; + int -xfs_error_sysfs_init( +xfs_mount_sysfs_init( struct xfs_mount *mp) { int error; + super_set_sysfs_name_id(mp->m_super); + + /* .../xfs/<dev>/ */ + error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, + NULL, mp->m_super->s_id); + if (error) + return error; + + /* .../xfs/<dev>/stats/ */ + error = xfs_sysfs_init(&mp->m_stats.xs_kobj, &xfs_stats_ktype, + &mp->m_kobj, "stats"); + if (error) + goto out_remove_fsdir; + /* .../xfs/<dev>/error/ */ error = xfs_sysfs_init(&mp->m_error_kobj, &xfs_error_ktype, &mp->m_kobj, "error"); if (error) - return error; + goto out_remove_stats_dir; + /* .../xfs/<dev>/error/fail_at_unmount */ error = sysfs_create_file(&mp->m_error_kobj.kobject, ATTR_LIST(fail_at_unmount)); if (error) - goto out_error; + goto out_remove_error_dir; /* .../xfs/<dev>/error/metadata/ */ error = xfs_error_sysfs_init_class(mp, XFS_ERR_METADATA, "metadata", &mp->m_error_meta_kobj, xfs_error_meta_init); if (error) - goto out_error; + goto out_remove_error_dir; + + if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(mp)) { + /* .../xfs/<dev>/zoned/ */ + error = xfs_sysfs_init(&mp->m_zoned_kobj, &xfs_zoned_ktype, + &mp->m_kobj, "zoned"); + if (error) + goto out_remove_error_dir; + } return 0; -out_error: +out_remove_error_dir: xfs_sysfs_del(&mp->m_error_kobj); +out_remove_stats_dir: + xfs_sysfs_del(&mp->m_stats.xs_kobj); +out_remove_fsdir: + xfs_sysfs_del(&mp->m_kobj); return error; } void -xfs_error_sysfs_del( +xfs_mount_sysfs_del( struct xfs_mount *mp) { struct xfs_error_cfg *cfg; int i, j; + if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(mp)) + xfs_sysfs_del(&mp->m_zoned_kobj); + for (i = 0; i < XFS_ERR_CLASS_MAX; i++) { for (j = 0; j < XFS_ERR_ERRNO_MAX; j++) { cfg = &mp->m_error_cfg[i][j]; @@ -749,6 +808,8 @@ xfs_error_sysfs_del( } xfs_sysfs_del(&mp->m_error_meta_kobj); xfs_sysfs_del(&mp->m_error_kobj); + xfs_sysfs_del(&mp->m_stats.xs_kobj); + xfs_sysfs_del(&mp->m_kobj); } struct xfs_error_cfg * diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h index 148893ebfdef..1622fe80ad3e 100644 --- a/fs/xfs/xfs_sysfs.h +++ b/fs/xfs/xfs_sysfs.h @@ -7,7 +7,6 @@ #ifndef __XFS_SYSFS_H__ #define __XFS_SYSFS_H__ -extern const struct kobj_type xfs_mp_ktype; /* xfs_mount */ extern const struct kobj_type xfs_dbg_ktype; /* debug */ extern const struct kobj_type xfs_log_ktype; /* xlog */ extern const struct kobj_type xfs_stats_ktype; /* stats */ @@ -53,7 +52,7 @@ xfs_sysfs_del( wait_for_completion(&kobj->complete); } -int xfs_error_sysfs_init(struct xfs_mount *mp); -void xfs_error_sysfs_del(struct xfs_mount *mp); +int xfs_mount_sysfs_init(struct xfs_mount *mp); +void xfs_mount_sysfs_del(struct xfs_mount *mp); #endif /* __XFS_SYSFS_H__ */ diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index 8f530e69c18a..a60556dbd172 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -49,6 +49,8 @@ #include "xfs_metafile.h" #include "xfs_metadir.h" #include "xfs_rtgroup.h" +#include "xfs_zone_alloc.h" +#include "xfs_zone_priv.h" /* * We include this last to have the helpers above available for the trace diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index bfc2f1249022..83f894c07866 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -102,6 +102,7 @@ struct xfs_rmap_intent; struct xfs_refcount_intent; struct xfs_metadir_update; struct xfs_rtgroup; +struct xfs_open_zone; #define XFS_ATTR_FILTER_FLAGS \ { XFS_ATTR_ROOT, "ROOT" }, \ @@ -265,6 +266,152 @@ DEFINE_GROUP_REF_EVENT(xfs_group_grab); DEFINE_GROUP_REF_EVENT(xfs_group_grab_next_tag); DEFINE_GROUP_REF_EVENT(xfs_group_rele); +#ifdef CONFIG_XFS_RT +DECLARE_EVENT_CLASS(xfs_zone_class, + TP_PROTO(struct xfs_rtgroup *rtg), + TP_ARGS(rtg), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_rgnumber_t, rgno) + __field(xfs_rgblock_t, used) + __field(unsigned int, nr_open) + ), + TP_fast_assign( + struct xfs_mount *mp = rtg_mount(rtg); + + __entry->dev = mp->m_super->s_dev; + __entry->rgno = rtg_rgno(rtg); + __entry->used = rtg_rmap(rtg)->i_used_blocks; + __entry->nr_open = mp->m_zone_info->zi_nr_open_zones; + ), + TP_printk("dev %d:%d rgno 0x%x used 0x%x nr_open %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rgno, + __entry->used, + __entry->nr_open) +); + +#define DEFINE_ZONE_EVENT(name) \ +DEFINE_EVENT(xfs_zone_class, name, \ + TP_PROTO(struct xfs_rtgroup *rtg), \ + TP_ARGS(rtg)) +DEFINE_ZONE_EVENT(xfs_zone_emptied); +DEFINE_ZONE_EVENT(xfs_zone_full); +DEFINE_ZONE_EVENT(xfs_zone_opened); +DEFINE_ZONE_EVENT(xfs_zone_reset); +DEFINE_ZONE_EVENT(xfs_zone_gc_target_opened); + +TRACE_EVENT(xfs_zone_free_blocks, + TP_PROTO(struct xfs_rtgroup *rtg, xfs_rgblock_t rgbno, + xfs_extlen_t len), + TP_ARGS(rtg, rgbno, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_rgnumber_t, rgno) + __field(xfs_rgblock_t, used) + __field(xfs_rgblock_t, rgbno) + __field(xfs_extlen_t, len) + ), + TP_fast_assign( + __entry->dev = rtg_mount(rtg)->m_super->s_dev; + __entry->rgno = rtg_rgno(rtg); + __entry->used = rtg_rmap(rtg)->i_used_blocks; + __entry->rgbno = rgbno; + __entry->len = len; + ), + TP_printk("dev %d:%d rgno 0x%x used 0x%x rgbno 0x%x len 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rgno, + __entry->used, + __entry->rgbno, + __entry->len) +); + +DECLARE_EVENT_CLASS(xfs_zone_alloc_class, + TP_PROTO(struct xfs_open_zone *oz, xfs_rgblock_t rgbno, + xfs_extlen_t len), + TP_ARGS(oz, rgbno, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_rgnumber_t, rgno) + __field(xfs_rgblock_t, used) + __field(xfs_rgblock_t, written) + __field(xfs_rgblock_t, write_pointer) + __field(xfs_rgblock_t, rgbno) + __field(xfs_extlen_t, len) + ), + TP_fast_assign( + __entry->dev = rtg_mount(oz->oz_rtg)->m_super->s_dev; + __entry->rgno = rtg_rgno(oz->oz_rtg); + __entry->used = rtg_rmap(oz->oz_rtg)->i_used_blocks; + __entry->written = oz->oz_written; + __entry->write_pointer = oz->oz_write_pointer; + __entry->rgbno = rgbno; + __entry->len = len; + ), + TP_printk("dev %d:%d rgno 0x%x used 0x%x written 0x%x wp 0x%x rgbno 0x%x len 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rgno, + __entry->used, + __entry->written, + __entry->write_pointer, + __entry->rgbno, + __entry->len) +); + +#define DEFINE_ZONE_ALLOC_EVENT(name) \ +DEFINE_EVENT(xfs_zone_alloc_class, name, \ + TP_PROTO(struct xfs_open_zone *oz, xfs_rgblock_t rgbno, \ + xfs_extlen_t len), \ + TP_ARGS(oz, rgbno, len)) +DEFINE_ZONE_ALLOC_EVENT(xfs_zone_record_blocks); +DEFINE_ZONE_ALLOC_EVENT(xfs_zone_alloc_blocks); + +TRACE_EVENT(xfs_zone_gc_select_victim, + TP_PROTO(struct xfs_rtgroup *rtg, unsigned int bucket), + TP_ARGS(rtg, bucket), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_rgnumber_t, rgno) + __field(xfs_rgblock_t, used) + __field(unsigned int, bucket) + ), + TP_fast_assign( + __entry->dev = rtg_mount(rtg)->m_super->s_dev; + __entry->rgno = rtg_rgno(rtg); + __entry->used = rtg_rmap(rtg)->i_used_blocks; + __entry->bucket = bucket; + ), + TP_printk("dev %d:%d rgno 0x%x used 0x%x bucket %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rgno, + __entry->used, + __entry->bucket) +); + +TRACE_EVENT(xfs_zones_mount, + TP_PROTO(struct xfs_mount *mp), + TP_ARGS(mp), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_rgnumber_t, rgcount) + __field(uint32_t, blocks) + __field(unsigned int, max_open_zones) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->rgcount = mp->m_sb.sb_rgcount; + __entry->blocks = mp->m_groups[XG_TYPE_RTG].blocks; + __entry->max_open_zones = mp->m_max_open_zones; + ), + TP_printk("dev %d:%d zoned %u blocks_per_zone %u, max_open %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rgcount, + __entry->blocks, + __entry->max_open_zones) +); +#endif /* CONFIG_XFS_RT */ + TRACE_EVENT(xfs_inodegc_worker, TP_PROTO(struct xfs_mount *mp, unsigned int shrinker_hits), TP_ARGS(mp, shrinker_hits), @@ -1596,6 +1743,7 @@ DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write); DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_unwritten); DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_append); DEFINE_SIMPLE_IO_EVENT(xfs_file_splice_read); +DEFINE_SIMPLE_IO_EVENT(xfs_zoned_map_blocks); DECLARE_EVENT_CLASS(xfs_itrunc_class, TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), @@ -3983,6 +4131,7 @@ DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range); DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow); DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_from); DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_to); +DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_skip); DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error); @@ -5606,11 +5755,10 @@ DEFINE_METADIR_EVENT(xfs_metadir_lookup); /* metadata inode space reservations */ DECLARE_EVENT_CLASS(xfs_metafile_resv_class, - TP_PROTO(struct xfs_inode *ip, xfs_filblks_t len), - TP_ARGS(ip, len), + TP_PROTO(struct xfs_mount *mp, xfs_filblks_t len), + TP_ARGS(mp, len), TP_STRUCT__entry( __field(dev_t, dev) - __field(xfs_ino_t, ino) __field(unsigned long long, freeblks) __field(unsigned long long, reserved) __field(unsigned long long, asked) @@ -5618,19 +5766,15 @@ DECLARE_EVENT_CLASS(xfs_metafile_resv_class, __field(unsigned long long, len) ), TP_fast_assign( - struct xfs_mount *mp = ip->i_mount; - __entry->dev = mp->m_super->s_dev; - __entry->ino = ip->i_ino; - __entry->freeblks = percpu_counter_sum(&mp->m_fdblocks); - __entry->reserved = ip->i_delayed_blks; - __entry->asked = ip->i_meta_resv_asked; - __entry->used = ip->i_nblocks; + __entry->freeblks = xfs_sum_freecounter_raw(mp, XC_FREE_BLOCKS); + __entry->reserved = mp->m_metafile_resv_avail; + __entry->asked = mp->m_metafile_resv_target; + __entry->used = mp->m_metafile_resv_used; __entry->len = len; ), - TP_printk("dev %d:%d ino 0x%llx freeblks %llu resv %llu ask %llu used %llu len %llu", + TP_printk("dev %d:%d freeblks %llu resv %llu ask %llu used %llu len %llu", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->ino, __entry->freeblks, __entry->reserved, __entry->asked, @@ -5639,14 +5783,14 @@ DECLARE_EVENT_CLASS(xfs_metafile_resv_class, ) #define DEFINE_METAFILE_RESV_EVENT(name) \ DEFINE_EVENT(xfs_metafile_resv_class, name, \ - TP_PROTO(struct xfs_inode *ip, xfs_filblks_t len), \ - TP_ARGS(ip, len)) + TP_PROTO(struct xfs_mount *mp, xfs_filblks_t len), \ + TP_ARGS(mp, len)) DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_init); DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_free); DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_alloc_space); DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_free_space); DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_critical); -DEFINE_INODE_ERROR_EVENT(xfs_metafile_resv_init_error); +DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_init_error); #ifdef CONFIG_XFS_RT TRACE_EVENT(xfs_growfs_check_rtgeom, @@ -5669,6 +5813,46 @@ TRACE_EVENT(xfs_growfs_check_rtgeom, ); #endif /* CONFIG_XFS_RT */ +TRACE_DEFINE_ENUM(XC_FREE_BLOCKS); +TRACE_DEFINE_ENUM(XC_FREE_RTEXTENTS); +TRACE_DEFINE_ENUM(XC_FREE_RTAVAILABLE); + +DECLARE_EVENT_CLASS(xfs_freeblocks_resv_class, + TP_PROTO(struct xfs_mount *mp, enum xfs_free_counter ctr, + uint64_t delta, unsigned long caller_ip), + TP_ARGS(mp, ctr, delta, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(enum xfs_free_counter, ctr) + __field(uint64_t, delta) + __field(uint64_t, avail) + __field(uint64_t, total) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->ctr = ctr; + __entry->delta = delta; + __entry->avail = mp->m_free[ctr].res_avail; + __entry->total = mp->m_free[ctr].res_total; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d ctr %s delta %llu avail %llu total %llu caller %pS", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->ctr, XFS_FREECOUNTER_STR), + __entry->delta, + __entry->avail, + __entry->total, + (char *)__entry->caller_ip) +) +#define DEFINE_FREEBLOCKS_RESV_EVENT(name) \ +DEFINE_EVENT(xfs_freeblocks_resv_class, name, \ + TP_PROTO(struct xfs_mount *mp, enum xfs_free_counter ctr, \ + uint64_t delta, unsigned long caller_ip), \ + TP_ARGS(mp, ctr, delta, caller_ip)) +DEFINE_FREEBLOCKS_RESV_EVENT(xfs_freecounter_reserved); +DEFINE_FREEBLOCKS_RESV_EVENT(xfs_freecounter_enospc); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c new file mode 100644 index 000000000000..fd4c60a050e6 --- /dev/null +++ b/fs/xfs/xfs_zone_alloc.c @@ -0,0 +1,1211 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2023-2025 Christoph Hellwig. + * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates. + */ +#include "xfs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_error.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_iomap.h" +#include "xfs_trans.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" +#include "xfs_refcount.h" +#include "xfs_rtbitmap.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_zone_alloc.h" +#include "xfs_zone_priv.h" +#include "xfs_zones.h" +#include "xfs_trace.h" + +void +xfs_open_zone_put( + struct xfs_open_zone *oz) +{ + if (atomic_dec_and_test(&oz->oz_ref)) { + xfs_rtgroup_rele(oz->oz_rtg); + kfree(oz); + } +} + +static inline uint32_t +xfs_zone_bucket( + struct xfs_mount *mp, + uint32_t used_blocks) +{ + return XFS_ZONE_USED_BUCKETS * used_blocks / + mp->m_groups[XG_TYPE_RTG].blocks; +} + +static inline void +xfs_zone_add_to_bucket( + struct xfs_zone_info *zi, + xfs_rgnumber_t rgno, + uint32_t to_bucket) +{ + __set_bit(rgno, zi->zi_used_bucket_bitmap[to_bucket]); + zi->zi_used_bucket_entries[to_bucket]++; +} + +static inline void +xfs_zone_remove_from_bucket( + struct xfs_zone_info *zi, + xfs_rgnumber_t rgno, + uint32_t from_bucket) +{ + __clear_bit(rgno, zi->zi_used_bucket_bitmap[from_bucket]); + zi->zi_used_bucket_entries[from_bucket]--; +} + +static void +xfs_zone_account_reclaimable( + struct xfs_rtgroup *rtg, + uint32_t freed) +{ + struct xfs_group *xg = &rtg->rtg_group; + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_zone_info *zi = mp->m_zone_info; + uint32_t used = rtg_rmap(rtg)->i_used_blocks; + xfs_rgnumber_t rgno = rtg_rgno(rtg); + uint32_t from_bucket = xfs_zone_bucket(mp, used + freed); + uint32_t to_bucket = xfs_zone_bucket(mp, used); + bool was_full = (used + freed == rtg_blocks(rtg)); + + /* + * This can be called from log recovery, where the zone_info structure + * hasn't been allocated yet. Skip all work as xfs_mount_zones will + * add the zones to the right buckets before the file systems becomes + * active. + */ + if (!zi) + return; + + if (!used) { + /* + * The zone is now empty, remove it from the bottom bucket and + * trigger a reset. + */ + trace_xfs_zone_emptied(rtg); + + if (!was_full) + xfs_group_clear_mark(xg, XFS_RTG_RECLAIMABLE); + + spin_lock(&zi->zi_used_buckets_lock); + if (!was_full) + xfs_zone_remove_from_bucket(zi, rgno, from_bucket); + spin_unlock(&zi->zi_used_buckets_lock); + + spin_lock(&zi->zi_reset_list_lock); + xg->xg_next_reset = zi->zi_reset_list; + zi->zi_reset_list = xg; + spin_unlock(&zi->zi_reset_list_lock); + + if (zi->zi_gc_thread) + wake_up_process(zi->zi_gc_thread); + } else if (was_full) { + /* + * The zone transitioned from full, mark it up as reclaimable + * and wake up GC which might be waiting for zones to reclaim. + */ + spin_lock(&zi->zi_used_buckets_lock); + xfs_zone_add_to_bucket(zi, rgno, to_bucket); + spin_unlock(&zi->zi_used_buckets_lock); + + xfs_group_set_mark(xg, XFS_RTG_RECLAIMABLE); + if (zi->zi_gc_thread && xfs_zoned_need_gc(mp)) + wake_up_process(zi->zi_gc_thread); + } else if (to_bucket != from_bucket) { + /* + * Move the zone to a new bucket if it dropped below the + * threshold. + */ + spin_lock(&zi->zi_used_buckets_lock); + xfs_zone_add_to_bucket(zi, rgno, to_bucket); + xfs_zone_remove_from_bucket(zi, rgno, from_bucket); + spin_unlock(&zi->zi_used_buckets_lock); + } +} + +static void +xfs_open_zone_mark_full( + struct xfs_open_zone *oz) +{ + struct xfs_rtgroup *rtg = oz->oz_rtg; + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_zone_info *zi = mp->m_zone_info; + uint32_t used = rtg_rmap(rtg)->i_used_blocks; + + trace_xfs_zone_full(rtg); + + WRITE_ONCE(rtg->rtg_open_zone, NULL); + + spin_lock(&zi->zi_open_zones_lock); + if (oz->oz_is_gc) { + ASSERT(current == zi->zi_gc_thread); + zi->zi_open_gc_zone = NULL; + } else { + zi->zi_nr_open_zones--; + list_del_init(&oz->oz_entry); + } + spin_unlock(&zi->zi_open_zones_lock); + xfs_open_zone_put(oz); + + wake_up_all(&zi->zi_zone_wait); + if (used < rtg_blocks(rtg)) + xfs_zone_account_reclaimable(rtg, rtg_blocks(rtg) - used); +} + +static void +xfs_zone_record_blocks( + struct xfs_trans *tp, + xfs_fsblock_t fsbno, + xfs_filblks_t len, + struct xfs_open_zone *oz, + bool used) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_rtgroup *rtg = oz->oz_rtg; + struct xfs_inode *rmapip = rtg_rmap(rtg); + + trace_xfs_zone_record_blocks(oz, xfs_rtb_to_rgbno(mp, fsbno), len); + + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); + xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP); + if (used) { + rmapip->i_used_blocks += len; + ASSERT(rmapip->i_used_blocks <= rtg_blocks(rtg)); + } else { + xfs_add_frextents(mp, len); + } + oz->oz_written += len; + if (oz->oz_written == rtg_blocks(rtg)) + xfs_open_zone_mark_full(oz); + xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE); +} + +static int +xfs_zoned_map_extent( + struct xfs_trans *tp, + struct xfs_inode *ip, + struct xfs_bmbt_irec *new, + struct xfs_open_zone *oz, + xfs_fsblock_t old_startblock) +{ + struct xfs_bmbt_irec data; + int nmaps = 1; + int error; + + /* Grab the corresponding mapping in the data fork. */ + error = xfs_bmapi_read(ip, new->br_startoff, new->br_blockcount, &data, + &nmaps, 0); + if (error) + return error; + + /* + * Cap the update to the existing extent in the data fork because we can + * only overwrite one extent at a time. + */ + ASSERT(new->br_blockcount >= data.br_blockcount); + new->br_blockcount = data.br_blockcount; + + /* + * If a data write raced with this GC write, keep the existing data in + * the data fork, mark our newly written GC extent as reclaimable, then + * move on to the next extent. + */ + if (old_startblock != NULLFSBLOCK && + old_startblock != data.br_startblock) + goto skip; + + trace_xfs_reflink_cow_remap_from(ip, new); + trace_xfs_reflink_cow_remap_to(ip, &data); + + error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK, + XFS_IEXT_REFLINK_END_COW_CNT); + if (error) + return error; + + if (data.br_startblock != HOLESTARTBLOCK) { + ASSERT(data.br_startblock != DELAYSTARTBLOCK); + ASSERT(!isnullstartblock(data.br_startblock)); + + xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &data); + if (xfs_is_reflink_inode(ip)) { + xfs_refcount_decrease_extent(tp, true, &data); + } else { + error = xfs_free_extent_later(tp, data.br_startblock, + data.br_blockcount, NULL, + XFS_AG_RESV_NONE, + XFS_FREE_EXTENT_REALTIME); + if (error) + return error; + } + } + + xfs_zone_record_blocks(tp, new->br_startblock, new->br_blockcount, oz, + true); + + /* Map the new blocks into the data fork. */ + xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, new); + return 0; + +skip: + trace_xfs_reflink_cow_remap_skip(ip, new); + xfs_zone_record_blocks(tp, new->br_startblock, new->br_blockcount, oz, + false); + return 0; +} + +int +xfs_zoned_end_io( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t count, + xfs_daddr_t daddr, + struct xfs_open_zone *oz, + xfs_fsblock_t old_startblock) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); + struct xfs_bmbt_irec new = { + .br_startoff = XFS_B_TO_FSBT(mp, offset), + .br_startblock = xfs_daddr_to_rtb(mp, daddr), + .br_state = XFS_EXT_NORM, + }; + unsigned int resblks = + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); + struct xfs_trans *tp; + int error; + + if (xfs_is_shutdown(mp)) + return -EIO; + + while (new.br_startoff < end_fsb) { + new.br_blockcount = end_fsb - new.br_startoff; + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, + XFS_TRANS_RESERVE | XFS_TRANS_RES_FDBLKS, &tp); + if (error) + return error; + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + error = xfs_zoned_map_extent(tp, ip, &new, oz, old_startblock); + if (error) + xfs_trans_cancel(tp); + else + error = xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (error) + return error; + + new.br_startoff += new.br_blockcount; + new.br_startblock += new.br_blockcount; + if (old_startblock != NULLFSBLOCK) + old_startblock += new.br_blockcount; + } + + return 0; +} + +/* + * "Free" blocks allocated in a zone. + * + * Just decrement the used blocks counter and report the space as freed. + */ +int +xfs_zone_free_blocks( + struct xfs_trans *tp, + struct xfs_rtgroup *rtg, + xfs_fsblock_t fsbno, + xfs_filblks_t len) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_inode *rmapip = rtg_rmap(rtg); + + xfs_assert_ilocked(rmapip, XFS_ILOCK_EXCL); + + if (len > rmapip->i_used_blocks) { + xfs_err(mp, +"trying to free more blocks (%lld) than used counter (%u).", + len, rmapip->i_used_blocks); + ASSERT(len <= rmapip->i_used_blocks); + xfs_rtginode_mark_sick(rtg, XFS_RTGI_RMAP); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + return -EFSCORRUPTED; + } + + trace_xfs_zone_free_blocks(rtg, xfs_rtb_to_rgbno(mp, fsbno), len); + + rmapip->i_used_blocks -= len; + /* + * Don't add open zones to the reclaimable buckets. The I/O completion + * for writing the last block will take care of accounting for already + * unused blocks instead. + */ + if (!READ_ONCE(rtg->rtg_open_zone)) + xfs_zone_account_reclaimable(rtg, len); + xfs_add_frextents(mp, len); + xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE); + return 0; +} + +/* + * Check if the zone containing the data just before the offset we are + * writing to is still open and has space. + */ +static struct xfs_open_zone * +xfs_last_used_zone( + struct iomap_ioend *ioend) +{ + struct xfs_inode *ip = XFS_I(ioend->io_inode); + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t offset_fsb = XFS_B_TO_FSB(mp, ioend->io_offset); + struct xfs_rtgroup *rtg = NULL; + struct xfs_open_zone *oz = NULL; + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec got; + + xfs_ilock(ip, XFS_ILOCK_SHARED); + if (!xfs_iext_lookup_extent_before(ip, &ip->i_df, &offset_fsb, + &icur, &got)) { + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return NULL; + } + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + rtg = xfs_rtgroup_grab(mp, xfs_rtb_to_rgno(mp, got.br_startblock)); + if (!rtg) + return NULL; + + xfs_ilock(rtg_rmap(rtg), XFS_ILOCK_SHARED); + oz = READ_ONCE(rtg->rtg_open_zone); + if (oz && (oz->oz_is_gc || !atomic_inc_not_zero(&oz->oz_ref))) + oz = NULL; + xfs_iunlock(rtg_rmap(rtg), XFS_ILOCK_SHARED); + + xfs_rtgroup_rele(rtg); + return oz; +} + +static struct xfs_group * +xfs_find_free_zone( + struct xfs_mount *mp, + unsigned long start, + unsigned long end) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + XA_STATE (xas, &mp->m_groups[XG_TYPE_RTG].xa, start); + struct xfs_group *xg; + + xas_lock(&xas); + xas_for_each_marked(&xas, xg, end, XFS_RTG_FREE) + if (atomic_inc_not_zero(&xg->xg_active_ref)) + goto found; + xas_unlock(&xas); + return NULL; + +found: + xas_clear_mark(&xas, XFS_RTG_FREE); + atomic_dec(&zi->zi_nr_free_zones); + zi->zi_free_zone_cursor = xg->xg_gno; + xas_unlock(&xas); + return xg; +} + +static struct xfs_open_zone * +xfs_init_open_zone( + struct xfs_rtgroup *rtg, + xfs_rgblock_t write_pointer, + enum rw_hint write_hint, + bool is_gc) +{ + struct xfs_open_zone *oz; + + oz = kzalloc(sizeof(*oz), GFP_NOFS | __GFP_NOFAIL); + spin_lock_init(&oz->oz_alloc_lock); + atomic_set(&oz->oz_ref, 1); + oz->oz_rtg = rtg; + oz->oz_write_pointer = write_pointer; + oz->oz_written = write_pointer; + oz->oz_write_hint = write_hint; + oz->oz_is_gc = is_gc; + + /* + * All dereferences of rtg->rtg_open_zone hold the ILOCK for the rmap + * inode, but we don't really want to take that here because we are + * under the zone_list_lock. Ensure the pointer is only set for a fully + * initialized open zone structure so that a racy lookup finding it is + * fine. + */ + WRITE_ONCE(rtg->rtg_open_zone, oz); + return oz; +} + +/* + * Find a completely free zone, open it, and return a reference. + */ +struct xfs_open_zone * +xfs_open_zone( + struct xfs_mount *mp, + enum rw_hint write_hint, + bool is_gc) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_group *xg; + + xg = xfs_find_free_zone(mp, zi->zi_free_zone_cursor, ULONG_MAX); + if (!xg) + xg = xfs_find_free_zone(mp, 0, zi->zi_free_zone_cursor); + if (!xg) + return NULL; + + set_current_state(TASK_RUNNING); + return xfs_init_open_zone(to_rtg(xg), 0, write_hint, is_gc); +} + +static struct xfs_open_zone * +xfs_try_open_zone( + struct xfs_mount *mp, + enum rw_hint write_hint) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_open_zone *oz; + + if (zi->zi_nr_open_zones >= mp->m_max_open_zones - XFS_OPEN_GC_ZONES) + return NULL; + if (atomic_read(&zi->zi_nr_free_zones) < + XFS_GC_ZONES - XFS_OPEN_GC_ZONES) + return NULL; + + /* + * Increment the open zone count to reserve our slot before dropping + * zi_open_zones_lock. + */ + zi->zi_nr_open_zones++; + spin_unlock(&zi->zi_open_zones_lock); + oz = xfs_open_zone(mp, write_hint, false); + spin_lock(&zi->zi_open_zones_lock); + if (!oz) { + zi->zi_nr_open_zones--; + return NULL; + } + + atomic_inc(&oz->oz_ref); + list_add_tail(&oz->oz_entry, &zi->zi_open_zones); + + /* + * If this was the last free zone, other waiters might be waiting + * on us to write to it as well. + */ + wake_up_all(&zi->zi_zone_wait); + + if (xfs_zoned_need_gc(mp)) + wake_up_process(zi->zi_gc_thread); + + trace_xfs_zone_opened(oz->oz_rtg); + return oz; +} + +/* + * For data with short or medium lifetime, try to colocated it into an + * already open zone with a matching temperature. + */ +static bool +xfs_colocate_eagerly( + enum rw_hint file_hint) +{ + switch (file_hint) { + case WRITE_LIFE_MEDIUM: + case WRITE_LIFE_SHORT: + case WRITE_LIFE_NONE: + return true; + default: + return false; + } +} + +static bool +xfs_good_hint_match( + struct xfs_open_zone *oz, + enum rw_hint file_hint) +{ + switch (oz->oz_write_hint) { + case WRITE_LIFE_LONG: + case WRITE_LIFE_EXTREME: + /* colocate long and extreme */ + if (file_hint == WRITE_LIFE_LONG || + file_hint == WRITE_LIFE_EXTREME) + return true; + break; + case WRITE_LIFE_MEDIUM: + /* colocate medium with medium */ + if (file_hint == WRITE_LIFE_MEDIUM) + return true; + break; + case WRITE_LIFE_SHORT: + case WRITE_LIFE_NONE: + case WRITE_LIFE_NOT_SET: + /* colocate short and none */ + if (file_hint <= WRITE_LIFE_SHORT) + return true; + break; + } + return false; +} + +static bool +xfs_try_use_zone( + struct xfs_zone_info *zi, + enum rw_hint file_hint, + struct xfs_open_zone *oz, + bool lowspace) +{ + if (oz->oz_write_pointer == rtg_blocks(oz->oz_rtg)) + return false; + if (!lowspace && !xfs_good_hint_match(oz, file_hint)) + return false; + if (!atomic_inc_not_zero(&oz->oz_ref)) + return false; + + /* + * If we have a hint set for the data, use that for the zone even if + * some data was written already without any hint set, but don't change + * the temperature after that as that would make little sense without + * tracking per-temperature class written block counts, which is + * probably overkill anyway. + */ + if (file_hint != WRITE_LIFE_NOT_SET && + oz->oz_write_hint == WRITE_LIFE_NOT_SET) + oz->oz_write_hint = file_hint; + + /* + * If we couldn't match by inode or life time we just pick the first + * zone with enough space above. For that we want the least busy zone + * for some definition of "least" busy. For now this simple LRU + * algorithm that rotates every zone to the end of the list will do it, + * even if it isn't exactly cache friendly. + */ + if (!list_is_last(&oz->oz_entry, &zi->zi_open_zones)) + list_move_tail(&oz->oz_entry, &zi->zi_open_zones); + return true; +} + +static struct xfs_open_zone * +xfs_select_open_zone_lru( + struct xfs_zone_info *zi, + enum rw_hint file_hint, + bool lowspace) +{ + struct xfs_open_zone *oz; + + lockdep_assert_held(&zi->zi_open_zones_lock); + + list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) + if (xfs_try_use_zone(zi, file_hint, oz, lowspace)) + return oz; + + cond_resched_lock(&zi->zi_open_zones_lock); + return NULL; +} + +static struct xfs_open_zone * +xfs_select_open_zone_mru( + struct xfs_zone_info *zi, + enum rw_hint file_hint) +{ + struct xfs_open_zone *oz; + + lockdep_assert_held(&zi->zi_open_zones_lock); + + list_for_each_entry_reverse(oz, &zi->zi_open_zones, oz_entry) + if (xfs_try_use_zone(zi, file_hint, oz, false)) + return oz; + + cond_resched_lock(&zi->zi_open_zones_lock); + return NULL; +} + +static inline enum rw_hint xfs_inode_write_hint(struct xfs_inode *ip) +{ + if (xfs_has_nolifetime(ip->i_mount)) + return WRITE_LIFE_NOT_SET; + return VFS_I(ip)->i_write_hint; +} + +/* + * Try to pack inodes that are written back after they were closed tight instead + * of trying to open new zones for them or spread them to the least recently + * used zone. This optimizes the data layout for workloads that untar or copy + * a lot of small files. Right now this does not separate multiple such + * streams. + */ +static inline bool xfs_zoned_pack_tight(struct xfs_inode *ip) +{ + return !inode_is_open_for_write(VFS_I(ip)) && + !(ip->i_diflags & XFS_DIFLAG_APPEND); +} + +/* + * Pick a new zone for writes. + * + * If we aren't using up our budget of open zones just open a new one from the + * freelist. Else try to find one that matches the expected data lifetime. If + * we don't find one that is good pick any zone that is available. + */ +static struct xfs_open_zone * +xfs_select_zone_nowait( + struct xfs_mount *mp, + enum rw_hint write_hint, + bool pack_tight) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_open_zone *oz = NULL; + + if (xfs_is_shutdown(mp)) + return NULL; + + /* + * Try to fill up open zones with matching temperature if available. It + * is better to try to co-locate data when this is favorable, so we can + * activate empty zones when it is statistically better to separate + * data. + */ + spin_lock(&zi->zi_open_zones_lock); + if (xfs_colocate_eagerly(write_hint)) + oz = xfs_select_open_zone_lru(zi, write_hint, false); + else if (pack_tight) + oz = xfs_select_open_zone_mru(zi, write_hint); + if (oz) + goto out_unlock; + + /* + * See if we can open a new zone and use that. + */ + oz = xfs_try_open_zone(mp, write_hint); + if (oz) + goto out_unlock; + + /* + * Try to colocate cold data with other cold data if we failed to open a + * new zone for it. + */ + if (write_hint != WRITE_LIFE_NOT_SET && + !xfs_colocate_eagerly(write_hint)) + oz = xfs_select_open_zone_lru(zi, write_hint, false); + if (!oz) + oz = xfs_select_open_zone_lru(zi, WRITE_LIFE_NOT_SET, false); + if (!oz) + oz = xfs_select_open_zone_lru(zi, WRITE_LIFE_NOT_SET, true); +out_unlock: + spin_unlock(&zi->zi_open_zones_lock); + return oz; +} + +static struct xfs_open_zone * +xfs_select_zone( + struct xfs_mount *mp, + enum rw_hint write_hint, + bool pack_tight) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + DEFINE_WAIT (wait); + struct xfs_open_zone *oz; + + oz = xfs_select_zone_nowait(mp, write_hint, pack_tight); + if (oz) + return oz; + + for (;;) { + prepare_to_wait(&zi->zi_zone_wait, &wait, TASK_UNINTERRUPTIBLE); + oz = xfs_select_zone_nowait(mp, write_hint, pack_tight); + if (oz) + break; + schedule(); + } + finish_wait(&zi->zi_zone_wait, &wait); + return oz; +} + +static unsigned int +xfs_zone_alloc_blocks( + struct xfs_open_zone *oz, + xfs_filblks_t count_fsb, + sector_t *sector, + bool *is_seq) +{ + struct xfs_rtgroup *rtg = oz->oz_rtg; + struct xfs_mount *mp = rtg_mount(rtg); + xfs_rgblock_t rgbno; + + spin_lock(&oz->oz_alloc_lock); + count_fsb = min3(count_fsb, XFS_MAX_BMBT_EXTLEN, + (xfs_filblks_t)rtg_blocks(rtg) - oz->oz_write_pointer); + if (!count_fsb) { + spin_unlock(&oz->oz_alloc_lock); + return 0; + } + rgbno = oz->oz_write_pointer; + oz->oz_write_pointer += count_fsb; + spin_unlock(&oz->oz_alloc_lock); + + trace_xfs_zone_alloc_blocks(oz, rgbno, count_fsb); + + *sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0); + *is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *sector); + if (!*is_seq) + *sector += XFS_FSB_TO_BB(mp, rgbno); + return XFS_FSB_TO_B(mp, count_fsb); +} + +void +xfs_mark_rtg_boundary( + struct iomap_ioend *ioend) +{ + struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; + sector_t sector = ioend->io_bio.bi_iter.bi_sector; + + if (xfs_rtb_to_rgbno(mp, xfs_daddr_to_rtb(mp, sector)) == 0) + ioend->io_flags |= IOMAP_IOEND_BOUNDARY; +} + +static void +xfs_submit_zoned_bio( + struct iomap_ioend *ioend, + struct xfs_open_zone *oz, + bool is_seq) +{ + ioend->io_bio.bi_iter.bi_sector = ioend->io_sector; + ioend->io_private = oz; + atomic_inc(&oz->oz_ref); /* for xfs_zoned_end_io */ + + if (is_seq) { + ioend->io_bio.bi_opf &= ~REQ_OP_WRITE; + ioend->io_bio.bi_opf |= REQ_OP_ZONE_APPEND; + } else { + xfs_mark_rtg_boundary(ioend); + } + + submit_bio(&ioend->io_bio); +} + +void +xfs_zone_alloc_and_submit( + struct iomap_ioend *ioend, + struct xfs_open_zone **oz) +{ + struct xfs_inode *ip = XFS_I(ioend->io_inode); + struct xfs_mount *mp = ip->i_mount; + enum rw_hint write_hint = xfs_inode_write_hint(ip); + bool pack_tight = xfs_zoned_pack_tight(ip); + unsigned int alloc_len; + struct iomap_ioend *split; + bool is_seq; + + if (xfs_is_shutdown(mp)) + goto out_error; + + /* + * If we don't have a cached zone in this write context, see if the + * last extent before the one we are writing to points to an active + * zone. If so, just continue writing to it. + */ + if (!*oz && ioend->io_offset) + *oz = xfs_last_used_zone(ioend); + if (!*oz) { +select_zone: + *oz = xfs_select_zone(mp, write_hint, pack_tight); + if (!*oz) + goto out_error; + } + + alloc_len = xfs_zone_alloc_blocks(*oz, XFS_B_TO_FSB(mp, ioend->io_size), + &ioend->io_sector, &is_seq); + if (!alloc_len) { + xfs_open_zone_put(*oz); + goto select_zone; + } + + while ((split = iomap_split_ioend(ioend, alloc_len, is_seq))) { + if (IS_ERR(split)) + goto out_split_error; + alloc_len -= split->io_bio.bi_iter.bi_size; + xfs_submit_zoned_bio(split, *oz, is_seq); + if (!alloc_len) { + xfs_open_zone_put(*oz); + goto select_zone; + } + } + + xfs_submit_zoned_bio(ioend, *oz, is_seq); + return; + +out_split_error: + ioend->io_bio.bi_status = errno_to_blk_status(PTR_ERR(split)); +out_error: + bio_io_error(&ioend->io_bio); +} + +void +xfs_zoned_wake_all( + struct xfs_mount *mp) +{ + if (!(mp->m_super->s_flags & SB_ACTIVE)) + return; /* can happen during log recovery */ + wake_up_all(&mp->m_zone_info->zi_zone_wait); +} + +/* + * Check if @rgbno in @rgb is a potentially valid block. It might still be + * unused, but that information is only found in the rmap. + */ +bool +xfs_zone_rgbno_is_valid( + struct xfs_rtgroup *rtg, + xfs_rgnumber_t rgbno) +{ + lockdep_assert_held(&rtg_rmap(rtg)->i_lock); + + if (rtg->rtg_open_zone) + return rgbno < rtg->rtg_open_zone->oz_write_pointer; + return !xa_get_mark(&rtg_mount(rtg)->m_groups[XG_TYPE_RTG].xa, + rtg_rgno(rtg), XFS_RTG_FREE); +} + +static void +xfs_free_open_zones( + struct xfs_zone_info *zi) +{ + struct xfs_open_zone *oz; + + spin_lock(&zi->zi_open_zones_lock); + while ((oz = list_first_entry_or_null(&zi->zi_open_zones, + struct xfs_open_zone, oz_entry))) { + list_del(&oz->oz_entry); + xfs_open_zone_put(oz); + } + spin_unlock(&zi->zi_open_zones_lock); +} + +struct xfs_init_zones { + struct xfs_mount *mp; + uint64_t available; + uint64_t reclaimable; +}; + +static int +xfs_init_zone( + struct xfs_init_zones *iz, + struct xfs_rtgroup *rtg, + struct blk_zone *zone) +{ + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_zone_info *zi = mp->m_zone_info; + uint64_t used = rtg_rmap(rtg)->i_used_blocks; + xfs_rgblock_t write_pointer, highest_rgbno; + int error; + + if (zone && !xfs_zone_validate(zone, rtg, &write_pointer)) + return -EFSCORRUPTED; + + /* + * For sequential write required zones we retrieved the hardware write + * pointer above. + * + * For conventional zones or conventional devices we don't have that + * luxury. Instead query the rmap to find the highest recorded block + * and set the write pointer to the block after that. In case of a + * power loss this misses blocks where the data I/O has completed but + * not recorded in the rmap yet, and it also rewrites blocks if the most + * recently written ones got deleted again before unmount, but this is + * the best we can do without hardware support. + */ + if (!zone || zone->cond == BLK_ZONE_COND_NOT_WP) { + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); + highest_rgbno = xfs_rtrmap_highest_rgbno(rtg); + if (highest_rgbno == NULLRGBLOCK) + write_pointer = 0; + else + write_pointer = highest_rgbno + 1; + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); + } + + /* + * If there are no used blocks, but the zone is not in empty state yet + * we lost power before the zoned reset. In that case finish the work + * here. + */ + if (write_pointer == rtg_blocks(rtg) && used == 0) { + error = xfs_zone_gc_reset_sync(rtg); + if (error) + return error; + write_pointer = 0; + } + + if (write_pointer == 0) { + /* zone is empty */ + atomic_inc(&zi->zi_nr_free_zones); + xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE); + iz->available += rtg_blocks(rtg); + } else if (write_pointer < rtg_blocks(rtg)) { + /* zone is open */ + struct xfs_open_zone *oz; + + atomic_inc(&rtg_group(rtg)->xg_active_ref); + oz = xfs_init_open_zone(rtg, write_pointer, WRITE_LIFE_NOT_SET, + false); + list_add_tail(&oz->oz_entry, &zi->zi_open_zones); + zi->zi_nr_open_zones++; + + iz->available += (rtg_blocks(rtg) - write_pointer); + iz->reclaimable += write_pointer - used; + } else if (used < rtg_blocks(rtg)) { + /* zone fully written, but has freed blocks */ + xfs_zone_account_reclaimable(rtg, rtg_blocks(rtg) - used); + iz->reclaimable += (rtg_blocks(rtg) - used); + } + + return 0; +} + +static int +xfs_get_zone_info_cb( + struct blk_zone *zone, + unsigned int idx, + void *data) +{ + struct xfs_init_zones *iz = data; + struct xfs_mount *mp = iz->mp; + xfs_fsblock_t zsbno = xfs_daddr_to_rtb(mp, zone->start); + xfs_rgnumber_t rgno; + struct xfs_rtgroup *rtg; + int error; + + if (xfs_rtb_to_rgbno(mp, zsbno) != 0) { + xfs_warn(mp, "mismatched zone start 0x%llx.", zsbno); + return -EFSCORRUPTED; + } + + rgno = xfs_rtb_to_rgno(mp, zsbno); + rtg = xfs_rtgroup_grab(mp, rgno); + if (!rtg) { + xfs_warn(mp, "realtime group not found for zone %u.", rgno); + return -EFSCORRUPTED; + } + error = xfs_init_zone(iz, rtg, zone); + xfs_rtgroup_rele(rtg); + return error; +} + +/* + * Calculate the max open zone limit based on the of number of + * backing zones available + */ +static inline uint32_t +xfs_max_open_zones( + struct xfs_mount *mp) +{ + unsigned int max_open, max_open_data_zones; + /* + * We need two zones for every open data zone, + * one in reserve as we don't reclaim open zones. One data zone + * and its spare is included in XFS_MIN_ZONES. + */ + max_open_data_zones = (mp->m_sb.sb_rgcount - XFS_MIN_ZONES) / 2 + 1; + max_open = max_open_data_zones + XFS_OPEN_GC_ZONES; + + /* + * Cap the max open limit to 1/4 of available space + */ + max_open = min(max_open, mp->m_sb.sb_rgcount / 4); + + return max(XFS_MIN_OPEN_ZONES, max_open); +} + +/* + * Normally we use the open zone limit that the device reports. If there is + * none let the user pick one from the command line. + * + * If the device doesn't report an open zone limit and there is no override, + * allow to hold about a quarter of the zones open. In theory we could allow + * all to be open, but at that point we run into GC deadlocks because we can't + * reclaim open zones. + * + * When used on conventional SSDs a lower open limit is advisable as we'll + * otherwise overwhelm the FTL just as much as a conventional block allocator. + * + * Note: To debug the open zone management code, force max_open to 1 here. + */ +static int +xfs_calc_open_zones( + struct xfs_mount *mp) +{ + struct block_device *bdev = mp->m_rtdev_targp->bt_bdev; + unsigned int bdev_open_zones = bdev_max_open_zones(bdev); + + if (!mp->m_max_open_zones) { + if (bdev_open_zones) + mp->m_max_open_zones = bdev_open_zones; + else + mp->m_max_open_zones = xfs_max_open_zones(mp); + } + + if (mp->m_max_open_zones < XFS_MIN_OPEN_ZONES) { + xfs_notice(mp, "need at least %u open zones.", + XFS_MIN_OPEN_ZONES); + return -EIO; + } + + if (bdev_open_zones && bdev_open_zones < mp->m_max_open_zones) { + mp->m_max_open_zones = bdev_open_zones; + xfs_info(mp, "limiting open zones to %u due to hardware limit.\n", + bdev_open_zones); + } + + if (mp->m_max_open_zones > xfs_max_open_zones(mp)) { + mp->m_max_open_zones = xfs_max_open_zones(mp); + xfs_info(mp, +"limiting open zones to %u due to total zone count (%u)", + mp->m_max_open_zones, mp->m_sb.sb_rgcount); + } + + return 0; +} + +static unsigned long * +xfs_alloc_bucket_bitmap( + struct xfs_mount *mp) +{ + return kvmalloc_array(BITS_TO_LONGS(mp->m_sb.sb_rgcount), + sizeof(unsigned long), GFP_KERNEL | __GFP_ZERO); +} + +static struct xfs_zone_info * +xfs_alloc_zone_info( + struct xfs_mount *mp) +{ + struct xfs_zone_info *zi; + int i; + + zi = kzalloc(sizeof(*zi), GFP_KERNEL); + if (!zi) + return NULL; + INIT_LIST_HEAD(&zi->zi_open_zones); + INIT_LIST_HEAD(&zi->zi_reclaim_reservations); + spin_lock_init(&zi->zi_reset_list_lock); + spin_lock_init(&zi->zi_open_zones_lock); + spin_lock_init(&zi->zi_reservation_lock); + init_waitqueue_head(&zi->zi_zone_wait); + spin_lock_init(&zi->zi_used_buckets_lock); + for (i = 0; i < XFS_ZONE_USED_BUCKETS; i++) { + zi->zi_used_bucket_bitmap[i] = xfs_alloc_bucket_bitmap(mp); + if (!zi->zi_used_bucket_bitmap[i]) + goto out_free_bitmaps; + } + return zi; + +out_free_bitmaps: + while (--i > 0) + kvfree(zi->zi_used_bucket_bitmap[i]); + kfree(zi); + return NULL; +} + +static void +xfs_free_zone_info( + struct xfs_zone_info *zi) +{ + int i; + + xfs_free_open_zones(zi); + for (i = 0; i < XFS_ZONE_USED_BUCKETS; i++) + kvfree(zi->zi_used_bucket_bitmap[i]); + kfree(zi); +} + +int +xfs_mount_zones( + struct xfs_mount *mp) +{ + struct xfs_init_zones iz = { + .mp = mp, + }; + struct xfs_buftarg *bt = mp->m_rtdev_targp; + int error; + + if (!bt) { + xfs_notice(mp, "RT device missing."); + return -EINVAL; + } + + if (!xfs_has_rtgroups(mp) || !xfs_has_rmapbt(mp)) { + xfs_notice(mp, "invalid flag combination."); + return -EFSCORRUPTED; + } + if (mp->m_sb.sb_rextsize != 1) { + xfs_notice(mp, "zoned file systems do not support rextsize."); + return -EFSCORRUPTED; + } + if (mp->m_sb.sb_rgcount < XFS_MIN_ZONES) { + xfs_notice(mp, +"zoned file systems need to have at least %u zones.", XFS_MIN_ZONES); + return -EFSCORRUPTED; + } + + error = xfs_calc_open_zones(mp); + if (error) + return error; + + mp->m_zone_info = xfs_alloc_zone_info(mp); + if (!mp->m_zone_info) + return -ENOMEM; + + xfs_info(mp, "%u zones of %u blocks size (%u max open)", + mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks, + mp->m_max_open_zones); + trace_xfs_zones_mount(mp); + + if (bdev_is_zoned(bt->bt_bdev)) { + error = blkdev_report_zones(bt->bt_bdev, + XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart), + mp->m_sb.sb_rgcount, xfs_get_zone_info_cb, &iz); + if (error < 0) + goto out_free_zone_info; + } else { + struct xfs_rtgroup *rtg = NULL; + + while ((rtg = xfs_rtgroup_next(mp, rtg))) { + error = xfs_init_zone(&iz, rtg, NULL); + if (error) + goto out_free_zone_info; + } + } + + xfs_set_freecounter(mp, XC_FREE_RTAVAILABLE, iz.available); + xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, + iz.available + iz.reclaimable); + + error = xfs_zone_gc_mount(mp); + if (error) + goto out_free_zone_info; + return 0; + +out_free_zone_info: + xfs_free_zone_info(mp->m_zone_info); + return error; +} + +void +xfs_unmount_zones( + struct xfs_mount *mp) +{ + xfs_zone_gc_unmount(mp); + xfs_free_zone_info(mp->m_zone_info); +} diff --git a/fs/xfs/xfs_zone_alloc.h b/fs/xfs/xfs_zone_alloc.h new file mode 100644 index 000000000000..ecf39106704c --- /dev/null +++ b/fs/xfs/xfs_zone_alloc.h @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _XFS_ZONE_ALLOC_H +#define _XFS_ZONE_ALLOC_H + +struct iomap_ioend; +struct xfs_open_zone; + +struct xfs_zone_alloc_ctx { + struct xfs_open_zone *open_zone; + xfs_filblks_t reserved_blocks; +}; + +/* + * Grab any available space, even if it is less than what the caller asked for. + */ +#define XFS_ZR_GREEDY (1U << 0) +/* + * Only grab instantly available space, don't wait or GC. + */ +#define XFS_ZR_NOWAIT (1U << 1) +/* + * Dip into the reserved pool. + */ +#define XFS_ZR_RESERVED (1U << 2) + +int xfs_zoned_space_reserve(struct xfs_inode *ip, xfs_filblks_t count_fsb, + unsigned int flags, struct xfs_zone_alloc_ctx *ac); +void xfs_zoned_space_unreserve(struct xfs_inode *ip, + struct xfs_zone_alloc_ctx *ac); +void xfs_zoned_add_available(struct xfs_mount *mp, xfs_filblks_t count_fsb); + +void xfs_zone_alloc_and_submit(struct iomap_ioend *ioend, + struct xfs_open_zone **oz); +int xfs_zone_free_blocks(struct xfs_trans *tp, struct xfs_rtgroup *rtg, + xfs_fsblock_t fsbno, xfs_filblks_t len); +int xfs_zoned_end_io(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count, + xfs_daddr_t daddr, struct xfs_open_zone *oz, + xfs_fsblock_t old_startblock); +void xfs_open_zone_put(struct xfs_open_zone *oz); + +void xfs_zoned_wake_all(struct xfs_mount *mp); +bool xfs_zone_rgbno_is_valid(struct xfs_rtgroup *rtg, xfs_rgnumber_t rgbno); +void xfs_mark_rtg_boundary(struct iomap_ioend *ioend); + +uint64_t xfs_zoned_default_resblks(struct xfs_mount *mp, + enum xfs_free_counter ctr); +void xfs_zoned_show_stats(struct seq_file *m, struct xfs_mount *mp); + +#ifdef CONFIG_XFS_RT +int xfs_mount_zones(struct xfs_mount *mp); +void xfs_unmount_zones(struct xfs_mount *mp); +void xfs_zone_gc_start(struct xfs_mount *mp); +void xfs_zone_gc_stop(struct xfs_mount *mp); +#else +static inline int xfs_mount_zones(struct xfs_mount *mp) +{ + return -EIO; +} +static inline void xfs_unmount_zones(struct xfs_mount *mp) +{ +} +static inline void xfs_zone_gc_start(struct xfs_mount *mp) +{ +} +static inline void xfs_zone_gc_stop(struct xfs_mount *mp) +{ +} +#endif /* CONFIG_XFS_RT */ + +#endif /* _XFS_ZONE_ALLOC_H */ diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c new file mode 100644 index 000000000000..c5136ea9bb1d --- /dev/null +++ b/fs/xfs/xfs_zone_gc.c @@ -0,0 +1,1165 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2023-2025 Christoph Hellwig. + * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates. + */ +#include "xfs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_trans.h" +#include "xfs_icache.h" +#include "xfs_rmap.h" +#include "xfs_rtbitmap.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_zone_alloc.h" +#include "xfs_zone_priv.h" +#include "xfs_zones.h" +#include "xfs_trace.h" + +/* + * Implement Garbage Collection (GC) of partially used zoned. + * + * To support the purely sequential writes in each zone, zoned XFS needs to be + * able to move data remaining in a zone out of it to reset the zone to prepare + * for writing to it again. + * + * This is done by the GC thread implemented in this file. To support that a + * number of zones (XFS_GC_ZONES) is reserved from the user visible capacity to + * write the garbage collected data into. + * + * Whenever the available space is below the chosen threshold, the GC thread + * looks for potential non-empty but not fully used zones that are worth + * reclaiming. Once found the rmap for the victim zone is queried, and after + * a bit of sorting to reduce fragmentation, the still live extents are read + * into memory and written to the GC target zone, and the bmap btree of the + * files is updated to point to the new location. To avoid taking the IOLOCK + * and MMAPLOCK for the entire GC process and thus affecting the latency of + * user reads and writes to the files, the GC writes are speculative and the + * I/O completion checks that no other writes happened for the affected regions + * before remapping. + * + * Once a zone does not contain any valid data, be that through GC or user + * block removal, it is queued for for a zone reset. The reset operation + * carefully ensures that the RT device cache is flushed and all transactions + * referencing the rmap have been committed to disk. + */ + +/* + * Size of each GC scratch pad. This is also the upper bound for each + * GC I/O, which helps to keep latency down. + */ +#define XFS_GC_CHUNK_SIZE SZ_1M + +/* + * Scratchpad data to read GCed data into. + * + * The offset member tracks where the next allocation starts, and freed tracks + * the amount of space that is not used anymore. + */ +#define XFS_ZONE_GC_NR_SCRATCH 2 +struct xfs_zone_scratch { + struct folio *folio; + unsigned int offset; + unsigned int freed; +}; + +/* + * Chunk that is read and written for each GC operation. + * + * Note that for writes to actual zoned devices, the chunk can be split when + * reaching the hardware limit. + */ +struct xfs_gc_bio { + struct xfs_zone_gc_data *data; + + /* + * Entry into the reading/writing/resetting list. Only accessed from + * the GC thread, so no locking needed. + */ + struct list_head entry; + + /* + * State of this gc_bio. Done means the current I/O completed. + * Set from the bio end I/O handler, read from the GC thread. + */ + enum { + XFS_GC_BIO_NEW, + XFS_GC_BIO_DONE, + } state; + + /* + * Pointer to the inode and byte range in the inode that this + * GC chunk is operating on. + */ + struct xfs_inode *ip; + loff_t offset; + unsigned int len; + + /* + * Existing startblock (in the zone to be freed) and newly assigned + * daddr in the zone GCed into. + */ + xfs_fsblock_t old_startblock; + xfs_daddr_t new_daddr; + struct xfs_zone_scratch *scratch; + + /* Are we writing to a sequential write required zone? */ + bool is_seq; + + /* Open Zone being written to */ + struct xfs_open_zone *oz; + + /* Bio used for reads and writes, including the bvec used by it */ + struct bio_vec bv; + struct bio bio; /* must be last */ +}; + +#define XFS_ZONE_GC_RECS 1024 + +/* iterator, needs to be reinitialized for each victim zone */ +struct xfs_zone_gc_iter { + struct xfs_rtgroup *victim_rtg; + unsigned int rec_count; + unsigned int rec_idx; + xfs_agblock_t next_startblock; + struct xfs_rmap_irec *recs; +}; + +/* + * Per-mount GC state. + */ +struct xfs_zone_gc_data { + struct xfs_mount *mp; + + /* bioset used to allocate the gc_bios */ + struct bio_set bio_set; + + /* + * Scratchpad used, and index to indicated which one is used. + */ + struct xfs_zone_scratch scratch[XFS_ZONE_GC_NR_SCRATCH]; + unsigned int scratch_idx; + + /* + * List of bios currently being read, written and reset. + * These lists are only accessed by the GC thread itself, and must only + * be processed in order. + */ + struct list_head reading; + struct list_head writing; + struct list_head resetting; + + /* + * Iterator for the victim zone. + */ + struct xfs_zone_gc_iter iter; +}; + +/* + * We aim to keep enough zones free in stock to fully use the open zone limit + * for data placement purposes. + */ +bool +xfs_zoned_need_gc( + struct xfs_mount *mp) +{ + if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE)) + return false; + if (xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE) < + mp->m_groups[XG_TYPE_RTG].blocks * + (mp->m_max_open_zones - XFS_OPEN_GC_ZONES)) + return true; + return false; +} + +static struct xfs_zone_gc_data * +xfs_zone_gc_data_alloc( + struct xfs_mount *mp) +{ + struct xfs_zone_gc_data *data; + int i; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return NULL; + data->iter.recs = kcalloc(XFS_ZONE_GC_RECS, sizeof(*data->iter.recs), + GFP_KERNEL); + if (!data->iter.recs) + goto out_free_data; + + /* + * We actually only need a single bio_vec. It would be nice to have + * a flag that only allocates the inline bvecs and not the separate + * bvec pool. + */ + if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio), + BIOSET_NEED_BVECS)) + goto out_free_recs; + for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) { + data->scratch[i].folio = + folio_alloc(GFP_KERNEL, get_order(XFS_GC_CHUNK_SIZE)); + if (!data->scratch[i].folio) + goto out_free_scratch; + } + INIT_LIST_HEAD(&data->reading); + INIT_LIST_HEAD(&data->writing); + INIT_LIST_HEAD(&data->resetting); + data->mp = mp; + return data; + +out_free_scratch: + while (--i >= 0) + folio_put(data->scratch[i].folio); + bioset_exit(&data->bio_set); +out_free_recs: + kfree(data->iter.recs); +out_free_data: + kfree(data); + return NULL; +} + +static void +xfs_zone_gc_data_free( + struct xfs_zone_gc_data *data) +{ + int i; + + for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) + folio_put(data->scratch[i].folio); + bioset_exit(&data->bio_set); + kfree(data->iter.recs); + kfree(data); +} + +static void +xfs_zone_gc_iter_init( + struct xfs_zone_gc_iter *iter, + struct xfs_rtgroup *victim_rtg) + +{ + iter->next_startblock = 0; + iter->rec_count = 0; + iter->rec_idx = 0; + iter->victim_rtg = victim_rtg; +} + +/* + * Query the rmap of the victim zone to gather the records to evacuate. + */ +static int +xfs_zone_gc_query_cb( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *irec, + void *private) +{ + struct xfs_zone_gc_iter *iter = private; + + ASSERT(!XFS_RMAP_NON_INODE_OWNER(irec->rm_owner)); + ASSERT(!xfs_is_sb_inum(cur->bc_mp, irec->rm_owner)); + ASSERT(!(irec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))); + + iter->recs[iter->rec_count] = *irec; + if (++iter->rec_count == XFS_ZONE_GC_RECS) { + iter->next_startblock = + irec->rm_startblock + irec->rm_blockcount; + return 1; + } + return 0; +} + +#define cmp_int(l, r) ((l > r) - (l < r)) + +static int +xfs_zone_gc_rmap_rec_cmp( + const void *a, + const void *b) +{ + const struct xfs_rmap_irec *reca = a; + const struct xfs_rmap_irec *recb = b; + int diff; + + diff = cmp_int(reca->rm_owner, recb->rm_owner); + if (diff) + return diff; + return cmp_int(reca->rm_offset, recb->rm_offset); +} + +static int +xfs_zone_gc_query( + struct xfs_mount *mp, + struct xfs_zone_gc_iter *iter) +{ + struct xfs_rtgroup *rtg = iter->victim_rtg; + struct xfs_rmap_irec ri_low = { }; + struct xfs_rmap_irec ri_high; + struct xfs_btree_cur *cur; + struct xfs_trans *tp; + int error; + + ASSERT(iter->next_startblock <= rtg_blocks(rtg)); + if (iter->next_startblock == rtg_blocks(rtg)) + goto done; + + ASSERT(iter->next_startblock < rtg_blocks(rtg)); + ri_low.rm_startblock = iter->next_startblock; + memset(&ri_high, 0xFF, sizeof(ri_high)); + + iter->rec_idx = 0; + iter->rec_count = 0; + + error = xfs_trans_alloc_empty(mp, &tp); + if (error) + return error; + + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); + cur = xfs_rtrmapbt_init_cursor(tp, rtg); + error = xfs_rmap_query_range(cur, &ri_low, &ri_high, + xfs_zone_gc_query_cb, iter); + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); + xfs_btree_del_cursor(cur, error < 0 ? error : 0); + xfs_trans_cancel(tp); + + if (error < 0) + return error; + + /* + * Sort the rmap records by inode number and increasing offset to + * defragment the mappings. + * + * This could be further enhanced by an even bigger look ahead window, + * but that's better left until we have better detection of changes to + * inode mapping to avoid the potential of GCing already dead data. + */ + sort(iter->recs, iter->rec_count, sizeof(iter->recs[0]), + xfs_zone_gc_rmap_rec_cmp, NULL); + + if (error == 0) { + /* + * We finished iterating through the zone. + */ + iter->next_startblock = rtg_blocks(rtg); + if (iter->rec_count == 0) + goto done; + } + + return 0; +done: + xfs_rtgroup_rele(iter->victim_rtg); + iter->victim_rtg = NULL; + return 0; +} + +static bool +xfs_zone_gc_iter_next( + struct xfs_mount *mp, + struct xfs_zone_gc_iter *iter, + struct xfs_rmap_irec *chunk_rec, + struct xfs_inode **ipp) +{ + struct xfs_rmap_irec *irec; + int error; + + if (!iter->victim_rtg) + return false; + +retry: + if (iter->rec_idx == iter->rec_count) { + error = xfs_zone_gc_query(mp, iter); + if (error) + goto fail; + if (!iter->victim_rtg) + return false; + } + + irec = &iter->recs[iter->rec_idx]; + error = xfs_iget(mp, NULL, irec->rm_owner, + XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, ipp); + if (error) { + /* + * If the inode was already deleted, skip over it. + */ + if (error == -ENOENT) { + iter->rec_idx++; + goto retry; + } + goto fail; + } + + if (!S_ISREG(VFS_I(*ipp)->i_mode) || !XFS_IS_REALTIME_INODE(*ipp)) { + iter->rec_idx++; + xfs_irele(*ipp); + goto retry; + } + + *chunk_rec = *irec; + return true; + +fail: + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + return false; +} + +static void +xfs_zone_gc_iter_advance( + struct xfs_zone_gc_iter *iter, + xfs_extlen_t count_fsb) +{ + struct xfs_rmap_irec *irec = &iter->recs[iter->rec_idx]; + + irec->rm_offset += count_fsb; + irec->rm_startblock += count_fsb; + irec->rm_blockcount -= count_fsb; + if (!irec->rm_blockcount) + iter->rec_idx++; +} + +static struct xfs_rtgroup * +xfs_zone_gc_pick_victim_from( + struct xfs_mount *mp, + uint32_t bucket) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + uint32_t victim_used = U32_MAX; + struct xfs_rtgroup *victim_rtg = NULL; + uint32_t bit; + + if (!zi->zi_used_bucket_entries[bucket]) + return NULL; + + for_each_set_bit(bit, zi->zi_used_bucket_bitmap[bucket], + mp->m_sb.sb_rgcount) { + struct xfs_rtgroup *rtg = xfs_rtgroup_grab(mp, bit); + + if (!rtg) + continue; + + /* skip zones that are just waiting for a reset */ + if (rtg_rmap(rtg)->i_used_blocks == 0 || + rtg_rmap(rtg)->i_used_blocks >= victim_used) { + xfs_rtgroup_rele(rtg); + continue; + } + + if (victim_rtg) + xfs_rtgroup_rele(victim_rtg); + victim_rtg = rtg; + victim_used = rtg_rmap(rtg)->i_used_blocks; + + /* + * Any zone that is less than 1 percent used is fair game for + * instant reclaim. All of these zones are in the last + * bucket, so avoid the expensive division for the zones + * in the other buckets. + */ + if (bucket == 0 && + rtg_rmap(rtg)->i_used_blocks < rtg_blocks(rtg) / 100) + break; + } + + return victim_rtg; +} + +/* + * Iterate through all zones marked as reclaimable and find a candidate to + * reclaim. + */ +static bool +xfs_zone_gc_select_victim( + struct xfs_zone_gc_data *data) +{ + struct xfs_zone_gc_iter *iter = &data->iter; + struct xfs_mount *mp = data->mp; + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_rtgroup *victim_rtg = NULL; + unsigned int bucket; + + if (xfs_is_shutdown(mp)) + return false; + + if (iter->victim_rtg) + return true; + + /* + * Don't start new work if we are asked to stop or park. + */ + if (kthread_should_stop() || kthread_should_park()) + return false; + + if (!xfs_zoned_need_gc(mp)) + return false; + + spin_lock(&zi->zi_used_buckets_lock); + for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) { + victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket); + if (victim_rtg) + break; + } + spin_unlock(&zi->zi_used_buckets_lock); + + if (!victim_rtg) + return false; + + trace_xfs_zone_gc_select_victim(victim_rtg, bucket); + xfs_zone_gc_iter_init(iter, victim_rtg); + return true; +} + +static struct xfs_open_zone * +xfs_zone_gc_steal_open( + struct xfs_zone_info *zi) +{ + struct xfs_open_zone *oz, *found = NULL; + + spin_lock(&zi->zi_open_zones_lock); + list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) { + if (!found || + oz->oz_write_pointer < found->oz_write_pointer) + found = oz; + } + + if (found) { + found->oz_is_gc = true; + list_del_init(&found->oz_entry); + zi->zi_nr_open_zones--; + } + + spin_unlock(&zi->zi_open_zones_lock); + return found; +} + +static struct xfs_open_zone * +xfs_zone_gc_select_target( + struct xfs_mount *mp) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_open_zone *oz = zi->zi_open_gc_zone; + + /* + * We need to wait for pending writes to finish. + */ + if (oz && oz->oz_written < rtg_blocks(oz->oz_rtg)) + return NULL; + + ASSERT(zi->zi_nr_open_zones <= + mp->m_max_open_zones - XFS_OPEN_GC_ZONES); + oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true); + if (oz) + trace_xfs_zone_gc_target_opened(oz->oz_rtg); + spin_lock(&zi->zi_open_zones_lock); + zi->zi_open_gc_zone = oz; + spin_unlock(&zi->zi_open_zones_lock); + return oz; +} + +/* + * Ensure we have a valid open zone to write the GC data to. + * + * If the current target zone has space keep writing to it, else first wait for + * all pending writes and then pick a new one. + */ +static struct xfs_open_zone * +xfs_zone_gc_ensure_target( + struct xfs_mount *mp) +{ + struct xfs_open_zone *oz = mp->m_zone_info->zi_open_gc_zone; + + if (!oz || oz->oz_write_pointer == rtg_blocks(oz->oz_rtg)) + return xfs_zone_gc_select_target(mp); + return oz; +} + +static unsigned int +xfs_zone_gc_scratch_available( + struct xfs_zone_gc_data *data) +{ + return XFS_GC_CHUNK_SIZE - data->scratch[data->scratch_idx].offset; +} + +static bool +xfs_zone_gc_space_available( + struct xfs_zone_gc_data *data) +{ + struct xfs_open_zone *oz; + + oz = xfs_zone_gc_ensure_target(data->mp); + if (!oz) + return false; + return oz->oz_write_pointer < rtg_blocks(oz->oz_rtg) && + xfs_zone_gc_scratch_available(data); +} + +static void +xfs_zone_gc_end_io( + struct bio *bio) +{ + struct xfs_gc_bio *chunk = + container_of(bio, struct xfs_gc_bio, bio); + struct xfs_zone_gc_data *data = chunk->data; + + WRITE_ONCE(chunk->state, XFS_GC_BIO_DONE); + wake_up_process(data->mp->m_zone_info->zi_gc_thread); +} + +static struct xfs_open_zone * +xfs_zone_gc_alloc_blocks( + struct xfs_zone_gc_data *data, + xfs_extlen_t *count_fsb, + xfs_daddr_t *daddr, + bool *is_seq) +{ + struct xfs_mount *mp = data->mp; + struct xfs_open_zone *oz; + + oz = xfs_zone_gc_ensure_target(mp); + if (!oz) + return NULL; + + *count_fsb = min(*count_fsb, + XFS_B_TO_FSB(mp, xfs_zone_gc_scratch_available(data))); + + /* + * Directly allocate GC blocks from the reserved pool. + * + * If we'd take them from the normal pool we could be stealing blocks + * from a regular writer, which would then have to wait for GC and + * deadlock. + */ + spin_lock(&mp->m_sb_lock); + *count_fsb = min(*count_fsb, + rtg_blocks(oz->oz_rtg) - oz->oz_write_pointer); + *count_fsb = min3(*count_fsb, + mp->m_free[XC_FREE_RTEXTENTS].res_avail, + mp->m_free[XC_FREE_RTAVAILABLE].res_avail); + mp->m_free[XC_FREE_RTEXTENTS].res_avail -= *count_fsb; + mp->m_free[XC_FREE_RTAVAILABLE].res_avail -= *count_fsb; + spin_unlock(&mp->m_sb_lock); + + if (!*count_fsb) + return NULL; + + *daddr = xfs_gbno_to_daddr(&oz->oz_rtg->rtg_group, 0); + *is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr); + if (!*is_seq) + *daddr += XFS_FSB_TO_BB(mp, oz->oz_write_pointer); + oz->oz_write_pointer += *count_fsb; + atomic_inc(&oz->oz_ref); + return oz; +} + +static bool +xfs_zone_gc_start_chunk( + struct xfs_zone_gc_data *data) +{ + struct xfs_zone_gc_iter *iter = &data->iter; + struct xfs_mount *mp = data->mp; + struct block_device *bdev = mp->m_rtdev_targp->bt_bdev; + struct xfs_open_zone *oz; + struct xfs_rmap_irec irec; + struct xfs_gc_bio *chunk; + struct xfs_inode *ip; + struct bio *bio; + xfs_daddr_t daddr; + bool is_seq; + + if (xfs_is_shutdown(mp)) + return false; + + if (!xfs_zone_gc_iter_next(mp, iter, &irec, &ip)) + return false; + oz = xfs_zone_gc_alloc_blocks(data, &irec.rm_blockcount, &daddr, + &is_seq); + if (!oz) { + xfs_irele(ip); + return false; + } + + bio = bio_alloc_bioset(bdev, 1, REQ_OP_READ, GFP_NOFS, &data->bio_set); + + chunk = container_of(bio, struct xfs_gc_bio, bio); + chunk->ip = ip; + chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset); + chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount); + chunk->old_startblock = + xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock); + chunk->new_daddr = daddr; + chunk->is_seq = is_seq; + chunk->scratch = &data->scratch[data->scratch_idx]; + chunk->data = data; + chunk->oz = oz; + + bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock); + bio->bi_end_io = xfs_zone_gc_end_io; + bio_add_folio_nofail(bio, chunk->scratch->folio, chunk->len, + chunk->scratch->offset); + chunk->scratch->offset += chunk->len; + if (chunk->scratch->offset == XFS_GC_CHUNK_SIZE) { + data->scratch_idx = + (data->scratch_idx + 1) % XFS_ZONE_GC_NR_SCRATCH; + } + WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW); + list_add_tail(&chunk->entry, &data->reading); + xfs_zone_gc_iter_advance(iter, irec.rm_blockcount); + + submit_bio(bio); + return true; +} + +static void +xfs_zone_gc_free_chunk( + struct xfs_gc_bio *chunk) +{ + list_del(&chunk->entry); + xfs_open_zone_put(chunk->oz); + xfs_irele(chunk->ip); + bio_put(&chunk->bio); +} + +static void +xfs_zone_gc_submit_write( + struct xfs_zone_gc_data *data, + struct xfs_gc_bio *chunk) +{ + if (chunk->is_seq) { + chunk->bio.bi_opf &= ~REQ_OP_WRITE; + chunk->bio.bi_opf |= REQ_OP_ZONE_APPEND; + } + chunk->bio.bi_iter.bi_sector = chunk->new_daddr; + chunk->bio.bi_end_io = xfs_zone_gc_end_io; + submit_bio(&chunk->bio); +} + +static struct xfs_gc_bio * +xfs_zone_gc_split_write( + struct xfs_zone_gc_data *data, + struct xfs_gc_bio *chunk) +{ + struct queue_limits *lim = + &bdev_get_queue(chunk->bio.bi_bdev)->limits; + struct xfs_gc_bio *split_chunk; + int split_sectors; + unsigned int split_len; + struct bio *split; + unsigned int nsegs; + + if (!chunk->is_seq) + return NULL; + + split_sectors = bio_split_rw_at(&chunk->bio, lim, &nsegs, + lim->max_zone_append_sectors << SECTOR_SHIFT); + if (!split_sectors) + return NULL; + + /* ensure the split chunk is still block size aligned */ + split_sectors = ALIGN_DOWN(split_sectors << SECTOR_SHIFT, + data->mp->m_sb.sb_blocksize) >> SECTOR_SHIFT; + split_len = split_sectors << SECTOR_SHIFT; + + split = bio_split(&chunk->bio, split_sectors, GFP_NOFS, &data->bio_set); + split_chunk = container_of(split, struct xfs_gc_bio, bio); + split_chunk->data = data; + ihold(VFS_I(chunk->ip)); + split_chunk->ip = chunk->ip; + split_chunk->is_seq = chunk->is_seq; + split_chunk->scratch = chunk->scratch; + split_chunk->offset = chunk->offset; + split_chunk->len = split_len; + split_chunk->old_startblock = chunk->old_startblock; + split_chunk->new_daddr = chunk->new_daddr; + split_chunk->oz = chunk->oz; + atomic_inc(&chunk->oz->oz_ref); + + chunk->offset += split_len; + chunk->len -= split_len; + chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len); + + /* add right before the original chunk */ + WRITE_ONCE(split_chunk->state, XFS_GC_BIO_NEW); + list_add_tail(&split_chunk->entry, &chunk->entry); + return split_chunk; +} + +static void +xfs_zone_gc_write_chunk( + struct xfs_gc_bio *chunk) +{ + struct xfs_zone_gc_data *data = chunk->data; + struct xfs_mount *mp = chunk->ip->i_mount; + unsigned int folio_offset = chunk->bio.bi_io_vec->bv_offset; + struct xfs_gc_bio *split_chunk; + + if (chunk->bio.bi_status) + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + if (xfs_is_shutdown(mp)) { + xfs_zone_gc_free_chunk(chunk); + return; + } + + WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW); + list_move_tail(&chunk->entry, &data->writing); + + bio_reset(&chunk->bio, mp->m_rtdev_targp->bt_bdev, REQ_OP_WRITE); + bio_add_folio_nofail(&chunk->bio, chunk->scratch->folio, chunk->len, + folio_offset); + + while ((split_chunk = xfs_zone_gc_split_write(data, chunk))) + xfs_zone_gc_submit_write(data, split_chunk); + xfs_zone_gc_submit_write(data, chunk); +} + +static void +xfs_zone_gc_finish_chunk( + struct xfs_gc_bio *chunk) +{ + uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; + struct xfs_inode *ip = chunk->ip; + struct xfs_mount *mp = ip->i_mount; + int error; + + if (chunk->bio.bi_status) + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + if (xfs_is_shutdown(mp)) { + xfs_zone_gc_free_chunk(chunk); + return; + } + + chunk->scratch->freed += chunk->len; + if (chunk->scratch->freed == chunk->scratch->offset) { + chunk->scratch->offset = 0; + chunk->scratch->freed = 0; + } + + /* + * Cycle through the iolock and wait for direct I/O and layouts to + * ensure no one is reading from the old mapping before it goes away. + * + * Note that xfs_zoned_end_io() below checks that no other writer raced + * with us to update the mapping by checking that the old startblock + * didn't change. + */ + xfs_ilock(ip, iolock); + error = xfs_break_layouts(VFS_I(ip), &iolock, BREAK_UNMAP); + if (!error) + inode_dio_wait(VFS_I(ip)); + xfs_iunlock(ip, iolock); + if (error) + goto free; + + if (chunk->is_seq) + chunk->new_daddr = chunk->bio.bi_iter.bi_sector; + error = xfs_zoned_end_io(ip, chunk->offset, chunk->len, + chunk->new_daddr, chunk->oz, chunk->old_startblock); +free: + if (error) + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + xfs_zone_gc_free_chunk(chunk); +} + +static void +xfs_zone_gc_finish_reset( + struct xfs_gc_bio *chunk) +{ + struct xfs_rtgroup *rtg = chunk->bio.bi_private; + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_zone_info *zi = mp->m_zone_info; + + if (chunk->bio.bi_status) { + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + goto out; + } + + xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE); + atomic_inc(&zi->zi_nr_free_zones); + + xfs_zoned_add_available(mp, rtg_blocks(rtg)); + + wake_up_all(&zi->zi_zone_wait); +out: + list_del(&chunk->entry); + bio_put(&chunk->bio); +} + +static bool +xfs_zone_gc_prepare_reset( + struct bio *bio, + struct xfs_rtgroup *rtg) +{ + trace_xfs_zone_reset(rtg); + + ASSERT(rtg_rmap(rtg)->i_used_blocks == 0); + bio->bi_iter.bi_sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0); + if (!bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) { + if (!bdev_max_discard_sectors(bio->bi_bdev)) + return false; + bio->bi_opf = REQ_OP_DISCARD | REQ_SYNC; + bio->bi_iter.bi_size = + XFS_FSB_TO_B(rtg_mount(rtg), rtg_blocks(rtg)); + } + + return true; +} + +int +xfs_zone_gc_reset_sync( + struct xfs_rtgroup *rtg) +{ + int error = 0; + struct bio bio; + + bio_init(&bio, rtg_mount(rtg)->m_rtdev_targp->bt_bdev, NULL, 0, + REQ_OP_ZONE_RESET); + if (xfs_zone_gc_prepare_reset(&bio, rtg)) + error = submit_bio_wait(&bio); + bio_uninit(&bio); + + return error; +} + +static void +xfs_zone_gc_reset_zones( + struct xfs_zone_gc_data *data, + struct xfs_group *reset_list) +{ + struct xfs_group *next = reset_list; + + if (blkdev_issue_flush(data->mp->m_rtdev_targp->bt_bdev) < 0) { + xfs_force_shutdown(data->mp, SHUTDOWN_META_IO_ERROR); + return; + } + + do { + struct xfs_rtgroup *rtg = to_rtg(next); + struct xfs_gc_bio *chunk; + struct bio *bio; + + xfs_log_force_inode(rtg_rmap(rtg)); + + next = rtg_group(rtg)->xg_next_reset; + rtg_group(rtg)->xg_next_reset = NULL; + + bio = bio_alloc_bioset(rtg_mount(rtg)->m_rtdev_targp->bt_bdev, + 0, REQ_OP_ZONE_RESET, GFP_NOFS, &data->bio_set); + bio->bi_private = rtg; + bio->bi_end_io = xfs_zone_gc_end_io; + + chunk = container_of(bio, struct xfs_gc_bio, bio); + chunk->data = data; + WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW); + list_add_tail(&chunk->entry, &data->resetting); + + /* + * Also use the bio to drive the state machine when neither + * zone reset nor discard is supported to keep things simple. + */ + if (xfs_zone_gc_prepare_reset(bio, rtg)) + submit_bio(bio); + else + bio_endio(bio); + } while (next); +} + +/* + * Handle the work to read and write data for GC and to reset the zones, + * including handling all completions. + * + * Note that the order of the chunks is preserved so that we don't undo the + * optimal order established by xfs_zone_gc_query(). + */ +static bool +xfs_zone_gc_handle_work( + struct xfs_zone_gc_data *data) +{ + struct xfs_zone_info *zi = data->mp->m_zone_info; + struct xfs_gc_bio *chunk, *next; + struct xfs_group *reset_list; + struct blk_plug plug; + + spin_lock(&zi->zi_reset_list_lock); + reset_list = zi->zi_reset_list; + zi->zi_reset_list = NULL; + spin_unlock(&zi->zi_reset_list_lock); + + if (!xfs_zone_gc_select_victim(data) || + !xfs_zone_gc_space_available(data)) { + if (list_empty(&data->reading) && + list_empty(&data->writing) && + list_empty(&data->resetting) && + !reset_list) + return false; + } + + __set_current_state(TASK_RUNNING); + try_to_freeze(); + + if (reset_list) + xfs_zone_gc_reset_zones(data, reset_list); + + list_for_each_entry_safe(chunk, next, &data->resetting, entry) { + if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) + break; + xfs_zone_gc_finish_reset(chunk); + } + + list_for_each_entry_safe(chunk, next, &data->writing, entry) { + if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) + break; + xfs_zone_gc_finish_chunk(chunk); + } + + blk_start_plug(&plug); + list_for_each_entry_safe(chunk, next, &data->reading, entry) { + if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) + break; + xfs_zone_gc_write_chunk(chunk); + } + blk_finish_plug(&plug); + + blk_start_plug(&plug); + while (xfs_zone_gc_start_chunk(data)) + ; + blk_finish_plug(&plug); + return true; +} + +/* + * Note that the current GC algorithm would break reflinks and thus duplicate + * data that was shared by multiple owners before. Because of that reflinks + * are currently not supported on zoned file systems and can't be created or + * mounted. + */ +static int +xfs_zoned_gcd( + void *private) +{ + struct xfs_zone_gc_data *data = private; + struct xfs_mount *mp = data->mp; + struct xfs_zone_info *zi = mp->m_zone_info; + unsigned int nofs_flag; + + nofs_flag = memalloc_nofs_save(); + set_freezable(); + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE); + xfs_set_zonegc_running(mp); + if (xfs_zone_gc_handle_work(data)) + continue; + + if (list_empty(&data->reading) && + list_empty(&data->writing) && + list_empty(&data->resetting) && + !zi->zi_reset_list) { + xfs_clear_zonegc_running(mp); + xfs_zoned_resv_wake_all(mp); + + if (kthread_should_stop()) { + __set_current_state(TASK_RUNNING); + break; + } + + if (kthread_should_park()) { + __set_current_state(TASK_RUNNING); + kthread_parkme(); + continue; + } + } + + schedule(); + } + xfs_clear_zonegc_running(mp); + + if (data->iter.victim_rtg) + xfs_rtgroup_rele(data->iter.victim_rtg); + + memalloc_nofs_restore(nofs_flag); + xfs_zone_gc_data_free(data); + return 0; +} + +void +xfs_zone_gc_start( + struct xfs_mount *mp) +{ + if (xfs_has_zoned(mp)) + kthread_unpark(mp->m_zone_info->zi_gc_thread); +} + +void +xfs_zone_gc_stop( + struct xfs_mount *mp) +{ + if (xfs_has_zoned(mp)) + kthread_park(mp->m_zone_info->zi_gc_thread); +} + +int +xfs_zone_gc_mount( + struct xfs_mount *mp) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_zone_gc_data *data; + struct xfs_open_zone *oz; + int error; + + /* + * If there are no free zones available for GC, pick the open zone with + * the least used space to GC into. This should only happen after an + * unclean shutdown near ENOSPC while GC was ongoing. + * + * We also need to do this for the first gc zone allocation if we + * unmounted while at the open limit. + */ + if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE) || + zi->zi_nr_open_zones == mp->m_max_open_zones) + oz = xfs_zone_gc_steal_open(zi); + else + oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true); + if (!oz) { + xfs_warn(mp, "unable to allocate a zone for gc"); + error = -EIO; + goto out; + } + + trace_xfs_zone_gc_target_opened(oz->oz_rtg); + zi->zi_open_gc_zone = oz; + + data = xfs_zone_gc_data_alloc(mp); + if (!data) { + error = -ENOMEM; + goto out_put_gc_zone; + } + + mp->m_zone_info->zi_gc_thread = kthread_create(xfs_zoned_gcd, data, + "xfs-zone-gc/%s", mp->m_super->s_id); + if (IS_ERR(mp->m_zone_info->zi_gc_thread)) { + xfs_warn(mp, "unable to create zone gc thread"); + error = PTR_ERR(mp->m_zone_info->zi_gc_thread); + goto out_free_gc_data; + } + + /* xfs_zone_gc_start will unpark for rw mounts */ + kthread_park(mp->m_zone_info->zi_gc_thread); + return 0; + +out_free_gc_data: + kfree(data); +out_put_gc_zone: + xfs_open_zone_put(zi->zi_open_gc_zone); +out: + return error; +} + +void +xfs_zone_gc_unmount( + struct xfs_mount *mp) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + + kthread_stop(zi->zi_gc_thread); + if (zi->zi_open_gc_zone) + xfs_open_zone_put(zi->zi_open_gc_zone); +} diff --git a/fs/xfs/xfs_zone_info.c b/fs/xfs/xfs_zone_info.c new file mode 100644 index 000000000000..733bcc2f8645 --- /dev/null +++ b/fs/xfs/xfs_zone_info.c @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2023-2025 Christoph Hellwig. + * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates. + */ +#include "xfs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_rtgroup.h" +#include "xfs_zone_alloc.h" +#include "xfs_zone_priv.h" + +static const char xfs_write_hint_shorthand[6][16] = { + "NOT_SET", "NONE", "SHORT", "MEDIUM", "LONG", "EXTREME"}; + +static inline const char * +xfs_write_hint_to_str( + uint8_t write_hint) +{ + if (write_hint > WRITE_LIFE_EXTREME) + return "UNKNOWN"; + return xfs_write_hint_shorthand[write_hint]; +} + +static void +xfs_show_open_zone( + struct seq_file *m, + struct xfs_open_zone *oz) +{ + seq_printf(m, "\t zone %d, wp %u, written %u, used %u, hint %s\n", + rtg_rgno(oz->oz_rtg), + oz->oz_write_pointer, oz->oz_written, + rtg_rmap(oz->oz_rtg)->i_used_blocks, + xfs_write_hint_to_str(oz->oz_write_hint)); +} + +static void +xfs_show_full_zone_used_distribution( + struct seq_file *m, + struct xfs_mount *mp) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + unsigned int reclaimable = 0, full, i; + + spin_lock(&zi->zi_used_buckets_lock); + for (i = 0; i < XFS_ZONE_USED_BUCKETS; i++) { + unsigned int entries = zi->zi_used_bucket_entries[i]; + + seq_printf(m, "\t %2u..%2u%%: %u\n", + i * (100 / XFS_ZONE_USED_BUCKETS), + (i + 1) * (100 / XFS_ZONE_USED_BUCKETS) - 1, + entries); + reclaimable += entries; + } + spin_unlock(&zi->zi_used_buckets_lock); + + full = mp->m_sb.sb_rgcount; + if (zi->zi_open_gc_zone) + full--; + full -= zi->zi_nr_open_zones; + full -= atomic_read(&zi->zi_nr_free_zones); + full -= reclaimable; + + seq_printf(m, "\t 100%%: %u\n", full); +} + +void +xfs_zoned_show_stats( + struct seq_file *m, + struct xfs_mount *mp) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_open_zone *oz; + + seq_puts(m, "\n"); + + seq_printf(m, "\tuser free RT blocks: %lld\n", + xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS)); + seq_printf(m, "\treserved free RT blocks: %lld\n", + mp->m_free[XC_FREE_RTEXTENTS].res_avail); + seq_printf(m, "\tuser available RT blocks: %lld\n", + xfs_sum_freecounter(mp, XC_FREE_RTAVAILABLE)); + seq_printf(m, "\treserved available RT blocks: %lld\n", + mp->m_free[XC_FREE_RTAVAILABLE].res_avail); + seq_printf(m, "\tRT reservations required: %d\n", + !list_empty_careful(&zi->zi_reclaim_reservations)); + seq_printf(m, "\tRT GC required: %d\n", + xfs_zoned_need_gc(mp)); + + seq_printf(m, "\tfree zones: %d\n", atomic_read(&zi->zi_nr_free_zones)); + seq_puts(m, "\topen zones:\n"); + spin_lock(&zi->zi_open_zones_lock); + list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) + xfs_show_open_zone(m, oz); + if (zi->zi_open_gc_zone) { + seq_puts(m, "\topen gc zone:\n"); + xfs_show_open_zone(m, zi->zi_open_gc_zone); + } + spin_unlock(&zi->zi_open_zones_lock); + seq_puts(m, "\tused blocks distribution (fully written zones):\n"); + xfs_show_full_zone_used_distribution(m, mp); +} diff --git a/fs/xfs/xfs_zone_priv.h b/fs/xfs/xfs_zone_priv.h new file mode 100644 index 000000000000..ab696975a993 --- /dev/null +++ b/fs/xfs/xfs_zone_priv.h @@ -0,0 +1,119 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _XFS_ZONE_PRIV_H +#define _XFS_ZONE_PRIV_H + +struct xfs_open_zone { + /* + * Entry in the open zone list and refcount. Protected by + * zi_open_zones_lock in struct xfs_zone_info. + */ + struct list_head oz_entry; + atomic_t oz_ref; + + /* + * oz_write_pointer is the write pointer at which space is handed out + * for conventional zones, or simple the count of blocks handed out + * so far for sequential write required zones and is protected by + * oz_alloc_lock/ + */ + spinlock_t oz_alloc_lock; + xfs_rgblock_t oz_write_pointer; + + /* + * oz_written is the number of blocks for which we've received a + * write completion. oz_written must always be <= oz_write_pointer + * and is protected by the ILOCK of the rmap inode. + */ + xfs_rgblock_t oz_written; + + /* + * Write hint (data temperature) assigned to this zone, or + * WRITE_LIFE_NOT_SET if none was set. + */ + enum rw_hint oz_write_hint; + + /* + * Is this open zone used for garbage collection? There can only be a + * single open GC zone, which is pointed to by zi_open_gc_zone in + * struct xfs_zone_info. Constant over the life time of an open zone. + */ + bool oz_is_gc; + + /* + * Pointer to the RT groups structure for this open zone. Constant over + * the life time of an open zone. + */ + struct xfs_rtgroup *oz_rtg; +}; + +/* + * Number of bitmap buckets to track reclaimable zones. There are 10 buckets + * so that each 10% of the usable capacity get their own bucket and GC can + * only has to walk the bitmaps of the lesser used zones if there are any. + */ +#define XFS_ZONE_USED_BUCKETS 10u + +struct xfs_zone_info { + /* + * List of pending space reservations: + */ + spinlock_t zi_reservation_lock; + struct list_head zi_reclaim_reservations; + + /* + * List and number of open zones: + */ + spinlock_t zi_open_zones_lock; + struct list_head zi_open_zones; + unsigned int zi_nr_open_zones; + + /* + * Free zone search cursor and number of free zones: + */ + unsigned long zi_free_zone_cursor; + atomic_t zi_nr_free_zones; + + /* + * Wait queue to wait for free zones or open zone resources to become + * available: + */ + wait_queue_head_t zi_zone_wait; + + /* + * Pointer to the GC thread, and the current open zone used by GC + * (if any). + * + * zi_open_gc_zone is mostly private to the GC thread, but can be read + * for debugging from other threads, in which case zi_open_zones_lock + * must be taken to access it. + */ + struct task_struct *zi_gc_thread; + struct xfs_open_zone *zi_open_gc_zone; + + /* + * List of zones that need a reset: + */ + spinlock_t zi_reset_list_lock; + struct xfs_group *zi_reset_list; + + /* + * A set of bitmaps to bucket-sort reclaimable zones by used blocks to help + * garbage collection to quickly find the best candidate for reclaim. + */ + spinlock_t zi_used_buckets_lock; + unsigned int zi_used_bucket_entries[XFS_ZONE_USED_BUCKETS]; + unsigned long *zi_used_bucket_bitmap[XFS_ZONE_USED_BUCKETS]; + +}; + +struct xfs_open_zone *xfs_open_zone(struct xfs_mount *mp, + enum rw_hint write_hint, bool is_gc); + +int xfs_zone_gc_reset_sync(struct xfs_rtgroup *rtg); +bool xfs_zoned_need_gc(struct xfs_mount *mp); +int xfs_zone_gc_mount(struct xfs_mount *mp); +void xfs_zone_gc_unmount(struct xfs_mount *mp); + +void xfs_zoned_resv_wake_all(struct xfs_mount *mp); + +#endif /* _XFS_ZONE_PRIV_H */ diff --git a/fs/xfs/xfs_zone_space_resv.c b/fs/xfs/xfs_zone_space_resv.c new file mode 100644 index 000000000000..4bf1b18aa7a7 --- /dev/null +++ b/fs/xfs/xfs_zone_space_resv.c @@ -0,0 +1,253 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2023-2025 Christoph Hellwig. + * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates. + */ +#include "xfs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_rtbitmap.h" +#include "xfs_zone_alloc.h" +#include "xfs_zone_priv.h" +#include "xfs_zones.h" + +/* + * Note: the zoned allocator does not support a rtextsize > 1, so this code and + * the allocator itself uses file system blocks interchangeable with realtime + * extents without doing the otherwise required conversions. + */ + +/* + * Per-task space reservation. + * + * Tasks that need to wait for GC to free up space allocate one of these + * on-stack and adds it to the per-mount zi_reclaim_reservations lists. + * The GC thread will then wake the tasks in order when space becomes available. + */ +struct xfs_zone_reservation { + struct list_head entry; + struct task_struct *task; + xfs_filblks_t count_fsb; +}; + +/* + * Calculate the number of reserved blocks. + * + * XC_FREE_RTEXTENTS counts the user available capacity, to which the file + * system can be filled, while XC_FREE_RTAVAILABLE counts the blocks instantly + * available for writes without waiting for GC. + * + * For XC_FREE_RTAVAILABLE only the smaller reservation required for GC and + * block zeroing is excluded from the user capacity, while XC_FREE_RTEXTENTS + * is further restricted by at least one zone as well as the optional + * persistently reserved blocks. This allows the allocator to run more + * smoothly by not always triggering GC. + */ +uint64_t +xfs_zoned_default_resblks( + struct xfs_mount *mp, + enum xfs_free_counter ctr) +{ + switch (ctr) { + case XC_FREE_RTEXTENTS: + return (uint64_t)XFS_RESERVED_ZONES * + mp->m_groups[XG_TYPE_RTG].blocks + + mp->m_sb.sb_rtreserved; + case XC_FREE_RTAVAILABLE: + return (uint64_t)XFS_GC_ZONES * + mp->m_groups[XG_TYPE_RTG].blocks; + default: + ASSERT(0); + return 0; + } +} + +void +xfs_zoned_resv_wake_all( + struct xfs_mount *mp) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_zone_reservation *reservation; + + spin_lock(&zi->zi_reservation_lock); + list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry) + wake_up_process(reservation->task); + spin_unlock(&zi->zi_reservation_lock); +} + +void +xfs_zoned_add_available( + struct xfs_mount *mp, + xfs_filblks_t count_fsb) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_zone_reservation *reservation; + + if (list_empty_careful(&zi->zi_reclaim_reservations)) { + xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb); + return; + } + + spin_lock(&zi->zi_reservation_lock); + xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb); + count_fsb = xfs_sum_freecounter(mp, XC_FREE_RTAVAILABLE); + list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry) { + if (reservation->count_fsb > count_fsb) + break; + wake_up_process(reservation->task); + count_fsb -= reservation->count_fsb; + + } + spin_unlock(&zi->zi_reservation_lock); +} + +static int +xfs_zoned_space_wait_error( + struct xfs_mount *mp) +{ + if (xfs_is_shutdown(mp)) + return -EIO; + if (fatal_signal_pending(current)) + return -EINTR; + return 0; +} + +static int +xfs_zoned_reserve_available( + struct xfs_inode *ip, + xfs_filblks_t count_fsb, + unsigned int flags) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_zone_reservation reservation = { + .task = current, + .count_fsb = count_fsb, + }; + int error; + + /* + * If there are no waiters, try to directly grab the available blocks + * from the percpu counter. + * + * If the caller wants to dip into the reserved pool also bypass the + * wait list. This relies on the fact that we have a very graciously + * sized reserved pool that always has enough space. If the reserved + * allocations fail we're in trouble. + */ + if (likely(list_empty_careful(&zi->zi_reclaim_reservations) || + (flags & XFS_ZR_RESERVED))) { + error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb, + flags & XFS_ZR_RESERVED); + if (error != -ENOSPC) + return error; + } + + if (flags & XFS_ZR_NOWAIT) + return -EAGAIN; + + spin_lock(&zi->zi_reservation_lock); + list_add_tail(&reservation.entry, &zi->zi_reclaim_reservations); + while ((error = xfs_zoned_space_wait_error(mp)) == 0) { + set_current_state(TASK_KILLABLE); + + error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb, + flags & XFS_ZR_RESERVED); + if (error != -ENOSPC) + break; + + /* + * If there is no reclaimable group left and we aren't still + * processing a pending GC request give up as we're fully out + * of space. + */ + if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE) && + !xfs_is_zonegc_running(mp)) + break; + + spin_unlock(&zi->zi_reservation_lock); + schedule(); + spin_lock(&zi->zi_reservation_lock); + } + list_del(&reservation.entry); + spin_unlock(&zi->zi_reservation_lock); + + __set_current_state(TASK_RUNNING); + return error; +} + +/* + * Implement greedy space allocation for short writes by trying to grab all + * that is left after locking out other threads from trying to do the same. + * + * This isn't exactly optimal and can hopefully be replaced by a proper + * percpu_counter primitive one day. + */ +static int +xfs_zoned_reserve_extents_greedy( + struct xfs_inode *ip, + xfs_filblks_t *count_fsb, + unsigned int flags) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_zone_info *zi = mp->m_zone_info; + s64 len = *count_fsb; + int error = -ENOSPC; + + spin_lock(&zi->zi_reservation_lock); + len = min(len, xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS)); + if (len > 0) { + *count_fsb = len; + error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, *count_fsb, + flags & XFS_ZR_RESERVED); + } + spin_unlock(&zi->zi_reservation_lock); + return error; +} + +int +xfs_zoned_space_reserve( + struct xfs_inode *ip, + xfs_filblks_t count_fsb, + unsigned int flags, + struct xfs_zone_alloc_ctx *ac) +{ + struct xfs_mount *mp = ip->i_mount; + int error; + + ASSERT(ac->reserved_blocks == 0); + ASSERT(ac->open_zone == NULL); + + error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb, + flags & XFS_ZR_RESERVED); + if (error == -ENOSPC && (flags & XFS_ZR_GREEDY) && count_fsb > 1) + error = xfs_zoned_reserve_extents_greedy(ip, &count_fsb, flags); + if (error) + return error; + + error = xfs_zoned_reserve_available(ip, count_fsb, flags); + if (error) { + xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb); + return error; + } + ac->reserved_blocks = count_fsb; + return 0; +} + +void +xfs_zoned_space_unreserve( + struct xfs_inode *ip, + struct xfs_zone_alloc_ctx *ac) +{ + if (ac->reserved_blocks > 0) { + struct xfs_mount *mp = ip->i_mount; + + xfs_zoned_add_available(mp, ac->reserved_blocks); + xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, ac->reserved_blocks); + } + if (ac->open_zone) + xfs_open_zone_put(ac->open_zone); +} diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c index 35166c92420c..42e2c0065bb3 100644 --- a/fs/zonefs/file.c +++ b/fs/zonefs/file.c @@ -299,7 +299,7 @@ static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf) /* Serialize against truncates */ filemap_invalidate_lock_shared(inode->i_mapping); - ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops); + ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops, NULL); filemap_invalidate_unlock_shared(inode->i_mapping); sb_end_pagefault(inode->i_sb); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 75bf54e76f3b..9cd93530013c 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -56,6 +56,10 @@ struct vm_fault; * * IOMAP_F_BOUNDARY indicates that I/O and I/O completions for this iomap must * never be merged with the mapping before it. + * + * IOMAP_F_ANON_WRITE indicates that (write) I/O does not have a target block + * assigned to it yet and the file system will do that in the bio submission + * handler, splitting the I/O as needed. */ #define IOMAP_F_NEW (1U << 0) #define IOMAP_F_DIRTY (1U << 1) @@ -68,6 +72,7 @@ struct vm_fault; #endif /* CONFIG_BUFFER_HEAD */ #define IOMAP_F_XATTR (1U << 5) #define IOMAP_F_BOUNDARY (1U << 6) +#define IOMAP_F_ANON_WRITE (1U << 7) /* * Flags set by the core iomap code during operations: @@ -111,6 +116,8 @@ struct iomap { static inline sector_t iomap_sector(const struct iomap *iomap, loff_t pos) { + if (iomap->flags & IOMAP_F_ANON_WRITE) + return U64_MAX; /* invalid */ return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT; } @@ -182,7 +189,9 @@ struct iomap_folio_ops { #else #define IOMAP_DAX 0 #endif /* CONFIG_FS_DAX */ -#define IOMAP_ATOMIC (1 << 9) +#define IOMAP_ATOMIC_HW (1 << 9) /* HW-based torn-write protection */ +#define IOMAP_DONTCACHE (1 << 10) +#define IOMAP_ATOMIC_SW (1 << 11)/* SW-based torn-write protection */ struct iomap_ops { /* @@ -211,8 +220,10 @@ struct iomap_ops { * calls to iomap_iter(). Treat as read-only in the body. * @len: The remaining length of the file segment we're operating on. * It is updated at the same time as @pos. - * @processed: The number of bytes processed by the body in the most recent - * iteration, or a negative errno. 0 causes the iteration to stop. + * @iter_start_pos: The original start pos for the current iomap. Used for + * incremental iter advance. + * @status: Status of the most recent iteration. Zero on success or a negative + * errno on error. * @flags: Zero or more of the iomap_begin flags above. * @iomap: Map describing the I/O iteration * @srcmap: Source map for COW operations @@ -221,7 +232,8 @@ struct iomap_iter { struct inode *inode; loff_t pos; u64 len; - s64 processed; + loff_t iter_start_pos; + int status; unsigned flags; struct iomap iomap; struct iomap srcmap; @@ -229,20 +241,46 @@ struct iomap_iter { }; int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops); +int iomap_iter_advance(struct iomap_iter *iter, u64 *count); /** - * iomap_length - length of the current iomap iteration + * iomap_length_trim - trimmed length of the current iomap iteration * @iter: iteration structure + * @pos: File position to trim from. + * @len: Length of the mapping to trim to. * - * Returns the length that the operation applies to for the current iteration. + * Returns a trimmed length that the operation applies to for the current + * iteration. */ -static inline u64 iomap_length(const struct iomap_iter *iter) +static inline u64 iomap_length_trim(const struct iomap_iter *iter, loff_t pos, + u64 len) { u64 end = iter->iomap.offset + iter->iomap.length; if (iter->srcmap.type != IOMAP_HOLE) end = min(end, iter->srcmap.offset + iter->srcmap.length); - return min(iter->len, end - iter->pos); + return min(len, end - pos); +} + +/** + * iomap_length - length of the current iomap iteration + * @iter: iteration structure + * + * Returns the length that the operation applies to for the current iteration. + */ +static inline u64 iomap_length(const struct iomap_iter *iter) +{ + return iomap_length_trim(iter, iter->pos, iter->len); +} + +/** + * iomap_iter_advance_full - advance by the full length of current map + */ +static inline int iomap_iter_advance_full(struct iomap_iter *iter) +{ + u64 length = iomap_length(iter); + + return iomap_iter_advance(iter, &length); } /** @@ -306,12 +344,11 @@ bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio); int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, const struct iomap_ops *ops); int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, - bool *did_zero, const struct iomap_ops *ops); + bool *did_zero, const struct iomap_ops *ops, void *private); int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, - const struct iomap_ops *ops); -vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, - const struct iomap_ops *ops); - + const struct iomap_ops *ops, void *private); +vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops, + void *private); typedef void (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length, struct iomap *iomap); void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte, @@ -328,16 +365,42 @@ sector_t iomap_bmap(struct address_space *mapping, sector_t bno, const struct iomap_ops *ops); /* + * Flags for iomap_ioend->io_flags. + */ +/* shared COW extent */ +#define IOMAP_IOEND_SHARED (1U << 0) +/* unwritten extent */ +#define IOMAP_IOEND_UNWRITTEN (1U << 1) +/* don't merge into previous ioend */ +#define IOMAP_IOEND_BOUNDARY (1U << 2) +/* is direct I/O */ +#define IOMAP_IOEND_DIRECT (1U << 3) + +/* + * Flags that if set on either ioend prevent the merge of two ioends. + * (IOMAP_IOEND_BOUNDARY also prevents merges, but only one-way) + */ +#define IOMAP_IOEND_NOMERGE_FLAGS \ + (IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_DIRECT) + +/* * Structure for writeback I/O completions. + * + * File systems implementing ->submit_ioend (for buffered I/O) or ->submit_io + * for direct I/O) can split a bio generated by iomap. In that case the parent + * ioend it was split from is recorded in ioend->io_parent. */ struct iomap_ioend { struct list_head io_list; /* next ioend in chain */ - u16 io_type; - u16 io_flags; /* IOMAP_F_* */ + u16 io_flags; /* IOMAP_IOEND_* */ struct inode *io_inode; /* file being written to */ - size_t io_size; /* size of data within eof */ + size_t io_size; /* size of the extent */ + atomic_t io_remaining; /* completetion defer count */ + int io_error; /* stashed away status */ + struct iomap_ioend *io_parent; /* parent for completions */ loff_t io_offset; /* offset in the file */ sector_t io_sector; /* start sector of ioend */ + void *io_private; /* file system private data */ struct bio io_bio; /* MUST BE LAST! */ }; @@ -362,12 +425,14 @@ struct iomap_writeback_ops { loff_t offset, unsigned len); /* - * Optional, allows the file systems to perform actions just before - * submitting the bio and/or override the bio end_io handler for complex - * operations like copy on write extent manipulation or unwritten extent - * conversions. + * Optional, allows the file systems to hook into bio submission, + * including overriding the bi_end_io handler. + * + * Returns 0 if the bio was successfully submitted, or a negative + * error code if status was non-zero or another error happened and + * the bio could not be submitted. */ - int (*prepare_ioend)(struct iomap_ioend *ioend, int status); + int (*submit_ioend)(struct iomap_writepage_ctx *wpc, int status); /* * Optional, allows the file system to discard state on a page where @@ -383,6 +448,10 @@ struct iomap_writepage_ctx { u32 nr_folios; /* folios added to the ioend */ }; +struct iomap_ioend *iomap_init_ioend(struct inode *inode, struct bio *bio, + loff_t file_offset, u16 ioend_flags); +struct iomap_ioend *iomap_split_ioend(struct iomap_ioend *ioend, + unsigned int max_len, bool is_append); void iomap_finish_ioends(struct iomap_ioend *ioend, int error); void iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends); @@ -434,6 +503,11 @@ struct iomap_dio_ops { */ #define IOMAP_DIO_PARTIAL (1 << 2) +/* + * Use software-based torn-write protection. + */ +#define IOMAP_DIO_ATOMIC_SW (1 << 3) + ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, unsigned int dio_flags, void *private, size_t done_before); @@ -454,4 +528,6 @@ int iomap_swapfile_activate(struct swap_info_struct *sis, # define iomap_swapfile_activate(sis, swapfile, pagespan, ops) (-EIO) #endif /* CONFIG_SWAP */ +extern struct bio_set iomap_ioend_bioset; + #endif /* LINUX_IOMAP_H */ |