diff options
-rw-r--r-- | Documentation/filesystems/iomap/design.rst | 9 | ||||
-rw-r--r-- | Documentation/filesystems/iomap/operations.rst | 13 | ||||
-rw-r--r-- | fs/dax.c | 111 | ||||
-rw-r--r-- | fs/gfs2/bmap.c | 3 | ||||
-rw-r--r-- | fs/iomap/Makefile | 1 | ||||
-rw-r--r-- | fs/iomap/buffered-io.c | 347 | ||||
-rw-r--r-- | fs/iomap/direct-io.c | 154 | ||||
-rw-r--r-- | fs/iomap/fiemap.c | 21 | ||||
-rw-r--r-- | fs/iomap/internal.h | 10 | ||||
-rw-r--r-- | fs/iomap/ioend.c | 216 | ||||
-rw-r--r-- | fs/iomap/iter.c | 97 | ||||
-rw-r--r-- | fs/iomap/seek.c | 16 | ||||
-rw-r--r-- | fs/iomap/swapfile.c | 7 | ||||
-rw-r--r-- | fs/iomap/trace.h | 8 | ||||
-rw-r--r-- | fs/xfs/xfs_aops.c | 25 | ||||
-rw-r--r-- | fs/xfs/xfs_file.c | 6 | ||||
-rw-r--r-- | fs/xfs/xfs_iomap.c | 4 | ||||
-rw-r--r-- | fs/zonefs/file.c | 2 | ||||
-rw-r--r-- | include/linux/iomap.h | 110 |
19 files changed, 716 insertions, 444 deletions
diff --git a/Documentation/filesystems/iomap/design.rst b/Documentation/filesystems/iomap/design.rst index b0d0188a095e..e29651a42eec 100644 --- a/Documentation/filesystems/iomap/design.rst +++ b/Documentation/filesystems/iomap/design.rst @@ -246,6 +246,10 @@ The fields are as follows: * **IOMAP_F_PRIVATE**: Starting with this value, the upper bits can be set by the filesystem for its own purposes. + * **IOMAP_F_ANON_WRITE**: Indicates that (write) I/O does not have a target + block assigned to it yet and the file system will do that in the bio + submission handler, splitting the I/O as needed. + These flags can be set by iomap itself during file operations. The filesystem should supply an ``->iomap_end`` function if it needs to observe these flags: @@ -352,6 +356,11 @@ operations: ``IOMAP_NOWAIT`` is often set on behalf of ``IOCB_NOWAIT`` or ``RWF_NOWAIT``. + * ``IOMAP_DONTCACHE`` is set when the caller wishes to perform a + buffered file I/O and would like the kernel to drop the pagecache + after the I/O completes, if it isn't already being used by another + thread. + If it is necessary to read existing file contents from a `different <https://lore.kernel.org/all/20191008071527.29304-9-hch@lst.de/>`_ device or address range on a device, the filesystem should return that diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst index 2c7f5df9d8b0..d1535109587a 100644 --- a/Documentation/filesystems/iomap/operations.rst +++ b/Documentation/filesystems/iomap/operations.rst @@ -131,6 +131,8 @@ These ``struct kiocb`` flags are significant for buffered I/O with iomap: * ``IOCB_NOWAIT``: Turns on ``IOMAP_NOWAIT``. + * ``IOCB_DONTCACHE``: Turns on ``IOMAP_DONTCACHE``. + Internal per-Folio State ------------------------ @@ -283,7 +285,7 @@ The ``ops`` structure must be specified and is as follows: struct iomap_writeback_ops { int (*map_blocks)(struct iomap_writepage_ctx *wpc, struct inode *inode, loff_t offset, unsigned len); - int (*prepare_ioend)(struct iomap_ioend *ioend, int status); + int (*submit_ioend)(struct iomap_writepage_ctx *wpc, int status); void (*discard_folio)(struct folio *folio, loff_t pos); }; @@ -306,13 +308,12 @@ The fields are as follows: purpose. This function must be supplied by the filesystem. - - ``prepare_ioend``: Enables filesystems to transform the writeback - ioend or perform any other preparatory work before the writeback I/O - is submitted. + - ``submit_ioend``: Allows the file systems to hook into writeback bio + submission. This might include pre-write space accounting updates, or installing a custom ``->bi_end_io`` function for internal purposes, such as deferring the ioend completion to a workqueue to run metadata update - transactions from process context. + transactions from process context before submitting the bio. This function is optional. - ``discard_folio``: iomap calls this function after ``->map_blocks`` @@ -341,7 +342,7 @@ This can happen in interrupt or process context, depending on the storage device. Filesystems that need to update internal bookkeeping (e.g. unwritten -extent conversions) should provide a ``->prepare_ioend`` function to +extent conversions) should provide a ``->submit_ioend`` function to set ``struct iomap_end::bio::bi_end_io`` to its own function. This function should call ``iomap_finish_ioends`` after finishing its own work (e.g. unwritten extent conversion). @@ -1258,7 +1258,7 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, } #endif /* CONFIG_FS_DAX_PMD */ -static s64 dax_unshare_iter(struct iomap_iter *iter) +static int dax_unshare_iter(struct iomap_iter *iter) { struct iomap *iomap = &iter->iomap; const struct iomap *srcmap = iomap_iter_srcmap(iter); @@ -1266,11 +1266,11 @@ static s64 dax_unshare_iter(struct iomap_iter *iter) u64 copy_len = iomap_length(iter); u32 mod; int id = 0; - s64 ret = 0; + s64 ret; void *daddr = NULL, *saddr = NULL; if (!iomap_want_unshare_iter(iter)) - return iomap_length(iter); + return iomap_iter_advance_full(iter); /* * Extend the file range to be aligned to fsblock/pagesize, because @@ -1300,14 +1300,14 @@ static s64 dax_unshare_iter(struct iomap_iter *iter) if (ret < 0) goto out_unlock; - if (copy_mc_to_kernel(daddr, saddr, copy_len) == 0) - ret = iomap_length(iter); - else + if (copy_mc_to_kernel(daddr, saddr, copy_len) != 0) ret = -EIO; out_unlock: dax_read_unlock(id); - return dax_mem2blk_err(ret); + if (ret < 0) + return dax_mem2blk_err(ret); + return iomap_iter_advance_full(iter); } int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len, @@ -1326,7 +1326,7 @@ int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len, iter.len = min(len, size - pos); while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = dax_unshare_iter(&iter); + iter.status = dax_unshare_iter(&iter); return ret; } EXPORT_SYMBOL_GPL(dax_file_unshare); @@ -1354,17 +1354,16 @@ static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size) return ret; } -static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero) +static int dax_zero_iter(struct iomap_iter *iter, bool *did_zero) { const struct iomap *iomap = &iter->iomap; const struct iomap *srcmap = iomap_iter_srcmap(iter); - loff_t pos = iter->pos; u64 length = iomap_length(iter); - s64 written = 0; + int ret; /* already zeroed? we're done. */ if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) - return length; + return iomap_iter_advance(iter, &length); /* * invalidate the pages whose sharing state is to be changed @@ -1372,33 +1371,35 @@ static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero) */ if (iomap->flags & IOMAP_F_SHARED) invalidate_inode_pages2_range(iter->inode->i_mapping, - pos >> PAGE_SHIFT, - (pos + length - 1) >> PAGE_SHIFT); + iter->pos >> PAGE_SHIFT, + (iter->pos + length - 1) >> PAGE_SHIFT); do { + loff_t pos = iter->pos; unsigned offset = offset_in_page(pos); - unsigned size = min_t(u64, PAGE_SIZE - offset, length); pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); - long rc; int id; + length = min_t(u64, PAGE_SIZE - offset, length); + id = dax_read_lock(); - if (IS_ALIGNED(pos, PAGE_SIZE) && size == PAGE_SIZE) - rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1); + if (IS_ALIGNED(pos, PAGE_SIZE) && length == PAGE_SIZE) + ret = dax_zero_page_range(iomap->dax_dev, pgoff, 1); else - rc = dax_memzero(iter, pos, size); + ret = dax_memzero(iter, pos, length); dax_read_unlock(id); - if (rc < 0) - return rc; - pos += size; - length -= size; - written += size; + if (ret < 0) + return ret; + + ret = iomap_iter_advance(iter, &length); + if (ret) + return ret; } while (length > 0); if (did_zero) *did_zero = true; - return written; + return ret; } int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, @@ -1413,7 +1414,7 @@ int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, int ret; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = dax_zero_iter(&iter, did_zero); + iter.status = dax_zero_iter(&iter, did_zero); return ret; } EXPORT_SYMBOL_GPL(dax_zero_range); @@ -1431,8 +1432,7 @@ int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, } EXPORT_SYMBOL_GPL(dax_truncate_page); -static loff_t dax_iomap_iter(const struct iomap_iter *iomi, - struct iov_iter *iter) +static int dax_iomap_iter(struct iomap_iter *iomi, struct iov_iter *iter) { const struct iomap *iomap = &iomi->iomap; const struct iomap *srcmap = iomap_iter_srcmap(iomi); @@ -1451,8 +1451,10 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, if (pos >= end) return 0; - if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) - return iov_iter_zero(min(length, end - pos), iter); + if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) { + done = iov_iter_zero(min(length, end - pos), iter); + return iomap_iter_advance(iomi, &done); + } } /* @@ -1485,7 +1487,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, } id = dax_read_lock(); - while (pos < end) { + while ((pos = iomi->pos) < end) { unsigned offset = pos & (PAGE_SIZE - 1); const size_t size = ALIGN(length + offset, PAGE_SIZE); pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); @@ -1535,18 +1537,16 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr, map_len, iter); - pos += xfer; - length -= xfer; - done += xfer; - - if (xfer == 0) + length = xfer; + ret = iomap_iter_advance(iomi, &length); + if (!ret && xfer == 0) ret = -EFAULT; if (xfer < map_len) break; } dax_read_unlock(id); - return done ? done : ret; + return ret; } /** @@ -1586,7 +1586,7 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, iomi.flags |= IOMAP_NOWAIT; while ((ret = iomap_iter(&iomi, ops)) > 0) - iomi.processed = dax_iomap_iter(&iomi, iter); + iomi.status = dax_iomap_iter(&iomi, iter); done = iomi.pos - iocb->ki_pos; iocb->ki_pos = iomi.pos; @@ -1757,7 +1757,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, while ((error = iomap_iter(&iter, ops)) > 0) { if (WARN_ON_ONCE(iomap_length(&iter) < PAGE_SIZE)) { - iter.processed = -EIO; /* fs corruption? */ + iter.status = -EIO; /* fs corruption? */ continue; } @@ -1769,8 +1769,10 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, ret |= VM_FAULT_MAJOR; } - if (!(ret & VM_FAULT_ERROR)) - iter.processed = PAGE_SIZE; + if (!(ret & VM_FAULT_ERROR)) { + u64 length = PAGE_SIZE; + iter.status = iomap_iter_advance(&iter, &length); + } } if (iomap_errp) @@ -1883,8 +1885,10 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, continue; /* actually breaks out of the loop */ ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, true); - if (ret != VM_FAULT_FALLBACK) - iter.processed = PMD_SIZE; + if (ret != VM_FAULT_FALLBACK) { + u64 length = PMD_SIZE; + iter.status = iomap_iter_advance(&iter, &length); + } } unlock_entry: @@ -1999,12 +2003,13 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, unsigned int order, } EXPORT_SYMBOL_GPL(dax_finish_sync_fault); -static loff_t dax_range_compare_iter(struct iomap_iter *it_src, +static int dax_range_compare_iter(struct iomap_iter *it_src, struct iomap_iter *it_dest, u64 len, bool *same) { const struct iomap *smap = &it_src->iomap; const struct iomap *dmap = &it_dest->iomap; loff_t pos1 = it_src->pos, pos2 = it_dest->pos; + u64 dest_len; void *saddr, *daddr; int id, ret; @@ -2012,7 +2017,7 @@ static loff_t dax_range_compare_iter(struct iomap_iter *it_src, if (smap->type == IOMAP_HOLE && dmap->type == IOMAP_HOLE) { *same = true; - return len; + goto advance; } if (smap->type == IOMAP_HOLE || dmap->type == IOMAP_HOLE) { @@ -2035,7 +2040,13 @@ static loff_t dax_range_compare_iter(struct iomap_iter *it_src, if (!*same) len = 0; dax_read_unlock(id); - return len; + +advance: + dest_len = len; + ret = iomap_iter_advance(it_src, &len); + if (!ret) + ret = iomap_iter_advance(it_dest, &dest_len); + return ret; out_unlock: dax_read_unlock(id); @@ -2058,15 +2069,15 @@ int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff, .len = len, .flags = IOMAP_DAX, }; - int ret, compared = 0; + int ret, status; while ((ret = iomap_iter(&src_iter, ops)) > 0 && (ret = iomap_iter(&dst_iter, ops)) > 0) { - compared = dax_range_compare_iter(&src_iter, &dst_iter, + status = dax_range_compare_iter(&src_iter, &dst_iter, min(src_iter.len, dst_iter.len), same); - if (compared < 0) + if (status < 0) return ret; - src_iter.processed = dst_iter.processed = compared; + src_iter.status = dst_iter.status = status; } return ret; } diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 1795c4e8dbf6..366516b98b3f 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1300,7 +1300,8 @@ static int gfs2_block_zero_range(struct inode *inode, loff_t from, unsigned int length) { BUG_ON(current->journal_info); - return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops); + return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops, + NULL); } #define GFS2_JTRUNC_REVOKES 8192 diff --git a/fs/iomap/Makefile b/fs/iomap/Makefile index 381d76c5c232..69e8ebb41302 100644 --- a/fs/iomap/Makefile +++ b/fs/iomap/Makefile @@ -12,6 +12,7 @@ iomap-y += trace.o \ iter.o iomap-$(CONFIG_BLOCK) += buffered-io.o \ direct-io.o \ + ioend.o \ fiemap.o \ seek.o iomap-$(CONFIG_SWAP) += swapfile.o diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index d303e6c8900c..d52cfdc299c4 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -12,17 +12,15 @@ #include <linux/buffer_head.h> #include <linux/dax.h> #include <linux/writeback.h> -#include <linux/list_sort.h> #include <linux/swap.h> #include <linux/bio.h> #include <linux/sched/signal.h> #include <linux/migrate.h> +#include "internal.h" #include "trace.h" #include "../internal.h" -#define IOEND_BATCH_SIZE 4096 - /* * Structure allocated for each folio to track per-block uptodate, dirty state * and I/O completions. @@ -40,8 +38,6 @@ struct iomap_folio_state { unsigned long state[]; }; -static struct bio_set iomap_ioend_bioset; - static inline bool ifs_is_fully_uptodate(struct folio *folio, struct iomap_folio_state *ifs) { @@ -366,15 +362,14 @@ static inline bool iomap_block_needs_zeroing(const struct iomap_iter *iter, pos >= i_size_read(iter->inode); } -static loff_t iomap_readpage_iter(const struct iomap_iter *iter, - struct iomap_readpage_ctx *ctx, loff_t offset) +static int iomap_readpage_iter(struct iomap_iter *iter, + struct iomap_readpage_ctx *ctx) { const struct iomap *iomap = &iter->iomap; - loff_t pos = iter->pos + offset; - loff_t length = iomap_length(iter) - offset; + loff_t pos = iter->pos; + loff_t length = iomap_length(iter); struct folio *folio = ctx->cur_folio; struct iomap_folio_state *ifs; - loff_t orig_pos = pos; size_t poff, plen; sector_t sector; @@ -438,25 +433,22 @@ done: * we can skip trailing ones as they will be handled in the next * iteration. */ - return pos - orig_pos + plen; + length = pos - iter->pos + plen; + return iomap_iter_advance(iter, &length); } -static loff_t iomap_read_folio_iter(const struct iomap_iter *iter, +static int iomap_read_folio_iter(struct iomap_iter *iter, struct iomap_readpage_ctx *ctx) { - struct folio *folio = ctx->cur_folio; - size_t offset = offset_in_folio(folio, iter->pos); - loff_t length = min_t(loff_t, folio_size(folio) - offset, - iomap_length(iter)); - loff_t done, ret; - - for (done = 0; done < length; done += ret) { - ret = iomap_readpage_iter(iter, ctx, done); - if (ret <= 0) + int ret; + + while (iomap_length(iter)) { + ret = iomap_readpage_iter(iter, ctx); + if (ret) return ret; } - return done; + return 0; } int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops) @@ -474,7 +466,7 @@ int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops) trace_iomap_readpage(iter.inode, 1); while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_read_folio_iter(&iter, &ctx); + iter.status = iomap_read_folio_iter(&iter, &ctx); if (ctx.bio) { submit_bio(ctx.bio); @@ -493,15 +485,14 @@ int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops) } EXPORT_SYMBOL_GPL(iomap_read_folio); -static loff_t iomap_readahead_iter(const struct iomap_iter *iter, +static int iomap_readahead_iter(struct iomap_iter *iter, struct iomap_readpage_ctx *ctx) { - loff_t length = iomap_length(iter); - loff_t done, ret; + int ret; - for (done = 0; done < length; done += ret) { + while (iomap_length(iter)) { if (ctx->cur_folio && - offset_in_folio(ctx->cur_folio, iter->pos + done) == 0) { + offset_in_folio(ctx->cur_folio, iter->pos) == 0) { if (!ctx->cur_folio_in_bio) folio_unlock(ctx->cur_folio); ctx->cur_folio = NULL; @@ -510,12 +501,12 @@ static loff_t iomap_readahead_iter(const struct iomap_iter *iter, ctx->cur_folio = readahead_folio(ctx->rac); ctx->cur_folio_in_bio = false; } - ret = iomap_readpage_iter(iter, ctx, done); - if (ret <= 0) + ret = iomap_readpage_iter(iter, ctx); + if (ret) return ret; } - return done; + return 0; } /** @@ -547,7 +538,7 @@ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops) trace_iomap_readahead(rac->mapping->host, readahead_count(rac)); while (iomap_iter(&iter, ops) > 0) - iter.processed = iomap_readahead_iter(&iter, &ctx); + iter.status = iomap_readahead_iter(&iter, &ctx); if (ctx.bio) submit_bio(ctx.bio); @@ -603,6 +594,8 @@ struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len) if (iter->flags & IOMAP_NOWAIT) fgp |= FGP_NOWAIT; + if (iter->flags & IOMAP_DONTCACHE) + fgp |= FGP_DONTCACHE; fgp |= fgf_set_order(len); return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, @@ -907,12 +900,10 @@ static bool iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, return __iomap_write_end(iter->inode, pos, len, copied, folio); } -static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) +static int iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) { - loff_t length = iomap_length(iter); - loff_t pos = iter->pos; ssize_t total_written = 0; - long status = 0; + int status = 0; struct address_space *mapping = iter->inode->i_mapping; size_t chunk = mapping_max_folio_size(mapping); unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0; @@ -923,7 +914,8 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) size_t offset; /* Offset into folio */ size_t bytes; /* Bytes to write to folio */ size_t copied; /* Bytes copied from user */ - size_t written; /* Bytes have been written */ + u64 written; /* Bytes have been written */ + loff_t pos = iter->pos; bytes = iov_iter_count(i); retry: @@ -934,8 +926,8 @@ retry: if (unlikely(status)) break; - if (bytes > length) - bytes = length; + if (bytes > iomap_length(iter)) + bytes = iomap_length(iter); /* * Bring in the user page that we'll copy from _first_. @@ -1006,17 +998,12 @@ retry: goto retry; } } else { - pos += written; total_written += written; - length -= written; + iomap_iter_advance(iter, &written); } - } while (iov_iter_count(i) && length); + } while (iov_iter_count(i) && iomap_length(iter)); - if (status == -EAGAIN) { - iov_iter_revert(i, total_written); - return -EAGAIN; - } - return total_written ? total_written : status; + return total_written ? 0 : status; } ssize_t @@ -1034,9 +1021,11 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, if (iocb->ki_flags & IOCB_NOWAIT) iter.flags |= IOMAP_NOWAIT; + if (iocb->ki_flags & IOCB_DONTCACHE) + iter.flags |= IOMAP_DONTCACHE; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_write_iter(&iter, i); + iter.status = iomap_write_iter(&iter, i); if (unlikely(iter.pos == iocb->ki_pos)) return ret; @@ -1270,23 +1259,22 @@ void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte, } EXPORT_SYMBOL_GPL(iomap_write_delalloc_release); -static loff_t iomap_unshare_iter(struct iomap_iter *iter) +static int iomap_unshare_iter(struct iomap_iter *iter) { struct iomap *iomap = &iter->iomap; - loff_t pos = iter->pos; - loff_t length = iomap_length(iter); - loff_t written = 0; + u64 bytes = iomap_length(iter); + int status; if (!iomap_want_unshare_iter(iter)) - return length; + return iomap_iter_advance(iter, &bytes); do { struct folio *folio; - int status; size_t offset; - size_t bytes = min_t(u64, SIZE_MAX, length); + loff_t pos = iter->pos; bool ret; + bytes = min_t(u64, SIZE_MAX, bytes); status = iomap_write_begin(iter, pos, bytes, &folio); if (unlikely(status)) return status; @@ -1304,14 +1292,14 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter) cond_resched(); - pos += bytes; - written += bytes; - length -= bytes; - balance_dirty_pages_ratelimited(iter->inode->i_mapping); - } while (length > 0); - return written; + status = iomap_iter_advance(iter, &bytes); + if (status) + break; + } while (bytes > 0); + + return status; } int @@ -1331,7 +1319,7 @@ iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, iter.len = min(len, size - pos); while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_unshare_iter(&iter); + iter.status = iomap_unshare_iter(&iter); return ret; } EXPORT_SYMBOL_GPL(iomap_file_unshare); @@ -1350,19 +1338,18 @@ static inline int iomap_zero_iter_flush_and_stale(struct iomap_iter *i) return filemap_write_and_wait_range(mapping, i->pos, end); } -static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) +static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) { - loff_t pos = iter->pos; - loff_t length = iomap_length(iter); - loff_t written = 0; + u64 bytes = iomap_length(iter); + int status; do { struct folio *folio; - int status; size_t offset; - size_t bytes = min_t(u64, SIZE_MAX, length); + loff_t pos = iter->pos; bool ret; + bytes = min_t(u64, SIZE_MAX, bytes); status = iomap_write_begin(iter, pos, bytes, &folio); if (status) return status; @@ -1383,25 +1370,26 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) if (WARN_ON_ONCE(!ret)) return -EIO; - pos += bytes; - length -= bytes; - written += bytes; - } while (length > 0); + status = iomap_iter_advance(iter, &bytes); + if (status) + break; + } while (bytes > 0); if (did_zero) *did_zero = true; - return written; + return status; } int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, - const struct iomap_ops *ops) + const struct iomap_ops *ops, void *private) { struct iomap_iter iter = { .inode = inode, .pos = pos, .len = len, .flags = IOMAP_ZERO, + .private = private, }; struct address_space *mapping = inode->i_mapping; unsigned int blocksize = i_blocksize(inode); @@ -1424,7 +1412,7 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, filemap_range_needs_writeback(mapping, pos, pos + plen - 1)) { iter.len = plen; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_zero_iter(&iter, did_zero); + iter.status = iomap_zero_iter(&iter, did_zero); iter.len = len - (iter.pos - pos); if (ret || !iter.len) @@ -1443,17 +1431,19 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) { - loff_t proc = iomap_length(&iter); + s64 status; if (range_dirty) { range_dirty = false; - proc = iomap_zero_iter_flush_and_stale(&iter); + status = iomap_zero_iter_flush_and_stale(&iter); + } else { + status = iomap_iter_advance_full(&iter); } - iter.processed = proc; + iter.status = status; continue; } - iter.processed = iomap_zero_iter(&iter, did_zero); + iter.status = iomap_zero_iter(&iter, did_zero); } return ret; } @@ -1461,7 +1451,7 @@ EXPORT_SYMBOL_GPL(iomap_zero_range); int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, - const struct iomap_ops *ops) + const struct iomap_ops *ops, void *private) { unsigned int blocksize = i_blocksize(inode); unsigned int off = pos & (blocksize - 1); @@ -1469,11 +1459,12 @@ iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, /* Block boundary? Nothing to do */ if (!off) return 0; - return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops); + return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops, + private); } EXPORT_SYMBOL_GPL(iomap_truncate_page); -static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter, +static int iomap_folio_mkwrite_iter(struct iomap_iter *iter, struct folio *folio) { loff_t length = iomap_length(iter); @@ -1490,14 +1481,16 @@ static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter, folio_mark_dirty(folio); } - return length; + return iomap_iter_advance(iter, &length); } -vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) +vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops, + void *private) { struct iomap_iter iter = { .inode = file_inode(vmf->vma->vm_file), .flags = IOMAP_WRITE | IOMAP_FAULT, + .private = private, }; struct folio *folio = page_folio(vmf->page); ssize_t ret; @@ -1509,7 +1502,7 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) iter.pos = folio_pos(folio); iter.len = ret; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_folio_mkwrite_iter(&iter, folio); + iter.status = iomap_folio_mkwrite_iter(&iter, folio); if (ret < 0) goto out_unlock; @@ -1538,16 +1531,15 @@ static void iomap_finish_folio_write(struct inode *inode, struct folio *folio, * state, release holds on bios, and finally free up memory. Do not use the * ioend after this. */ -static u32 -iomap_finish_ioend(struct iomap_ioend *ioend, int error) +u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend) { struct inode *inode = ioend->io_inode; struct bio *bio = &ioend->io_bio; struct folio_iter fi; u32 folio_count = 0; - if (error) { - mapping_set_error(inode->i_mapping, error); + if (ioend->io_error) { + mapping_set_error(inode->i_mapping, ioend->io_error); if (!bio_flagged(bio, BIO_QUIET)) { pr_err_ratelimited( "%s: writeback error on inode %lu, offset %lld, sector %llu", @@ -1566,116 +1558,16 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error) return folio_count; } -/* - * Ioend completion routine for merged bios. This can only be called from task - * contexts as merged ioends can be of unbound length. Hence we have to break up - * the writeback completions into manageable chunks to avoid long scheduler - * holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get - * good batch processing throughput without creating adverse scheduler latency - * conditions. - */ -void -iomap_finish_ioends(struct iomap_ioend *ioend, int error) -{ - struct list_head tmp; - u32 completions; - - might_sleep(); - - list_replace_init(&ioend->io_list, &tmp); - completions = iomap_finish_ioend(ioend, error); - - while (!list_empty(&tmp)) { - if (completions > IOEND_BATCH_SIZE * 8) { - cond_resched(); - completions = 0; - } - ioend = list_first_entry(&tmp, struct iomap_ioend, io_list); - list_del_init(&ioend->io_list); - completions += iomap_finish_ioend(ioend, error); - } -} -EXPORT_SYMBOL_GPL(iomap_finish_ioends); - -/* - * We can merge two adjacent ioends if they have the same set of work to do. - */ -static bool -iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next) -{ - if (ioend->io_bio.bi_status != next->io_bio.bi_status) - return false; - if (next->io_flags & IOMAP_F_BOUNDARY) - return false; - if ((ioend->io_flags & IOMAP_F_SHARED) ^ - (next->io_flags & IOMAP_F_SHARED)) - return false; - if ((ioend->io_type == IOMAP_UNWRITTEN) ^ - (next->io_type == IOMAP_UNWRITTEN)) - return false; - if (ioend->io_offset + ioend->io_size != next->io_offset) - return false; - /* - * Do not merge physically discontiguous ioends. The filesystem - * completion functions will have to iterate the physical - * discontiguities even if we merge the ioends at a logical level, so - * we don't gain anything by merging physical discontiguities here. - * - * We cannot use bio->bi_iter.bi_sector here as it is modified during - * submission so does not point to the start sector of the bio at - * completion. - */ - if (ioend->io_sector + (ioend->io_size >> 9) != next->io_sector) - return false; - return true; -} - -void -iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends) -{ - struct iomap_ioend *next; - - INIT_LIST_HEAD(&ioend->io_list); - - while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend, - io_list))) { - if (!iomap_ioend_can_merge(ioend, next)) - break; - list_move_tail(&next->io_list, &ioend->io_list); - ioend->io_size += next->io_size; - } -} -EXPORT_SYMBOL_GPL(iomap_ioend_try_merge); - -static int -iomap_ioend_compare(void *priv, const struct list_head *a, - const struct list_head *b) -{ - struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list); - struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list); - - if (ia->io_offset < ib->io_offset) - return -1; - if (ia->io_offset > ib->io_offset) - return 1; - return 0; -} - -void -iomap_sort_ioends(struct list_head *ioend_list) -{ - list_sort(NULL, ioend_list, iomap_ioend_compare); -} -EXPORT_SYMBOL_GPL(iomap_sort_ioends); - static void iomap_writepage_end_bio(struct bio *bio) { - iomap_finish_ioend(iomap_ioend_from_bio(bio), - blk_status_to_errno(bio->bi_status)); + struct iomap_ioend *ioend = iomap_ioend_from_bio(bio); + + ioend->io_error = blk_status_to_errno(bio->bi_status); + iomap_finish_ioend_buffered(ioend); } /* - * Submit the final bio for an ioend. + * Submit an ioend. * * If @error is non-zero, it means that we have a situation where some part of * the submission process has failed after we've marked pages for writeback. @@ -1694,14 +1586,18 @@ static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error) * failure happened so that the file system end I/O handler gets called * to clean up. */ - if (wpc->ops->prepare_ioend) - error = wpc->ops->prepare_ioend(wpc->ioend, error); + if (wpc->ops->submit_ioend) { + error = wpc->ops->submit_ioend(wpc, error); + } else { + if (WARN_ON_ONCE(wpc->iomap.flags & IOMAP_F_ANON_WRITE)) + error = -EIO; + if (!error) + submit_bio(&wpc->ioend->io_bio); + } if (error) { wpc->ioend->io_bio.bi_status = errno_to_blk_status(error); bio_endio(&wpc->ioend->io_bio); - } else { - submit_bio(&wpc->ioend->io_bio); } wpc->ioend = NULL; @@ -1709,9 +1605,9 @@ static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error) } static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc, - struct writeback_control *wbc, struct inode *inode, loff_t pos) + struct writeback_control *wbc, struct inode *inode, loff_t pos, + u16 ioend_flags) { - struct iomap_ioend *ioend; struct bio *bio; bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS, @@ -1719,36 +1615,24 @@ static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc, GFP_NOFS, &iomap_ioend_bioset); bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos); bio->bi_end_io = iomap_writepage_end_bio; - wbc_init_bio(wbc, bio); bio->bi_write_hint = inode->i_write_hint; - - ioend = iomap_ioend_from_bio(bio); - INIT_LIST_HEAD(&ioend->io_list); - ioend->io_type = wpc->iomap.type; - ioend->io_flags = wpc->iomap.flags; - if (pos > wpc->iomap.offset) - wpc->iomap.flags &= ~IOMAP_F_BOUNDARY; - ioend->io_inode = inode; - ioend->io_size = 0; - ioend->io_offset = pos; - ioend->io_sector = bio->bi_iter.bi_sector; - + wbc_init_bio(wbc, bio); wpc->nr_folios = 0; - return ioend; + return iomap_init_ioend(inode, bio, pos, ioend_flags); } -static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos) +static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos, + u16 ioend_flags) { - if (wpc->iomap.offset == pos && (wpc->iomap.flags & IOMAP_F_BOUNDARY)) + if (ioend_flags & IOMAP_IOEND_BOUNDARY) return false; - if ((wpc->iomap.flags & IOMAP_F_SHARED) != - (wpc->ioend->io_flags & IOMAP_F_SHARED)) - return false; - if (wpc->iomap.type != wpc->ioend->io_type) + if ((ioend_flags & IOMAP_IOEND_NOMERGE_FLAGS) != + (wpc->ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS)) return false; if (pos != wpc->ioend->io_offset + wpc->ioend->io_size) return false; - if (iomap_sector(&wpc->iomap, pos) != + if (!(wpc->iomap.flags & IOMAP_F_ANON_WRITE) && + iomap_sector(&wpc->iomap, pos) != bio_end_sector(&wpc->ioend->io_bio)) return false; /* @@ -1779,14 +1663,23 @@ static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, { struct iomap_folio_state *ifs = folio->private; size_t poff = offset_in_folio(folio, pos); + unsigned int ioend_flags = 0; int error; - if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos)) { + if (wpc->iomap.type == IOMAP_UNWRITTEN) + ioend_flags |= IOMAP_IOEND_UNWRITTEN; + if (wpc->iomap.flags & IOMAP_F_SHARED) + ioend_flags |= IOMAP_IOEND_SHARED; + if (pos == wpc->iomap.offset && (wpc->iomap.flags & IOMAP_F_BOUNDARY)) + ioend_flags |= IOMAP_IOEND_BOUNDARY; + + if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos, ioend_flags)) { new_ioend: error = iomap_submit_ioend(wpc, 0); if (error) return error; - wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos); + wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos, + ioend_flags); } if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff)) @@ -2062,11 +1955,3 @@ iomap_writepages(struct address_space *mapping, struct writeback_control *wbc, return iomap_submit_ioend(wpc, error); } EXPORT_SYMBOL_GPL(iomap_writepages); - -static int __init iomap_buffered_init(void) -{ - return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE), - offsetof(struct iomap_ioend, io_bio), - BIOSET_NEED_BVECS); -} -fs_initcall(iomap_buffered_init); diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index b521eb15759e..e1e32e2bb0bf 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2010 Red Hat, Inc. - * Copyright (c) 2016-2021 Christoph Hellwig. + * Copyright (c) 2016-2025 Christoph Hellwig. */ #include <linux/module.h> #include <linux/compiler.h> @@ -12,6 +12,7 @@ #include <linux/backing-dev.h> #include <linux/uio.h> #include <linux/task_io_accounting_ops.h> +#include "internal.h" #include "trace.h" #include "../internal.h" @@ -20,6 +21,7 @@ * Private flags for iomap_dio, must not overlap with the public ones in * iomap.h: */ +#define IOMAP_DIO_NO_INVALIDATE (1U << 25) #define IOMAP_DIO_CALLER_COMP (1U << 26) #define IOMAP_DIO_INLINE_COMP (1U << 27) #define IOMAP_DIO_WRITE_THROUGH (1U << 28) @@ -81,10 +83,12 @@ static void iomap_dio_submit_bio(const struct iomap_iter *iter, WRITE_ONCE(iocb->private, bio); } - if (dio->dops && dio->dops->submit_io) + if (dio->dops && dio->dops->submit_io) { dio->dops->submit_io(iter, bio, pos); - else + } else { + WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_ANON_WRITE); submit_bio(bio); + } } ssize_t iomap_dio_complete(struct iomap_dio *dio) @@ -117,7 +121,8 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio) * ->end_io() when necessary, otherwise a racing buffer read would cache * zeros from unwritten extents. */ - if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE)) + if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE) && + !(dio->flags & IOMAP_DIO_NO_INVALIDATE)) kiocb_invalidate_post_direct_write(iocb, dio->size); inode_dio_end(file_inode(iocb->ki_filp)); @@ -163,43 +168,31 @@ static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret) cmpxchg(&dio->error, 0, ret); } -void iomap_dio_bio_end_io(struct bio *bio) +/* + * Called when dio->ref reaches zero from an I/O completion. + */ +static void iomap_dio_done(struct iomap_dio *dio) { - struct iomap_dio *dio = bio->bi_private; - bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); struct kiocb *iocb = dio->iocb; - if (bio->bi_status) - iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status)); - if (!atomic_dec_and_test(&dio->ref)) - goto release_bio; - - /* - * Synchronous dio, task itself will handle any completion work - * that needs after IO. All we need to do is wake the task. - */ if (dio->wait_for_completion) { + /* + * Synchronous I/O, task itself will handle any completion work + * that needs after IO. All we need to do is wake the task. + */ struct task_struct *waiter = dio->submit.waiter; WRITE_ONCE(dio->submit.waiter, NULL); blk_wake_io_task(waiter); - goto release_bio; - } - - /* - * Flagged with IOMAP_DIO_INLINE_COMP, we can complete it inline - */ - if (dio->flags & IOMAP_DIO_INLINE_COMP) { + } else if (dio->flags & IOMAP_DIO_INLINE_COMP) { WRITE_ONCE(iocb->private, NULL); iomap_dio_complete_work(&dio->aio.work); - goto release_bio; - } - - /* - * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then schedule - * our completion that way to avoid an async punt to a workqueue. - */ - if (dio->flags & IOMAP_DIO_CALLER_COMP) { + } else if (dio->flags & IOMAP_DIO_CALLER_COMP) { + /* + * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then + * schedule our completion that way to avoid an async punt to a + * workqueue. + */ /* only polled IO cares about private cleared */ iocb->private = dio; iocb->dio_complete = iomap_dio_deferred_complete; @@ -217,19 +210,31 @@ void iomap_dio_bio_end_io(struct bio *bio) * issuer. */ iocb->ki_complete(iocb, 0); - goto release_bio; + } else { + struct inode *inode = file_inode(iocb->ki_filp); + + /* + * Async DIO completion that requires filesystem level + * completion work gets punted to a work queue to complete as + * the operation may require more IO to be issued to finalise + * filesystem metadata changes or guarantee data integrity. + */ + INIT_WORK(&dio->aio.work, iomap_dio_complete_work); + queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work); } +} + +void iomap_dio_bio_end_io(struct bio *bio) +{ + struct iomap_dio *dio = bio->bi_private; + bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); + + if (bio->bi_status) + iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status)); + + if (atomic_dec_and_test(&dio->ref)) + iomap_dio_done(dio); - /* - * Async DIO completion that requires filesystem level completion work - * gets punted to a work queue to complete as the operation may require - * more IO to be issued to finalise filesystem metadata changes or - * guarantee data integrity. - */ - INIT_WORK(&dio->aio.work, iomap_dio_complete_work); - queue_work(file_inode(iocb->ki_filp)->i_sb->s_dio_done_wq, - &dio->aio.work); -release_bio: if (should_dirty) { bio_check_pages_dirty(bio); } else { @@ -239,6 +244,47 @@ release_bio: } EXPORT_SYMBOL_GPL(iomap_dio_bio_end_io); +u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend) +{ + struct iomap_dio *dio = ioend->io_bio.bi_private; + bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); + u32 vec_count = ioend->io_bio.bi_vcnt; + + if (ioend->io_error) + iomap_dio_set_error(dio, ioend->io_error); + + if (atomic_dec_and_test(&dio->ref)) { + /* + * Try to avoid another context switch for the completion given + * that we are already called from the ioend completion + * workqueue, but never invalidate pages from this thread to + * avoid deadlocks with buffered I/O completions. Tough luck if + * you hit the tiny race with someone dirtying the range now + * between this check and the actual completion. + */ + if (!dio->iocb->ki_filp->f_mapping->nrpages) { + dio->flags |= IOMAP_DIO_INLINE_COMP; + dio->flags |= IOMAP_DIO_NO_INVALIDATE; + } + dio->flags &= ~IOMAP_DIO_CALLER_COMP; + iomap_dio_done(dio); + } + + if (should_dirty) { + bio_check_pages_dirty(&ioend->io_bio); + } else { + bio_release_pages(&ioend->io_bio, false); + bio_put(&ioend->io_bio); + } + + /* + * Return the number of bvecs completed as even direct I/O completions + * do significant per-folio work and we'll still want to give up the + * CPU after a lot of completions. + */ + return vec_count; +} + static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, loff_t pos, unsigned len) { @@ -289,8 +335,7 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio, return opflags; } -static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, - struct iomap_dio *dio) +static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) { const struct iomap *iomap = &iter->iomap; struct inode *inode = iter->inode; @@ -303,7 +348,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, bool need_zeroout = false; bool use_fua = false; int nr_pages, ret = 0; - size_t copied = 0; + u64 copied = 0; size_t orig_count; if (atomic && length != fs_block_size) @@ -467,30 +512,28 @@ out: /* Undo iter limitation to current extent */ iov_iter_reexpand(dio->submit.iter, orig_count - copied); if (copied) - return copied; + return iomap_iter_advance(iter, &copied); return ret; } -static loff_t iomap_dio_hole_iter(const struct iomap_iter *iter, - struct iomap_dio *dio) +static int iomap_dio_hole_iter(struct iomap_iter *iter, struct iomap_dio *dio) { loff_t length = iov_iter_zero(iomap_length(iter), dio->submit.iter); dio->size += length; if (!length) return -EFAULT; - return length; + return iomap_iter_advance(iter, &length); } -static loff_t iomap_dio_inline_iter(const struct iomap_iter *iomi, - struct iomap_dio *dio) +static int iomap_dio_inline_iter(struct iomap_iter *iomi, struct iomap_dio *dio) { const struct iomap *iomap = &iomi->iomap; struct iov_iter *iter = dio->submit.iter; void *inline_data = iomap_inline_data(iomap, iomi->pos); loff_t length = iomap_length(iomi); loff_t pos = iomi->pos; - size_t copied; + u64 copied; if (WARN_ON_ONCE(!iomap_inline_data_valid(iomap))) return -EIO; @@ -512,11 +555,10 @@ static loff_t iomap_dio_inline_iter(const struct iomap_iter *iomi, dio->size += copied; if (!copied) return -EFAULT; - return copied; + return iomap_iter_advance(iomi, &copied); } -static loff_t iomap_dio_iter(const struct iomap_iter *iter, - struct iomap_dio *dio) +static int iomap_dio_iter(struct iomap_iter *iter, struct iomap_dio *dio) { switch (iter->iomap.type) { case IOMAP_HOLE: @@ -700,7 +742,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, blk_start_plug(&plug); while ((ret = iomap_iter(&iomi, ops)) > 0) { - iomi.processed = iomap_dio_iter(&iomi, dio); + iomi.status = iomap_dio_iter(&iomi, dio); /* * We can only poll for single bio I/Os. diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c index 610ca6f1ec9b..80675c42e94e 100644 --- a/fs/iomap/fiemap.c +++ b/fs/iomap/fiemap.c @@ -39,24 +39,23 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi, iomap->length, flags); } -static loff_t iomap_fiemap_iter(const struct iomap_iter *iter, +static int iomap_fiemap_iter(struct iomap_iter *iter, struct fiemap_extent_info *fi, struct iomap *prev) { int ret; if (iter->iomap.type == IOMAP_HOLE) - return iomap_length(iter); + goto advance; ret = iomap_to_fiemap(fi, prev, 0); *prev = iter->iomap; - switch (ret) { - case 0: /* success */ - return iomap_length(iter); - case 1: /* extent array full */ - return 0; - default: /* error */ + if (ret < 0) return ret; - } + if (ret == 1) /* extent array full */ + return 0; + +advance: + return iomap_iter_advance_full(iter); } int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, @@ -78,7 +77,7 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, return ret; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_fiemap_iter(&iter, fi, &prev); + iter.status = iomap_fiemap_iter(&iter, fi, &prev); if (prev.type != IOMAP_HOLE) { ret = iomap_to_fiemap(fi, &prev, FIEMAP_EXTENT_LAST); @@ -114,7 +113,7 @@ iomap_bmap(struct address_space *mapping, sector_t bno, while ((ret = iomap_iter(&iter, ops)) > 0) { if (iter.iomap.type == IOMAP_MAPPED) bno = iomap_sector(&iter.iomap, iter.pos) >> blkshift; - /* leave iter.processed unset to abort loop */ + /* leave iter.status unset to abort loop */ } if (ret) return 0; diff --git a/fs/iomap/internal.h b/fs/iomap/internal.h new file mode 100644 index 000000000000..f6992a3bf66a --- /dev/null +++ b/fs/iomap/internal.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _IOMAP_INTERNAL_H +#define _IOMAP_INTERNAL_H 1 + +#define IOEND_BATCH_SIZE 4096 + +u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend); +u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend); + +#endif /* _IOMAP_INTERNAL_H */ diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c new file mode 100644 index 000000000000..18894ebba6db --- /dev/null +++ b/fs/iomap/ioend.c @@ -0,0 +1,216 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2024-2025 Christoph Hellwig. + */ +#include <linux/iomap.h> +#include <linux/list_sort.h> +#include "internal.h" + +struct bio_set iomap_ioend_bioset; +EXPORT_SYMBOL_GPL(iomap_ioend_bioset); + +struct iomap_ioend *iomap_init_ioend(struct inode *inode, + struct bio *bio, loff_t file_offset, u16 ioend_flags) +{ + struct iomap_ioend *ioend = iomap_ioend_from_bio(bio); + + atomic_set(&ioend->io_remaining, 1); + ioend->io_error = 0; + ioend->io_parent = NULL; + INIT_LIST_HEAD(&ioend->io_list); + ioend->io_flags = ioend_flags; + ioend->io_inode = inode; + ioend->io_offset = file_offset; + ioend->io_size = bio->bi_iter.bi_size; + ioend->io_sector = bio->bi_iter.bi_sector; + ioend->io_private = NULL; + return ioend; +} +EXPORT_SYMBOL_GPL(iomap_init_ioend); + +static u32 iomap_finish_ioend(struct iomap_ioend *ioend, int error) +{ + if (ioend->io_parent) { + struct bio *bio = &ioend->io_bio; + + ioend = ioend->io_parent; + bio_put(bio); + } + + if (error) + cmpxchg(&ioend->io_error, 0, error); + + if (!atomic_dec_and_test(&ioend->io_remaining)) + return 0; + if (ioend->io_flags & IOMAP_IOEND_DIRECT) + return iomap_finish_ioend_direct(ioend); + return iomap_finish_ioend_buffered(ioend); +} + +/* + * Ioend completion routine for merged bios. This can only be called from task + * contexts as merged ioends can be of unbound length. Hence we have to break up + * the writeback completions into manageable chunks to avoid long scheduler + * holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get + * good batch processing throughput without creating adverse scheduler latency + * conditions. + */ +void iomap_finish_ioends(struct iomap_ioend *ioend, int error) +{ + struct list_head tmp; + u32 completions; + + might_sleep(); + + list_replace_init(&ioend->io_list, &tmp); + completions = iomap_finish_ioend(ioend, error); + + while (!list_empty(&tmp)) { + if (completions > IOEND_BATCH_SIZE * 8) { + cond_resched(); + completions = 0; + } + ioend = list_first_entry(&tmp, struct iomap_ioend, io_list); + list_del_init(&ioend->io_list); + completions += iomap_finish_ioend(ioend, error); + } +} +EXPORT_SYMBOL_GPL(iomap_finish_ioends); + +/* + * We can merge two adjacent ioends if they have the same set of work to do. + */ +static bool iomap_ioend_can_merge(struct iomap_ioend *ioend, + struct iomap_ioend *next) +{ + if (ioend->io_bio.bi_status != next->io_bio.bi_status) + return false; + if (next->io_flags & IOMAP_IOEND_BOUNDARY) + return false; + if ((ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS) != + (next->io_flags & IOMAP_IOEND_NOMERGE_FLAGS)) + return false; + if (ioend->io_offset + ioend->io_size != next->io_offset) + return false; + /* + * Do not merge physically discontiguous ioends. The filesystem + * completion functions will have to iterate the physical + * discontiguities even if we merge the ioends at a logical level, so + * we don't gain anything by merging physical discontiguities here. + * + * We cannot use bio->bi_iter.bi_sector here as it is modified during + * submission so does not point to the start sector of the bio at + * completion. + */ + if (ioend->io_sector + (ioend->io_size >> SECTOR_SHIFT) != + next->io_sector) + return false; + return true; +} + +void iomap_ioend_try_merge(struct iomap_ioend *ioend, + struct list_head *more_ioends) +{ + struct iomap_ioend *next; + + INIT_LIST_HEAD(&ioend->io_list); + + while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend, + io_list))) { + if (!iomap_ioend_can_merge(ioend, next)) + break; + list_move_tail(&next->io_list, &ioend->io_list); + ioend->io_size += next->io_size; + } +} +EXPORT_SYMBOL_GPL(iomap_ioend_try_merge); + +static int iomap_ioend_compare(void *priv, const struct list_head *a, + const struct list_head *b) +{ + struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list); + struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list); + + if (ia->io_offset < ib->io_offset) + return -1; + if (ia->io_offset > ib->io_offset) + return 1; + return 0; +} + +void iomap_sort_ioends(struct list_head *ioend_list) +{ + list_sort(NULL, ioend_list, iomap_ioend_compare); +} +EXPORT_SYMBOL_GPL(iomap_sort_ioends); + +/* + * Split up to the first @max_len bytes from @ioend if the ioend covers more + * than @max_len bytes. + * + * If @is_append is set, the split will be based on the hardware limits for + * REQ_OP_ZONE_APPEND commands and can be less than @max_len if the hardware + * limits don't allow the entire @max_len length. + * + * The bio embedded into @ioend must be a REQ_OP_WRITE because the block layer + * does not allow splitting REQ_OP_ZONE_APPEND bios. The file systems has to + * switch the operation after this call, but before submitting the bio. + */ +struct iomap_ioend *iomap_split_ioend(struct iomap_ioend *ioend, + unsigned int max_len, bool is_append) +{ + struct bio *bio = &ioend->io_bio; + struct iomap_ioend *split_ioend; + unsigned int nr_segs; + int sector_offset; + struct bio *split; + + if (is_append) { + struct queue_limits *lim = bdev_limits(bio->bi_bdev); + + max_len = min(max_len, + lim->max_zone_append_sectors << SECTOR_SHIFT); + + sector_offset = bio_split_rw_at(bio, lim, &nr_segs, max_len); + if (unlikely(sector_offset < 0)) + return ERR_PTR(sector_offset); + if (!sector_offset) + return NULL; + } else { + if (bio->bi_iter.bi_size <= max_len) + return NULL; + sector_offset = max_len >> SECTOR_SHIFT; + } + + /* ensure the split ioend is still block size aligned */ + sector_offset = ALIGN_DOWN(sector_offset << SECTOR_SHIFT, + i_blocksize(ioend->io_inode)) >> SECTOR_SHIFT; + + split = bio_split(bio, sector_offset, GFP_NOFS, &iomap_ioend_bioset); + if (IS_ERR(split)) + return ERR_CAST(split); + split->bi_private = bio->bi_private; + split->bi_end_io = bio->bi_end_io; + + split_ioend = iomap_init_ioend(ioend->io_inode, split, ioend->io_offset, + ioend->io_flags); + split_ioend->io_parent = ioend; + + atomic_inc(&ioend->io_remaining); + ioend->io_offset += split_ioend->io_size; + ioend->io_size -= split_ioend->io_size; + + split_ioend->io_sector = ioend->io_sector; + if (!is_append) + ioend->io_sector += (split_ioend->io_size >> SECTOR_SHIFT); + return split_ioend; +} +EXPORT_SYMBOL_GPL(iomap_split_ioend); + +static int __init iomap_ioend_init(void) +{ + return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE), + offsetof(struct iomap_ioend, io_bio), + BIOSET_NEED_BVECS); +} +fs_initcall(iomap_ioend_init); diff --git a/fs/iomap/iter.c b/fs/iomap/iter.c index 3790918646af..6ffc6a7b9ba5 100644 --- a/fs/iomap/iter.c +++ b/fs/iomap/iter.c @@ -7,40 +7,25 @@ #include <linux/iomap.h> #include "trace.h" -/* - * Advance to the next range we need to map. - * - * If the iomap is marked IOMAP_F_STALE, it means the existing map was not fully - * processed - it was aborted because the extent the iomap spanned may have been - * changed during the operation. In this case, the iteration behaviour is to - * remap the unprocessed range of the iter, and that means we may need to remap - * even when we've made no progress (i.e. iter->processed = 0). Hence the - * "finished iterating" case needs to distinguish between - * (processed = 0) meaning we are done and (processed = 0 && stale) meaning we - * need to remap the entire remaining range. - */ -static inline int iomap_iter_advance(struct iomap_iter *iter) +static inline void iomap_iter_reset_iomap(struct iomap_iter *iter) { - bool stale = iter->iomap.flags & IOMAP_F_STALE; - int ret = 1; - - /* handle the previous iteration (if any) */ - if (iter->iomap.length) { - if (iter->processed < 0) - return iter->processed; - if (WARN_ON_ONCE(iter->processed > iomap_length(iter))) - return -EIO; - iter->pos += iter->processed; - iter->len -= iter->processed; - if (!iter->len || (!iter->processed && !stale)) - ret = 0; - } - - /* clear the per iteration state */ - iter->processed = 0; + iter->status = 0; memset(&iter->iomap, 0, sizeof(iter->iomap)); memset(&iter->srcmap, 0, sizeof(iter->srcmap)); - return ret; +} + +/* + * Advance the current iterator position and output the length remaining for the + * current mapping. + */ +int iomap_iter_advance(struct iomap_iter *iter, u64 *count) +{ + if (WARN_ON_ONCE(*count > iomap_length(iter))) + return -EIO; + iter->pos += *count; + iter->len -= *count; + *count = iomap_length(iter); + return 0; } static inline void iomap_iter_done(struct iomap_iter *iter) @@ -50,6 +35,8 @@ static inline void iomap_iter_done(struct iomap_iter *iter) WARN_ON_ONCE(iter->iomap.offset + iter->iomap.length <= iter->pos); WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_STALE); + iter->iter_start_pos = iter->pos; + trace_iomap_iter_dstmap(iter->inode, &iter->iomap); if (iter->srcmap.type != IOMAP_HOLE) trace_iomap_iter_srcmap(iter->inode, &iter->srcmap); @@ -67,26 +54,58 @@ static inline void iomap_iter_done(struct iomap_iter *iter) * function must be called in a loop that continues as long it returns a * positive value. If 0 or a negative value is returned, the caller must not * return to the loop body. Within a loop body, there are two ways to break out - * of the loop body: leave @iter.processed unchanged, or set it to a negative + * of the loop body: leave @iter.status unchanged, or set it to a negative * errno. */ int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops) { + bool stale = iter->iomap.flags & IOMAP_F_STALE; + ssize_t advanced; + u64 olen; int ret; - if (iter->iomap.length && ops->iomap_end) { - ret = ops->iomap_end(iter->inode, iter->pos, iomap_length(iter), - iter->processed > 0 ? iter->processed : 0, - iter->flags, &iter->iomap); - if (ret < 0 && !iter->processed) + trace_iomap_iter(iter, ops, _RET_IP_); + + if (!iter->iomap.length) + goto begin; + + /* + * Calculate how far the iter was advanced and the original length bytes + * for ->iomap_end(). + */ + advanced = iter->pos - iter->iter_start_pos; + olen = iter->len + advanced; + + if (ops->iomap_end) { + ret = ops->iomap_end(iter->inode, iter->iter_start_pos, + iomap_length_trim(iter, iter->iter_start_pos, + olen), + advanced, iter->flags, &iter->iomap); + if (ret < 0 && !advanced) return ret; } - trace_iomap_iter(iter, ops, _RET_IP_); - ret = iomap_iter_advance(iter); + /* detect old return semantics where this would advance */ + if (WARN_ON_ONCE(iter->status > 0)) + iter->status = -EIO; + + /* + * Use iter->len to determine whether to continue onto the next mapping. + * Explicitly terminate on error status or if the current iter has not + * advanced at all (i.e. no work was done for some reason) unless the + * mapping has been marked stale and needs to be reprocessed. + */ + if (iter->status < 0) + ret = iter->status; + else if (iter->len == 0 || (!advanced && !stale)) + ret = 0; + else + ret = 1; + iomap_iter_reset_iomap(iter); if (ret <= 0) return ret; +begin: ret = ops->iomap_begin(iter->inode, iter->pos, iter->len, iter->flags, &iter->iomap, &iter->srcmap); if (ret < 0) diff --git a/fs/iomap/seek.c b/fs/iomap/seek.c index a845c012b50c..04d7919636c1 100644 --- a/fs/iomap/seek.c +++ b/fs/iomap/seek.c @@ -10,7 +10,7 @@ #include <linux/pagemap.h> #include <linux/pagevec.h> -static loff_t iomap_seek_hole_iter(const struct iomap_iter *iter, +static int iomap_seek_hole_iter(struct iomap_iter *iter, loff_t *hole_pos) { loff_t length = iomap_length(iter); @@ -20,13 +20,13 @@ static loff_t iomap_seek_hole_iter(const struct iomap_iter *iter, *hole_pos = mapping_seek_hole_data(iter->inode->i_mapping, iter->pos, iter->pos + length, SEEK_HOLE); if (*hole_pos == iter->pos + length) - return length; + return iomap_iter_advance(iter, &length); return 0; case IOMAP_HOLE: *hole_pos = iter->pos; return 0; default: - return length; + return iomap_iter_advance(iter, &length); } } @@ -47,7 +47,7 @@ iomap_seek_hole(struct inode *inode, loff_t pos, const struct iomap_ops *ops) iter.len = size - pos; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_seek_hole_iter(&iter, &pos); + iter.status = iomap_seek_hole_iter(&iter, &pos); if (ret < 0) return ret; if (iter.len) /* found hole before EOF */ @@ -56,19 +56,19 @@ iomap_seek_hole(struct inode *inode, loff_t pos, const struct iomap_ops *ops) } EXPORT_SYMBOL_GPL(iomap_seek_hole); -static loff_t iomap_seek_data_iter(const struct iomap_iter *iter, +static int iomap_seek_data_iter(struct iomap_iter *iter, loff_t *hole_pos) { loff_t length = iomap_length(iter); switch (iter->iomap.type) { case IOMAP_HOLE: - return length; + return iomap_iter_advance(iter, &length); case IOMAP_UNWRITTEN: *hole_pos = mapping_seek_hole_data(iter->inode->i_mapping, iter->pos, iter->pos + length, SEEK_DATA); if (*hole_pos < 0) - return length; + return iomap_iter_advance(iter, &length); return 0; default: *hole_pos = iter->pos; @@ -93,7 +93,7 @@ iomap_seek_data(struct inode *inode, loff_t pos, const struct iomap_ops *ops) iter.len = size - pos; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_seek_data_iter(&iter, &pos); + iter.status = iomap_seek_data_iter(&iter, &pos); if (ret < 0) return ret; if (iter.len) /* found data before EOF */ diff --git a/fs/iomap/swapfile.c b/fs/iomap/swapfile.c index b90d0eda9e51..c1a762c10ce4 100644 --- a/fs/iomap/swapfile.c +++ b/fs/iomap/swapfile.c @@ -94,7 +94,7 @@ static int iomap_swapfile_fail(struct iomap_swapfile_info *isi, const char *str) * swap only cares about contiguous page-aligned physical extents and makes no * distinction between written and unwritten extents. */ -static loff_t iomap_swapfile_iter(const struct iomap_iter *iter, +static int iomap_swapfile_iter(struct iomap_iter *iter, struct iomap *iomap, struct iomap_swapfile_info *isi) { switch (iomap->type) { @@ -132,7 +132,8 @@ static loff_t iomap_swapfile_iter(const struct iomap_iter *iter, return error; memcpy(&isi->iomap, iomap, sizeof(isi->iomap)); } - return iomap_length(iter); + + return iomap_iter_advance_full(iter); } /* @@ -166,7 +167,7 @@ int iomap_swapfile_activate(struct swap_info_struct *sis, return ret; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_swapfile_iter(&iter, &iter.iomap, &isi); + iter.status = iomap_swapfile_iter(&iter, &iter.iomap, &isi); if (ret < 0) return ret; diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index 4118a42cdab0..9eab2c8ac3c5 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -207,7 +207,7 @@ TRACE_EVENT(iomap_iter, __field(u64, ino) __field(loff_t, pos) __field(u64, length) - __field(s64, processed) + __field(int, status) __field(unsigned int, flags) __field(const void *, ops) __field(unsigned long, caller) @@ -217,17 +217,17 @@ TRACE_EVENT(iomap_iter, __entry->ino = iter->inode->i_ino; __entry->pos = iter->pos; __entry->length = iomap_length(iter); - __entry->processed = iter->processed; + __entry->status = iter->status; __entry->flags = iter->flags; __entry->ops = ops; __entry->caller = caller; ), - TP_printk("dev %d:%d ino 0x%llx pos 0x%llx length 0x%llx processed %lld flags %s (0x%x) ops %ps caller %pS", + TP_printk("dev %d:%d ino 0x%llx pos 0x%llx length 0x%llx status %d flags %s (0x%x) ops %ps caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->pos, __entry->length, - __entry->processed, + __entry->status, __print_flags(__entry->flags, "|", IOMAP_FLAGS_STRINGS), __entry->flags, __entry->ops, diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 6d9965b546cb..5077d52a775d 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -115,7 +115,7 @@ xfs_end_ioend( */ error = blk_status_to_errno(ioend->io_bio.bi_status); if (unlikely(error)) { - if (ioend->io_flags & IOMAP_F_SHARED) { + if (ioend->io_flags & IOMAP_IOEND_SHARED) { xfs_reflink_cancel_cow_range(ip, offset, size, true); xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, offset, offset + size); @@ -126,9 +126,9 @@ xfs_end_ioend( /* * Success: commit the COW or unwritten blocks if needed. */ - if (ioend->io_flags & IOMAP_F_SHARED) + if (ioend->io_flags & IOMAP_IOEND_SHARED) error = xfs_reflink_end_cow(ip, offset, size); - else if (ioend->io_type == IOMAP_UNWRITTEN) + else if (ioend->io_flags & IOMAP_IOEND_UNWRITTEN) error = xfs_iomap_write_unwritten(ip, offset, size, false); if (!error && xfs_ioend_is_append(ioend)) @@ -396,10 +396,11 @@ allocate_blocks: } static int -xfs_prepare_ioend( - struct iomap_ioend *ioend, +xfs_submit_ioend( + struct iomap_writepage_ctx *wpc, int status) { + struct iomap_ioend *ioend = wpc->ioend; unsigned int nofs_flag; /* @@ -410,7 +411,7 @@ xfs_prepare_ioend( nofs_flag = memalloc_nofs_save(); /* Convert CoW extents to regular */ - if (!status && (ioend->io_flags & IOMAP_F_SHARED)) { + if (!status && (ioend->io_flags & IOMAP_IOEND_SHARED)) { status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode), ioend->io_offset, ioend->io_size); } @@ -418,10 +419,14 @@ xfs_prepare_ioend( memalloc_nofs_restore(nofs_flag); /* send ioends that might require a transaction to the completion wq */ - if (xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN || - (ioend->io_flags & IOMAP_F_SHARED)) + if (xfs_ioend_is_append(ioend) || + (ioend->io_flags & (IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_SHARED))) ioend->io_bio.bi_end_io = xfs_end_bio; - return status; + + if (status) + return status; + submit_bio(&ioend->io_bio); + return 0; } /* @@ -463,7 +468,7 @@ xfs_discard_folio( static const struct iomap_writeback_ops xfs_writeback_ops = { .map_blocks = xfs_map_blocks, - .prepare_ioend = xfs_prepare_ioend, + .submit_ioend = xfs_submit_ioend, .discard_folio = xfs_discard_folio, }; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index f7a7d89c345e..a81c3e943f20 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1511,7 +1511,8 @@ xfs_write_fault( if (IS_DAX(inode)) ret = xfs_dax_fault_locked(vmf, order, true); else - ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops); + ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops, + NULL); xfs_iunlock(ip, lock_mode); sb_end_pagefault(inode->i_sb); @@ -1626,7 +1627,8 @@ const struct file_operations xfs_file_operations = { .fadvise = xfs_file_fadvise, .remap_file_range = xfs_file_remap_range, .fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC | - FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE, + FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE | + FOP_DONTCACHE, }; const struct file_operations xfs_dir_file_operations = { diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index d61460309a78..46acf727cbe7 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1495,7 +1495,7 @@ xfs_zero_range( return dax_zero_range(inode, pos, len, did_zero, &xfs_dax_write_iomap_ops); return iomap_zero_range(inode, pos, len, did_zero, - &xfs_buffered_write_iomap_ops); + &xfs_buffered_write_iomap_ops, NULL); } int @@ -1510,5 +1510,5 @@ xfs_truncate_page( return dax_truncate_page(inode, pos, did_zero, &xfs_dax_write_iomap_ops); return iomap_truncate_page(inode, pos, did_zero, - &xfs_buffered_write_iomap_ops); + &xfs_buffered_write_iomap_ops, NULL); } diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c index 35166c92420c..42e2c0065bb3 100644 --- a/fs/zonefs/file.c +++ b/fs/zonefs/file.c @@ -299,7 +299,7 @@ static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf) /* Serialize against truncates */ filemap_invalidate_lock_shared(inode->i_mapping); - ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops); + ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops, NULL); filemap_invalidate_unlock_shared(inode->i_mapping); sb_end_pagefault(inode->i_sb); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 75bf54e76f3b..ea29388b2fba 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -56,6 +56,10 @@ struct vm_fault; * * IOMAP_F_BOUNDARY indicates that I/O and I/O completions for this iomap must * never be merged with the mapping before it. + * + * IOMAP_F_ANON_WRITE indicates that (write) I/O does not have a target block + * assigned to it yet and the file system will do that in the bio submission + * handler, splitting the I/O as needed. */ #define IOMAP_F_NEW (1U << 0) #define IOMAP_F_DIRTY (1U << 1) @@ -68,6 +72,7 @@ struct vm_fault; #endif /* CONFIG_BUFFER_HEAD */ #define IOMAP_F_XATTR (1U << 5) #define IOMAP_F_BOUNDARY (1U << 6) +#define IOMAP_F_ANON_WRITE (1U << 7) /* * Flags set by the core iomap code during operations: @@ -111,6 +116,8 @@ struct iomap { static inline sector_t iomap_sector(const struct iomap *iomap, loff_t pos) { + if (iomap->flags & IOMAP_F_ANON_WRITE) + return U64_MAX; /* invalid */ return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT; } @@ -183,6 +190,7 @@ struct iomap_folio_ops { #define IOMAP_DAX 0 #endif /* CONFIG_FS_DAX */ #define IOMAP_ATOMIC (1 << 9) +#define IOMAP_DONTCACHE (1 << 10) struct iomap_ops { /* @@ -211,8 +219,10 @@ struct iomap_ops { * calls to iomap_iter(). Treat as read-only in the body. * @len: The remaining length of the file segment we're operating on. * It is updated at the same time as @pos. - * @processed: The number of bytes processed by the body in the most recent - * iteration, or a negative errno. 0 causes the iteration to stop. + * @iter_start_pos: The original start pos for the current iomap. Used for + * incremental iter advance. + * @status: Status of the most recent iteration. Zero on success or a negative + * errno on error. * @flags: Zero or more of the iomap_begin flags above. * @iomap: Map describing the I/O iteration * @srcmap: Source map for COW operations @@ -221,7 +231,8 @@ struct iomap_iter { struct inode *inode; loff_t pos; u64 len; - s64 processed; + loff_t iter_start_pos; + int status; unsigned flags; struct iomap iomap; struct iomap srcmap; @@ -229,20 +240,46 @@ struct iomap_iter { }; int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops); +int iomap_iter_advance(struct iomap_iter *iter, u64 *count); /** - * iomap_length - length of the current iomap iteration + * iomap_length_trim - trimmed length of the current iomap iteration * @iter: iteration structure + * @pos: File position to trim from. + * @len: Length of the mapping to trim to. * - * Returns the length that the operation applies to for the current iteration. + * Returns a trimmed length that the operation applies to for the current + * iteration. */ -static inline u64 iomap_length(const struct iomap_iter *iter) +static inline u64 iomap_length_trim(const struct iomap_iter *iter, loff_t pos, + u64 len) { u64 end = iter->iomap.offset + iter->iomap.length; if (iter->srcmap.type != IOMAP_HOLE) end = min(end, iter->srcmap.offset + iter->srcmap.length); - return min(iter->len, end - iter->pos); + return min(len, end - pos); +} + +/** + * iomap_length - length of the current iomap iteration + * @iter: iteration structure + * + * Returns the length that the operation applies to for the current iteration. + */ +static inline u64 iomap_length(const struct iomap_iter *iter) +{ + return iomap_length_trim(iter, iter->pos, iter->len); +} + +/** + * iomap_iter_advance_full - advance by the full length of current map + */ +static inline int iomap_iter_advance_full(struct iomap_iter *iter) +{ + u64 length = iomap_length(iter); + + return iomap_iter_advance(iter, &length); } /** @@ -306,12 +343,11 @@ bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio); int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, const struct iomap_ops *ops); int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, - bool *did_zero, const struct iomap_ops *ops); + bool *did_zero, const struct iomap_ops *ops, void *private); int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, - const struct iomap_ops *ops); -vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, - const struct iomap_ops *ops); - + const struct iomap_ops *ops, void *private); +vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops, + void *private); typedef void (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length, struct iomap *iomap); void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte, @@ -328,16 +364,42 @@ sector_t iomap_bmap(struct address_space *mapping, sector_t bno, const struct iomap_ops *ops); /* + * Flags for iomap_ioend->io_flags. + */ +/* shared COW extent */ +#define IOMAP_IOEND_SHARED (1U << 0) +/* unwritten extent */ +#define IOMAP_IOEND_UNWRITTEN (1U << 1) +/* don't merge into previous ioend */ +#define IOMAP_IOEND_BOUNDARY (1U << 2) +/* is direct I/O */ +#define IOMAP_IOEND_DIRECT (1U << 3) + +/* + * Flags that if set on either ioend prevent the merge of two ioends. + * (IOMAP_IOEND_BOUNDARY also prevents merges, but only one-way) + */ +#define IOMAP_IOEND_NOMERGE_FLAGS \ + (IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_DIRECT) + +/* * Structure for writeback I/O completions. + * + * File systems implementing ->submit_ioend (for buffered I/O) or ->submit_io + * for direct I/O) can split a bio generated by iomap. In that case the parent + * ioend it was split from is recorded in ioend->io_parent. */ struct iomap_ioend { struct list_head io_list; /* next ioend in chain */ - u16 io_type; - u16 io_flags; /* IOMAP_F_* */ + u16 io_flags; /* IOMAP_IOEND_* */ struct inode *io_inode; /* file being written to */ - size_t io_size; /* size of data within eof */ + size_t io_size; /* size of the extent */ + atomic_t io_remaining; /* completetion defer count */ + int io_error; /* stashed away status */ + struct iomap_ioend *io_parent; /* parent for completions */ loff_t io_offset; /* offset in the file */ sector_t io_sector; /* start sector of ioend */ + void *io_private; /* file system private data */ struct bio io_bio; /* MUST BE LAST! */ }; @@ -362,12 +424,14 @@ struct iomap_writeback_ops { loff_t offset, unsigned len); /* - * Optional, allows the file systems to perform actions just before - * submitting the bio and/or override the bio end_io handler for complex - * operations like copy on write extent manipulation or unwritten extent - * conversions. + * Optional, allows the file systems to hook into bio submission, + * including overriding the bi_end_io handler. + * + * Returns 0 if the bio was successfully submitted, or a negative + * error code if status was non-zero or another error happened and + * the bio could not be submitted. */ - int (*prepare_ioend)(struct iomap_ioend *ioend, int status); + int (*submit_ioend)(struct iomap_writepage_ctx *wpc, int status); /* * Optional, allows the file system to discard state on a page where @@ -383,6 +447,10 @@ struct iomap_writepage_ctx { u32 nr_folios; /* folios added to the ioend */ }; +struct iomap_ioend *iomap_init_ioend(struct inode *inode, struct bio *bio, + loff_t file_offset, u16 ioend_flags); +struct iomap_ioend *iomap_split_ioend(struct iomap_ioend *ioend, + unsigned int max_len, bool is_append); void iomap_finish_ioends(struct iomap_ioend *ioend, int error); void iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends); @@ -454,4 +522,6 @@ int iomap_swapfile_activate(struct swap_info_struct *sis, # define iomap_swapfile_activate(sis, swapfile, pagespan, ops) (-EIO) #endif /* CONFIG_SWAP */ +extern struct bio_set iomap_ioend_bioset; + #endif /* LINUX_IOMAP_H */ |