summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/filesystems/iomap/design.rst9
-rw-r--r--Documentation/filesystems/iomap/operations.rst13
-rw-r--r--fs/dax.c111
-rw-r--r--fs/gfs2/bmap.c3
-rw-r--r--fs/iomap/Makefile1
-rw-r--r--fs/iomap/buffered-io.c347
-rw-r--r--fs/iomap/direct-io.c154
-rw-r--r--fs/iomap/fiemap.c21
-rw-r--r--fs/iomap/internal.h10
-rw-r--r--fs/iomap/ioend.c216
-rw-r--r--fs/iomap/iter.c97
-rw-r--r--fs/iomap/seek.c16
-rw-r--r--fs/iomap/swapfile.c7
-rw-r--r--fs/iomap/trace.h8
-rw-r--r--fs/xfs/xfs_aops.c25
-rw-r--r--fs/xfs/xfs_file.c6
-rw-r--r--fs/xfs/xfs_iomap.c4
-rw-r--r--fs/zonefs/file.c2
-rw-r--r--include/linux/iomap.h110
19 files changed, 716 insertions, 444 deletions
diff --git a/Documentation/filesystems/iomap/design.rst b/Documentation/filesystems/iomap/design.rst
index b0d0188a095e..e29651a42eec 100644
--- a/Documentation/filesystems/iomap/design.rst
+++ b/Documentation/filesystems/iomap/design.rst
@@ -246,6 +246,10 @@ The fields are as follows:
* **IOMAP_F_PRIVATE**: Starting with this value, the upper bits can
be set by the filesystem for its own purposes.
+ * **IOMAP_F_ANON_WRITE**: Indicates that (write) I/O does not have a target
+ block assigned to it yet and the file system will do that in the bio
+ submission handler, splitting the I/O as needed.
+
These flags can be set by iomap itself during file operations.
The filesystem should supply an ``->iomap_end`` function if it needs
to observe these flags:
@@ -352,6 +356,11 @@ operations:
``IOMAP_NOWAIT`` is often set on behalf of ``IOCB_NOWAIT`` or
``RWF_NOWAIT``.
+ * ``IOMAP_DONTCACHE`` is set when the caller wishes to perform a
+ buffered file I/O and would like the kernel to drop the pagecache
+ after the I/O completes, if it isn't already being used by another
+ thread.
+
If it is necessary to read existing file contents from a `different
<https://lore.kernel.org/all/20191008071527.29304-9-hch@lst.de/>`_
device or address range on a device, the filesystem should return that
diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst
index 2c7f5df9d8b0..d1535109587a 100644
--- a/Documentation/filesystems/iomap/operations.rst
+++ b/Documentation/filesystems/iomap/operations.rst
@@ -131,6 +131,8 @@ These ``struct kiocb`` flags are significant for buffered I/O with iomap:
* ``IOCB_NOWAIT``: Turns on ``IOMAP_NOWAIT``.
+ * ``IOCB_DONTCACHE``: Turns on ``IOMAP_DONTCACHE``.
+
Internal per-Folio State
------------------------
@@ -283,7 +285,7 @@ The ``ops`` structure must be specified and is as follows:
struct iomap_writeback_ops {
int (*map_blocks)(struct iomap_writepage_ctx *wpc, struct inode *inode,
loff_t offset, unsigned len);
- int (*prepare_ioend)(struct iomap_ioend *ioend, int status);
+ int (*submit_ioend)(struct iomap_writepage_ctx *wpc, int status);
void (*discard_folio)(struct folio *folio, loff_t pos);
};
@@ -306,13 +308,12 @@ The fields are as follows:
purpose.
This function must be supplied by the filesystem.
- - ``prepare_ioend``: Enables filesystems to transform the writeback
- ioend or perform any other preparatory work before the writeback I/O
- is submitted.
+ - ``submit_ioend``: Allows the file systems to hook into writeback bio
+ submission.
This might include pre-write space accounting updates, or installing
a custom ``->bi_end_io`` function for internal purposes, such as
deferring the ioend completion to a workqueue to run metadata update
- transactions from process context.
+ transactions from process context before submitting the bio.
This function is optional.
- ``discard_folio``: iomap calls this function after ``->map_blocks``
@@ -341,7 +342,7 @@ This can happen in interrupt or process context, depending on the
storage device.
Filesystems that need to update internal bookkeeping (e.g. unwritten
-extent conversions) should provide a ``->prepare_ioend`` function to
+extent conversions) should provide a ``->submit_ioend`` function to
set ``struct iomap_end::bio::bi_end_io`` to its own function.
This function should call ``iomap_finish_ioends`` after finishing its
own work (e.g. unwritten extent conversion).
diff --git a/fs/dax.c b/fs/dax.c
index 21b47402b3dc..7fd4cd9a51f2 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1258,7 +1258,7 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
}
#endif /* CONFIG_FS_DAX_PMD */
-static s64 dax_unshare_iter(struct iomap_iter *iter)
+static int dax_unshare_iter(struct iomap_iter *iter)
{
struct iomap *iomap = &iter->iomap;
const struct iomap *srcmap = iomap_iter_srcmap(iter);
@@ -1266,11 +1266,11 @@ static s64 dax_unshare_iter(struct iomap_iter *iter)
u64 copy_len = iomap_length(iter);
u32 mod;
int id = 0;
- s64 ret = 0;
+ s64 ret;
void *daddr = NULL, *saddr = NULL;
if (!iomap_want_unshare_iter(iter))
- return iomap_length(iter);
+ return iomap_iter_advance_full(iter);
/*
* Extend the file range to be aligned to fsblock/pagesize, because
@@ -1300,14 +1300,14 @@ static s64 dax_unshare_iter(struct iomap_iter *iter)
if (ret < 0)
goto out_unlock;
- if (copy_mc_to_kernel(daddr, saddr, copy_len) == 0)
- ret = iomap_length(iter);
- else
+ if (copy_mc_to_kernel(daddr, saddr, copy_len) != 0)
ret = -EIO;
out_unlock:
dax_read_unlock(id);
- return dax_mem2blk_err(ret);
+ if (ret < 0)
+ return dax_mem2blk_err(ret);
+ return iomap_iter_advance_full(iter);
}
int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len,
@@ -1326,7 +1326,7 @@ int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len,
iter.len = min(len, size - pos);
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.processed = dax_unshare_iter(&iter);
+ iter.status = dax_unshare_iter(&iter);
return ret;
}
EXPORT_SYMBOL_GPL(dax_file_unshare);
@@ -1354,17 +1354,16 @@ static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size)
return ret;
}
-static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
+static int dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
{
const struct iomap *iomap = &iter->iomap;
const struct iomap *srcmap = iomap_iter_srcmap(iter);
- loff_t pos = iter->pos;
u64 length = iomap_length(iter);
- s64 written = 0;
+ int ret;
/* already zeroed? we're done. */
if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
- return length;
+ return iomap_iter_advance(iter, &length);
/*
* invalidate the pages whose sharing state is to be changed
@@ -1372,33 +1371,35 @@ static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
*/
if (iomap->flags & IOMAP_F_SHARED)
invalidate_inode_pages2_range(iter->inode->i_mapping,
- pos >> PAGE_SHIFT,
- (pos + length - 1) >> PAGE_SHIFT);
+ iter->pos >> PAGE_SHIFT,
+ (iter->pos + length - 1) >> PAGE_SHIFT);
do {
+ loff_t pos = iter->pos;
unsigned offset = offset_in_page(pos);
- unsigned size = min_t(u64, PAGE_SIZE - offset, length);
pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
- long rc;
int id;
+ length = min_t(u64, PAGE_SIZE - offset, length);
+
id = dax_read_lock();
- if (IS_ALIGNED(pos, PAGE_SIZE) && size == PAGE_SIZE)
- rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
+ if (IS_ALIGNED(pos, PAGE_SIZE) && length == PAGE_SIZE)
+ ret = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
else
- rc = dax_memzero(iter, pos, size);
+ ret = dax_memzero(iter, pos, length);
dax_read_unlock(id);
- if (rc < 0)
- return rc;
- pos += size;
- length -= size;
- written += size;
+ if (ret < 0)
+ return ret;
+
+ ret = iomap_iter_advance(iter, &length);
+ if (ret)
+ return ret;
} while (length > 0);
if (did_zero)
*did_zero = true;
- return written;
+ return ret;
}
int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
@@ -1413,7 +1414,7 @@ int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
int ret;
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.processed = dax_zero_iter(&iter, did_zero);
+ iter.status = dax_zero_iter(&iter, did_zero);
return ret;
}
EXPORT_SYMBOL_GPL(dax_zero_range);
@@ -1431,8 +1432,7 @@ int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
}
EXPORT_SYMBOL_GPL(dax_truncate_page);
-static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
- struct iov_iter *iter)
+static int dax_iomap_iter(struct iomap_iter *iomi, struct iov_iter *iter)
{
const struct iomap *iomap = &iomi->iomap;
const struct iomap *srcmap = iomap_iter_srcmap(iomi);
@@ -1451,8 +1451,10 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
if (pos >= end)
return 0;
- if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
- return iov_iter_zero(min(length, end - pos), iter);
+ if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) {
+ done = iov_iter_zero(min(length, end - pos), iter);
+ return iomap_iter_advance(iomi, &done);
+ }
}
/*
@@ -1485,7 +1487,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
}
id = dax_read_lock();
- while (pos < end) {
+ while ((pos = iomi->pos) < end) {
unsigned offset = pos & (PAGE_SIZE - 1);
const size_t size = ALIGN(length + offset, PAGE_SIZE);
pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
@@ -1535,18 +1537,16 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
map_len, iter);
- pos += xfer;
- length -= xfer;
- done += xfer;
-
- if (xfer == 0)
+ length = xfer;
+ ret = iomap_iter_advance(iomi, &length);
+ if (!ret && xfer == 0)
ret = -EFAULT;
if (xfer < map_len)
break;
}
dax_read_unlock(id);
- return done ? done : ret;
+ return ret;
}
/**
@@ -1586,7 +1586,7 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
iomi.flags |= IOMAP_NOWAIT;
while ((ret = iomap_iter(&iomi, ops)) > 0)
- iomi.processed = dax_iomap_iter(&iomi, iter);
+ iomi.status = dax_iomap_iter(&iomi, iter);
done = iomi.pos - iocb->ki_pos;
iocb->ki_pos = iomi.pos;
@@ -1757,7 +1757,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
while ((error = iomap_iter(&iter, ops)) > 0) {
if (WARN_ON_ONCE(iomap_length(&iter) < PAGE_SIZE)) {
- iter.processed = -EIO; /* fs corruption? */
+ iter.status = -EIO; /* fs corruption? */
continue;
}
@@ -1769,8 +1769,10 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
ret |= VM_FAULT_MAJOR;
}
- if (!(ret & VM_FAULT_ERROR))
- iter.processed = PAGE_SIZE;
+ if (!(ret & VM_FAULT_ERROR)) {
+ u64 length = PAGE_SIZE;
+ iter.status = iomap_iter_advance(&iter, &length);
+ }
}
if (iomap_errp)
@@ -1883,8 +1885,10 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
continue; /* actually breaks out of the loop */
ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, true);
- if (ret != VM_FAULT_FALLBACK)
- iter.processed = PMD_SIZE;
+ if (ret != VM_FAULT_FALLBACK) {
+ u64 length = PMD_SIZE;
+ iter.status = iomap_iter_advance(&iter, &length);
+ }
}
unlock_entry:
@@ -1999,12 +2003,13 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, unsigned int order,
}
EXPORT_SYMBOL_GPL(dax_finish_sync_fault);
-static loff_t dax_range_compare_iter(struct iomap_iter *it_src,
+static int dax_range_compare_iter(struct iomap_iter *it_src,
struct iomap_iter *it_dest, u64 len, bool *same)
{
const struct iomap *smap = &it_src->iomap;
const struct iomap *dmap = &it_dest->iomap;
loff_t pos1 = it_src->pos, pos2 = it_dest->pos;
+ u64 dest_len;
void *saddr, *daddr;
int id, ret;
@@ -2012,7 +2017,7 @@ static loff_t dax_range_compare_iter(struct iomap_iter *it_src,
if (smap->type == IOMAP_HOLE && dmap->type == IOMAP_HOLE) {
*same = true;
- return len;
+ goto advance;
}
if (smap->type == IOMAP_HOLE || dmap->type == IOMAP_HOLE) {
@@ -2035,7 +2040,13 @@ static loff_t dax_range_compare_iter(struct iomap_iter *it_src,
if (!*same)
len = 0;
dax_read_unlock(id);
- return len;
+
+advance:
+ dest_len = len;
+ ret = iomap_iter_advance(it_src, &len);
+ if (!ret)
+ ret = iomap_iter_advance(it_dest, &dest_len);
+ return ret;
out_unlock:
dax_read_unlock(id);
@@ -2058,15 +2069,15 @@ int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
.len = len,
.flags = IOMAP_DAX,
};
- int ret, compared = 0;
+ int ret, status;
while ((ret = iomap_iter(&src_iter, ops)) > 0 &&
(ret = iomap_iter(&dst_iter, ops)) > 0) {
- compared = dax_range_compare_iter(&src_iter, &dst_iter,
+ status = dax_range_compare_iter(&src_iter, &dst_iter,
min(src_iter.len, dst_iter.len), same);
- if (compared < 0)
+ if (status < 0)
return ret;
- src_iter.processed = dst_iter.processed = compared;
+ src_iter.status = dst_iter.status = status;
}
return ret;
}
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 1795c4e8dbf6..366516b98b3f 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1300,7 +1300,8 @@ static int gfs2_block_zero_range(struct inode *inode, loff_t from,
unsigned int length)
{
BUG_ON(current->journal_info);
- return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops);
+ return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops,
+ NULL);
}
#define GFS2_JTRUNC_REVOKES 8192
diff --git a/fs/iomap/Makefile b/fs/iomap/Makefile
index 381d76c5c232..69e8ebb41302 100644
--- a/fs/iomap/Makefile
+++ b/fs/iomap/Makefile
@@ -12,6 +12,7 @@ iomap-y += trace.o \
iter.o
iomap-$(CONFIG_BLOCK) += buffered-io.o \
direct-io.o \
+ ioend.o \
fiemap.o \
seek.o
iomap-$(CONFIG_SWAP) += swapfile.o
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index d303e6c8900c..d52cfdc299c4 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -12,17 +12,15 @@
#include <linux/buffer_head.h>
#include <linux/dax.h>
#include <linux/writeback.h>
-#include <linux/list_sort.h>
#include <linux/swap.h>
#include <linux/bio.h>
#include <linux/sched/signal.h>
#include <linux/migrate.h>
+#include "internal.h"
#include "trace.h"
#include "../internal.h"
-#define IOEND_BATCH_SIZE 4096
-
/*
* Structure allocated for each folio to track per-block uptodate, dirty state
* and I/O completions.
@@ -40,8 +38,6 @@ struct iomap_folio_state {
unsigned long state[];
};
-static struct bio_set iomap_ioend_bioset;
-
static inline bool ifs_is_fully_uptodate(struct folio *folio,
struct iomap_folio_state *ifs)
{
@@ -366,15 +362,14 @@ static inline bool iomap_block_needs_zeroing(const struct iomap_iter *iter,
pos >= i_size_read(iter->inode);
}
-static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
- struct iomap_readpage_ctx *ctx, loff_t offset)
+static int iomap_readpage_iter(struct iomap_iter *iter,
+ struct iomap_readpage_ctx *ctx)
{
const struct iomap *iomap = &iter->iomap;
- loff_t pos = iter->pos + offset;
- loff_t length = iomap_length(iter) - offset;
+ loff_t pos = iter->pos;
+ loff_t length = iomap_length(iter);
struct folio *folio = ctx->cur_folio;
struct iomap_folio_state *ifs;
- loff_t orig_pos = pos;
size_t poff, plen;
sector_t sector;
@@ -438,25 +433,22 @@ done:
* we can skip trailing ones as they will be handled in the next
* iteration.
*/
- return pos - orig_pos + plen;
+ length = pos - iter->pos + plen;
+ return iomap_iter_advance(iter, &length);
}
-static loff_t iomap_read_folio_iter(const struct iomap_iter *iter,
+static int iomap_read_folio_iter(struct iomap_iter *iter,
struct iomap_readpage_ctx *ctx)
{
- struct folio *folio = ctx->cur_folio;
- size_t offset = offset_in_folio(folio, iter->pos);
- loff_t length = min_t(loff_t, folio_size(folio) - offset,
- iomap_length(iter));
- loff_t done, ret;
-
- for (done = 0; done < length; done += ret) {
- ret = iomap_readpage_iter(iter, ctx, done);
- if (ret <= 0)
+ int ret;
+
+ while (iomap_length(iter)) {
+ ret = iomap_readpage_iter(iter, ctx);
+ if (ret)
return ret;
}
- return done;
+ return 0;
}
int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops)
@@ -474,7 +466,7 @@ int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops)
trace_iomap_readpage(iter.inode, 1);
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.processed = iomap_read_folio_iter(&iter, &ctx);
+ iter.status = iomap_read_folio_iter(&iter, &ctx);
if (ctx.bio) {
submit_bio(ctx.bio);
@@ -493,15 +485,14 @@ int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops)
}
EXPORT_SYMBOL_GPL(iomap_read_folio);
-static loff_t iomap_readahead_iter(const struct iomap_iter *iter,
+static int iomap_readahead_iter(struct iomap_iter *iter,
struct iomap_readpage_ctx *ctx)
{
- loff_t length = iomap_length(iter);
- loff_t done, ret;
+ int ret;
- for (done = 0; done < length; done += ret) {
+ while (iomap_length(iter)) {
if (ctx->cur_folio &&
- offset_in_folio(ctx->cur_folio, iter->pos + done) == 0) {
+ offset_in_folio(ctx->cur_folio, iter->pos) == 0) {
if (!ctx->cur_folio_in_bio)
folio_unlock(ctx->cur_folio);
ctx->cur_folio = NULL;
@@ -510,12 +501,12 @@ static loff_t iomap_readahead_iter(const struct iomap_iter *iter,
ctx->cur_folio = readahead_folio(ctx->rac);
ctx->cur_folio_in_bio = false;
}
- ret = iomap_readpage_iter(iter, ctx, done);
- if (ret <= 0)
+ ret = iomap_readpage_iter(iter, ctx);
+ if (ret)
return ret;
}
- return done;
+ return 0;
}
/**
@@ -547,7 +538,7 @@ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops)
trace_iomap_readahead(rac->mapping->host, readahead_count(rac));
while (iomap_iter(&iter, ops) > 0)
- iter.processed = iomap_readahead_iter(&iter, &ctx);
+ iter.status = iomap_readahead_iter(&iter, &ctx);
if (ctx.bio)
submit_bio(ctx.bio);
@@ -603,6 +594,8 @@ struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len)
if (iter->flags & IOMAP_NOWAIT)
fgp |= FGP_NOWAIT;
+ if (iter->flags & IOMAP_DONTCACHE)
+ fgp |= FGP_DONTCACHE;
fgp |= fgf_set_order(len);
return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
@@ -907,12 +900,10 @@ static bool iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
return __iomap_write_end(iter->inode, pos, len, copied, folio);
}
-static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
+static int iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
{
- loff_t length = iomap_length(iter);
- loff_t pos = iter->pos;
ssize_t total_written = 0;
- long status = 0;
+ int status = 0;
struct address_space *mapping = iter->inode->i_mapping;
size_t chunk = mapping_max_folio_size(mapping);
unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0;
@@ -923,7 +914,8 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
size_t offset; /* Offset into folio */
size_t bytes; /* Bytes to write to folio */
size_t copied; /* Bytes copied from user */
- size_t written; /* Bytes have been written */
+ u64 written; /* Bytes have been written */
+ loff_t pos = iter->pos;
bytes = iov_iter_count(i);
retry:
@@ -934,8 +926,8 @@ retry:
if (unlikely(status))
break;
- if (bytes > length)
- bytes = length;
+ if (bytes > iomap_length(iter))
+ bytes = iomap_length(iter);
/*
* Bring in the user page that we'll copy from _first_.
@@ -1006,17 +998,12 @@ retry:
goto retry;
}
} else {
- pos += written;
total_written += written;
- length -= written;
+ iomap_iter_advance(iter, &written);
}
- } while (iov_iter_count(i) && length);
+ } while (iov_iter_count(i) && iomap_length(iter));
- if (status == -EAGAIN) {
- iov_iter_revert(i, total_written);
- return -EAGAIN;
- }
- return total_written ? total_written : status;
+ return total_written ? 0 : status;
}
ssize_t
@@ -1034,9 +1021,11 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
if (iocb->ki_flags & IOCB_NOWAIT)
iter.flags |= IOMAP_NOWAIT;
+ if (iocb->ki_flags & IOCB_DONTCACHE)
+ iter.flags |= IOMAP_DONTCACHE;
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.processed = iomap_write_iter(&iter, i);
+ iter.status = iomap_write_iter(&iter, i);
if (unlikely(iter.pos == iocb->ki_pos))
return ret;
@@ -1270,23 +1259,22 @@ void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte,
}
EXPORT_SYMBOL_GPL(iomap_write_delalloc_release);
-static loff_t iomap_unshare_iter(struct iomap_iter *iter)
+static int iomap_unshare_iter(struct iomap_iter *iter)
{
struct iomap *iomap = &iter->iomap;
- loff_t pos = iter->pos;
- loff_t length = iomap_length(iter);
- loff_t written = 0;
+ u64 bytes = iomap_length(iter);
+ int status;
if (!iomap_want_unshare_iter(iter))
- return length;
+ return iomap_iter_advance(iter, &bytes);
do {
struct folio *folio;
- int status;
size_t offset;
- size_t bytes = min_t(u64, SIZE_MAX, length);
+ loff_t pos = iter->pos;
bool ret;
+ bytes = min_t(u64, SIZE_MAX, bytes);
status = iomap_write_begin(iter, pos, bytes, &folio);
if (unlikely(status))
return status;
@@ -1304,14 +1292,14 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter)
cond_resched();
- pos += bytes;
- written += bytes;
- length -= bytes;
-
balance_dirty_pages_ratelimited(iter->inode->i_mapping);
- } while (length > 0);
- return written;
+ status = iomap_iter_advance(iter, &bytes);
+ if (status)
+ break;
+ } while (bytes > 0);
+
+ return status;
}
int
@@ -1331,7 +1319,7 @@ iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
iter.len = min(len, size - pos);
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.processed = iomap_unshare_iter(&iter);
+ iter.status = iomap_unshare_iter(&iter);
return ret;
}
EXPORT_SYMBOL_GPL(iomap_file_unshare);
@@ -1350,19 +1338,18 @@ static inline int iomap_zero_iter_flush_and_stale(struct iomap_iter *i)
return filemap_write_and_wait_range(mapping, i->pos, end);
}
-static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
+static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
{
- loff_t pos = iter->pos;
- loff_t length = iomap_length(iter);
- loff_t written = 0;
+ u64 bytes = iomap_length(iter);
+ int status;
do {
struct folio *folio;
- int status;
size_t offset;
- size_t bytes = min_t(u64, SIZE_MAX, length);
+ loff_t pos = iter->pos;
bool ret;
+ bytes = min_t(u64, SIZE_MAX, bytes);
status = iomap_write_begin(iter, pos, bytes, &folio);
if (status)
return status;
@@ -1383,25 +1370,26 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
if (WARN_ON_ONCE(!ret))
return -EIO;
- pos += bytes;
- length -= bytes;
- written += bytes;
- } while (length > 0);
+ status = iomap_iter_advance(iter, &bytes);
+ if (status)
+ break;
+ } while (bytes > 0);
if (did_zero)
*did_zero = true;
- return written;
+ return status;
}
int
iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
- const struct iomap_ops *ops)
+ const struct iomap_ops *ops, void *private)
{
struct iomap_iter iter = {
.inode = inode,
.pos = pos,
.len = len,
.flags = IOMAP_ZERO,
+ .private = private,
};
struct address_space *mapping = inode->i_mapping;
unsigned int blocksize = i_blocksize(inode);
@@ -1424,7 +1412,7 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
filemap_range_needs_writeback(mapping, pos, pos + plen - 1)) {
iter.len = plen;
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.processed = iomap_zero_iter(&iter, did_zero);
+ iter.status = iomap_zero_iter(&iter, did_zero);
iter.len = len - (iter.pos - pos);
if (ret || !iter.len)
@@ -1443,17 +1431,19 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
if (srcmap->type == IOMAP_HOLE ||
srcmap->type == IOMAP_UNWRITTEN) {
- loff_t proc = iomap_length(&iter);
+ s64 status;
if (range_dirty) {
range_dirty = false;
- proc = iomap_zero_iter_flush_and_stale(&iter);
+ status = iomap_zero_iter_flush_and_stale(&iter);
+ } else {
+ status = iomap_iter_advance_full(&iter);
}
- iter.processed = proc;
+ iter.status = status;
continue;
}
- iter.processed = iomap_zero_iter(&iter, did_zero);
+ iter.status = iomap_zero_iter(&iter, did_zero);
}
return ret;
}
@@ -1461,7 +1451,7 @@ EXPORT_SYMBOL_GPL(iomap_zero_range);
int
iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
- const struct iomap_ops *ops)
+ const struct iomap_ops *ops, void *private)
{
unsigned int blocksize = i_blocksize(inode);
unsigned int off = pos & (blocksize - 1);
@@ -1469,11 +1459,12 @@ iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
/* Block boundary? Nothing to do */
if (!off)
return 0;
- return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops);
+ return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops,
+ private);
}
EXPORT_SYMBOL_GPL(iomap_truncate_page);
-static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter,
+static int iomap_folio_mkwrite_iter(struct iomap_iter *iter,
struct folio *folio)
{
loff_t length = iomap_length(iter);
@@ -1490,14 +1481,16 @@ static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter,
folio_mark_dirty(folio);
}
- return length;
+ return iomap_iter_advance(iter, &length);
}
-vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
+vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops,
+ void *private)
{
struct iomap_iter iter = {
.inode = file_inode(vmf->vma->vm_file),
.flags = IOMAP_WRITE | IOMAP_FAULT,
+ .private = private,
};
struct folio *folio = page_folio(vmf->page);
ssize_t ret;
@@ -1509,7 +1502,7 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
iter.pos = folio_pos(folio);
iter.len = ret;
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.processed = iomap_folio_mkwrite_iter(&iter, folio);
+ iter.status = iomap_folio_mkwrite_iter(&iter, folio);
if (ret < 0)
goto out_unlock;
@@ -1538,16 +1531,15 @@ static void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
* state, release holds on bios, and finally free up memory. Do not use the
* ioend after this.
*/
-static u32
-iomap_finish_ioend(struct iomap_ioend *ioend, int error)
+u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend)
{
struct inode *inode = ioend->io_inode;
struct bio *bio = &ioend->io_bio;
struct folio_iter fi;
u32 folio_count = 0;
- if (error) {
- mapping_set_error(inode->i_mapping, error);
+ if (ioend->io_error) {
+ mapping_set_error(inode->i_mapping, ioend->io_error);
if (!bio_flagged(bio, BIO_QUIET)) {
pr_err_ratelimited(
"%s: writeback error on inode %lu, offset %lld, sector %llu",
@@ -1566,116 +1558,16 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error)
return folio_count;
}
-/*
- * Ioend completion routine for merged bios. This can only be called from task
- * contexts as merged ioends can be of unbound length. Hence we have to break up
- * the writeback completions into manageable chunks to avoid long scheduler
- * holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get
- * good batch processing throughput without creating adverse scheduler latency
- * conditions.
- */
-void
-iomap_finish_ioends(struct iomap_ioend *ioend, int error)
-{
- struct list_head tmp;
- u32 completions;
-
- might_sleep();
-
- list_replace_init(&ioend->io_list, &tmp);
- completions = iomap_finish_ioend(ioend, error);
-
- while (!list_empty(&tmp)) {
- if (completions > IOEND_BATCH_SIZE * 8) {
- cond_resched();
- completions = 0;
- }
- ioend = list_first_entry(&tmp, struct iomap_ioend, io_list);
- list_del_init(&ioend->io_list);
- completions += iomap_finish_ioend(ioend, error);
- }
-}
-EXPORT_SYMBOL_GPL(iomap_finish_ioends);
-
-/*
- * We can merge two adjacent ioends if they have the same set of work to do.
- */
-static bool
-iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next)
-{
- if (ioend->io_bio.bi_status != next->io_bio.bi_status)
- return false;
- if (next->io_flags & IOMAP_F_BOUNDARY)
- return false;
- if ((ioend->io_flags & IOMAP_F_SHARED) ^
- (next->io_flags & IOMAP_F_SHARED))
- return false;
- if ((ioend->io_type == IOMAP_UNWRITTEN) ^
- (next->io_type == IOMAP_UNWRITTEN))
- return false;
- if (ioend->io_offset + ioend->io_size != next->io_offset)
- return false;
- /*
- * Do not merge physically discontiguous ioends. The filesystem
- * completion functions will have to iterate the physical
- * discontiguities even if we merge the ioends at a logical level, so
- * we don't gain anything by merging physical discontiguities here.
- *
- * We cannot use bio->bi_iter.bi_sector here as it is modified during
- * submission so does not point to the start sector of the bio at
- * completion.
- */
- if (ioend->io_sector + (ioend->io_size >> 9) != next->io_sector)
- return false;
- return true;
-}
-
-void
-iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends)
-{
- struct iomap_ioend *next;
-
- INIT_LIST_HEAD(&ioend->io_list);
-
- while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend,
- io_list))) {
- if (!iomap_ioend_can_merge(ioend, next))
- break;
- list_move_tail(&next->io_list, &ioend->io_list);
- ioend->io_size += next->io_size;
- }
-}
-EXPORT_SYMBOL_GPL(iomap_ioend_try_merge);
-
-static int
-iomap_ioend_compare(void *priv, const struct list_head *a,
- const struct list_head *b)
-{
- struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list);
- struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list);
-
- if (ia->io_offset < ib->io_offset)
- return -1;
- if (ia->io_offset > ib->io_offset)
- return 1;
- return 0;
-}
-
-void
-iomap_sort_ioends(struct list_head *ioend_list)
-{
- list_sort(NULL, ioend_list, iomap_ioend_compare);
-}
-EXPORT_SYMBOL_GPL(iomap_sort_ioends);
-
static void iomap_writepage_end_bio(struct bio *bio)
{
- iomap_finish_ioend(iomap_ioend_from_bio(bio),
- blk_status_to_errno(bio->bi_status));
+ struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
+
+ ioend->io_error = blk_status_to_errno(bio->bi_status);
+ iomap_finish_ioend_buffered(ioend);
}
/*
- * Submit the final bio for an ioend.
+ * Submit an ioend.
*
* If @error is non-zero, it means that we have a situation where some part of
* the submission process has failed after we've marked pages for writeback.
@@ -1694,14 +1586,18 @@ static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error)
* failure happened so that the file system end I/O handler gets called
* to clean up.
*/
- if (wpc->ops->prepare_ioend)
- error = wpc->ops->prepare_ioend(wpc->ioend, error);
+ if (wpc->ops->submit_ioend) {
+ error = wpc->ops->submit_ioend(wpc, error);
+ } else {
+ if (WARN_ON_ONCE(wpc->iomap.flags & IOMAP_F_ANON_WRITE))
+ error = -EIO;
+ if (!error)
+ submit_bio(&wpc->ioend->io_bio);
+ }
if (error) {
wpc->ioend->io_bio.bi_status = errno_to_blk_status(error);
bio_endio(&wpc->ioend->io_bio);
- } else {
- submit_bio(&wpc->ioend->io_bio);
}
wpc->ioend = NULL;
@@ -1709,9 +1605,9 @@ static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error)
}
static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
- struct writeback_control *wbc, struct inode *inode, loff_t pos)
+ struct writeback_control *wbc, struct inode *inode, loff_t pos,
+ u16 ioend_flags)
{
- struct iomap_ioend *ioend;
struct bio *bio;
bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS,
@@ -1719,36 +1615,24 @@ static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
GFP_NOFS, &iomap_ioend_bioset);
bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos);
bio->bi_end_io = iomap_writepage_end_bio;
- wbc_init_bio(wbc, bio);
bio->bi_write_hint = inode->i_write_hint;
-
- ioend = iomap_ioend_from_bio(bio);
- INIT_LIST_HEAD(&ioend->io_list);
- ioend->io_type = wpc->iomap.type;
- ioend->io_flags = wpc->iomap.flags;
- if (pos > wpc->iomap.offset)
- wpc->iomap.flags &= ~IOMAP_F_BOUNDARY;
- ioend->io_inode = inode;
- ioend->io_size = 0;
- ioend->io_offset = pos;
- ioend->io_sector = bio->bi_iter.bi_sector;
-
+ wbc_init_bio(wbc, bio);
wpc->nr_folios = 0;
- return ioend;
+ return iomap_init_ioend(inode, bio, pos, ioend_flags);
}
-static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos)
+static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos,
+ u16 ioend_flags)
{
- if (wpc->iomap.offset == pos && (wpc->iomap.flags & IOMAP_F_BOUNDARY))
+ if (ioend_flags & IOMAP_IOEND_BOUNDARY)
return false;
- if ((wpc->iomap.flags & IOMAP_F_SHARED) !=
- (wpc->ioend->io_flags & IOMAP_F_SHARED))
- return false;
- if (wpc->iomap.type != wpc->ioend->io_type)
+ if ((ioend_flags & IOMAP_IOEND_NOMERGE_FLAGS) !=
+ (wpc->ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS))
return false;
if (pos != wpc->ioend->io_offset + wpc->ioend->io_size)
return false;
- if (iomap_sector(&wpc->iomap, pos) !=
+ if (!(wpc->iomap.flags & IOMAP_F_ANON_WRITE) &&
+ iomap_sector(&wpc->iomap, pos) !=
bio_end_sector(&wpc->ioend->io_bio))
return false;
/*
@@ -1779,14 +1663,23 @@ static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
{
struct iomap_folio_state *ifs = folio->private;
size_t poff = offset_in_folio(folio, pos);
+ unsigned int ioend_flags = 0;
int error;
- if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos)) {
+ if (wpc->iomap.type == IOMAP_UNWRITTEN)
+ ioend_flags |= IOMAP_IOEND_UNWRITTEN;
+ if (wpc->iomap.flags & IOMAP_F_SHARED)
+ ioend_flags |= IOMAP_IOEND_SHARED;
+ if (pos == wpc->iomap.offset && (wpc->iomap.flags & IOMAP_F_BOUNDARY))
+ ioend_flags |= IOMAP_IOEND_BOUNDARY;
+
+ if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos, ioend_flags)) {
new_ioend:
error = iomap_submit_ioend(wpc, 0);
if (error)
return error;
- wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos);
+ wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos,
+ ioend_flags);
}
if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff))
@@ -2062,11 +1955,3 @@ iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
return iomap_submit_ioend(wpc, error);
}
EXPORT_SYMBOL_GPL(iomap_writepages);
-
-static int __init iomap_buffered_init(void)
-{
- return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
- offsetof(struct iomap_ioend, io_bio),
- BIOSET_NEED_BVECS);
-}
-fs_initcall(iomap_buffered_init);
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index b521eb15759e..e1e32e2bb0bf 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2010 Red Hat, Inc.
- * Copyright (c) 2016-2021 Christoph Hellwig.
+ * Copyright (c) 2016-2025 Christoph Hellwig.
*/
#include <linux/module.h>
#include <linux/compiler.h>
@@ -12,6 +12,7 @@
#include <linux/backing-dev.h>
#include <linux/uio.h>
#include <linux/task_io_accounting_ops.h>
+#include "internal.h"
#include "trace.h"
#include "../internal.h"
@@ -20,6 +21,7 @@
* Private flags for iomap_dio, must not overlap with the public ones in
* iomap.h:
*/
+#define IOMAP_DIO_NO_INVALIDATE (1U << 25)
#define IOMAP_DIO_CALLER_COMP (1U << 26)
#define IOMAP_DIO_INLINE_COMP (1U << 27)
#define IOMAP_DIO_WRITE_THROUGH (1U << 28)
@@ -81,10 +83,12 @@ static void iomap_dio_submit_bio(const struct iomap_iter *iter,
WRITE_ONCE(iocb->private, bio);
}
- if (dio->dops && dio->dops->submit_io)
+ if (dio->dops && dio->dops->submit_io) {
dio->dops->submit_io(iter, bio, pos);
- else
+ } else {
+ WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_ANON_WRITE);
submit_bio(bio);
+ }
}
ssize_t iomap_dio_complete(struct iomap_dio *dio)
@@ -117,7 +121,8 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio)
* ->end_io() when necessary, otherwise a racing buffer read would cache
* zeros from unwritten extents.
*/
- if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE))
+ if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE) &&
+ !(dio->flags & IOMAP_DIO_NO_INVALIDATE))
kiocb_invalidate_post_direct_write(iocb, dio->size);
inode_dio_end(file_inode(iocb->ki_filp));
@@ -163,43 +168,31 @@ static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret)
cmpxchg(&dio->error, 0, ret);
}
-void iomap_dio_bio_end_io(struct bio *bio)
+/*
+ * Called when dio->ref reaches zero from an I/O completion.
+ */
+static void iomap_dio_done(struct iomap_dio *dio)
{
- struct iomap_dio *dio = bio->bi_private;
- bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
struct kiocb *iocb = dio->iocb;
- if (bio->bi_status)
- iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
- if (!atomic_dec_and_test(&dio->ref))
- goto release_bio;
-
- /*
- * Synchronous dio, task itself will handle any completion work
- * that needs after IO. All we need to do is wake the task.
- */
if (dio->wait_for_completion) {
+ /*
+ * Synchronous I/O, task itself will handle any completion work
+ * that needs after IO. All we need to do is wake the task.
+ */
struct task_struct *waiter = dio->submit.waiter;
WRITE_ONCE(dio->submit.waiter, NULL);
blk_wake_io_task(waiter);
- goto release_bio;
- }
-
- /*
- * Flagged with IOMAP_DIO_INLINE_COMP, we can complete it inline
- */
- if (dio->flags & IOMAP_DIO_INLINE_COMP) {
+ } else if (dio->flags & IOMAP_DIO_INLINE_COMP) {
WRITE_ONCE(iocb->private, NULL);
iomap_dio_complete_work(&dio->aio.work);
- goto release_bio;
- }
-
- /*
- * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then schedule
- * our completion that way to avoid an async punt to a workqueue.
- */
- if (dio->flags & IOMAP_DIO_CALLER_COMP) {
+ } else if (dio->flags & IOMAP_DIO_CALLER_COMP) {
+ /*
+ * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then
+ * schedule our completion that way to avoid an async punt to a
+ * workqueue.
+ */
/* only polled IO cares about private cleared */
iocb->private = dio;
iocb->dio_complete = iomap_dio_deferred_complete;
@@ -217,19 +210,31 @@ void iomap_dio_bio_end_io(struct bio *bio)
* issuer.
*/
iocb->ki_complete(iocb, 0);
- goto release_bio;
+ } else {
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ /*
+ * Async DIO completion that requires filesystem level
+ * completion work gets punted to a work queue to complete as
+ * the operation may require more IO to be issued to finalise
+ * filesystem metadata changes or guarantee data integrity.
+ */
+ INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
+ queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
}
+}
+
+void iomap_dio_bio_end_io(struct bio *bio)
+{
+ struct iomap_dio *dio = bio->bi_private;
+ bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
+
+ if (bio->bi_status)
+ iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
+
+ if (atomic_dec_and_test(&dio->ref))
+ iomap_dio_done(dio);
- /*
- * Async DIO completion that requires filesystem level completion work
- * gets punted to a work queue to complete as the operation may require
- * more IO to be issued to finalise filesystem metadata changes or
- * guarantee data integrity.
- */
- INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
- queue_work(file_inode(iocb->ki_filp)->i_sb->s_dio_done_wq,
- &dio->aio.work);
-release_bio:
if (should_dirty) {
bio_check_pages_dirty(bio);
} else {
@@ -239,6 +244,47 @@ release_bio:
}
EXPORT_SYMBOL_GPL(iomap_dio_bio_end_io);
+u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend)
+{
+ struct iomap_dio *dio = ioend->io_bio.bi_private;
+ bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
+ u32 vec_count = ioend->io_bio.bi_vcnt;
+
+ if (ioend->io_error)
+ iomap_dio_set_error(dio, ioend->io_error);
+
+ if (atomic_dec_and_test(&dio->ref)) {
+ /*
+ * Try to avoid another context switch for the completion given
+ * that we are already called from the ioend completion
+ * workqueue, but never invalidate pages from this thread to
+ * avoid deadlocks with buffered I/O completions. Tough luck if
+ * you hit the tiny race with someone dirtying the range now
+ * between this check and the actual completion.
+ */
+ if (!dio->iocb->ki_filp->f_mapping->nrpages) {
+ dio->flags |= IOMAP_DIO_INLINE_COMP;
+ dio->flags |= IOMAP_DIO_NO_INVALIDATE;
+ }
+ dio->flags &= ~IOMAP_DIO_CALLER_COMP;
+ iomap_dio_done(dio);
+ }
+
+ if (should_dirty) {
+ bio_check_pages_dirty(&ioend->io_bio);
+ } else {
+ bio_release_pages(&ioend->io_bio, false);
+ bio_put(&ioend->io_bio);
+ }
+
+ /*
+ * Return the number of bvecs completed as even direct I/O completions
+ * do significant per-folio work and we'll still want to give up the
+ * CPU after a lot of completions.
+ */
+ return vec_count;
+}
+
static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
loff_t pos, unsigned len)
{
@@ -289,8 +335,7 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
return opflags;
}
-static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
- struct iomap_dio *dio)
+static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
{
const struct iomap *iomap = &iter->iomap;
struct inode *inode = iter->inode;
@@ -303,7 +348,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
bool need_zeroout = false;
bool use_fua = false;
int nr_pages, ret = 0;
- size_t copied = 0;
+ u64 copied = 0;
size_t orig_count;
if (atomic && length != fs_block_size)
@@ -467,30 +512,28 @@ out:
/* Undo iter limitation to current extent */
iov_iter_reexpand(dio->submit.iter, orig_count - copied);
if (copied)
- return copied;
+ return iomap_iter_advance(iter, &copied);
return ret;
}
-static loff_t iomap_dio_hole_iter(const struct iomap_iter *iter,
- struct iomap_dio *dio)
+static int iomap_dio_hole_iter(struct iomap_iter *iter, struct iomap_dio *dio)
{
loff_t length = iov_iter_zero(iomap_length(iter), dio->submit.iter);
dio->size += length;
if (!length)
return -EFAULT;
- return length;
+ return iomap_iter_advance(iter, &length);
}
-static loff_t iomap_dio_inline_iter(const struct iomap_iter *iomi,
- struct iomap_dio *dio)
+static int iomap_dio_inline_iter(struct iomap_iter *iomi, struct iomap_dio *dio)
{
const struct iomap *iomap = &iomi->iomap;
struct iov_iter *iter = dio->submit.iter;
void *inline_data = iomap_inline_data(iomap, iomi->pos);
loff_t length = iomap_length(iomi);
loff_t pos = iomi->pos;
- size_t copied;
+ u64 copied;
if (WARN_ON_ONCE(!iomap_inline_data_valid(iomap)))
return -EIO;
@@ -512,11 +555,10 @@ static loff_t iomap_dio_inline_iter(const struct iomap_iter *iomi,
dio->size += copied;
if (!copied)
return -EFAULT;
- return copied;
+ return iomap_iter_advance(iomi, &copied);
}
-static loff_t iomap_dio_iter(const struct iomap_iter *iter,
- struct iomap_dio *dio)
+static int iomap_dio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
{
switch (iter->iomap.type) {
case IOMAP_HOLE:
@@ -700,7 +742,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
blk_start_plug(&plug);
while ((ret = iomap_iter(&iomi, ops)) > 0) {
- iomi.processed = iomap_dio_iter(&iomi, dio);
+ iomi.status = iomap_dio_iter(&iomi, dio);
/*
* We can only poll for single bio I/Os.
diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c
index 610ca6f1ec9b..80675c42e94e 100644
--- a/fs/iomap/fiemap.c
+++ b/fs/iomap/fiemap.c
@@ -39,24 +39,23 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi,
iomap->length, flags);
}
-static loff_t iomap_fiemap_iter(const struct iomap_iter *iter,
+static int iomap_fiemap_iter(struct iomap_iter *iter,
struct fiemap_extent_info *fi, struct iomap *prev)
{
int ret;
if (iter->iomap.type == IOMAP_HOLE)
- return iomap_length(iter);
+ goto advance;
ret = iomap_to_fiemap(fi, prev, 0);
*prev = iter->iomap;
- switch (ret) {
- case 0: /* success */
- return iomap_length(iter);
- case 1: /* extent array full */
- return 0;
- default: /* error */
+ if (ret < 0)
return ret;
- }
+ if (ret == 1) /* extent array full */
+ return 0;
+
+advance:
+ return iomap_iter_advance_full(iter);
}
int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
@@ -78,7 +77,7 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
return ret;
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.processed = iomap_fiemap_iter(&iter, fi, &prev);
+ iter.status = iomap_fiemap_iter(&iter, fi, &prev);
if (prev.type != IOMAP_HOLE) {
ret = iomap_to_fiemap(fi, &prev, FIEMAP_EXTENT_LAST);
@@ -114,7 +113,7 @@ iomap_bmap(struct address_space *mapping, sector_t bno,
while ((ret = iomap_iter(&iter, ops)) > 0) {
if (iter.iomap.type == IOMAP_MAPPED)
bno = iomap_sector(&iter.iomap, iter.pos) >> blkshift;
- /* leave iter.processed unset to abort loop */
+ /* leave iter.status unset to abort loop */
}
if (ret)
return 0;
diff --git a/fs/iomap/internal.h b/fs/iomap/internal.h
new file mode 100644
index 000000000000..f6992a3bf66a
--- /dev/null
+++ b/fs/iomap/internal.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _IOMAP_INTERNAL_H
+#define _IOMAP_INTERNAL_H 1
+
+#define IOEND_BATCH_SIZE 4096
+
+u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend);
+u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend);
+
+#endif /* _IOMAP_INTERNAL_H */
diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c
new file mode 100644
index 000000000000..18894ebba6db
--- /dev/null
+++ b/fs/iomap/ioend.c
@@ -0,0 +1,216 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2024-2025 Christoph Hellwig.
+ */
+#include <linux/iomap.h>
+#include <linux/list_sort.h>
+#include "internal.h"
+
+struct bio_set iomap_ioend_bioset;
+EXPORT_SYMBOL_GPL(iomap_ioend_bioset);
+
+struct iomap_ioend *iomap_init_ioend(struct inode *inode,
+ struct bio *bio, loff_t file_offset, u16 ioend_flags)
+{
+ struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
+
+ atomic_set(&ioend->io_remaining, 1);
+ ioend->io_error = 0;
+ ioend->io_parent = NULL;
+ INIT_LIST_HEAD(&ioend->io_list);
+ ioend->io_flags = ioend_flags;
+ ioend->io_inode = inode;
+ ioend->io_offset = file_offset;
+ ioend->io_size = bio->bi_iter.bi_size;
+ ioend->io_sector = bio->bi_iter.bi_sector;
+ ioend->io_private = NULL;
+ return ioend;
+}
+EXPORT_SYMBOL_GPL(iomap_init_ioend);
+
+static u32 iomap_finish_ioend(struct iomap_ioend *ioend, int error)
+{
+ if (ioend->io_parent) {
+ struct bio *bio = &ioend->io_bio;
+
+ ioend = ioend->io_parent;
+ bio_put(bio);
+ }
+
+ if (error)
+ cmpxchg(&ioend->io_error, 0, error);
+
+ if (!atomic_dec_and_test(&ioend->io_remaining))
+ return 0;
+ if (ioend->io_flags & IOMAP_IOEND_DIRECT)
+ return iomap_finish_ioend_direct(ioend);
+ return iomap_finish_ioend_buffered(ioend);
+}
+
+/*
+ * Ioend completion routine for merged bios. This can only be called from task
+ * contexts as merged ioends can be of unbound length. Hence we have to break up
+ * the writeback completions into manageable chunks to avoid long scheduler
+ * holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get
+ * good batch processing throughput without creating adverse scheduler latency
+ * conditions.
+ */
+void iomap_finish_ioends(struct iomap_ioend *ioend, int error)
+{
+ struct list_head tmp;
+ u32 completions;
+
+ might_sleep();
+
+ list_replace_init(&ioend->io_list, &tmp);
+ completions = iomap_finish_ioend(ioend, error);
+
+ while (!list_empty(&tmp)) {
+ if (completions > IOEND_BATCH_SIZE * 8) {
+ cond_resched();
+ completions = 0;
+ }
+ ioend = list_first_entry(&tmp, struct iomap_ioend, io_list);
+ list_del_init(&ioend->io_list);
+ completions += iomap_finish_ioend(ioend, error);
+ }
+}
+EXPORT_SYMBOL_GPL(iomap_finish_ioends);
+
+/*
+ * We can merge two adjacent ioends if they have the same set of work to do.
+ */
+static bool iomap_ioend_can_merge(struct iomap_ioend *ioend,
+ struct iomap_ioend *next)
+{
+ if (ioend->io_bio.bi_status != next->io_bio.bi_status)
+ return false;
+ if (next->io_flags & IOMAP_IOEND_BOUNDARY)
+ return false;
+ if ((ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS) !=
+ (next->io_flags & IOMAP_IOEND_NOMERGE_FLAGS))
+ return false;
+ if (ioend->io_offset + ioend->io_size != next->io_offset)
+ return false;
+ /*
+ * Do not merge physically discontiguous ioends. The filesystem
+ * completion functions will have to iterate the physical
+ * discontiguities even if we merge the ioends at a logical level, so
+ * we don't gain anything by merging physical discontiguities here.
+ *
+ * We cannot use bio->bi_iter.bi_sector here as it is modified during
+ * submission so does not point to the start sector of the bio at
+ * completion.
+ */
+ if (ioend->io_sector + (ioend->io_size >> SECTOR_SHIFT) !=
+ next->io_sector)
+ return false;
+ return true;
+}
+
+void iomap_ioend_try_merge(struct iomap_ioend *ioend,
+ struct list_head *more_ioends)
+{
+ struct iomap_ioend *next;
+
+ INIT_LIST_HEAD(&ioend->io_list);
+
+ while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend,
+ io_list))) {
+ if (!iomap_ioend_can_merge(ioend, next))
+ break;
+ list_move_tail(&next->io_list, &ioend->io_list);
+ ioend->io_size += next->io_size;
+ }
+}
+EXPORT_SYMBOL_GPL(iomap_ioend_try_merge);
+
+static int iomap_ioend_compare(void *priv, const struct list_head *a,
+ const struct list_head *b)
+{
+ struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list);
+ struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list);
+
+ if (ia->io_offset < ib->io_offset)
+ return -1;
+ if (ia->io_offset > ib->io_offset)
+ return 1;
+ return 0;
+}
+
+void iomap_sort_ioends(struct list_head *ioend_list)
+{
+ list_sort(NULL, ioend_list, iomap_ioend_compare);
+}
+EXPORT_SYMBOL_GPL(iomap_sort_ioends);
+
+/*
+ * Split up to the first @max_len bytes from @ioend if the ioend covers more
+ * than @max_len bytes.
+ *
+ * If @is_append is set, the split will be based on the hardware limits for
+ * REQ_OP_ZONE_APPEND commands and can be less than @max_len if the hardware
+ * limits don't allow the entire @max_len length.
+ *
+ * The bio embedded into @ioend must be a REQ_OP_WRITE because the block layer
+ * does not allow splitting REQ_OP_ZONE_APPEND bios. The file systems has to
+ * switch the operation after this call, but before submitting the bio.
+ */
+struct iomap_ioend *iomap_split_ioend(struct iomap_ioend *ioend,
+ unsigned int max_len, bool is_append)
+{
+ struct bio *bio = &ioend->io_bio;
+ struct iomap_ioend *split_ioend;
+ unsigned int nr_segs;
+ int sector_offset;
+ struct bio *split;
+
+ if (is_append) {
+ struct queue_limits *lim = bdev_limits(bio->bi_bdev);
+
+ max_len = min(max_len,
+ lim->max_zone_append_sectors << SECTOR_SHIFT);
+
+ sector_offset = bio_split_rw_at(bio, lim, &nr_segs, max_len);
+ if (unlikely(sector_offset < 0))
+ return ERR_PTR(sector_offset);
+ if (!sector_offset)
+ return NULL;
+ } else {
+ if (bio->bi_iter.bi_size <= max_len)
+ return NULL;
+ sector_offset = max_len >> SECTOR_SHIFT;
+ }
+
+ /* ensure the split ioend is still block size aligned */
+ sector_offset = ALIGN_DOWN(sector_offset << SECTOR_SHIFT,
+ i_blocksize(ioend->io_inode)) >> SECTOR_SHIFT;
+
+ split = bio_split(bio, sector_offset, GFP_NOFS, &iomap_ioend_bioset);
+ if (IS_ERR(split))
+ return ERR_CAST(split);
+ split->bi_private = bio->bi_private;
+ split->bi_end_io = bio->bi_end_io;
+
+ split_ioend = iomap_init_ioend(ioend->io_inode, split, ioend->io_offset,
+ ioend->io_flags);
+ split_ioend->io_parent = ioend;
+
+ atomic_inc(&ioend->io_remaining);
+ ioend->io_offset += split_ioend->io_size;
+ ioend->io_size -= split_ioend->io_size;
+
+ split_ioend->io_sector = ioend->io_sector;
+ if (!is_append)
+ ioend->io_sector += (split_ioend->io_size >> SECTOR_SHIFT);
+ return split_ioend;
+}
+EXPORT_SYMBOL_GPL(iomap_split_ioend);
+
+static int __init iomap_ioend_init(void)
+{
+ return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
+ offsetof(struct iomap_ioend, io_bio),
+ BIOSET_NEED_BVECS);
+}
+fs_initcall(iomap_ioend_init);
diff --git a/fs/iomap/iter.c b/fs/iomap/iter.c
index 3790918646af..6ffc6a7b9ba5 100644
--- a/fs/iomap/iter.c
+++ b/fs/iomap/iter.c
@@ -7,40 +7,25 @@
#include <linux/iomap.h>
#include "trace.h"
-/*
- * Advance to the next range we need to map.
- *
- * If the iomap is marked IOMAP_F_STALE, it means the existing map was not fully
- * processed - it was aborted because the extent the iomap spanned may have been
- * changed during the operation. In this case, the iteration behaviour is to
- * remap the unprocessed range of the iter, and that means we may need to remap
- * even when we've made no progress (i.e. iter->processed = 0). Hence the
- * "finished iterating" case needs to distinguish between
- * (processed = 0) meaning we are done and (processed = 0 && stale) meaning we
- * need to remap the entire remaining range.
- */
-static inline int iomap_iter_advance(struct iomap_iter *iter)
+static inline void iomap_iter_reset_iomap(struct iomap_iter *iter)
{
- bool stale = iter->iomap.flags & IOMAP_F_STALE;
- int ret = 1;
-
- /* handle the previous iteration (if any) */
- if (iter->iomap.length) {
- if (iter->processed < 0)
- return iter->processed;
- if (WARN_ON_ONCE(iter->processed > iomap_length(iter)))
- return -EIO;
- iter->pos += iter->processed;
- iter->len -= iter->processed;
- if (!iter->len || (!iter->processed && !stale))
- ret = 0;
- }
-
- /* clear the per iteration state */
- iter->processed = 0;
+ iter->status = 0;
memset(&iter->iomap, 0, sizeof(iter->iomap));
memset(&iter->srcmap, 0, sizeof(iter->srcmap));
- return ret;
+}
+
+/*
+ * Advance the current iterator position and output the length remaining for the
+ * current mapping.
+ */
+int iomap_iter_advance(struct iomap_iter *iter, u64 *count)
+{
+ if (WARN_ON_ONCE(*count > iomap_length(iter)))
+ return -EIO;
+ iter->pos += *count;
+ iter->len -= *count;
+ *count = iomap_length(iter);
+ return 0;
}
static inline void iomap_iter_done(struct iomap_iter *iter)
@@ -50,6 +35,8 @@ static inline void iomap_iter_done(struct iomap_iter *iter)
WARN_ON_ONCE(iter->iomap.offset + iter->iomap.length <= iter->pos);
WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_STALE);
+ iter->iter_start_pos = iter->pos;
+
trace_iomap_iter_dstmap(iter->inode, &iter->iomap);
if (iter->srcmap.type != IOMAP_HOLE)
trace_iomap_iter_srcmap(iter->inode, &iter->srcmap);
@@ -67,26 +54,58 @@ static inline void iomap_iter_done(struct iomap_iter *iter)
* function must be called in a loop that continues as long it returns a
* positive value. If 0 or a negative value is returned, the caller must not
* return to the loop body. Within a loop body, there are two ways to break out
- * of the loop body: leave @iter.processed unchanged, or set it to a negative
+ * of the loop body: leave @iter.status unchanged, or set it to a negative
* errno.
*/
int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops)
{
+ bool stale = iter->iomap.flags & IOMAP_F_STALE;
+ ssize_t advanced;
+ u64 olen;
int ret;
- if (iter->iomap.length && ops->iomap_end) {
- ret = ops->iomap_end(iter->inode, iter->pos, iomap_length(iter),
- iter->processed > 0 ? iter->processed : 0,
- iter->flags, &iter->iomap);
- if (ret < 0 && !iter->processed)
+ trace_iomap_iter(iter, ops, _RET_IP_);
+
+ if (!iter->iomap.length)
+ goto begin;
+
+ /*
+ * Calculate how far the iter was advanced and the original length bytes
+ * for ->iomap_end().
+ */
+ advanced = iter->pos - iter->iter_start_pos;
+ olen = iter->len + advanced;
+
+ if (ops->iomap_end) {
+ ret = ops->iomap_end(iter->inode, iter->iter_start_pos,
+ iomap_length_trim(iter, iter->iter_start_pos,
+ olen),
+ advanced, iter->flags, &iter->iomap);
+ if (ret < 0 && !advanced)
return ret;
}
- trace_iomap_iter(iter, ops, _RET_IP_);
- ret = iomap_iter_advance(iter);
+ /* detect old return semantics where this would advance */
+ if (WARN_ON_ONCE(iter->status > 0))
+ iter->status = -EIO;
+
+ /*
+ * Use iter->len to determine whether to continue onto the next mapping.
+ * Explicitly terminate on error status or if the current iter has not
+ * advanced at all (i.e. no work was done for some reason) unless the
+ * mapping has been marked stale and needs to be reprocessed.
+ */
+ if (iter->status < 0)
+ ret = iter->status;
+ else if (iter->len == 0 || (!advanced && !stale))
+ ret = 0;
+ else
+ ret = 1;
+ iomap_iter_reset_iomap(iter);
if (ret <= 0)
return ret;
+begin:
ret = ops->iomap_begin(iter->inode, iter->pos, iter->len, iter->flags,
&iter->iomap, &iter->srcmap);
if (ret < 0)
diff --git a/fs/iomap/seek.c b/fs/iomap/seek.c
index a845c012b50c..04d7919636c1 100644
--- a/fs/iomap/seek.c
+++ b/fs/iomap/seek.c
@@ -10,7 +10,7 @@
#include <linux/pagemap.h>
#include <linux/pagevec.h>
-static loff_t iomap_seek_hole_iter(const struct iomap_iter *iter,
+static int iomap_seek_hole_iter(struct iomap_iter *iter,
loff_t *hole_pos)
{
loff_t length = iomap_length(iter);
@@ -20,13 +20,13 @@ static loff_t iomap_seek_hole_iter(const struct iomap_iter *iter,
*hole_pos = mapping_seek_hole_data(iter->inode->i_mapping,
iter->pos, iter->pos + length, SEEK_HOLE);
if (*hole_pos == iter->pos + length)
- return length;
+ return iomap_iter_advance(iter, &length);
return 0;
case IOMAP_HOLE:
*hole_pos = iter->pos;
return 0;
default:
- return length;
+ return iomap_iter_advance(iter, &length);
}
}
@@ -47,7 +47,7 @@ iomap_seek_hole(struct inode *inode, loff_t pos, const struct iomap_ops *ops)
iter.len = size - pos;
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.processed = iomap_seek_hole_iter(&iter, &pos);
+ iter.status = iomap_seek_hole_iter(&iter, &pos);
if (ret < 0)
return ret;
if (iter.len) /* found hole before EOF */
@@ -56,19 +56,19 @@ iomap_seek_hole(struct inode *inode, loff_t pos, const struct iomap_ops *ops)
}
EXPORT_SYMBOL_GPL(iomap_seek_hole);
-static loff_t iomap_seek_data_iter(const struct iomap_iter *iter,
+static int iomap_seek_data_iter(struct iomap_iter *iter,
loff_t *hole_pos)
{
loff_t length = iomap_length(iter);
switch (iter->iomap.type) {
case IOMAP_HOLE:
- return length;
+ return iomap_iter_advance(iter, &length);
case IOMAP_UNWRITTEN:
*hole_pos = mapping_seek_hole_data(iter->inode->i_mapping,
iter->pos, iter->pos + length, SEEK_DATA);
if (*hole_pos < 0)
- return length;
+ return iomap_iter_advance(iter, &length);
return 0;
default:
*hole_pos = iter->pos;
@@ -93,7 +93,7 @@ iomap_seek_data(struct inode *inode, loff_t pos, const struct iomap_ops *ops)
iter.len = size - pos;
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.processed = iomap_seek_data_iter(&iter, &pos);
+ iter.status = iomap_seek_data_iter(&iter, &pos);
if (ret < 0)
return ret;
if (iter.len) /* found data before EOF */
diff --git a/fs/iomap/swapfile.c b/fs/iomap/swapfile.c
index b90d0eda9e51..c1a762c10ce4 100644
--- a/fs/iomap/swapfile.c
+++ b/fs/iomap/swapfile.c
@@ -94,7 +94,7 @@ static int iomap_swapfile_fail(struct iomap_swapfile_info *isi, const char *str)
* swap only cares about contiguous page-aligned physical extents and makes no
* distinction between written and unwritten extents.
*/
-static loff_t iomap_swapfile_iter(const struct iomap_iter *iter,
+static int iomap_swapfile_iter(struct iomap_iter *iter,
struct iomap *iomap, struct iomap_swapfile_info *isi)
{
switch (iomap->type) {
@@ -132,7 +132,8 @@ static loff_t iomap_swapfile_iter(const struct iomap_iter *iter,
return error;
memcpy(&isi->iomap, iomap, sizeof(isi->iomap));
}
- return iomap_length(iter);
+
+ return iomap_iter_advance_full(iter);
}
/*
@@ -166,7 +167,7 @@ int iomap_swapfile_activate(struct swap_info_struct *sis,
return ret;
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.processed = iomap_swapfile_iter(&iter, &iter.iomap, &isi);
+ iter.status = iomap_swapfile_iter(&iter, &iter.iomap, &isi);
if (ret < 0)
return ret;
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
index 4118a42cdab0..9eab2c8ac3c5 100644
--- a/fs/iomap/trace.h
+++ b/fs/iomap/trace.h
@@ -207,7 +207,7 @@ TRACE_EVENT(iomap_iter,
__field(u64, ino)
__field(loff_t, pos)
__field(u64, length)
- __field(s64, processed)
+ __field(int, status)
__field(unsigned int, flags)
__field(const void *, ops)
__field(unsigned long, caller)
@@ -217,17 +217,17 @@ TRACE_EVENT(iomap_iter,
__entry->ino = iter->inode->i_ino;
__entry->pos = iter->pos;
__entry->length = iomap_length(iter);
- __entry->processed = iter->processed;
+ __entry->status = iter->status;
__entry->flags = iter->flags;
__entry->ops = ops;
__entry->caller = caller;
),
- TP_printk("dev %d:%d ino 0x%llx pos 0x%llx length 0x%llx processed %lld flags %s (0x%x) ops %ps caller %pS",
+ TP_printk("dev %d:%d ino 0x%llx pos 0x%llx length 0x%llx status %d flags %s (0x%x) ops %ps caller %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->pos,
__entry->length,
- __entry->processed,
+ __entry->status,
__print_flags(__entry->flags, "|", IOMAP_FLAGS_STRINGS),
__entry->flags,
__entry->ops,
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 6d9965b546cb..5077d52a775d 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -115,7 +115,7 @@ xfs_end_ioend(
*/
error = blk_status_to_errno(ioend->io_bio.bi_status);
if (unlikely(error)) {
- if (ioend->io_flags & IOMAP_F_SHARED) {
+ if (ioend->io_flags & IOMAP_IOEND_SHARED) {
xfs_reflink_cancel_cow_range(ip, offset, size, true);
xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, offset,
offset + size);
@@ -126,9 +126,9 @@ xfs_end_ioend(
/*
* Success: commit the COW or unwritten blocks if needed.
*/
- if (ioend->io_flags & IOMAP_F_SHARED)
+ if (ioend->io_flags & IOMAP_IOEND_SHARED)
error = xfs_reflink_end_cow(ip, offset, size);
- else if (ioend->io_type == IOMAP_UNWRITTEN)
+ else if (ioend->io_flags & IOMAP_IOEND_UNWRITTEN)
error = xfs_iomap_write_unwritten(ip, offset, size, false);
if (!error && xfs_ioend_is_append(ioend))
@@ -396,10 +396,11 @@ allocate_blocks:
}
static int
-xfs_prepare_ioend(
- struct iomap_ioend *ioend,
+xfs_submit_ioend(
+ struct iomap_writepage_ctx *wpc,
int status)
{
+ struct iomap_ioend *ioend = wpc->ioend;
unsigned int nofs_flag;
/*
@@ -410,7 +411,7 @@ xfs_prepare_ioend(
nofs_flag = memalloc_nofs_save();
/* Convert CoW extents to regular */
- if (!status && (ioend->io_flags & IOMAP_F_SHARED)) {
+ if (!status && (ioend->io_flags & IOMAP_IOEND_SHARED)) {
status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
ioend->io_offset, ioend->io_size);
}
@@ -418,10 +419,14 @@ xfs_prepare_ioend(
memalloc_nofs_restore(nofs_flag);
/* send ioends that might require a transaction to the completion wq */
- if (xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN ||
- (ioend->io_flags & IOMAP_F_SHARED))
+ if (xfs_ioend_is_append(ioend) ||
+ (ioend->io_flags & (IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_SHARED)))
ioend->io_bio.bi_end_io = xfs_end_bio;
- return status;
+
+ if (status)
+ return status;
+ submit_bio(&ioend->io_bio);
+ return 0;
}
/*
@@ -463,7 +468,7 @@ xfs_discard_folio(
static const struct iomap_writeback_ops xfs_writeback_ops = {
.map_blocks = xfs_map_blocks,
- .prepare_ioend = xfs_prepare_ioend,
+ .submit_ioend = xfs_submit_ioend,
.discard_folio = xfs_discard_folio,
};
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index f7a7d89c345e..a81c3e943f20 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1511,7 +1511,8 @@ xfs_write_fault(
if (IS_DAX(inode))
ret = xfs_dax_fault_locked(vmf, order, true);
else
- ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops);
+ ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops,
+ NULL);
xfs_iunlock(ip, lock_mode);
sb_end_pagefault(inode->i_sb);
@@ -1626,7 +1627,8 @@ const struct file_operations xfs_file_operations = {
.fadvise = xfs_file_fadvise,
.remap_file_range = xfs_file_remap_range,
.fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC |
- FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE,
+ FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE |
+ FOP_DONTCACHE,
};
const struct file_operations xfs_dir_file_operations = {
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index d61460309a78..46acf727cbe7 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1495,7 +1495,7 @@ xfs_zero_range(
return dax_zero_range(inode, pos, len, did_zero,
&xfs_dax_write_iomap_ops);
return iomap_zero_range(inode, pos, len, did_zero,
- &xfs_buffered_write_iomap_ops);
+ &xfs_buffered_write_iomap_ops, NULL);
}
int
@@ -1510,5 +1510,5 @@ xfs_truncate_page(
return dax_truncate_page(inode, pos, did_zero,
&xfs_dax_write_iomap_ops);
return iomap_truncate_page(inode, pos, did_zero,
- &xfs_buffered_write_iomap_ops);
+ &xfs_buffered_write_iomap_ops, NULL);
}
diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c
index 35166c92420c..42e2c0065bb3 100644
--- a/fs/zonefs/file.c
+++ b/fs/zonefs/file.c
@@ -299,7 +299,7 @@ static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf)
/* Serialize against truncates */
filemap_invalidate_lock_shared(inode->i_mapping);
- ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops);
+ ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops, NULL);
filemap_invalidate_unlock_shared(inode->i_mapping);
sb_end_pagefault(inode->i_sb);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 75bf54e76f3b..ea29388b2fba 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -56,6 +56,10 @@ struct vm_fault;
*
* IOMAP_F_BOUNDARY indicates that I/O and I/O completions for this iomap must
* never be merged with the mapping before it.
+ *
+ * IOMAP_F_ANON_WRITE indicates that (write) I/O does not have a target block
+ * assigned to it yet and the file system will do that in the bio submission
+ * handler, splitting the I/O as needed.
*/
#define IOMAP_F_NEW (1U << 0)
#define IOMAP_F_DIRTY (1U << 1)
@@ -68,6 +72,7 @@ struct vm_fault;
#endif /* CONFIG_BUFFER_HEAD */
#define IOMAP_F_XATTR (1U << 5)
#define IOMAP_F_BOUNDARY (1U << 6)
+#define IOMAP_F_ANON_WRITE (1U << 7)
/*
* Flags set by the core iomap code during operations:
@@ -111,6 +116,8 @@ struct iomap {
static inline sector_t iomap_sector(const struct iomap *iomap, loff_t pos)
{
+ if (iomap->flags & IOMAP_F_ANON_WRITE)
+ return U64_MAX; /* invalid */
return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT;
}
@@ -183,6 +190,7 @@ struct iomap_folio_ops {
#define IOMAP_DAX 0
#endif /* CONFIG_FS_DAX */
#define IOMAP_ATOMIC (1 << 9)
+#define IOMAP_DONTCACHE (1 << 10)
struct iomap_ops {
/*
@@ -211,8 +219,10 @@ struct iomap_ops {
* calls to iomap_iter(). Treat as read-only in the body.
* @len: The remaining length of the file segment we're operating on.
* It is updated at the same time as @pos.
- * @processed: The number of bytes processed by the body in the most recent
- * iteration, or a negative errno. 0 causes the iteration to stop.
+ * @iter_start_pos: The original start pos for the current iomap. Used for
+ * incremental iter advance.
+ * @status: Status of the most recent iteration. Zero on success or a negative
+ * errno on error.
* @flags: Zero or more of the iomap_begin flags above.
* @iomap: Map describing the I/O iteration
* @srcmap: Source map for COW operations
@@ -221,7 +231,8 @@ struct iomap_iter {
struct inode *inode;
loff_t pos;
u64 len;
- s64 processed;
+ loff_t iter_start_pos;
+ int status;
unsigned flags;
struct iomap iomap;
struct iomap srcmap;
@@ -229,20 +240,46 @@ struct iomap_iter {
};
int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops);
+int iomap_iter_advance(struct iomap_iter *iter, u64 *count);
/**
- * iomap_length - length of the current iomap iteration
+ * iomap_length_trim - trimmed length of the current iomap iteration
* @iter: iteration structure
+ * @pos: File position to trim from.
+ * @len: Length of the mapping to trim to.
*
- * Returns the length that the operation applies to for the current iteration.
+ * Returns a trimmed length that the operation applies to for the current
+ * iteration.
*/
-static inline u64 iomap_length(const struct iomap_iter *iter)
+static inline u64 iomap_length_trim(const struct iomap_iter *iter, loff_t pos,
+ u64 len)
{
u64 end = iter->iomap.offset + iter->iomap.length;
if (iter->srcmap.type != IOMAP_HOLE)
end = min(end, iter->srcmap.offset + iter->srcmap.length);
- return min(iter->len, end - iter->pos);
+ return min(len, end - pos);
+}
+
+/**
+ * iomap_length - length of the current iomap iteration
+ * @iter: iteration structure
+ *
+ * Returns the length that the operation applies to for the current iteration.
+ */
+static inline u64 iomap_length(const struct iomap_iter *iter)
+{
+ return iomap_length_trim(iter, iter->pos, iter->len);
+}
+
+/**
+ * iomap_iter_advance_full - advance by the full length of current map
+ */
+static inline int iomap_iter_advance_full(struct iomap_iter *iter)
+{
+ u64 length = iomap_length(iter);
+
+ return iomap_iter_advance(iter, &length);
}
/**
@@ -306,12 +343,11 @@ bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio);
int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
const struct iomap_ops *ops);
int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
- bool *did_zero, const struct iomap_ops *ops);
+ bool *did_zero, const struct iomap_ops *ops, void *private);
int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
- const struct iomap_ops *ops);
-vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf,
- const struct iomap_ops *ops);
-
+ const struct iomap_ops *ops, void *private);
+vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops,
+ void *private);
typedef void (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length,
struct iomap *iomap);
void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte,
@@ -328,16 +364,42 @@ sector_t iomap_bmap(struct address_space *mapping, sector_t bno,
const struct iomap_ops *ops);
/*
+ * Flags for iomap_ioend->io_flags.
+ */
+/* shared COW extent */
+#define IOMAP_IOEND_SHARED (1U << 0)
+/* unwritten extent */
+#define IOMAP_IOEND_UNWRITTEN (1U << 1)
+/* don't merge into previous ioend */
+#define IOMAP_IOEND_BOUNDARY (1U << 2)
+/* is direct I/O */
+#define IOMAP_IOEND_DIRECT (1U << 3)
+
+/*
+ * Flags that if set on either ioend prevent the merge of two ioends.
+ * (IOMAP_IOEND_BOUNDARY also prevents merges, but only one-way)
+ */
+#define IOMAP_IOEND_NOMERGE_FLAGS \
+ (IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_DIRECT)
+
+/*
* Structure for writeback I/O completions.
+ *
+ * File systems implementing ->submit_ioend (for buffered I/O) or ->submit_io
+ * for direct I/O) can split a bio generated by iomap. In that case the parent
+ * ioend it was split from is recorded in ioend->io_parent.
*/
struct iomap_ioend {
struct list_head io_list; /* next ioend in chain */
- u16 io_type;
- u16 io_flags; /* IOMAP_F_* */
+ u16 io_flags; /* IOMAP_IOEND_* */
struct inode *io_inode; /* file being written to */
- size_t io_size; /* size of data within eof */
+ size_t io_size; /* size of the extent */
+ atomic_t io_remaining; /* completetion defer count */
+ int io_error; /* stashed away status */
+ struct iomap_ioend *io_parent; /* parent for completions */
loff_t io_offset; /* offset in the file */
sector_t io_sector; /* start sector of ioend */
+ void *io_private; /* file system private data */
struct bio io_bio; /* MUST BE LAST! */
};
@@ -362,12 +424,14 @@ struct iomap_writeback_ops {
loff_t offset, unsigned len);
/*
- * Optional, allows the file systems to perform actions just before
- * submitting the bio and/or override the bio end_io handler for complex
- * operations like copy on write extent manipulation or unwritten extent
- * conversions.
+ * Optional, allows the file systems to hook into bio submission,
+ * including overriding the bi_end_io handler.
+ *
+ * Returns 0 if the bio was successfully submitted, or a negative
+ * error code if status was non-zero or another error happened and
+ * the bio could not be submitted.
*/
- int (*prepare_ioend)(struct iomap_ioend *ioend, int status);
+ int (*submit_ioend)(struct iomap_writepage_ctx *wpc, int status);
/*
* Optional, allows the file system to discard state on a page where
@@ -383,6 +447,10 @@ struct iomap_writepage_ctx {
u32 nr_folios; /* folios added to the ioend */
};
+struct iomap_ioend *iomap_init_ioend(struct inode *inode, struct bio *bio,
+ loff_t file_offset, u16 ioend_flags);
+struct iomap_ioend *iomap_split_ioend(struct iomap_ioend *ioend,
+ unsigned int max_len, bool is_append);
void iomap_finish_ioends(struct iomap_ioend *ioend, int error);
void iomap_ioend_try_merge(struct iomap_ioend *ioend,
struct list_head *more_ioends);
@@ -454,4 +522,6 @@ int iomap_swapfile_activate(struct swap_info_struct *sis,
# define iomap_swapfile_activate(sis, swapfile, pagespan, ops) (-EIO)
#endif /* CONFIG_SWAP */
+extern struct bio_set iomap_ioend_bioset;
+
#endif /* LINUX_IOMAP_H */