summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--fs/iomap/buffered-io.c118
-rw-r--r--fs/iomap/iter.c6
-rw-r--r--fs/xfs/libxfs/xfs_errortag.h6
-rw-r--r--fs/xfs/xfs_file.c29
-rw-r--r--fs/xfs/xfs_iomap.c38
-rw-r--r--include/linux/iomap.h4
-rw-r--r--include/linux/pagemap.h2
-rw-r--r--mm/filemap.c58
8 files changed, 211 insertions, 50 deletions
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 1dbcac17fefd..6ae031ac8058 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -772,6 +772,28 @@ static struct folio *__iomap_get_folio(struct iomap_iter *iter,
if (!mapping_large_folio_support(iter->inode->i_mapping))
len = min_t(size_t, len, PAGE_SIZE - offset_in_page(pos));
+ if (iter->fbatch) {
+ struct folio *folio = folio_batch_next(iter->fbatch);
+
+ if (!folio)
+ return NULL;
+
+ /*
+ * The folio mapping generally shouldn't have changed based on
+ * fs locks, but be consistent with filemap lookup and retry
+ * the iter if it does.
+ */
+ folio_lock(folio);
+ if (unlikely(folio->mapping != iter->inode->i_mapping)) {
+ iter->iomap.flags |= IOMAP_F_STALE;
+ folio_unlock(folio);
+ return NULL;
+ }
+
+ folio_get(folio);
+ return folio;
+ }
+
if (write_ops && write_ops->get_folio)
return write_ops->get_folio(iter, pos, len);
return iomap_get_folio(iter, pos, len);
@@ -826,15 +848,14 @@ static int iomap_write_begin(struct iomap_iter *iter,
size_t *poffset, u64 *plen)
{
const struct iomap *srcmap = iomap_iter_srcmap(iter);
- loff_t pos = iter->pos;
+ loff_t pos;
u64 len = min_t(u64, SIZE_MAX, iomap_length(iter));
struct folio *folio;
int status = 0;
len = min_not_zero(len, *plen);
- BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length);
- if (srcmap != &iter->iomap)
- BUG_ON(pos + len > srcmap->offset + srcmap->length);
+ *foliop = NULL;
+ *plen = 0;
if (fatal_signal_pending(current))
return -EINTR;
@@ -844,6 +865,15 @@ static int iomap_write_begin(struct iomap_iter *iter,
return PTR_ERR(folio);
/*
+ * No folio means we're done with a batch. We still have range to
+ * process so return and let the caller iterate and refill the batch.
+ */
+ if (!folio) {
+ WARN_ON_ONCE(!iter->fbatch);
+ return 0;
+ }
+
+ /*
* Now we have a locked folio, before we do anything with it we need to
* check that the iomap we have cached is not stale. The inode extent
* mapping can change due to concurrent IO in flight (e.g.
@@ -863,6 +893,22 @@ static int iomap_write_begin(struct iomap_iter *iter,
}
}
+ /*
+ * The folios in a batch may not be contiguous. If we've skipped
+ * forward, advance the iter to the pos of the current folio. If the
+ * folio starts beyond the end of the mapping, it may have been trimmed
+ * since the lookup for whatever reason. Return a NULL folio to
+ * terminate the op.
+ */
+ if (folio_pos(folio) > iter->pos) {
+ len = min_t(u64, folio_pos(folio) - iter->pos,
+ iomap_length(iter));
+ status = iomap_iter_advance(iter, len);
+ len = iomap_length(iter);
+ if (status || !len)
+ goto out_unlock;
+ }
+
pos = iomap_trim_folio_range(iter, folio, poffset, &len);
if (srcmap->type == IOMAP_INLINE)
@@ -1409,6 +1455,12 @@ static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero,
if (iter->iomap.flags & IOMAP_F_STALE)
break;
+ /* a NULL folio means we're done with a folio batch */
+ if (!folio) {
+ status = iomap_iter_advance_full(iter);
+ break;
+ }
+
/* warn about zeroing folios beyond eof that won't write back */
WARN_ON_ONCE(folio_pos(folio) > iter->inode->i_size);
@@ -1433,6 +1485,26 @@ static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero,
return status;
}
+loff_t
+iomap_fill_dirty_folios(
+ struct iomap_iter *iter,
+ loff_t offset,
+ loff_t length)
+{
+ struct address_space *mapping = iter->inode->i_mapping;
+ pgoff_t start = offset >> PAGE_SHIFT;
+ pgoff_t end = (offset + length - 1) >> PAGE_SHIFT;
+
+ iter->fbatch = kmalloc(sizeof(struct folio_batch), GFP_KERNEL);
+ if (!iter->fbatch)
+ return offset + length;
+ folio_batch_init(iter->fbatch);
+
+ filemap_get_folios_dirty(mapping, &start, end, iter->fbatch);
+ return (start << PAGE_SHIFT);
+}
+EXPORT_SYMBOL_GPL(iomap_fill_dirty_folios);
+
int
iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
const struct iomap_ops *ops,
@@ -1446,46 +1518,26 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
.private = private,
};
struct address_space *mapping = inode->i_mapping;
- unsigned int blocksize = i_blocksize(inode);
- unsigned int off = pos & (blocksize - 1);
- loff_t plen = min_t(loff_t, len, blocksize - off);
int ret;
bool range_dirty;
/*
- * Zero range can skip mappings that are zero on disk so long as
- * pagecache is clean. If pagecache was dirty prior to zero range, the
- * mapping converts on writeback completion and so must be zeroed.
- *
- * The simplest way to deal with this across a range is to flush
- * pagecache and process the updated mappings. To avoid excessive
- * flushing on partial eof zeroing, special case it to zero the
- * unaligned start portion if already dirty in pagecache.
- */
- if (off &&
- filemap_range_needs_writeback(mapping, pos, pos + plen - 1)) {
- iter.len = plen;
- while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.status = iomap_zero_iter(&iter, did_zero,
- write_ops);
-
- iter.len = len - (iter.pos - pos);
- if (ret || !iter.len)
- return ret;
- }
-
- /*
* To avoid an unconditional flush, check pagecache state and only flush
* if dirty and the fs returns a mapping that might convert on
* writeback.
*/
- range_dirty = filemap_range_needs_writeback(inode->i_mapping,
- iter.pos, iter.pos + iter.len - 1);
+ range_dirty = filemap_range_needs_writeback(mapping, iter.pos,
+ iter.pos + iter.len - 1);
while ((ret = iomap_iter(&iter, ops)) > 0) {
const struct iomap *srcmap = iomap_iter_srcmap(&iter);
- if (srcmap->type == IOMAP_HOLE ||
- srcmap->type == IOMAP_UNWRITTEN) {
+ if (WARN_ON_ONCE(iter.fbatch &&
+ srcmap->type != IOMAP_UNWRITTEN))
+ return -EIO;
+
+ if (!iter.fbatch &&
+ (srcmap->type == IOMAP_HOLE ||
+ srcmap->type == IOMAP_UNWRITTEN)) {
s64 status;
if (range_dirty) {
diff --git a/fs/iomap/iter.c b/fs/iomap/iter.c
index 91d2024e00da..8692e5e41c6d 100644
--- a/fs/iomap/iter.c
+++ b/fs/iomap/iter.c
@@ -8,6 +8,12 @@
static inline void iomap_iter_reset_iomap(struct iomap_iter *iter)
{
+ if (iter->fbatch) {
+ folio_batch_release(iter->fbatch);
+ kfree(iter->fbatch);
+ iter->fbatch = NULL;
+ }
+
iter->status = 0;
memset(&iter->iomap, 0, sizeof(iter->iomap));
memset(&iter->srcmap, 0, sizeof(iter->srcmap));
diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h
index de840abc0bcd..57e47077c75a 100644
--- a/fs/xfs/libxfs/xfs_errortag.h
+++ b/fs/xfs/libxfs/xfs_errortag.h
@@ -73,7 +73,8 @@
#define XFS_ERRTAG_WRITE_DELAY_MS 43
#define XFS_ERRTAG_EXCHMAPS_FINISH_ONE 44
#define XFS_ERRTAG_METAFILE_RESV_CRITICAL 45
-#define XFS_ERRTAG_MAX 46
+#define XFS_ERRTAG_FORCE_ZERO_RANGE 46
+#define XFS_ERRTAG_MAX 47
/*
* Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -133,7 +134,8 @@ XFS_ERRTAG(ATTR_LEAF_TO_NODE, attr_leaf_to_node, 1) \
XFS_ERRTAG(WB_DELAY_MS, wb_delay_ms, 3000) \
XFS_ERRTAG(WRITE_DELAY_MS, write_delay_ms, 3000) \
XFS_ERRTAG(EXCHMAPS_FINISH_ONE, exchmaps_finish_one, 1) \
-XFS_ERRTAG(METAFILE_RESV_CRITICAL, metafile_resv_crit, 4)
+XFS_ERRTAG(METAFILE_RESV_CRITICAL, metafile_resv_crit, 4) \
+XFS_ERRTAG(FORCE_ZERO_RANGE, force_zero_range, 4)
#endif /* XFS_ERRTAG */
#endif /* __XFS_ERRORTAG_H_ */
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 2702fef2c90c..5b9864c8582e 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -27,6 +27,8 @@
#include "xfs_file.h"
#include "xfs_aops.h"
#include "xfs_zone_alloc.h"
+#include "xfs_error.h"
+#include "xfs_errortag.h"
#include <linux/dax.h>
#include <linux/falloc.h>
@@ -1254,23 +1256,36 @@ xfs_falloc_zero_range(
struct xfs_zone_alloc_ctx *ac)
{
struct inode *inode = file_inode(file);
+ struct xfs_inode *ip = XFS_I(inode);
unsigned int blksize = i_blocksize(inode);
loff_t new_size = 0;
int error;
- trace_xfs_zero_file_space(XFS_I(inode));
+ trace_xfs_zero_file_space(ip);
error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
if (error)
return error;
- error = xfs_free_file_space(XFS_I(inode), offset, len, ac);
- if (error)
- return error;
+ /*
+ * Zero range implements a full zeroing mechanism but is only used in
+ * limited situations. It is more efficient to allocate unwritten
+ * extents than to perform zeroing here, so use an errortag to randomly
+ * force zeroing on DEBUG kernels for added test coverage.
+ */
+ if (XFS_TEST_ERROR(ip->i_mount,
+ XFS_ERRTAG_FORCE_ZERO_RANGE)) {
+ error = xfs_zero_range(ip, offset, len, ac, NULL);
+ } else {
+ error = xfs_free_file_space(ip, offset, len, ac);
+ if (error)
+ return error;
- len = round_up(offset + len, blksize) - round_down(offset, blksize);
- offset = round_down(offset, blksize);
- error = xfs_alloc_file_space(XFS_I(inode), offset, len);
+ len = round_up(offset + len, blksize) -
+ round_down(offset, blksize);
+ offset = round_down(offset, blksize);
+ error = xfs_alloc_file_space(ip, offset, len);
+ }
if (error)
return error;
return xfs_falloc_setsize(file, new_size);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index d3f6e3e42a11..535bf3b8705d 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1702,6 +1702,8 @@ xfs_buffered_write_iomap_begin(
struct iomap *iomap,
struct iomap *srcmap)
{
+ struct iomap_iter *iter = container_of(iomap, struct iomap_iter,
+ iomap);
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
@@ -1767,21 +1769,41 @@ xfs_buffered_write_iomap_begin(
}
/*
- * For zeroing, trim a delalloc extent that extends beyond the EOF
- * block. If it starts beyond the EOF block, convert it to an
+ * For zeroing, trim extents that extend beyond the EOF block. If a
+ * delalloc extent starts beyond the EOF block, convert it to an
* unwritten extent.
*/
- if ((flags & IOMAP_ZERO) && imap.br_startoff <= offset_fsb &&
- isnullstartblock(imap.br_startblock)) {
+ if (flags & IOMAP_ZERO) {
xfs_fileoff_t eof_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
+ u64 end;
- if (offset_fsb >= eof_fsb)
+ if (isnullstartblock(imap.br_startblock) &&
+ offset_fsb >= eof_fsb)
goto convert_delay;
- if (end_fsb > eof_fsb) {
+ if (offset_fsb < eof_fsb && end_fsb > eof_fsb)
end_fsb = eof_fsb;
- xfs_trim_extent(&imap, offset_fsb,
- end_fsb - offset_fsb);
+
+ /*
+ * Look up dirty folios for unwritten mappings within EOF.
+ * Providing this bypasses the flush iomap uses to trigger
+ * extent conversion when unwritten mappings have dirty
+ * pagecache in need of zeroing.
+ *
+ * Trim the mapping to the end pos of the lookup, which in turn
+ * was trimmed to the end of the batch if it became full before
+ * the end of the mapping.
+ */
+ if (imap.br_state == XFS_EXT_UNWRITTEN &&
+ offset_fsb < eof_fsb) {
+ loff_t len = min(count,
+ XFS_FSB_TO_B(mp, imap.br_blockcount));
+
+ end = iomap_fill_dirty_folios(iter, offset, len);
+ end_fsb = min_t(xfs_fileoff_t, end_fsb,
+ XFS_B_TO_FSB(mp, end));
}
+
+ xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb);
}
/*
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 6d864b446b6e..65d123114883 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -9,6 +9,7 @@
#include <linux/types.h>
#include <linux/mm_types.h>
#include <linux/blkdev.h>
+#include <linux/pagevec.h>
struct address_space;
struct fiemap_extent_info;
@@ -242,6 +243,7 @@ struct iomap_iter {
unsigned flags;
struct iomap iomap;
struct iomap srcmap;
+ struct folio_batch *fbatch;
void *private;
};
@@ -350,6 +352,8 @@ bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio);
int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
const struct iomap_ops *ops,
const struct iomap_write_ops *write_ops);
+loff_t iomap_fill_dirty_folios(struct iomap_iter *iter, loff_t offset,
+ loff_t length);
int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
bool *did_zero, const struct iomap_ops *ops,
const struct iomap_write_ops *write_ops, void *private);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 09b581c1d878..7274a86b4871 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -977,6 +977,8 @@ unsigned filemap_get_folios_contig(struct address_space *mapping,
pgoff_t *start, pgoff_t end, struct folio_batch *fbatch);
unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start,
pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch);
+unsigned filemap_get_folios_dirty(struct address_space *mapping,
+ pgoff_t *start, pgoff_t end, struct folio_batch *fbatch);
struct folio *read_cache_folio(struct address_space *, pgoff_t index,
filler_t *filler, struct file *file);
diff --git a/mm/filemap.c b/mm/filemap.c
index 13f0259d993c..da1be27de10d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2366,6 +2366,64 @@ out:
}
EXPORT_SYMBOL(filemap_get_folios_tag);
+/**
+ * filemap_get_folios_dirty - Get a batch of dirty folios
+ * @mapping: The address_space to search
+ * @start: The starting folio index
+ * @end: The final folio index (inclusive)
+ * @fbatch: The batch to fill
+ *
+ * filemap_get_folios_dirty() works exactly like filemap_get_folios(), except
+ * the returned folios are presumed to be dirty or undergoing writeback. Dirty
+ * state is presumed because we don't block on folio lock nor want to miss
+ * folios. Callers that need to can recheck state upon locking the folio.
+ *
+ * This may not return all dirty folios if the batch gets filled up.
+ *
+ * Return: The number of folios found.
+ * Also update @start to be positioned for traversal of the next folio.
+ */
+unsigned filemap_get_folios_dirty(struct address_space *mapping, pgoff_t *start,
+ pgoff_t end, struct folio_batch *fbatch)
+{
+ XA_STATE(xas, &mapping->i_pages, *start);
+ struct folio *folio;
+
+ rcu_read_lock();
+ while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) {
+ if (xa_is_value(folio))
+ continue;
+ if (folio_trylock(folio)) {
+ bool clean = !folio_test_dirty(folio) &&
+ !folio_test_writeback(folio);
+ folio_unlock(folio);
+ if (clean) {
+ folio_put(folio);
+ continue;
+ }
+ }
+ if (!folio_batch_add(fbatch, folio)) {
+ unsigned long nr = folio_nr_pages(folio);
+ *start = folio->index + nr;
+ goto out;
+ }
+ }
+ /*
+ * We come here when there is no folio beyond @end. We take care to not
+ * overflow the index @start as it confuses some of the callers. This
+ * breaks the iteration when there is a folio at index -1 but that is
+ * already broke anyway.
+ */
+ if (end == (pgoff_t)-1)
+ *start = (pgoff_t)-1;
+ else
+ *start = end + 1;
+out:
+ rcu_read_unlock();
+
+ return folio_batch_count(fbatch);
+}
+
/*
* CD/DVDs are error prone. When a medium error occurs, the driver may fail
* a _large_ part of the i/o request. Imagine the worst scenario: