summaryrefslogtreecommitdiff
path: root/fs/xfs/xfs_iops.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/xfs_iops.c')
-rw-r--r--fs/xfs/xfs_iops.c161
1 files changed, 134 insertions, 27 deletions
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 207e0dadffc3..8cddbb7c149b 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -29,6 +29,7 @@
#include "xfs_xattr.h"
#include "xfs_file.h"
#include "xfs_bmap.h"
+#include "xfs_zone_alloc.h"
#include <linux/posix_acl.h>
#include <linux/security.h>
@@ -298,14 +299,14 @@ xfs_vn_create(
return xfs_generic_create(idmap, dir, dentry, mode, 0, NULL);
}
-STATIC int
+STATIC struct dentry *
xfs_vn_mkdir(
struct mnt_idmap *idmap,
struct inode *dir,
struct dentry *dentry,
umode_t mode)
{
- return xfs_generic_create(idmap, dir, dentry, mode | S_IFDIR, 0, NULL);
+ return ERR_PTR(xfs_generic_create(idmap, dir, dentry, mode | S_IFDIR, 0, NULL));
}
STATIC struct dentry *
@@ -573,17 +574,109 @@ xfs_stat_blksize(
}
static void
-xfs_get_atomic_write_attr(
+xfs_report_dioalign(
struct xfs_inode *ip,
- unsigned int *unit_min,
- unsigned int *unit_max)
+ struct kstat *stat)
+{
+ struct xfs_buftarg *target = xfs_inode_buftarg(ip);
+ struct block_device *bdev = target->bt_bdev;
+
+ stat->result_mask |= STATX_DIOALIGN | STATX_DIO_READ_ALIGN;
+ stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
+
+ /*
+ * For COW inodes, we can only perform out of place writes of entire
+ * allocation units (blocks or RT extents).
+ * For writes smaller than the allocation unit, we must fall back to
+ * buffered I/O to perform read-modify-write cycles. At best this is
+ * highly inefficient; at worst it leads to page cache invalidation
+ * races. Tell applications to avoid this by reporting the larger write
+ * alignment in dio_offset_align, and the smaller read alignment in
+ * dio_read_offset_align.
+ */
+ stat->dio_read_offset_align = bdev_logical_block_size(bdev);
+ if (xfs_is_cow_inode(ip))
+ stat->dio_offset_align = xfs_inode_alloc_unitsize(ip);
+ else
+ stat->dio_offset_align = stat->dio_read_offset_align;
+}
+
+unsigned int
+xfs_get_atomic_write_min(
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+
+ /*
+ * If we can complete an atomic write via atomic out of place writes,
+ * then advertise a minimum size of one fsblock. Without this
+ * mechanism, we can only guarantee atomic writes up to a single LBA.
+ *
+ * If out of place writes are not available, we can guarantee an atomic
+ * write of exactly one single fsblock if the bdev will make that
+ * guarantee for us.
+ */
+ if (xfs_inode_can_hw_atomic_write(ip) || xfs_can_sw_atomic_write(mp))
+ return mp->m_sb.sb_blocksize;
+
+ return 0;
+}
+
+unsigned int
+xfs_get_atomic_write_max(
+ struct xfs_inode *ip)
{
- if (!xfs_inode_can_atomicwrite(ip)) {
- *unit_min = *unit_max = 0;
- return;
+ struct xfs_mount *mp = ip->i_mount;
+
+ /*
+ * If out of place writes are not available, we can guarantee an atomic
+ * write of exactly one single fsblock if the bdev will make that
+ * guarantee for us.
+ */
+ if (!xfs_can_sw_atomic_write(mp)) {
+ if (xfs_inode_can_hw_atomic_write(ip))
+ return mp->m_sb.sb_blocksize;
+ return 0;
}
- *unit_min = *unit_max = ip->i_mount->m_sb.sb_blocksize;
+ /*
+ * If we can complete an atomic write via atomic out of place writes,
+ * then advertise a maximum size of whatever we can complete through
+ * that means. Hardware support is reported via max_opt, not here.
+ */
+ if (XFS_IS_REALTIME_INODE(ip))
+ return XFS_FSB_TO_B(mp, mp->m_groups[XG_TYPE_RTG].awu_max);
+ return XFS_FSB_TO_B(mp, mp->m_groups[XG_TYPE_AG].awu_max);
+}
+
+unsigned int
+xfs_get_atomic_write_max_opt(
+ struct xfs_inode *ip)
+{
+ unsigned int awu_max = xfs_get_atomic_write_max(ip);
+
+ /* if the max is 1x block, then just keep behaviour that opt is 0 */
+ if (awu_max <= ip->i_mount->m_sb.sb_blocksize)
+ return 0;
+
+ /*
+ * Advertise the maximum size of an atomic write that we can tell the
+ * block device to perform for us. In general the bdev limit will be
+ * less than our out of place write limit, but we don't want to exceed
+ * the awu_max.
+ */
+ return min(awu_max, xfs_inode_buftarg(ip)->bt_bdev_awu_max);
+}
+
+static void
+xfs_report_atomic_write(
+ struct xfs_inode *ip,
+ struct kstat *stat)
+{
+ generic_fill_statx_atomic_writes(stat,
+ xfs_get_atomic_write_min(ip),
+ xfs_get_atomic_write_max(ip),
+ xfs_get_atomic_write_max_opt(ip));
}
STATIC int
@@ -647,22 +740,10 @@ xfs_vn_getattr(
stat->rdev = inode->i_rdev;
break;
case S_IFREG:
- if (request_mask & STATX_DIOALIGN) {
- struct xfs_buftarg *target = xfs_inode_buftarg(ip);
- struct block_device *bdev = target->bt_bdev;
-
- stat->result_mask |= STATX_DIOALIGN;
- stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
- stat->dio_offset_align = bdev_logical_block_size(bdev);
- }
- if (request_mask & STATX_WRITE_ATOMIC) {
- unsigned int unit_min, unit_max;
-
- xfs_get_atomic_write_attr(ip, &unit_min,
- &unit_max);
- generic_fill_statx_atomic_writes(stat,
- unit_min, unit_max);
- }
+ if (request_mask & (STATX_DIOALIGN | STATX_DIO_READ_ALIGN))
+ xfs_report_dioalign(ip, stat);
+ if (request_mask & STATX_WRITE_ATOMIC)
+ xfs_report_atomic_write(ip, stat);
fallthrough;
default:
stat->blksize = xfs_stat_blksize(ip);
@@ -840,6 +921,7 @@ xfs_setattr_size(
uint lock_flags = 0;
uint resblks = 0;
bool did_zeroing = false;
+ struct xfs_zone_alloc_ctx ac = { };
xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
ASSERT(S_ISREG(inode->i_mode));
@@ -876,6 +958,28 @@ xfs_setattr_size(
inode_dio_wait(inode);
/*
+ * Normally xfs_zoned_space_reserve is supposed to be called outside the
+ * IOLOCK. For truncate we can't do that since ->setattr is called with
+ * it already held by the VFS. So for now chicken out and try to
+ * allocate space under it.
+ *
+ * To avoid deadlocks this means we can't block waiting for space, which
+ * can lead to spurious -ENOSPC if there are no directly available
+ * blocks. We mitigate this a bit by allowing zeroing to dip into the
+ * reserved pool, but eventually the VFS calling convention needs to
+ * change.
+ */
+ if (xfs_is_zoned_inode(ip)) {
+ error = xfs_zoned_space_reserve(ip, 1,
+ XFS_ZR_NOWAIT | XFS_ZR_RESERVED, &ac);
+ if (error) {
+ if (error == -EAGAIN)
+ return -ENOSPC;
+ return error;
+ }
+ }
+
+ /*
* File data changes must be complete before we start the transaction to
* modify the inode. This needs to be done before joining the inode to
* the transaction because the inode cannot be unlocked once it is a
@@ -888,11 +992,14 @@ xfs_setattr_size(
if (newsize > oldsize) {
trace_xfs_zero_eof(ip, oldsize, newsize - oldsize);
error = xfs_zero_range(ip, oldsize, newsize - oldsize,
- &did_zeroing);
+ &ac, &did_zeroing);
} else {
- error = xfs_truncate_page(ip, newsize, &did_zeroing);
+ error = xfs_truncate_page(ip, newsize, &ac, &did_zeroing);
}
+ if (xfs_is_zoned_inode(ip))
+ xfs_zoned_space_unreserve(ip, &ac);
+
if (error)
return error;