summaryrefslogtreecommitdiff
path: root/fs/xfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs')
-rw-r--r--fs/xfs/Kconfig36
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.c7
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c46
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c52
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c25
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c7
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c36
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h6
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c32
-rw-r--r--fs/xfs/libxfs/xfs_btree.c35
-rw-r--r--fs/xfs/libxfs/xfs_btree.h41
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c8
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c2
-rw-r--r--fs/xfs/libxfs/xfs_errortag.h118
-rw-r--r--fs/xfs/libxfs/xfs_exchmaps.c4
-rw-r--r--fs/xfs/libxfs/xfs_format.h2
-rw-r--r--fs/xfs/libxfs/xfs_group.c17
-rw-r--r--fs/xfs/libxfs/xfs_group.h9
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c37
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c24
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c4
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c3
-rw-r--r--fs/xfs/libxfs/xfs_inode_util.c11
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h180
-rw-r--r--fs/xfs/libxfs/xfs_log_recover.h6
-rw-r--r--fs/xfs/libxfs/xfs_log_rlimit.c4
-rw-r--r--fs/xfs/libxfs/xfs_metafile.c2
-rw-r--r--fs/xfs/libxfs/xfs_ondisk.h4
-rw-r--r--fs/xfs/libxfs/xfs_quota_defs.h4
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c11
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.c18
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c2
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.c67
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c2
-rw-r--r--fs/xfs/libxfs/xfs_rtgroup.h20
-rw-r--r--fs/xfs/libxfs/xfs_rtrefcount_btree.c18
-rw-r--r--fs/xfs/libxfs/xfs_rtrmap_btree.c67
-rw-r--r--fs/xfs/libxfs/xfs_sb.c24
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.c343
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.h25
-rw-r--r--fs/xfs/libxfs/xfs_zones.c1
-rw-r--r--fs/xfs/libxfs/xfs_zones.h7
-rw-r--r--fs/xfs/scrub/attr_repair.c2
-rw-r--r--fs/xfs/scrub/btree.c2
-rw-r--r--fs/xfs/scrub/common.c9
-rw-r--r--fs/xfs/scrub/common.h2
-rw-r--r--fs/xfs/scrub/cow_repair.c4
-rw-r--r--fs/xfs/scrub/dir_repair.c8
-rw-r--r--fs/xfs/scrub/fscounters.c7
-rw-r--r--fs/xfs/scrub/inode_repair.c2
-rw-r--r--fs/xfs/scrub/metapath.c16
-rw-r--r--fs/xfs/scrub/newbt.c9
-rw-r--r--fs/xfs/scrub/nlinks.c42
-rw-r--r--fs/xfs/scrub/nlinks_repair.c4
-rw-r--r--fs/xfs/scrub/orphanage.c18
-rw-r--r--fs/xfs/scrub/parent.c2
-rw-r--r--fs/xfs/scrub/parent_repair.c12
-rw-r--r--fs/xfs/scrub/quota.c8
-rw-r--r--fs/xfs/scrub/quota_repair.c18
-rw-r--r--fs/xfs/scrub/quotacheck.c15
-rw-r--r--fs/xfs/scrub/quotacheck_repair.c21
-rw-r--r--fs/xfs/scrub/rcbag_btree.c38
-rw-r--r--fs/xfs/scrub/reap.c620
-rw-r--r--fs/xfs/scrub/repair.c38
-rw-r--r--fs/xfs/scrub/repair.h12
-rw-r--r--fs/xfs/scrub/rmap_repair.c14
-rw-r--r--fs/xfs/scrub/rtrmap_repair.c14
-rw-r--r--fs/xfs/scrub/scrub.c7
-rw-r--r--fs/xfs/scrub/symlink_repair.c4
-rw-r--r--fs/xfs/scrub/trace.c1
-rw-r--r--fs/xfs/scrub/trace.h49
-rw-r--r--fs/xfs/scrub/xfarray.c2
-rw-r--r--fs/xfs/xfs_aops.c236
-rw-r--r--fs/xfs/xfs_attr_item.c152
-rw-r--r--fs/xfs/xfs_attr_item.h8
-rw-r--r--fs/xfs/xfs_bio_io.c30
-rw-r--r--fs/xfs/xfs_bmap_item.c28
-rw-r--r--fs/xfs/xfs_bmap_item.h3
-rw-r--r--fs/xfs/xfs_bmap_util.c2
-rw-r--r--fs/xfs/xfs_buf.c188
-rw-r--r--fs/xfs/xfs_buf.h14
-rw-r--r--fs/xfs/xfs_buf_item.c323
-rw-r--r--fs/xfs/xfs_buf_item.h8
-rw-r--r--fs/xfs/xfs_buf_item_recover.c48
-rw-r--r--fs/xfs/xfs_buf_mem.c2
-rw-r--r--fs/xfs/xfs_discard.c87
-rw-r--r--fs/xfs/xfs_discard.h2
-rw-r--r--fs/xfs/xfs_dquot.c150
-rw-r--r--fs/xfs/xfs_dquot.h22
-rw-r--r--fs/xfs/xfs_dquot_item.c6
-rw-r--r--fs/xfs/xfs_dquot_item_recover.c20
-rw-r--r--fs/xfs/xfs_error.c216
-rw-r--r--fs/xfs/xfs_error.h47
-rw-r--r--fs/xfs/xfs_exchmaps_item.c8
-rw-r--r--fs/xfs/xfs_extent_busy.h8
-rw-r--r--fs/xfs/xfs_extfree_item.c69
-rw-r--r--fs/xfs/xfs_extfree_item.h7
-rw-r--r--fs/xfs/xfs_file.c310
-rw-r--r--fs/xfs/xfs_filestream.c15
-rw-r--r--fs/xfs/xfs_fsmap.c55
-rw-r--r--fs/xfs/xfs_globals.c4
-rw-r--r--fs/xfs/xfs_handle.c56
-rw-r--r--fs/xfs/xfs_health.c4
-rw-r--r--fs/xfs/xfs_icache.c56
-rw-r--r--fs/xfs/xfs_icreate_item.c2
-rw-r--r--fs/xfs/xfs_inode.c134
-rw-r--r--fs/xfs/xfs_inode.h19
-rw-r--r--fs/xfs/xfs_inode_item.c146
-rw-r--r--fs/xfs/xfs_inode_item.h14
-rw-r--r--fs/xfs/xfs_inode_item_recover.c26
-rw-r--r--fs/xfs/xfs_ioctl.c53
-rw-r--r--fs/xfs/xfs_ioctl.h4
-rw-r--r--fs/xfs/xfs_iomap.c315
-rw-r--r--fs/xfs/xfs_iomap.h2
-rw-r--r--fs/xfs/xfs_iops.c97
-rw-r--r--fs/xfs/xfs_iops.h3
-rw-r--r--fs/xfs/xfs_itable.c26
-rw-r--r--fs/xfs/xfs_itable.h10
-rw-r--r--fs/xfs/xfs_iwalk.c11
-rw-r--r--fs/xfs/xfs_linux.h2
-rw-r--r--fs/xfs/xfs_log.c286
-rw-r--r--fs/xfs/xfs_log.h53
-rw-r--r--fs/xfs/xfs_log_cil.c85
-rw-r--r--fs/xfs/xfs_log_priv.h54
-rw-r--r--fs/xfs/xfs_log_recover.c95
-rw-r--r--fs/xfs/xfs_message.c16
-rw-r--r--fs/xfs/xfs_message.h4
-rw-r--r--fs/xfs/xfs_mount.c189
-rw-r--r--fs/xfs/xfs_mount.h56
-rw-r--r--fs/xfs/xfs_mru_cache.c35
-rw-r--r--fs/xfs/xfs_notify_failure.c17
-rw-r--r--fs/xfs/xfs_pnfs.c2
-rw-r--r--fs/xfs/xfs_qm.c246
-rw-r--r--fs/xfs/xfs_qm.h2
-rw-r--r--fs/xfs/xfs_qm_bhv.c4
-rw-r--r--fs/xfs/xfs_qm_syscalls.c10
-rw-r--r--fs/xfs/xfs_quotaops.c2
-rw-r--r--fs/xfs/xfs_refcount_item.c44
-rw-r--r--fs/xfs/xfs_refcount_item.h3
-rw-r--r--fs/xfs/xfs_reflink.c149
-rw-r--r--fs/xfs/xfs_reflink.h8
-rw-r--r--fs/xfs/xfs_rmap_item.c44
-rw-r--r--fs/xfs/xfs_rmap_item.h3
-rw-r--r--fs/xfs/xfs_rtalloc.c27
-rw-r--r--fs/xfs/xfs_super.c245
-rw-r--r--fs/xfs/xfs_sysctl.c29
-rw-r--r--fs/xfs/xfs_sysctl.h5
-rw-r--r--fs/xfs/xfs_sysfs.c32
-rw-r--r--fs/xfs/xfs_trace.h213
-rw-r--r--fs/xfs/xfs_trans.c234
-rw-r--r--fs/xfs/xfs_trans.h4
-rw-r--r--fs/xfs/xfs_trans_ail.c41
-rw-r--r--fs/xfs/xfs_trans_dquot.c18
-rw-r--r--fs/xfs/xfs_trans_priv.h28
-rw-r--r--fs/xfs/xfs_xattr.c2
-rw-r--r--fs/xfs/xfs_zone_alloc.c428
-rw-r--r--fs/xfs/xfs_zone_alloc.h4
-rw-r--r--fs/xfs/xfs_zone_gc.c169
-rw-r--r--fs/xfs/xfs_zone_info.c2
-rw-r--r--fs/xfs/xfs_zone_priv.h19
-rw-r--r--fs/xfs/xfs_zone_space_resv.c33
161 files changed, 4899 insertions, 3456 deletions
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index fffd6fffdce0..b99da294e9a3 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -3,7 +3,7 @@ config XFS_FS
tristate "XFS filesystem support"
depends on BLOCK
select EXPORTFS
- select LIBCRC32C
+ select CRC32
select FS_IOMAP
help
XFS is a high performance journaling filesystem which originated
@@ -25,7 +25,7 @@ config XFS_FS
config XFS_SUPPORT_V4
bool "Support deprecated V4 (crc=0) format"
depends on XFS_FS
- default y
+ default n
help
The V4 filesystem format lacks certain features that are supported
by the V5 format, such as metadata checksumming, strengthened
@@ -40,7 +40,7 @@ config XFS_SUPPORT_V4
filesystem is a V4 filesystem. If no such string is found, please
upgrade xfsprogs to the latest version and try again.
- This option will become default N in September 2025. Support for the
+ This option became default N in September 2025. Support for the
V4 format will be removed entirely in September 2030. Distributors
can say N here to withdraw support earlier.
@@ -50,7 +50,7 @@ config XFS_SUPPORT_V4
config XFS_SUPPORT_ASCII_CI
bool "Support deprecated case-insensitive ascii (ascii-ci=1) format"
depends on XFS_FS
- default y
+ default n
help
The ASCII case insensitivity filesystem feature only works correctly
on systems that have been coerced into using ISO 8859-1, and it does
@@ -67,7 +67,7 @@ config XFS_SUPPORT_ASCII_CI
filesystem is a case-insensitive filesystem. If no such string is
found, please upgrade xfsprogs to the latest version and try again.
- This option will become default N in September 2025. Support for the
+ This option became default N in September 2025. Support for the
feature will be removed entirely in September 2030. Distributors
can say N here to withdraw support earlier.
@@ -105,6 +105,7 @@ config XFS_POSIX_ACL
config XFS_RT
bool "XFS Realtime subvolume support"
depends on XFS_FS
+ default BLK_DEV_ZONED
help
If you say Y here you will be able to mount and use XFS filesystems
which contain a realtime subvolume. The realtime subvolume is a
@@ -118,6 +119,15 @@ config XFS_RT
See the xfs man page in section 5 for additional information.
+ This option is mandatory to support zoned block devices. For these
+ devices, the realtime subvolume must be backed by a zoned block
+ device and a regular block device used as the main device (for
+ metadata). If the zoned block device is a host-managed SMR hard-disk
+ containing conventional zones at the beginning of its address space,
+ XFS will use the disk conventional zones as the main device and the
+ remaining sequential write required zones as the backing storage for
+ the realtime subvolume.
+
If unsure, say N.
config XFS_DRAIN_INTENTS
@@ -136,7 +146,7 @@ config XFS_BTREE_IN_MEM
config XFS_ONLINE_SCRUB
bool "XFS online metadata check support"
- default n
+ default y
depends on XFS_FS
depends on TMPFS && SHMEM
select XFS_LIVE_HOOKS
@@ -149,17 +159,13 @@ config XFS_ONLINE_SCRUB
advantage here is to look for problems proactively so that
they can be dealt with in a controlled manner.
- This feature is considered EXPERIMENTAL. Use with caution!
-
See the xfs_scrub man page in section 8 for additional information.
- If unsure, say N.
-
config XFS_ONLINE_SCRUB_STATS
bool "XFS online metadata check usage data collection"
default y
depends on XFS_ONLINE_SCRUB
- select DEBUG_FS
+ depends on DEBUG_FS
help
If you say Y here, the kernel will gather usage data about
the online metadata check subsystem. This includes the number
@@ -170,11 +176,9 @@ config XFS_ONLINE_SCRUB_STATS
Usage data are collected in /sys/kernel/debug/xfs/scrub.
- If unsure, say N.
-
config XFS_ONLINE_REPAIR
bool "XFS online metadata repair support"
- default n
+ default y
depends on XFS_FS && XFS_ONLINE_SCRUB
select XFS_BTREE_IN_MEM
help
@@ -185,12 +189,8 @@ config XFS_ONLINE_REPAIR
formatted with secondary metadata, such as reverse mappings and inode
parent pointers.
- This feature is considered EXPERIMENTAL. Use with caution!
-
See the xfs_scrub man page in section 8 for additional information.
- If unsure, say N.
-
config XFS_WARN
bool "XFS Verbose Warnings"
depends on XFS_FS && !XFS_DEBUG
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index fb79215a509d..8ac8230c3d3c 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -92,9 +92,8 @@ xfs_ag_resv_critical(
trace_xfs_ag_resv_critical(pag, type, avail);
/* Critically low if less than 10% or max btree height remains. */
- return XFS_TEST_ERROR(avail < orig / 10 ||
- avail < mp->m_agbtree_maxlevels,
- mp, XFS_ERRTAG_AG_RESV_CRITICAL);
+ return avail < orig / 10 || avail < mp->m_agbtree_maxlevels ||
+ XFS_TEST_ERROR(mp, XFS_ERRTAG_AG_RESV_CRITICAL);
}
/*
@@ -203,7 +202,7 @@ __xfs_ag_resv_init(
return -EINVAL;
}
- if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_AG_RESV_FAIL))
+ if (XFS_TEST_ERROR(mp, XFS_ERRTAG_AG_RESV_FAIL))
error = -ENOSPC;
else
error = xfs_dec_fdblocks(mp, hidden_space, true);
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 7839efe050bf..ad381c73abc4 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -3321,7 +3321,7 @@ xfs_agf_read_verify(
xfs_verifier_error(bp, -EFSBADCRC, __this_address);
else {
fa = xfs_agf_verify(bp);
- if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_ALLOC_READ_AGF))
+ if (fa || XFS_TEST_ERROR(mp, XFS_ERRTAG_ALLOC_READ_AGF))
xfs_verifier_error(bp, -EFSCORRUPTED, fa);
}
}
@@ -3444,16 +3444,41 @@ xfs_alloc_read_agf(
set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
}
+
#ifdef DEBUG
- else if (!xfs_is_shutdown(mp)) {
- ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks));
- ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
- ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
- ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest));
- ASSERT(pag->pagf_bno_level == be32_to_cpu(agf->agf_bno_level));
- ASSERT(pag->pagf_cnt_level == be32_to_cpu(agf->agf_cnt_level));
+ /*
+ * It's possible for the AGF to be out of sync if the block device is
+ * silently dropping writes. This can happen in fstests with dmflakey
+ * enabled, which allows the buffer to be cleaned and reclaimed by
+ * memory pressure and then re-read from disk here. We will get a
+ * stale version of the AGF from disk, and nothing good can happen from
+ * here. Hence if we detect this situation, immediately shut down the
+ * filesystem.
+ *
+ * This can also happen if we are already in the middle of a forced
+ * shutdown, so don't bother checking if we are already shut down.
+ */
+ if (!xfs_is_shutdown(pag_mount(pag))) {
+ bool ok = true;
+
+ ok &= pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks);
+ ok &= pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks);
+ ok &= pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks);
+ ok &= pag->pagf_flcount == be32_to_cpu(agf->agf_flcount);
+ ok &= pag->pagf_longest == be32_to_cpu(agf->agf_longest);
+ ok &= pag->pagf_bno_level == be32_to_cpu(agf->agf_bno_level);
+ ok &= pag->pagf_cnt_level == be32_to_cpu(agf->agf_cnt_level);
+
+ if (XFS_IS_CORRUPT(pag_mount(pag), !ok)) {
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_AGF);
+ xfs_trans_brelse(tp, agfbp);
+ xfs_force_shutdown(pag_mount(pag),
+ SHUTDOWN_CORRUPT_ONDISK);
+ return -EFSCORRUPTED;
+ }
}
-#endif
+#endif /* DEBUG */
+
if (agfbpp)
*agfbpp = agfbp;
else
@@ -3994,8 +4019,7 @@ __xfs_free_extent(
ASSERT(len != 0);
ASSERT(type != XFS_AG_RESV_AGFL);
- if (XFS_TEST_ERROR(false, mp,
- XFS_ERRTAG_FREE_EXTENT))
+ if (XFS_TEST_ERROR(mp, XFS_ERRTAG_FREE_EXTENT))
return -EIO;
error = xfs_free_extent_fix_freelist(tp, pag, &agbp);
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index a4ac37ba5d51..fa1f03c1331e 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -186,35 +186,32 @@ xfs_allocbt_init_ptr_from_cur(
ptr->s = agf->agf_cnt_root;
}
-STATIC int64_t
-xfs_bnobt_key_diff(
+STATIC int
+xfs_bnobt_cmp_key_with_cur(
struct xfs_btree_cur *cur,
const union xfs_btree_key *key)
{
struct xfs_alloc_rec_incore *rec = &cur->bc_rec.a;
const struct xfs_alloc_rec *kp = &key->alloc;
- return (int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
+ return cmp_int(be32_to_cpu(kp->ar_startblock),
+ rec->ar_startblock);
}
-STATIC int64_t
-xfs_cntbt_key_diff(
+STATIC int
+xfs_cntbt_cmp_key_with_cur(
struct xfs_btree_cur *cur,
const union xfs_btree_key *key)
{
struct xfs_alloc_rec_incore *rec = &cur->bc_rec.a;
const struct xfs_alloc_rec *kp = &key->alloc;
- int64_t diff;
- diff = (int64_t)be32_to_cpu(kp->ar_blockcount) - rec->ar_blockcount;
- if (diff)
- return diff;
-
- return (int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
+ return cmp_int(be32_to_cpu(kp->ar_blockcount), rec->ar_blockcount) ?:
+ cmp_int(be32_to_cpu(kp->ar_startblock), rec->ar_startblock);
}
-STATIC int64_t
-xfs_bnobt_diff_two_keys(
+STATIC int
+xfs_bnobt_cmp_two_keys(
struct xfs_btree_cur *cur,
const union xfs_btree_key *k1,
const union xfs_btree_key *k2,
@@ -222,29 +219,24 @@ xfs_bnobt_diff_two_keys(
{
ASSERT(!mask || mask->alloc.ar_startblock);
- return (int64_t)be32_to_cpu(k1->alloc.ar_startblock) -
- be32_to_cpu(k2->alloc.ar_startblock);
+ return cmp_int(be32_to_cpu(k1->alloc.ar_startblock),
+ be32_to_cpu(k2->alloc.ar_startblock));
}
-STATIC int64_t
-xfs_cntbt_diff_two_keys(
+STATIC int
+xfs_cntbt_cmp_two_keys(
struct xfs_btree_cur *cur,
const union xfs_btree_key *k1,
const union xfs_btree_key *k2,
const union xfs_btree_key *mask)
{
- int64_t diff;
-
ASSERT(!mask || (mask->alloc.ar_blockcount &&
mask->alloc.ar_startblock));
- diff = be32_to_cpu(k1->alloc.ar_blockcount) -
- be32_to_cpu(k2->alloc.ar_blockcount);
- if (diff)
- return diff;
-
- return be32_to_cpu(k1->alloc.ar_startblock) -
- be32_to_cpu(k2->alloc.ar_startblock);
+ return cmp_int(be32_to_cpu(k1->alloc.ar_blockcount),
+ be32_to_cpu(k2->alloc.ar_blockcount)) ?:
+ cmp_int(be32_to_cpu(k1->alloc.ar_startblock),
+ be32_to_cpu(k2->alloc.ar_startblock));
}
static xfs_failaddr_t
@@ -438,9 +430,9 @@ const struct xfs_btree_ops xfs_bnobt_ops = {
.init_high_key_from_rec = xfs_bnobt_init_high_key_from_rec,
.init_rec_from_cur = xfs_allocbt_init_rec_from_cur,
.init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur,
- .key_diff = xfs_bnobt_key_diff,
+ .cmp_key_with_cur = xfs_bnobt_cmp_key_with_cur,
.buf_ops = &xfs_bnobt_buf_ops,
- .diff_two_keys = xfs_bnobt_diff_two_keys,
+ .cmp_two_keys = xfs_bnobt_cmp_two_keys,
.keys_inorder = xfs_bnobt_keys_inorder,
.recs_inorder = xfs_bnobt_recs_inorder,
.keys_contiguous = xfs_allocbt_keys_contiguous,
@@ -468,9 +460,9 @@ const struct xfs_btree_ops xfs_cntbt_ops = {
.init_high_key_from_rec = xfs_cntbt_init_high_key_from_rec,
.init_rec_from_cur = xfs_allocbt_init_rec_from_cur,
.init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur,
- .key_diff = xfs_cntbt_key_diff,
+ .cmp_key_with_cur = xfs_cntbt_cmp_key_with_cur,
.buf_ops = &xfs_cntbt_buf_ops,
- .diff_two_keys = xfs_cntbt_diff_two_keys,
+ .cmp_two_keys = xfs_cntbt_cmp_two_keys,
.keys_inorder = xfs_cntbt_keys_inorder,
.recs_inorder = xfs_cntbt_recs_inorder,
.keys_contiguous = NULL, /* not needed right now */
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index fddb55605e0c..91c1b30ebaab 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -667,12 +667,8 @@ xfs_attr_shortform_bytesfit(
/*
* For attr2 we can try to move the forkoff if there is space in the
- * literal area, but for the old format we are done if there is no
- * space in the fixed attribute fork.
+ * literal area
*/
- if (!xfs_has_attr2(mp))
- return 0;
-
dsize = dp->i_df.if_bytes;
switch (dp->i_df.if_format) {
@@ -723,22 +719,16 @@ xfs_attr_shortform_bytesfit(
}
/*
- * Switch on the ATTR2 superblock bit (implies also FEATURES2) unless:
- * - noattr2 mount option is set,
- * - on-disk version bit says it is already set, or
- * - the attr2 mount option is not set to enable automatic upgrade from attr1.
+ * Switch on the ATTR2 superblock bit (implies also FEATURES2) unless
+ * on-disk version bit says it is already set
*/
STATIC void
xfs_sbversion_add_attr2(
struct xfs_mount *mp,
struct xfs_trans *tp)
{
- if (xfs_has_noattr2(mp))
- return;
if (mp->m_sb.sb_features2 & XFS_SB_VERSION2_ATTR2BIT)
return;
- if (!xfs_has_attr2(mp))
- return;
spin_lock(&mp->m_sb_lock);
xfs_add_attr2(mp);
@@ -889,7 +879,7 @@ xfs_attr_sf_removename(
/*
* Fix up the start offset of the attribute fork
*/
- if (totsize == sizeof(struct xfs_attr_sf_hdr) && xfs_has_attr2(mp) &&
+ if (totsize == sizeof(struct xfs_attr_sf_hdr) &&
(dp->i_df.if_format != XFS_DINODE_FMT_BTREE) &&
!(args->op_flags & (XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE)) &&
!xfs_has_parent(mp)) {
@@ -900,7 +890,6 @@ xfs_attr_sf_removename(
ASSERT(dp->i_forkoff);
ASSERT(totsize > sizeof(struct xfs_attr_sf_hdr) ||
(args->op_flags & XFS_DA_OP_ADDNAME) ||
- !xfs_has_attr2(mp) ||
dp->i_df.if_format == XFS_DINODE_FMT_BTREE ||
xfs_has_parent(mp));
xfs_trans_log_inode(args->trans, dp,
@@ -1040,8 +1029,7 @@ xfs_attr_shortform_allfit(
bytes += xfs_attr_sf_entsize_byname(name_loc->namelen,
be16_to_cpu(name_loc->valuelen));
}
- if (xfs_has_attr2(dp->i_mount) &&
- (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) &&
+ if ((dp->i_df.if_format != XFS_DINODE_FMT_BTREE) &&
(bytes == sizeof(struct xfs_attr_sf_hdr)))
return -1;
return xfs_attr_shortform_bytesfit(dp, bytes);
@@ -1161,7 +1149,6 @@ xfs_attr3_leaf_to_shortform(
* this case.
*/
if (!(args->op_flags & XFS_DA_OP_REPLACE)) {
- ASSERT(xfs_has_attr2(dp->i_mount));
ASSERT(dp->i_df.if_format != XFS_DINODE_FMT_BTREE);
xfs_attr_fork_remove(dp, args->trans);
}
@@ -1225,7 +1212,7 @@ xfs_attr3_leaf_to_node(
trace_xfs_attr_leaf_to_node(args);
- if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_ATTR_LEAF_TO_NODE)) {
+ if (XFS_TEST_ERROR(mp, XFS_ERRTAG_ATTR_LEAF_TO_NODE)) {
error = -EIO;
goto out;
}
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index 4c44ce1c8a64..bff3dc226f81 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -435,6 +435,13 @@ xfs_attr_rmtval_get(
0, &bp, &xfs_attr3_rmt_buf_ops);
if (xfs_metadata_is_sick(error))
xfs_dirattr_mark_sick(args->dp, XFS_ATTR_FORK);
+ /*
+ * ENODATA from disk implies a disk medium failure;
+ * ENODATA for xattrs means attribute not found, so
+ * disambiguate that here.
+ */
+ if (error == -ENODATA)
+ error = -EIO;
if (error)
return error;
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 63255820b58a..53ef4b7e504d 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -997,8 +997,7 @@ xfs_bmap_add_attrfork_local(
static int
xfs_bmap_set_attrforkoff(
struct xfs_inode *ip,
- int size,
- int *version)
+ int size)
{
int default_size = xfs_default_attroffset(ip) >> 3;
@@ -1012,8 +1011,6 @@ xfs_bmap_set_attrforkoff(
ip->i_forkoff = xfs_attr_shortform_bytesfit(ip, size);
if (!ip->i_forkoff)
ip->i_forkoff = default_size;
- else if (xfs_has_attr2(ip->i_mount) && version)
- *version = 2;
break;
default:
ASSERT(0);
@@ -1035,7 +1032,6 @@ xfs_bmap_add_attrfork(
int rsvd) /* xact may use reserved blks */
{
struct xfs_mount *mp = tp->t_mountp;
- int version = 1; /* superblock attr version */
int logflags; /* logging flags */
int error; /* error return value */
@@ -1045,7 +1041,7 @@ xfs_bmap_add_attrfork(
ASSERT(!xfs_inode_has_attr_fork(ip));
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- error = xfs_bmap_set_attrforkoff(ip, size, &version);
+ error = xfs_bmap_set_attrforkoff(ip, size);
if (error)
return error;
@@ -1069,16 +1065,12 @@ xfs_bmap_add_attrfork(
xfs_trans_log_inode(tp, ip, logflags);
if (error)
return error;
- if (!xfs_has_attr(mp) ||
- (!xfs_has_attr2(mp) && version == 2)) {
+ if (!xfs_has_attr(mp)) {
bool log_sb = false;
spin_lock(&mp->m_sb_lock);
if (!xfs_has_attr(mp)) {
xfs_add_attr(mp);
- log_sb = true;
- }
- if (!xfs_has_attr2(mp) && version == 2) {
xfs_add_attr2(mp);
log_sb = true;
}
@@ -3312,6 +3304,11 @@ xfs_bmap_compute_alignments(
align = xfs_get_cowextsz_hint(ap->ip);
else if (ap->datatype & XFS_ALLOC_USERDATA)
align = xfs_get_extsz_hint(ap->ip);
+
+ /* Try to align start block to any minimum allocation alignment */
+ if (align > 1 && (ap->flags & XFS_BMAPI_EXTSZALIGN))
+ args->alignment = align;
+
if (align) {
if (xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 0,
ap->eof, 0, ap->conv, &ap->offset,
@@ -3657,8 +3654,7 @@ xfs_bmap_btalloc(
/* Trim the allocation back to the maximum an AG can fit. */
args.maxlen = min(ap->length, mp->m_ag_max_usable);
- if (unlikely(XFS_TEST_ERROR(false, mp,
- XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT)))
+ if (unlikely(XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT)))
error = xfs_bmap_exact_minlen_extent_alloc(ap, &args);
else if ((ap->datatype & XFS_ALLOC_USERDATA) &&
xfs_inode_is_filestream(ap->ip))
@@ -3844,7 +3840,7 @@ xfs_bmapi_read(
}
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
- XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+ XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) {
xfs_bmap_mark_sick(ip, whichfork);
return -EFSCORRUPTED;
}
@@ -4195,7 +4191,7 @@ xfs_bmapi_write(
(XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO));
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
- XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+ XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) {
xfs_bmap_mark_sick(ip, whichfork);
return -EFSCORRUPTED;
}
@@ -4540,7 +4536,7 @@ xfs_bmapi_remap(
(XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC));
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
- XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+ XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) {
xfs_bmap_mark_sick(ip, whichfork);
return -EFSCORRUPTED;
}
@@ -5674,7 +5670,7 @@ xfs_bmap_collapse_extents(
int logflags = 0;
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
- XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+ XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) {
xfs_bmap_mark_sick(ip, whichfork);
return -EFSCORRUPTED;
}
@@ -5790,7 +5786,7 @@ xfs_bmap_insert_extents(
int logflags = 0;
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
- XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+ XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) {
xfs_bmap_mark_sick(ip, whichfork);
return -EFSCORRUPTED;
}
@@ -5895,7 +5891,7 @@ xfs_bmap_split_extent(
int i = 0;
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
- XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+ XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) {
xfs_bmap_mark_sick(ip, whichfork);
return -EFSCORRUPTED;
}
@@ -6060,7 +6056,7 @@ xfs_bmap_finish_one(
trace_xfs_bmap_deferred(bi);
- if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_BMAP_FINISH_ONE))
+ if (XFS_TEST_ERROR(tp->t_mountp, XFS_ERRTAG_BMAP_FINISH_ONE))
return -EIO;
switch (bi->bi_type) {
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index b4d9c6e0f3f9..d5f2729305fa 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -87,6 +87,9 @@ struct xfs_bmalloca {
/* Do not update the rmap btree. Used for reconstructing bmbt from rmapbt. */
#define XFS_BMAPI_NORMAP (1u << 10)
+/* Try to align allocations to the extent size hint */
+#define XFS_BMAPI_EXTSZALIGN (1u << 11)
+
#define XFS_BMAPI_FLAGS \
{ XFS_BMAPI_ENTIRE, "ENTIRE" }, \
{ XFS_BMAPI_METADATA, "METADATA" }, \
@@ -98,7 +101,8 @@ struct xfs_bmalloca {
{ XFS_BMAPI_REMAP, "REMAP" }, \
{ XFS_BMAPI_COWFORK, "COWFORK" }, \
{ XFS_BMAPI_NODISCARD, "NODISCARD" }, \
- { XFS_BMAPI_NORMAP, "NORMAP" }
+ { XFS_BMAPI_NORMAP, "NORMAP" },\
+ { XFS_BMAPI_EXTSZALIGN, "EXTSZALIGN" }
static inline int xfs_bmapi_aflag(int w)
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 908d7b050e9c..188feac04b60 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -369,38 +369,26 @@ xfs_bmbt_init_rec_from_cur(
xfs_bmbt_disk_set_all(&rec->bmbt, &cur->bc_rec.b);
}
-STATIC int64_t
-xfs_bmbt_key_diff(
+STATIC int
+xfs_bmbt_cmp_key_with_cur(
struct xfs_btree_cur *cur,
const union xfs_btree_key *key)
{
- return (int64_t)be64_to_cpu(key->bmbt.br_startoff) -
- cur->bc_rec.b.br_startoff;
+ return cmp_int(be64_to_cpu(key->bmbt.br_startoff),
+ cur->bc_rec.b.br_startoff);
}
-STATIC int64_t
-xfs_bmbt_diff_two_keys(
+STATIC int
+xfs_bmbt_cmp_two_keys(
struct xfs_btree_cur *cur,
const union xfs_btree_key *k1,
const union xfs_btree_key *k2,
const union xfs_btree_key *mask)
{
- uint64_t a = be64_to_cpu(k1->bmbt.br_startoff);
- uint64_t b = be64_to_cpu(k2->bmbt.br_startoff);
-
ASSERT(!mask || mask->bmbt.br_startoff);
- /*
- * Note: This routine previously casted a and b to int64 and subtracted
- * them to generate a result. This lead to problems if b was the
- * "maximum" key value (all ones) being signed incorrectly, hence this
- * somewhat less efficient version.
- */
- if (a > b)
- return 1;
- if (b > a)
- return -1;
- return 0;
+ return cmp_int(be64_to_cpu(k1->bmbt.br_startoff),
+ be64_to_cpu(k2->bmbt.br_startoff));
}
static xfs_failaddr_t
@@ -647,8 +635,8 @@ const struct xfs_btree_ops xfs_bmbt_ops = {
.init_key_from_rec = xfs_bmbt_init_key_from_rec,
.init_high_key_from_rec = xfs_bmbt_init_high_key_from_rec,
.init_rec_from_cur = xfs_bmbt_init_rec_from_cur,
- .key_diff = xfs_bmbt_key_diff,
- .diff_two_keys = xfs_bmbt_diff_two_keys,
+ .cmp_key_with_cur = xfs_bmbt_cmp_key_with_cur,
+ .cmp_two_keys = xfs_bmbt_cmp_two_keys,
.buf_ops = &xfs_bmbt_buf_ops,
.keys_inorder = xfs_bmbt_keys_inorder,
.recs_inorder = xfs_bmbt_recs_inorder,
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 299ce7fd11b0..dbe9df8c3300 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -306,7 +306,7 @@ xfs_btree_check_block(
fa = __xfs_btree_check_block(cur, block, level, bp);
if (XFS_IS_CORRUPT(mp, fa != NULL) ||
- XFS_TEST_ERROR(false, mp, xfs_btree_block_errtag(cur))) {
+ XFS_TEST_ERROR(mp, xfs_btree_block_errtag(cur))) {
if (bp)
trace_xfs_btree_corrupt(bp, _RET_IP_);
xfs_btree_mark_sick(cur);
@@ -1985,7 +1985,7 @@ xfs_btree_lookup(
int *stat) /* success/failure */
{
struct xfs_btree_block *block; /* current btree block */
- int64_t diff; /* difference for the current key */
+ int cmp_r; /* current key comparison result */
int error; /* error return value */
int keyno; /* current key number */
int level; /* level in the btree */
@@ -2013,13 +2013,13 @@ xfs_btree_lookup(
* on the lookup record, then follow the corresponding block
* pointer down to the next level.
*/
- for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
+ for (level = cur->bc_nlevels - 1, cmp_r = 1; level >= 0; level--) {
/* Get the block we need to do the lookup on. */
error = xfs_btree_lookup_get_block(cur, level, pp, &block);
if (error)
goto error0;
- if (diff == 0) {
+ if (cmp_r == 0) {
/*
* If we already had a key match at a higher level, we
* know we need to use the first entry in this block.
@@ -2065,15 +2065,16 @@ xfs_btree_lookup(
keyno, block, &key);
/*
- * Compute difference to get next direction:
+ * Compute comparison result to get next
+ * direction:
* - less than, move right
* - greater than, move left
* - equal, we're done
*/
- diff = cur->bc_ops->key_diff(cur, kp);
- if (diff < 0)
+ cmp_r = cur->bc_ops->cmp_key_with_cur(cur, kp);
+ if (cmp_r < 0)
low = keyno + 1;
- else if (diff > 0)
+ else if (cmp_r > 0)
high = keyno - 1;
else
break;
@@ -2089,7 +2090,7 @@ xfs_btree_lookup(
* If we moved left, need the previous key number,
* unless there isn't one.
*/
- if (diff > 0 && --keyno < 1)
+ if (cmp_r > 0 && --keyno < 1)
keyno = 1;
pp = xfs_btree_ptr_addr(cur, keyno, block);
@@ -2102,7 +2103,7 @@ xfs_btree_lookup(
}
/* Done with the search. See if we need to adjust the results. */
- if (dir != XFS_LOOKUP_LE && diff < 0) {
+ if (dir != XFS_LOOKUP_LE && cmp_r < 0) {
keyno++;
/*
* If ge search and we went off the end of the block, but it's
@@ -2125,14 +2126,14 @@ xfs_btree_lookup(
*stat = 1;
return 0;
}
- } else if (dir == XFS_LOOKUP_LE && diff > 0)
+ } else if (dir == XFS_LOOKUP_LE && cmp_r > 0)
keyno--;
cur->bc_levels[0].ptr = keyno;
/* Return if we succeeded or not. */
if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
*stat = 0;
- else if (dir != XFS_LOOKUP_EQ || diff == 0)
+ else if (dir != XFS_LOOKUP_EQ || cmp_r == 0)
*stat = 1;
else
*stat = 0;
@@ -5058,7 +5059,7 @@ xfs_btree_simple_query_range(
int error;
ASSERT(cur->bc_ops->init_high_key_from_rec);
- ASSERT(cur->bc_ops->diff_two_keys);
+ ASSERT(cur->bc_ops->cmp_two_keys);
/*
* Find the leftmost record. The btree cursor must be set
@@ -5352,15 +5353,15 @@ xfs_btree_count_blocks(
}
/* Compare two btree pointers. */
-int64_t
-xfs_btree_diff_two_ptrs(
+int
+xfs_btree_cmp_two_ptrs(
struct xfs_btree_cur *cur,
const union xfs_btree_ptr *a,
const union xfs_btree_ptr *b)
{
if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
- return (int64_t)be64_to_cpu(a->l) - be64_to_cpu(b->l);
- return (int64_t)be32_to_cpu(a->s) - be32_to_cpu(b->s);
+ return cmp_int(be64_to_cpu(a->l), be64_to_cpu(b->l));
+ return cmp_int(be32_to_cpu(a->s), be32_to_cpu(b->s));
}
struct xfs_btree_has_records {
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 355b304696e6..60e78572e725 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -171,20 +171,23 @@ struct xfs_btree_ops {
void (*init_high_key_from_rec)(union xfs_btree_key *key,
const union xfs_btree_rec *rec);
- /* difference between key value and cursor value */
- int64_t (*key_diff)(struct xfs_btree_cur *cur,
- const union xfs_btree_key *key);
+ /*
+ * Compare key value and cursor value -- positive if key > cur,
+ * negative if key < cur, and zero if equal.
+ */
+ int (*cmp_key_with_cur)(struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key);
/*
- * Difference between key2 and key1 -- positive if key1 > key2,
- * negative if key1 < key2, and zero if equal. If the @mask parameter
- * is non NULL, each key field to be used in the comparison must
- * contain a nonzero value.
+ * Compare key1 and key2 -- positive if key1 > key2, negative if
+ * key1 < key2, and zero if equal. If the @mask parameter is non NULL,
+ * each key field to be used in the comparison must contain a nonzero
+ * value.
*/
- int64_t (*diff_two_keys)(struct xfs_btree_cur *cur,
- const union xfs_btree_key *key1,
- const union xfs_btree_key *key2,
- const union xfs_btree_key *mask);
+ int (*cmp_two_keys)(struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key1,
+ const union xfs_btree_key *key2,
+ const union xfs_btree_key *mask);
const struct xfs_buf_ops *buf_ops;
@@ -516,9 +519,9 @@ struct xfs_btree_block *xfs_btree_get_block(struct xfs_btree_cur *cur,
int level, struct xfs_buf **bpp);
bool xfs_btree_ptr_is_null(struct xfs_btree_cur *cur,
const union xfs_btree_ptr *ptr);
-int64_t xfs_btree_diff_two_ptrs(struct xfs_btree_cur *cur,
- const union xfs_btree_ptr *a,
- const union xfs_btree_ptr *b);
+int xfs_btree_cmp_two_ptrs(struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *a,
+ const union xfs_btree_ptr *b);
void xfs_btree_get_sibling(struct xfs_btree_cur *cur,
struct xfs_btree_block *block,
union xfs_btree_ptr *ptr, int lr);
@@ -546,7 +549,7 @@ xfs_btree_keycmp_lt(
const union xfs_btree_key *key1,
const union xfs_btree_key *key2)
{
- return cur->bc_ops->diff_two_keys(cur, key1, key2, NULL) < 0;
+ return cur->bc_ops->cmp_two_keys(cur, key1, key2, NULL) < 0;
}
static inline bool
@@ -555,7 +558,7 @@ xfs_btree_keycmp_gt(
const union xfs_btree_key *key1,
const union xfs_btree_key *key2)
{
- return cur->bc_ops->diff_two_keys(cur, key1, key2, NULL) > 0;
+ return cur->bc_ops->cmp_two_keys(cur, key1, key2, NULL) > 0;
}
static inline bool
@@ -564,7 +567,7 @@ xfs_btree_keycmp_eq(
const union xfs_btree_key *key1,
const union xfs_btree_key *key2)
{
- return cur->bc_ops->diff_two_keys(cur, key1, key2, NULL) == 0;
+ return cur->bc_ops->cmp_two_keys(cur, key1, key2, NULL) == 0;
}
static inline bool
@@ -602,7 +605,7 @@ xfs_btree_masked_keycmp_lt(
const union xfs_btree_key *key2,
const union xfs_btree_key *mask)
{
- return cur->bc_ops->diff_two_keys(cur, key1, key2, mask) < 0;
+ return cur->bc_ops->cmp_two_keys(cur, key1, key2, mask) < 0;
}
static inline bool
@@ -612,7 +615,7 @@ xfs_btree_masked_keycmp_gt(
const union xfs_btree_key *key2,
const union xfs_btree_key *mask)
{
- return cur->bc_ops->diff_two_keys(cur, key1, key2, mask) > 0;
+ return cur->bc_ops->cmp_two_keys(cur, key1, key2, mask) > 0;
}
static inline bool
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 17d9e6154f19..90f7fc219fcc 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -565,7 +565,7 @@ xfs_da3_split(
trace_xfs_da_split(state->args);
- if (XFS_TEST_ERROR(false, state->mp, XFS_ERRTAG_DA_LEAF_SPLIT))
+ if (XFS_TEST_ERROR(state->mp, XFS_ERRTAG_DA_LEAF_SPLIT))
return -EIO;
/*
@@ -2833,6 +2833,12 @@ xfs_da_read_buf(
&bp, ops);
if (xfs_metadata_is_sick(error))
xfs_dirattr_mark_sick(dp, whichfork);
+ /*
+ * ENODATA from disk implies a disk medium failure; ENODATA for
+ * xattrs means attribute not found, so disambiguate that here.
+ */
+ if (error == -ENODATA && whichfork == XFS_ATTR_FORK)
+ error = -EIO;
if (error)
goto out_free;
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 1775abcfa04d..82a338458a51 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -223,7 +223,7 @@ xfs_dir_ino_validate(
bool ino_ok = xfs_verify_dir_ino(mp, ino);
if (XFS_IS_CORRUPT(mp, !ino_ok) ||
- XFS_TEST_ERROR(false, mp, XFS_ERRTAG_DIR_INO_VALIDATE)) {
+ XFS_TEST_ERROR(mp, XFS_ERRTAG_DIR_INO_VALIDATE)) {
xfs_warn(mp, "Invalid inode number 0x%Lx",
(unsigned long long) ino);
return -EFSCORRUPTED;
diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h
index a53c5d40e084..57e47077c75a 100644
--- a/fs/xfs/libxfs/xfs_errortag.h
+++ b/fs/xfs/libxfs/xfs_errortag.h
@@ -4,14 +4,22 @@
* Copyright (C) 2017 Oracle.
* All Rights Reserved.
*/
-#ifndef __XFS_ERRORTAG_H_
+#if !defined(__XFS_ERRORTAG_H_) || defined(XFS_ERRTAG)
#define __XFS_ERRORTAG_H_
/*
- * error injection tags - the labels can be anything you want
- * but each tag should have its own unique number
+ * There are two ways to use this header file. The first way is to #include it
+ * bare, which will define all the XFS_ERRTAG_* error injection knobs for use
+ * with the XFS_TEST_ERROR macro. The second way is to enclose the #include
+ * with a #define for an XFS_ERRTAG macro, in which case the header will define
+ " an XFS_ERRTAGS macro that expands to invoke that XFS_ERRTAG macro for each
+ * defined error injection knob.
*/
+/*
+ * These are the actual error injection tags. The numbers should be consecutive
+ * because arrays are sized based on the maximum.
+ */
#define XFS_ERRTAG_NOERROR 0
#define XFS_ERRTAG_IFLUSH_1 1
#define XFS_ERRTAG_IFLUSH_2 2
@@ -65,55 +73,69 @@
#define XFS_ERRTAG_WRITE_DELAY_MS 43
#define XFS_ERRTAG_EXCHMAPS_FINISH_ONE 44
#define XFS_ERRTAG_METAFILE_RESV_CRITICAL 45
-#define XFS_ERRTAG_MAX 46
+#define XFS_ERRTAG_FORCE_ZERO_RANGE 46
+#define XFS_ERRTAG_MAX 47
/*
* Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
*/
#define XFS_RANDOM_DEFAULT 100
-#define XFS_RANDOM_IFLUSH_1 XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IFLUSH_2 XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IFLUSH_3 XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IFLUSH_4 XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IFLUSH_5 XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IFLUSH_6 XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_DA_READ_BUF XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_BTREE_CHECK_LBLOCK (XFS_RANDOM_DEFAULT/4)
-#define XFS_RANDOM_BTREE_CHECK_SBLOCK XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_ALLOC_READ_AGF XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IALLOC_READ_AGI XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_ITOBP_INOTOBP XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IUNLINK XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IUNLINK_REMOVE XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_DIR_INO_VALIDATE XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_BULKSTAT_READ_CHUNK XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_IODONE_IOERR (XFS_RANDOM_DEFAULT/10)
-#define XFS_RANDOM_STRATREAD_IOERR (XFS_RANDOM_DEFAULT/10)
-#define XFS_RANDOM_STRATCMPL_IOERR (XFS_RANDOM_DEFAULT/10)
-#define XFS_RANDOM_DIOWRITE_IOERR (XFS_RANDOM_DEFAULT/10)
-#define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_FREE_EXTENT 1
-#define XFS_RANDOM_RMAP_FINISH_ONE 1
-#define XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE 1
-#define XFS_RANDOM_REFCOUNT_FINISH_ONE 1
-#define XFS_RANDOM_BMAP_FINISH_ONE 1
-#define XFS_RANDOM_AG_RESV_CRITICAL 4
-#define XFS_RANDOM_LOG_BAD_CRC 1
-#define XFS_RANDOM_LOG_ITEM_PIN 1
-#define XFS_RANDOM_BUF_LRU_REF 2
-#define XFS_RANDOM_FORCE_SCRUB_REPAIR 1
-#define XFS_RANDOM_FORCE_SUMMARY_RECALC 1
-#define XFS_RANDOM_IUNLINK_FALLBACK (XFS_RANDOM_DEFAULT/10)
-#define XFS_RANDOM_BUF_IOERROR XFS_RANDOM_DEFAULT
-#define XFS_RANDOM_REDUCE_MAX_IEXTENTS 1
-#define XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT 1
-#define XFS_RANDOM_AG_RESV_FAIL 1
-#define XFS_RANDOM_LARP 1
-#define XFS_RANDOM_DA_LEAF_SPLIT 1
-#define XFS_RANDOM_ATTR_LEAF_TO_NODE 1
-#define XFS_RANDOM_WB_DELAY_MS 3000
-#define XFS_RANDOM_WRITE_DELAY_MS 3000
-#define XFS_RANDOM_EXCHMAPS_FINISH_ONE 1
-#define XFS_RANDOM_METAFILE_RESV_CRITICAL 4
+
+/*
+ * Table of errror injection knobs. The parameters to the XFS_ERRTAG macro are:
+ * 1. The XFS_ERRTAG_ flag but without the prefix;
+ * 2. The name of the sysfs knob; and
+ * 3. The default value for the knob.
+ */
+#ifdef XFS_ERRTAG
+# undef XFS_ERRTAGS
+# define XFS_ERRTAGS \
+XFS_ERRTAG(NOERROR, noerror, XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(IFLUSH_1, iflush1, XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(IFLUSH_2, iflush2, XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(IFLUSH_3, iflush3, XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(IFLUSH_4, iflush4, XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(IFLUSH_5, iflush5, XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(IFLUSH_6, iflush6, XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(DA_READ_BUF, dareadbuf, XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(BTREE_CHECK_LBLOCK, btree_chk_lblk, XFS_RANDOM_DEFAULT/4) \
+XFS_ERRTAG(BTREE_CHECK_SBLOCK, btree_chk_sblk, XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(ALLOC_READ_AGF, readagf, XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(IALLOC_READ_AGI, readagi, XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(ITOBP_INOTOBP, itobp, XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(IUNLINK, iunlink, XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(IUNLINK_REMOVE, iunlinkrm, XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(DIR_INO_VALIDATE, dirinovalid, XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(BULKSTAT_READ_CHUNK, bulkstat, XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(IODONE_IOERR, logiodone, XFS_RANDOM_DEFAULT/10) \
+XFS_ERRTAG(STRATREAD_IOERR, stratread, XFS_RANDOM_DEFAULT/10) \
+XFS_ERRTAG(STRATCMPL_IOERR, stratcmpl, XFS_RANDOM_DEFAULT/10) \
+XFS_ERRTAG(DIOWRITE_IOERR, diowrite, XFS_RANDOM_DEFAULT/10) \
+XFS_ERRTAG(BMAPIFORMAT, bmapifmt, XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(FREE_EXTENT, free_extent, 1) \
+XFS_ERRTAG(RMAP_FINISH_ONE, rmap_finish_one, 1) \
+XFS_ERRTAG(REFCOUNT_CONTINUE_UPDATE, refcount_continue_update, 1) \
+XFS_ERRTAG(REFCOUNT_FINISH_ONE, refcount_finish_one, 1) \
+XFS_ERRTAG(BMAP_FINISH_ONE, bmap_finish_one, 1) \
+XFS_ERRTAG(AG_RESV_CRITICAL, ag_resv_critical, 4) \
+XFS_ERRTAG(LOG_BAD_CRC, log_bad_crc, 1) \
+XFS_ERRTAG(LOG_ITEM_PIN, log_item_pin, 1) \
+XFS_ERRTAG(BUF_LRU_REF, buf_lru_ref, 2) \
+XFS_ERRTAG(FORCE_SCRUB_REPAIR, force_repair, 1) \
+XFS_ERRTAG(FORCE_SUMMARY_RECALC, bad_summary, 1) \
+XFS_ERRTAG(IUNLINK_FALLBACK, iunlink_fallback, XFS_RANDOM_DEFAULT/10) \
+XFS_ERRTAG(BUF_IOERROR, buf_ioerror, XFS_RANDOM_DEFAULT) \
+XFS_ERRTAG(REDUCE_MAX_IEXTENTS, reduce_max_iextents, 1) \
+XFS_ERRTAG(BMAP_ALLOC_MINLEN_EXTENT, bmap_alloc_minlen_extent, 1) \
+XFS_ERRTAG(AG_RESV_FAIL, ag_resv_fail, 1) \
+XFS_ERRTAG(LARP, larp, 1) \
+XFS_ERRTAG(DA_LEAF_SPLIT, da_leaf_split, 1) \
+XFS_ERRTAG(ATTR_LEAF_TO_NODE, attr_leaf_to_node, 1) \
+XFS_ERRTAG(WB_DELAY_MS, wb_delay_ms, 3000) \
+XFS_ERRTAG(WRITE_DELAY_MS, write_delay_ms, 3000) \
+XFS_ERRTAG(EXCHMAPS_FINISH_ONE, exchmaps_finish_one, 1) \
+XFS_ERRTAG(METAFILE_RESV_CRITICAL, metafile_resv_crit, 4) \
+XFS_ERRTAG(FORCE_ZERO_RANGE, force_zero_range, 4)
+#endif /* XFS_ERRTAG */
#endif /* __XFS_ERRORTAG_H_ */
diff --git a/fs/xfs/libxfs/xfs_exchmaps.c b/fs/xfs/libxfs/xfs_exchmaps.c
index 3f1d6a98c118..932ee4619e9e 100644
--- a/fs/xfs/libxfs/xfs_exchmaps.c
+++ b/fs/xfs/libxfs/xfs_exchmaps.c
@@ -616,7 +616,7 @@ xfs_exchmaps_finish_one(
return error;
}
- if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_EXCHMAPS_FINISH_ONE))
+ if (XFS_TEST_ERROR(tp->t_mountp, XFS_ERRTAG_EXCHMAPS_FINISH_ONE))
return -EIO;
/* If we still have work to do, ask for a new transaction. */
@@ -882,7 +882,7 @@ xmi_ensure_delta_nextents(
&new_nextents))
return -EFBIG;
- if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) &&
+ if (XFS_TEST_ERROR(mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) &&
new_nextents > 10)
return -EFBIG;
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 9566a7623365..779dac59b1f3 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -112,7 +112,7 @@ typedef struct xfs_sb {
uint16_t sb_sectsize; /* volume sector size, bytes */
uint16_t sb_inodesize; /* inode size, bytes */
uint16_t sb_inopblock; /* inodes per block */
- char sb_fname[XFSLABEL_MAX]; /* file system name */
+ char sb_fname[XFSLABEL_MAX] __nonstring; /* file system name */
uint8_t sb_blocklog; /* log2 of sb_blocksize */
uint8_t sb_sectlog; /* log2 of sb_sectsize */
uint8_t sb_inodelog; /* log2 of sb_inodesize */
diff --git a/fs/xfs/libxfs/xfs_group.c b/fs/xfs/libxfs/xfs_group.c
index e9d76bcdc820..792f76d2e2a0 100644
--- a/fs/xfs/libxfs/xfs_group.c
+++ b/fs/xfs/libxfs/xfs_group.c
@@ -163,7 +163,8 @@ xfs_group_free(
xfs_defer_drain_free(&xg->xg_intents_drain);
#ifdef __KERNEL__
- kfree(xg->xg_busy_extents);
+ if (xfs_group_has_extent_busy(xg->xg_mount, xg->xg_type))
+ kfree(xg->xg_busy_extents);
#endif
if (uninit)
@@ -171,7 +172,8 @@ xfs_group_free(
/* drop the mount's active reference */
xfs_group_rele(xg);
- XFS_IS_CORRUPT(mp, atomic_read(&xg->xg_active_ref) != 0);
+ XFS_IS_CORRUPT(mp, atomic_read(&xg->xg_active_ref) > 0);
+ XFS_IS_CORRUPT(mp, atomic_read(&xg->xg_active_ref) < 0);
kfree_rcu_mightsleep(xg);
}
@@ -189,9 +191,11 @@ xfs_group_insert(
xg->xg_type = type;
#ifdef __KERNEL__
- xg->xg_busy_extents = xfs_extent_busy_alloc();
- if (!xg->xg_busy_extents)
- return -ENOMEM;
+ if (xfs_group_has_extent_busy(mp, type)) {
+ xg->xg_busy_extents = xfs_extent_busy_alloc();
+ if (!xg->xg_busy_extents)
+ return -ENOMEM;
+ }
spin_lock_init(&xg->xg_state_lock);
xfs_hooks_init(&xg->xg_rmap_update_hooks);
#endif
@@ -210,7 +214,8 @@ xfs_group_insert(
out_drain:
xfs_defer_drain_free(&xg->xg_intents_drain);
#ifdef __KERNEL__
- kfree(xg->xg_busy_extents);
+ if (xfs_group_has_extent_busy(xg->xg_mount, xg->xg_type))
+ kfree(xg->xg_busy_extents);
#endif
return error;
}
diff --git a/fs/xfs/libxfs/xfs_group.h b/fs/xfs/libxfs/xfs_group.h
index 4423932a2313..4ae638f1c2c5 100644
--- a/fs/xfs/libxfs/xfs_group.h
+++ b/fs/xfs/libxfs/xfs_group.h
@@ -98,6 +98,15 @@ xfs_group_max_blocks(
return xg->xg_mount->m_groups[xg->xg_type].blocks;
}
+static inline xfs_rfsblock_t
+xfs_groups_to_rfsbs(
+ struct xfs_mount *mp,
+ uint32_t nr_groups,
+ enum xfs_group_type type)
+{
+ return (xfs_rfsblock_t)mp->m_groups[type].blocks * nr_groups;
+}
+
static inline xfs_fsblock_t
xfs_group_start_fsb(
struct xfs_group *xg)
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 0c47b5c6ca7d..d97295eaebe6 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -2140,7 +2140,7 @@ xfs_difree_inobt(
* remove the chunk if the block size is large enough for multiple inode
* chunks (that might not be free).
*/
- if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE &&
+ if (rec.ir_free == XFS_INOBT_ALL_FREE &&
mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) {
xic->deleted = true;
xic->first_ino = xfs_agino_to_ino(pag, rec.ir_startino);
@@ -2286,7 +2286,7 @@ xfs_difree_finobt(
* enough for multiple chunks. Leave the finobt record to remain in sync
* with the inobt.
*/
- if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE &&
+ if (rec.ir_free == XFS_INOBT_ALL_FREE &&
mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) {
error = xfs_btree_delete(cur, &i);
if (error)
@@ -2706,7 +2706,7 @@ xfs_agi_read_verify(
xfs_verifier_error(bp, -EFSBADCRC, __this_address);
else {
fa = xfs_agi_verify(bp);
- if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_IALLOC_READ_AGI))
+ if (fa || XFS_TEST_ERROR(mp, XFS_ERRTAG_IALLOC_READ_AGI))
xfs_verifier_error(bp, -EFSCORRUPTED, fa);
}
}
@@ -2801,12 +2801,35 @@ xfs_ialloc_read_agi(
set_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate);
}
+#ifdef DEBUG
/*
- * It's possible for these to be out of sync if
- * we are in the middle of a forced shutdown.
+ * It's possible for the AGF to be out of sync if the block device is
+ * silently dropping writes. This can happen in fstests with dmflakey
+ * enabled, which allows the buffer to be cleaned and reclaimed by
+ * memory pressure and then re-read from disk here. We will get a
+ * stale version of the AGF from disk, and nothing good can happen from
+ * here. Hence if we detect this situation, immediately shut down the
+ * filesystem.
+ *
+ * This can also happen if we are already in the middle of a forced
+ * shutdown, so don't bother checking if we are already shut down.
*/
- ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
- xfs_is_shutdown(pag_mount(pag)));
+ if (!xfs_is_shutdown(pag_mount(pag))) {
+ bool ok = true;
+
+ ok &= pag->pagi_freecount == be32_to_cpu(agi->agi_freecount);
+ ok &= pag->pagi_count == be32_to_cpu(agi->agi_count);
+
+ if (XFS_IS_CORRUPT(pag_mount(pag), !ok)) {
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
+ xfs_trans_brelse(tp, agibp);
+ xfs_force_shutdown(pag_mount(pag),
+ SHUTDOWN_CORRUPT_ONDISK);
+ return -EFSCORRUPTED;
+ }
+ }
+#endif /* DEBUG */
+
if (agibpp)
*agibpp = agibp;
else
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 6f270d8f4270..100afdd66cdd 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -265,17 +265,17 @@ xfs_finobt_init_ptr_from_cur(
ptr->s = agi->agi_free_root;
}
-STATIC int64_t
-xfs_inobt_key_diff(
+STATIC int
+xfs_inobt_cmp_key_with_cur(
struct xfs_btree_cur *cur,
const union xfs_btree_key *key)
{
- return (int64_t)be32_to_cpu(key->inobt.ir_startino) -
- cur->bc_rec.i.ir_startino;
+ return cmp_int(be32_to_cpu(key->inobt.ir_startino),
+ cur->bc_rec.i.ir_startino);
}
-STATIC int64_t
-xfs_inobt_diff_two_keys(
+STATIC int
+xfs_inobt_cmp_two_keys(
struct xfs_btree_cur *cur,
const union xfs_btree_key *k1,
const union xfs_btree_key *k2,
@@ -283,8 +283,8 @@ xfs_inobt_diff_two_keys(
{
ASSERT(!mask || mask->inobt.ir_startino);
- return (int64_t)be32_to_cpu(k1->inobt.ir_startino) -
- be32_to_cpu(k2->inobt.ir_startino);
+ return cmp_int(be32_to_cpu(k1->inobt.ir_startino),
+ be32_to_cpu(k2->inobt.ir_startino));
}
static xfs_failaddr_t
@@ -430,9 +430,9 @@ const struct xfs_btree_ops xfs_inobt_ops = {
.init_high_key_from_rec = xfs_inobt_init_high_key_from_rec,
.init_rec_from_cur = xfs_inobt_init_rec_from_cur,
.init_ptr_from_cur = xfs_inobt_init_ptr_from_cur,
- .key_diff = xfs_inobt_key_diff,
+ .cmp_key_with_cur = xfs_inobt_cmp_key_with_cur,
.buf_ops = &xfs_inobt_buf_ops,
- .diff_two_keys = xfs_inobt_diff_two_keys,
+ .cmp_two_keys = xfs_inobt_cmp_two_keys,
.keys_inorder = xfs_inobt_keys_inorder,
.recs_inorder = xfs_inobt_recs_inorder,
.keys_contiguous = xfs_inobt_keys_contiguous,
@@ -460,9 +460,9 @@ const struct xfs_btree_ops xfs_finobt_ops = {
.init_high_key_from_rec = xfs_inobt_init_high_key_from_rec,
.init_rec_from_cur = xfs_inobt_init_rec_from_cur,
.init_ptr_from_cur = xfs_finobt_init_ptr_from_cur,
- .key_diff = xfs_inobt_key_diff,
+ .cmp_key_with_cur = xfs_inobt_cmp_key_with_cur,
.buf_ops = &xfs_finobt_buf_ops,
- .diff_two_keys = xfs_inobt_diff_two_keys,
+ .cmp_two_keys = xfs_inobt_cmp_two_keys,
.keys_inorder = xfs_inobt_keys_inorder,
.recs_inorder = xfs_inobt_recs_inorder,
.keys_contiguous = xfs_inobt_keys_contiguous,
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index aa13fc00afd7..b1812b2c3cce 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -61,8 +61,8 @@ xfs_inode_buf_verify(
di_ok = xfs_verify_magic16(bp, dip->di_magic) &&
xfs_dinode_good_version(mp, dip->di_version) &&
xfs_verify_agino_or_null(bp->b_pag, unlinked_ino);
- if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
- XFS_ERRTAG_ITOBP_INOTOBP))) {
+ if (unlikely(!di_ok ||
+ XFS_TEST_ERROR(mp, XFS_ERRTAG_ITOBP_INOTOBP))) {
if (readahead) {
bp->b_flags &= ~XBF_DONE;
xfs_buf_ioerror(bp, -EIO);
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 4f99b90add55..1772d82f2d68 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -756,8 +756,7 @@ xfs_iext_count_extend(
if (nr_exts < ifp->if_nextents)
return -EFBIG;
- if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) &&
- nr_exts > 10)
+ if (XFS_TEST_ERROR(mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) && nr_exts > 10)
return -EFBIG;
if (nr_exts > xfs_iext_max_nextents(has_large, whichfork)) {
diff --git a/fs/xfs/libxfs/xfs_inode_util.c b/fs/xfs/libxfs/xfs_inode_util.c
index 48fe49a5f050..309ce6dd5553 100644
--- a/fs/xfs/libxfs/xfs_inode_util.c
+++ b/fs/xfs/libxfs/xfs_inode_util.c
@@ -299,17 +299,6 @@ xfs_inode_init(
} else {
inode_init_owner(args->idmap, inode, dir, args->mode);
}
-
- /*
- * If the group ID of the new file does not match the effective
- * group ID or one of the supplementary group IDs, the S_ISGID
- * bit is cleared (and only if the irix_sgid_inherit
- * compatibility variable is set).
- */
- if (irix_sgid_inherit && (inode->i_mode & S_ISGID) &&
- !vfsgid_in_group_p(i_gid_into_vfsgid(args->idmap, inode)))
- inode->i_mode &= ~S_ISGID;
-
ip->i_projid = xfs_get_initial_prid(pip);
}
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index 0d637c276db0..908e7060428c 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -31,6 +31,7 @@ typedef uint32_t xlog_tid_t;
#define XLOG_BIG_RECORD_BSIZE (32*1024) /* 32k buffers */
#define XLOG_MAX_RECORD_BSIZE (256*1024)
#define XLOG_HEADER_CYCLE_SIZE (32*1024) /* cycle data in header */
+#define XLOG_CYCLE_DATA_SIZE (XLOG_HEADER_CYCLE_SIZE / BBSIZE)
#define XLOG_MIN_RECORD_BSHIFT 14 /* 16384 == 1 << 14 */
#define XLOG_BIG_RECORD_BSHIFT 15 /* 32k == 1 << 15 */
#define XLOG_MAX_RECORD_BSHIFT 18 /* 256k == 1 << 18 */
@@ -86,43 +87,6 @@ struct xfs_unmount_log_format {
uint32_t pad2; /* may as well make it 64 bits */
};
-/* Region types for iovec's i_type */
-#define XLOG_REG_TYPE_BFORMAT 1
-#define XLOG_REG_TYPE_BCHUNK 2
-#define XLOG_REG_TYPE_EFI_FORMAT 3
-#define XLOG_REG_TYPE_EFD_FORMAT 4
-#define XLOG_REG_TYPE_IFORMAT 5
-#define XLOG_REG_TYPE_ICORE 6
-#define XLOG_REG_TYPE_IEXT 7
-#define XLOG_REG_TYPE_IBROOT 8
-#define XLOG_REG_TYPE_ILOCAL 9
-#define XLOG_REG_TYPE_IATTR_EXT 10
-#define XLOG_REG_TYPE_IATTR_BROOT 11
-#define XLOG_REG_TYPE_IATTR_LOCAL 12
-#define XLOG_REG_TYPE_QFORMAT 13
-#define XLOG_REG_TYPE_DQUOT 14
-#define XLOG_REG_TYPE_QUOTAOFF 15
-#define XLOG_REG_TYPE_LRHEADER 16
-#define XLOG_REG_TYPE_UNMOUNT 17
-#define XLOG_REG_TYPE_COMMIT 18
-#define XLOG_REG_TYPE_TRANSHDR 19
-#define XLOG_REG_TYPE_ICREATE 20
-#define XLOG_REG_TYPE_RUI_FORMAT 21
-#define XLOG_REG_TYPE_RUD_FORMAT 22
-#define XLOG_REG_TYPE_CUI_FORMAT 23
-#define XLOG_REG_TYPE_CUD_FORMAT 24
-#define XLOG_REG_TYPE_BUI_FORMAT 25
-#define XLOG_REG_TYPE_BUD_FORMAT 26
-#define XLOG_REG_TYPE_ATTRI_FORMAT 27
-#define XLOG_REG_TYPE_ATTRD_FORMAT 28
-#define XLOG_REG_TYPE_ATTR_NAME 29
-#define XLOG_REG_TYPE_ATTR_VALUE 30
-#define XLOG_REG_TYPE_XMI_FORMAT 31
-#define XLOG_REG_TYPE_XMD_FORMAT 32
-#define XLOG_REG_TYPE_ATTR_NEWNAME 33
-#define XLOG_REG_TYPE_ATTR_NEWVALUE 34
-#define XLOG_REG_TYPE_MAX 34
-
/*
* Flags to log operation header
*
@@ -141,14 +105,13 @@ struct xfs_unmount_log_format {
#define XLOG_END_TRANS 0x10 /* End a continued transaction */
#define XLOG_UNMOUNT_TRANS 0x20 /* Unmount a filesystem transaction */
-
-typedef struct xlog_op_header {
+struct xlog_op_header {
__be32 oh_tid; /* transaction id of operation : 4 b */
__be32 oh_len; /* bytes in data region : 4 b */
__u8 oh_clientid; /* who sent me this : 1 b */
__u8 oh_flags; /* : 1 b */
__u16 oh_res2; /* 32 bit align : 2 b */
-} xlog_op_header_t;
+};
/* valid values for h_fmt */
#define XLOG_FMT_UNKNOWN 0
@@ -163,7 +126,17 @@ typedef struct xlog_op_header {
#define XLOG_FMT XLOG_FMT_LINUX_LE
#endif
-typedef struct xlog_rec_header {
+struct xlog_rec_ext_header {
+ __be32 xh_cycle; /* write cycle of log */
+ __be32 xh_cycle_data[XLOG_CYCLE_DATA_SIZE];
+ __u8 xh_reserved[252];
+};
+
+/* actual ext header payload size for checksumming */
+#define XLOG_REC_EXT_SIZE \
+ offsetofend(struct xlog_rec_ext_header, xh_cycle_data)
+
+struct xlog_rec_header {
__be32 h_magicno; /* log record (LR) identifier : 4 */
__be32 h_cycle; /* write cycle of log : 4 */
__be32 h_version; /* LR version : 4 */
@@ -173,34 +146,50 @@ typedef struct xlog_rec_header {
__le32 h_crc; /* crc of log record : 4 */
__be32 h_prev_block; /* block number to previous LR : 4 */
__be32 h_num_logops; /* number of log operations in this LR : 4 */
- __be32 h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE];
- /* new fields */
+ __be32 h_cycle_data[XLOG_CYCLE_DATA_SIZE];
+
+ /* fields added by the Linux port: */
__be32 h_fmt; /* format of log record : 4 */
uuid_t h_fs_uuid; /* uuid of FS : 16 */
+
+ /* fields added for log v2: */
__be32 h_size; /* iclog size : 4 */
-} xlog_rec_header_t;
-typedef struct xlog_rec_ext_header {
- __be32 xh_cycle; /* write cycle of log : 4 */
- __be32 xh_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; /* : 256 */
-} xlog_rec_ext_header_t;
+ /*
+ * When h_size added for log v2 support, it caused structure to have
+ * a different size on i386 vs all other architectures because the
+ * sum of the size ofthe member is not aligned by that of the largest
+ * __be64-sized member, and i386 has really odd struct alignment rules.
+ *
+ * Due to the way the log headers are placed out on-disk that alone is
+ * not a problem becaue the xlog_rec_header always sits alone in a
+ * BBSIZEs area, and the rest of that area is padded with zeroes.
+ * But xlog_cksum used to calculate the checksum based on the structure
+ * size, and thus gives different checksums for i386 vs the rest.
+ * We now do two checksum validation passes for both sizes to allow
+ * moving v5 file systems with unclean logs between i386 and other
+ * (little-endian) architectures.
+ */
+ __u32 h_pad0;
-/*
- * Quite misnamed, because this union lays out the actual on-disk log buffer.
- */
-typedef union xlog_in_core2 {
- xlog_rec_header_t hic_header;
- xlog_rec_ext_header_t hic_xheader;
- char hic_sector[XLOG_HEADER_SIZE];
-} xlog_in_core_2_t;
+ __u8 h_reserved[184];
+ struct xlog_rec_ext_header h_ext[];
+};
+
+#ifdef __i386__
+#define XLOG_REC_SIZE offsetofend(struct xlog_rec_header, h_size)
+#define XLOG_REC_SIZE_OTHER offsetofend(struct xlog_rec_header, h_pad0)
+#else
+#define XLOG_REC_SIZE offsetofend(struct xlog_rec_header, h_pad0)
+#define XLOG_REC_SIZE_OTHER offsetofend(struct xlog_rec_header, h_size)
+#endif /* __i386__ */
/* not an on-disk structure, but needed by log recovery in userspace */
-typedef struct xfs_log_iovec {
+struct xfs_log_iovec {
void *i_addr; /* beginning address of region */
int i_len; /* length in bytes of region */
uint i_type; /* type of region */
-} xfs_log_iovec_t;
-
+};
/*
* Transaction Header definitions.
@@ -213,12 +202,12 @@ typedef struct xfs_log_iovec {
* Do not change the below structure without redoing the code in
* xlog_recover_add_to_trans() and xlog_recover_add_to_cont_trans().
*/
-typedef struct xfs_trans_header {
+struct xfs_trans_header {
uint th_magic; /* magic number */
uint th_type; /* transaction type */
int32_t th_tid; /* transaction id (unused) */
uint th_num_items; /* num items logged by trans */
-} xfs_trans_header_t;
+};
#define XFS_TRANS_HEADER_MAGIC 0x5452414e /* TRAN */
@@ -542,7 +531,7 @@ struct xfs_log_dinode {
#define __XFS_BLF_DATAMAP_SIZE ((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD)
#define XFS_BLF_DATAMAP_SIZE (__XFS_BLF_DATAMAP_SIZE + 1)
-typedef struct xfs_buf_log_format {
+struct xfs_buf_log_format {
unsigned short blf_type; /* buf log item type indicator */
unsigned short blf_size; /* size of this item */
unsigned short blf_flags; /* misc state */
@@ -550,7 +539,7 @@ typedef struct xfs_buf_log_format {
int64_t blf_blkno; /* starting blkno of this buf */
unsigned int blf_map_size; /* used size of data bitmap in words */
unsigned int blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */
-} xfs_buf_log_format_t;
+};
/*
* All buffers now need to tell recovery where the magic number
@@ -606,40 +595,41 @@ xfs_blft_from_flags(struct xfs_buf_log_format *blf)
/*
* EFI/EFD log format definitions
*/
-typedef struct xfs_extent {
+struct xfs_extent {
xfs_fsblock_t ext_start;
xfs_extlen_t ext_len;
-} xfs_extent_t;
+};
/*
- * Since an xfs_extent_t has types (start:64, len: 32)
- * there are different alignments on 32 bit and 64 bit kernels.
- * So we provide the different variants for use by a
- * conversion routine.
+ * Since the structures in struct xfs_extent add up to 96 bytes, it has
+ * different alignments on i386 vs all other architectures, because i386
+ * does not pad structures to their natural alignment.
+ *
+ * Provide the different variants for use by a conversion routine.
*/
-typedef struct xfs_extent_32 {
+struct xfs_extent_32 {
uint64_t ext_start;
uint32_t ext_len;
-} __attribute__((packed)) xfs_extent_32_t;
+} __attribute__((packed));
-typedef struct xfs_extent_64 {
+struct xfs_extent_64 {
uint64_t ext_start;
uint32_t ext_len;
uint32_t ext_pad;
-} xfs_extent_64_t;
+};
/*
* This is the structure used to lay out an efi log item in the
* log. The efi_extents field is a variable size array whose
* size is given by efi_nextents.
*/
-typedef struct xfs_efi_log_format {
+struct xfs_efi_log_format {
uint16_t efi_type; /* efi log item type */
uint16_t efi_size; /* size of this item */
uint32_t efi_nextents; /* # extents to free */
uint64_t efi_id; /* efi identifier */
- xfs_extent_t efi_extents[]; /* array of extents to free */
-} xfs_efi_log_format_t;
+ struct xfs_extent efi_extents[]; /* array of extents to free */
+};
static inline size_t
xfs_efi_log_format_sizeof(
@@ -649,13 +639,13 @@ xfs_efi_log_format_sizeof(
nr * sizeof(struct xfs_extent);
}
-typedef struct xfs_efi_log_format_32 {
+struct xfs_efi_log_format_32 {
uint16_t efi_type; /* efi log item type */
uint16_t efi_size; /* size of this item */
uint32_t efi_nextents; /* # extents to free */
uint64_t efi_id; /* efi identifier */
- xfs_extent_32_t efi_extents[]; /* array of extents to free */
-} __attribute__((packed)) xfs_efi_log_format_32_t;
+ struct xfs_extent_32 efi_extents[]; /* array of extents to free */
+} __attribute__((packed));
static inline size_t
xfs_efi_log_format32_sizeof(
@@ -665,13 +655,13 @@ xfs_efi_log_format32_sizeof(
nr * sizeof(struct xfs_extent_32);
}
-typedef struct xfs_efi_log_format_64 {
+struct xfs_efi_log_format_64 {
uint16_t efi_type; /* efi log item type */
uint16_t efi_size; /* size of this item */
uint32_t efi_nextents; /* # extents to free */
uint64_t efi_id; /* efi identifier */
- xfs_extent_64_t efi_extents[]; /* array of extents to free */
-} xfs_efi_log_format_64_t;
+ struct xfs_extent_64 efi_extents[]; /* array of extents to free */
+};
static inline size_t
xfs_efi_log_format64_sizeof(
@@ -686,13 +676,13 @@ xfs_efi_log_format64_sizeof(
* log. The efd_extents array is a variable size array whose
* size is given by efd_nextents;
*/
-typedef struct xfs_efd_log_format {
+struct xfs_efd_log_format {
uint16_t efd_type; /* efd log item type */
uint16_t efd_size; /* size of this item */
uint32_t efd_nextents; /* # of extents freed */
uint64_t efd_efi_id; /* id of corresponding efi */
- xfs_extent_t efd_extents[]; /* array of extents freed */
-} xfs_efd_log_format_t;
+ struct xfs_extent efd_extents[]; /* array of extents freed */
+};
static inline size_t
xfs_efd_log_format_sizeof(
@@ -702,13 +692,13 @@ xfs_efd_log_format_sizeof(
nr * sizeof(struct xfs_extent);
}
-typedef struct xfs_efd_log_format_32 {
+struct xfs_efd_log_format_32 {
uint16_t efd_type; /* efd log item type */
uint16_t efd_size; /* size of this item */
uint32_t efd_nextents; /* # of extents freed */
uint64_t efd_efi_id; /* id of corresponding efi */
- xfs_extent_32_t efd_extents[]; /* array of extents freed */
-} __attribute__((packed)) xfs_efd_log_format_32_t;
+ struct xfs_extent_32 efd_extents[]; /* array of extents freed */
+} __attribute__((packed));
static inline size_t
xfs_efd_log_format32_sizeof(
@@ -718,13 +708,13 @@ xfs_efd_log_format32_sizeof(
nr * sizeof(struct xfs_extent_32);
}
-typedef struct xfs_efd_log_format_64 {
+struct xfs_efd_log_format_64 {
uint16_t efd_type; /* efd log item type */
uint16_t efd_size; /* size of this item */
uint32_t efd_nextents; /* # of extents freed */
uint64_t efd_efi_id; /* id of corresponding efi */
- xfs_extent_64_t efd_extents[]; /* array of extents freed */
-} xfs_efd_log_format_64_t;
+ struct xfs_extent_64 efd_extents[]; /* array of extents freed */
+};
static inline size_t
xfs_efd_log_format64_sizeof(
@@ -957,14 +947,14 @@ struct xfs_xmd_log_format {
* The first two fields must be the type and size fitting into
* 32 bits : log_recovery code assumes that.
*/
-typedef struct xfs_dq_logformat {
+struct xfs_dq_logformat {
uint16_t qlf_type; /* dquot log item type */
uint16_t qlf_size; /* size of this item */
xfs_dqid_t qlf_id; /* usr/grp/proj id : 32 bits */
int64_t qlf_blkno; /* blkno of dquot buffer */
int32_t qlf_len; /* len of dquot buffer */
uint32_t qlf_boffset; /* off of dquot in buffer */
-} xfs_dq_logformat_t;
+};
/*
* log format struct for QUOTAOFF records.
@@ -974,12 +964,12 @@ typedef struct xfs_dq_logformat {
* to the first and ensures that the first logitem is taken out of the AIL
* only when the last one is securely committed.
*/
-typedef struct xfs_qoff_logformat {
+struct xfs_qoff_logformat {
unsigned short qf_type; /* quotaoff log item type */
unsigned short qf_size; /* size of this item */
unsigned int qf_flags; /* USR and/or GRP */
char qf_pad[12]; /* padding for future */
-} xfs_qoff_logformat_t;
+};
/*
* Disk quotas status in m_qflags, and also sb_qflags. 16 bits.
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index 66c7916fb5cd..9e712e62369c 100644
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -104,20 +104,20 @@ struct xlog_recover_item {
struct list_head ri_list;
int ri_cnt; /* count of regions found */
int ri_total; /* total regions */
- struct xfs_log_iovec *ri_buf; /* ptr to regions buffer */
+ struct kvec *ri_buf; /* ptr to regions buffer */
const struct xlog_recover_item_ops *ri_ops;
};
struct xlog_recover {
struct hlist_node r_list;
xlog_tid_t r_log_tid; /* log's transaction id */
- xfs_trans_header_t r_theader; /* trans header for partial */
+ struct xfs_trans_header r_theader; /* trans header for partial */
int r_state; /* not needed */
xfs_lsn_t r_lsn; /* xact lsn */
struct list_head r_itemq; /* q for items */
};
-#define ITEM_TYPE(i) (*(unsigned short *)(i)->ri_buf[0].i_addr)
+#define ITEM_TYPE(i) (*(unsigned short *)(i)->ri_buf[0].iov_base)
#define XLOG_RECOVER_CRCPASS 0
#define XLOG_RECOVER_PASS1 1
diff --git a/fs/xfs/libxfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c
index d3bd6a86c8fe..34bba96d30ca 100644
--- a/fs/xfs/libxfs/xfs_log_rlimit.c
+++ b/fs/xfs/libxfs/xfs_log_rlimit.c
@@ -91,6 +91,7 @@ xfs_log_calc_trans_resv_for_minlogblocks(
*/
if (xfs_want_minlogsize_fixes(&mp->m_sb)) {
xfs_trans_resv_calc(mp, resv);
+ resv->tr_atomic_ioend = M_RES(mp)->tr_atomic_ioend;
return;
}
@@ -107,6 +108,9 @@ xfs_log_calc_trans_resv_for_minlogblocks(
xfs_trans_resv_calc(mp, resv);
+ /* Copy the dynamic transaction reservation types from the running fs */
+ resv->tr_atomic_ioend = M_RES(mp)->tr_atomic_ioend;
+
if (xfs_has_reflink(mp)) {
/*
* In the early days of reflink, typical log operation counts
diff --git a/fs/xfs/libxfs/xfs_metafile.c b/fs/xfs/libxfs/xfs_metafile.c
index 225923e463c4..b02e3d6c0868 100644
--- a/fs/xfs/libxfs/xfs_metafile.c
+++ b/fs/xfs/libxfs/xfs_metafile.c
@@ -121,7 +121,7 @@ xfs_metafile_resv_critical(
div_u64(mp->m_metafile_resv_target, 10)))
return true;
- return XFS_TEST_ERROR(false, mp, XFS_ERRTAG_METAFILE_RESV_CRITICAL);
+ return XFS_TEST_ERROR(mp, XFS_ERRTAG_METAFILE_RESV_CRITICAL);
}
/* Allocate a block from the metadata file's reservation. */
diff --git a/fs/xfs/libxfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h
index 5ed44fdf7491..2e9715cc1641 100644
--- a/fs/xfs/libxfs/xfs_ondisk.h
+++ b/fs/xfs/libxfs/xfs_ondisk.h
@@ -174,7 +174,11 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_STRUCT_SIZE(struct xfs_rud_log_format, 16);
XFS_CHECK_STRUCT_SIZE(struct xfs_map_extent, 32);
XFS_CHECK_STRUCT_SIZE(struct xfs_phys_extent, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xlog_rec_header, 512);
+ XFS_CHECK_STRUCT_SIZE(struct xlog_rec_ext_header, 512);
+ XFS_CHECK_OFFSET(struct xlog_rec_header, h_reserved, 328);
+ XFS_CHECK_OFFSET(struct xlog_rec_ext_header, xh_reserved, 260);
XFS_CHECK_OFFSET(struct xfs_bui_log_format, bui_extents, 16);
XFS_CHECK_OFFSET(struct xfs_cui_log_format, cui_extents, 16);
XFS_CHECK_OFFSET(struct xfs_rui_log_format, rui_extents, 16);
diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h
index 763d941a8420..551d7ae46c5c 100644
--- a/fs/xfs/libxfs/xfs_quota_defs.h
+++ b/fs/xfs/libxfs/xfs_quota_defs.h
@@ -29,11 +29,9 @@ typedef uint8_t xfs_dqtype_t;
* flags for q_flags field in the dquot.
*/
#define XFS_DQFLAG_DIRTY (1u << 0) /* dquot is dirty */
-#define XFS_DQFLAG_FREEING (1u << 1) /* dquot is being torn down */
#define XFS_DQFLAG_STRINGS \
- { XFS_DQFLAG_DIRTY, "DIRTY" }, \
- { XFS_DQFLAG_FREEING, "FREEING" }
+ { XFS_DQFLAG_DIRTY, "DIRTY" }
/*
* We have the possibility of all three quota types being active at once, and
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index cebe83f7842a..2484dc9f6d7e 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -1113,8 +1113,7 @@ xfs_refcount_still_have_space(
* refcount continue update "error" has been injected.
*/
if (cur->bc_refc.nr_ops > 2 &&
- XFS_TEST_ERROR(false, cur->bc_mp,
- XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE))
+ XFS_TEST_ERROR(cur->bc_mp, XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE))
return false;
if (cur->bc_refc.nr_ops == 0)
@@ -1398,7 +1397,7 @@ xfs_refcount_finish_one(
trace_xfs_refcount_deferred(mp, ri);
- if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE))
+ if (XFS_TEST_ERROR(mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE))
return -EIO;
/*
@@ -1511,7 +1510,7 @@ xfs_rtrefcount_finish_one(
trace_xfs_refcount_deferred(mp, ri);
- if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE))
+ if (XFS_TEST_ERROR(mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE))
return -EIO;
/*
@@ -2099,9 +2098,7 @@ xfs_refcount_recover_cow_leftovers(
* recording the CoW debris we cancel the (empty) transaction
* and everything goes away cleanly.
*/
- error = xfs_trans_alloc_empty(mp, &tp);
- if (error)
- return error;
+ tp = xfs_trans_alloc_empty(mp);
if (isrt) {
xfs_rtgroup_lock(to_rtg(xg), XFS_RTGLOCK_REFCOUNT);
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
index 54505fee1852..06da3ca14727 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.c
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -174,8 +174,8 @@ xfs_refcountbt_init_ptr_from_cur(
ptr->s = agf->agf_refcount_root;
}
-STATIC int64_t
-xfs_refcountbt_key_diff(
+STATIC int
+xfs_refcountbt_cmp_key_with_cur(
struct xfs_btree_cur *cur,
const union xfs_btree_key *key)
{
@@ -185,11 +185,11 @@ xfs_refcountbt_key_diff(
start = xfs_refcount_encode_startblock(irec->rc_startblock,
irec->rc_domain);
- return (int64_t)be32_to_cpu(kp->rc_startblock) - start;
+ return cmp_int(be32_to_cpu(kp->rc_startblock), start);
}
-STATIC int64_t
-xfs_refcountbt_diff_two_keys(
+STATIC int
+xfs_refcountbt_cmp_two_keys(
struct xfs_btree_cur *cur,
const union xfs_btree_key *k1,
const union xfs_btree_key *k2,
@@ -197,8 +197,8 @@ xfs_refcountbt_diff_two_keys(
{
ASSERT(!mask || mask->refc.rc_startblock);
- return (int64_t)be32_to_cpu(k1->refc.rc_startblock) -
- be32_to_cpu(k2->refc.rc_startblock);
+ return cmp_int(be32_to_cpu(k1->refc.rc_startblock),
+ be32_to_cpu(k2->refc.rc_startblock));
}
STATIC xfs_failaddr_t
@@ -339,9 +339,9 @@ const struct xfs_btree_ops xfs_refcountbt_ops = {
.init_high_key_from_rec = xfs_refcountbt_init_high_key_from_rec,
.init_rec_from_cur = xfs_refcountbt_init_rec_from_cur,
.init_ptr_from_cur = xfs_refcountbt_init_ptr_from_cur,
- .key_diff = xfs_refcountbt_key_diff,
+ .cmp_key_with_cur = xfs_refcountbt_cmp_key_with_cur,
.buf_ops = &xfs_refcountbt_buf_ops,
- .diff_two_keys = xfs_refcountbt_diff_two_keys,
+ .cmp_two_keys = xfs_refcountbt_cmp_two_keys,
.keys_inorder = xfs_refcountbt_keys_inorder,
.recs_inorder = xfs_refcountbt_recs_inorder,
.keys_contiguous = xfs_refcountbt_keys_contiguous,
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index 3cdf50563fec..83e0488ff773 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -2690,7 +2690,7 @@ xfs_rmap_finish_one(
trace_xfs_rmap_deferred(mp, ri);
- if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_RMAP_FINISH_ONE))
+ if (XFS_TEST_ERROR(mp, XFS_ERRTAG_RMAP_FINISH_ONE))
return -EIO;
/*
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index 2cab694ac58a..bf16aee50d73 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -243,38 +243,22 @@ static inline uint64_t offset_keymask(uint64_t offset)
return offset & ~XFS_RMAP_OFF_UNWRITTEN;
}
-STATIC int64_t
-xfs_rmapbt_key_diff(
+STATIC int
+xfs_rmapbt_cmp_key_with_cur(
struct xfs_btree_cur *cur,
const union xfs_btree_key *key)
{
struct xfs_rmap_irec *rec = &cur->bc_rec.r;
const struct xfs_rmap_key *kp = &key->rmap;
- __u64 x, y;
- int64_t d;
- d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock;
- if (d)
- return d;
-
- x = be64_to_cpu(kp->rm_owner);
- y = rec->rm_owner;
- if (x > y)
- return 1;
- else if (y > x)
- return -1;
-
- x = offset_keymask(be64_to_cpu(kp->rm_offset));
- y = offset_keymask(xfs_rmap_irec_offset_pack(rec));
- if (x > y)
- return 1;
- else if (y > x)
- return -1;
- return 0;
+ return cmp_int(be32_to_cpu(kp->rm_startblock), rec->rm_startblock) ?:
+ cmp_int(be64_to_cpu(kp->rm_owner), rec->rm_owner) ?:
+ cmp_int(offset_keymask(be64_to_cpu(kp->rm_offset)),
+ offset_keymask(xfs_rmap_irec_offset_pack(rec)));
}
-STATIC int64_t
-xfs_rmapbt_diff_two_keys(
+STATIC int
+xfs_rmapbt_cmp_two_keys(
struct xfs_btree_cur *cur,
const union xfs_btree_key *k1,
const union xfs_btree_key *k2,
@@ -282,36 +266,31 @@ xfs_rmapbt_diff_two_keys(
{
const struct xfs_rmap_key *kp1 = &k1->rmap;
const struct xfs_rmap_key *kp2 = &k2->rmap;
- int64_t d;
- __u64 x, y;
+ int d;
/* Doesn't make sense to mask off the physical space part */
ASSERT(!mask || mask->rmap.rm_startblock);
- d = (int64_t)be32_to_cpu(kp1->rm_startblock) -
- be32_to_cpu(kp2->rm_startblock);
+ d = cmp_int(be32_to_cpu(kp1->rm_startblock),
+ be32_to_cpu(kp2->rm_startblock));
if (d)
return d;
if (!mask || mask->rmap.rm_owner) {
- x = be64_to_cpu(kp1->rm_owner);
- y = be64_to_cpu(kp2->rm_owner);
- if (x > y)
- return 1;
- else if (y > x)
- return -1;
+ d = cmp_int(be64_to_cpu(kp1->rm_owner),
+ be64_to_cpu(kp2->rm_owner));
+ if (d)
+ return d;
}
if (!mask || mask->rmap.rm_offset) {
/* Doesn't make sense to allow offset but not owner */
ASSERT(!mask || mask->rmap.rm_owner);
- x = offset_keymask(be64_to_cpu(kp1->rm_offset));
- y = offset_keymask(be64_to_cpu(kp2->rm_offset));
- if (x > y)
- return 1;
- else if (y > x)
- return -1;
+ d = cmp_int(offset_keymask(be64_to_cpu(kp1->rm_offset)),
+ offset_keymask(be64_to_cpu(kp2->rm_offset)));
+ if (d)
+ return d;
}
return 0;
@@ -515,9 +494,9 @@ const struct xfs_btree_ops xfs_rmapbt_ops = {
.init_high_key_from_rec = xfs_rmapbt_init_high_key_from_rec,
.init_rec_from_cur = xfs_rmapbt_init_rec_from_cur,
.init_ptr_from_cur = xfs_rmapbt_init_ptr_from_cur,
- .key_diff = xfs_rmapbt_key_diff,
+ .cmp_key_with_cur = xfs_rmapbt_cmp_key_with_cur,
.buf_ops = &xfs_rmapbt_buf_ops,
- .diff_two_keys = xfs_rmapbt_diff_two_keys,
+ .cmp_two_keys = xfs_rmapbt_cmp_two_keys,
.keys_inorder = xfs_rmapbt_keys_inorder,
.recs_inorder = xfs_rmapbt_recs_inorder,
.keys_contiguous = xfs_rmapbt_keys_contiguous,
@@ -632,9 +611,9 @@ const struct xfs_btree_ops xfs_rmapbt_mem_ops = {
.init_high_key_from_rec = xfs_rmapbt_init_high_key_from_rec,
.init_rec_from_cur = xfs_rmapbt_init_rec_from_cur,
.init_ptr_from_cur = xfbtree_init_ptr_from_cur,
- .key_diff = xfs_rmapbt_key_diff,
+ .cmp_key_with_cur = xfs_rmapbt_cmp_key_with_cur,
.buf_ops = &xfs_rmapbt_mem_buf_ops,
- .diff_two_keys = xfs_rmapbt_diff_two_keys,
+ .cmp_two_keys = xfs_rmapbt_cmp_two_keys,
.keys_inorder = xfs_rmapbt_keys_inorder,
.recs_inorder = xfs_rmapbt_recs_inorder,
.keys_contiguous = xfs_rmapbt_keys_contiguous,
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 5057536e586c..618061d898d4 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -1067,7 +1067,7 @@ xfs_rtfree_extent(
ASSERT(rbmip->i_itemp != NULL);
xfs_assert_ilocked(rbmip, XFS_ILOCK_EXCL);
- if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_FREE_EXTENT))
+ if (XFS_TEST_ERROR(mp, XFS_ERRTAG_FREE_EXTENT))
return -EIO;
error = xfs_rtcheck_alloc_range(&args, start, len);
diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h
index d36a6ae0abe5..03f1e2493334 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.h
+++ b/fs/xfs/libxfs/xfs_rtgroup.h
@@ -50,6 +50,12 @@ struct xfs_rtgroup {
uint8_t *rtg_rsum_cache;
struct xfs_open_zone *rtg_open_zone;
};
+
+ /*
+ * Count of outstanding GC operations for zoned XFS. Any RTG with a
+ * non-zero rtg_gccount will not be picked as new GC victim.
+ */
+ atomic_t rtg_gccount;
};
/*
@@ -58,12 +64,6 @@ struct xfs_rtgroup {
*/
#define XFS_RTG_FREE XA_MARK_0
-/*
- * For zoned RT devices this is set on groups that are fully written and that
- * have unused blocks. Used by the garbage collection to pick targets.
- */
-#define XFS_RTG_RECLAIMABLE XA_MARK_1
-
static inline struct xfs_rtgroup *to_rtg(struct xfs_group *xg)
{
return container_of(xg, struct xfs_rtgroup, rtg_group);
@@ -365,4 +365,12 @@ static inline int xfs_initialize_rtgroups(struct xfs_mount *mp,
# define xfs_rtgroup_get_geometry(rtg, rgeo) (-EOPNOTSUPP)
#endif /* CONFIG_XFS_RT */
+static inline xfs_rfsblock_t
+xfs_rtgs_to_rfsbs(
+ struct xfs_mount *mp,
+ uint32_t nr_groups)
+{
+ return xfs_groups_to_rfsbs(mp, nr_groups, XG_TYPE_RTG);
+}
+
#endif /* __LIBXFS_RTGROUP_H */
diff --git a/fs/xfs/libxfs/xfs_rtrefcount_btree.c b/fs/xfs/libxfs/xfs_rtrefcount_btree.c
index 3db5e7a4a945..ac11e94b42ae 100644
--- a/fs/xfs/libxfs/xfs_rtrefcount_btree.c
+++ b/fs/xfs/libxfs/xfs_rtrefcount_btree.c
@@ -156,8 +156,8 @@ xfs_rtrefcountbt_init_ptr_from_cur(
ptr->l = 0;
}
-STATIC int64_t
-xfs_rtrefcountbt_key_diff(
+STATIC int
+xfs_rtrefcountbt_cmp_key_with_cur(
struct xfs_btree_cur *cur,
const union xfs_btree_key *key)
{
@@ -167,11 +167,11 @@ xfs_rtrefcountbt_key_diff(
start = xfs_refcount_encode_startblock(irec->rc_startblock,
irec->rc_domain);
- return (int64_t)be32_to_cpu(kp->rc_startblock) - start;
+ return cmp_int(be32_to_cpu(kp->rc_startblock), start);
}
-STATIC int64_t
-xfs_rtrefcountbt_diff_two_keys(
+STATIC int
+xfs_rtrefcountbt_cmp_two_keys(
struct xfs_btree_cur *cur,
const union xfs_btree_key *k1,
const union xfs_btree_key *k2,
@@ -179,8 +179,8 @@ xfs_rtrefcountbt_diff_two_keys(
{
ASSERT(!mask || mask->refc.rc_startblock);
- return (int64_t)be32_to_cpu(k1->refc.rc_startblock) -
- be32_to_cpu(k2->refc.rc_startblock);
+ return cmp_int(be32_to_cpu(k1->refc.rc_startblock),
+ be32_to_cpu(k2->refc.rc_startblock));
}
static xfs_failaddr_t
@@ -387,9 +387,9 @@ const struct xfs_btree_ops xfs_rtrefcountbt_ops = {
.init_high_key_from_rec = xfs_rtrefcountbt_init_high_key_from_rec,
.init_rec_from_cur = xfs_rtrefcountbt_init_rec_from_cur,
.init_ptr_from_cur = xfs_rtrefcountbt_init_ptr_from_cur,
- .key_diff = xfs_rtrefcountbt_key_diff,
+ .cmp_key_with_cur = xfs_rtrefcountbt_cmp_key_with_cur,
.buf_ops = &xfs_rtrefcountbt_buf_ops,
- .diff_two_keys = xfs_rtrefcountbt_diff_two_keys,
+ .cmp_two_keys = xfs_rtrefcountbt_cmp_two_keys,
.keys_inorder = xfs_rtrefcountbt_keys_inorder,
.recs_inorder = xfs_rtrefcountbt_recs_inorder,
.keys_contiguous = xfs_rtrefcountbt_keys_contiguous,
diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.c b/fs/xfs/libxfs/xfs_rtrmap_btree.c
index 9bdc2cbfc113..55f903165769 100644
--- a/fs/xfs/libxfs/xfs_rtrmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rtrmap_btree.c
@@ -185,38 +185,22 @@ static inline uint64_t offset_keymask(uint64_t offset)
return offset & ~XFS_RMAP_OFF_UNWRITTEN;
}
-STATIC int64_t
-xfs_rtrmapbt_key_diff(
+STATIC int
+xfs_rtrmapbt_cmp_key_with_cur(
struct xfs_btree_cur *cur,
const union xfs_btree_key *key)
{
struct xfs_rmap_irec *rec = &cur->bc_rec.r;
const struct xfs_rmap_key *kp = &key->rmap;
- __u64 x, y;
- int64_t d;
- d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock;
- if (d)
- return d;
-
- x = be64_to_cpu(kp->rm_owner);
- y = rec->rm_owner;
- if (x > y)
- return 1;
- else if (y > x)
- return -1;
-
- x = offset_keymask(be64_to_cpu(kp->rm_offset));
- y = offset_keymask(xfs_rmap_irec_offset_pack(rec));
- if (x > y)
- return 1;
- else if (y > x)
- return -1;
- return 0;
+ return cmp_int(be32_to_cpu(kp->rm_startblock), rec->rm_startblock) ?:
+ cmp_int(be64_to_cpu(kp->rm_owner), rec->rm_owner) ?:
+ cmp_int(offset_keymask(be64_to_cpu(kp->rm_offset)),
+ offset_keymask(xfs_rmap_irec_offset_pack(rec)));
}
-STATIC int64_t
-xfs_rtrmapbt_diff_two_keys(
+STATIC int
+xfs_rtrmapbt_cmp_two_keys(
struct xfs_btree_cur *cur,
const union xfs_btree_key *k1,
const union xfs_btree_key *k2,
@@ -224,36 +208,31 @@ xfs_rtrmapbt_diff_two_keys(
{
const struct xfs_rmap_key *kp1 = &k1->rmap;
const struct xfs_rmap_key *kp2 = &k2->rmap;
- int64_t d;
- __u64 x, y;
+ int d;
/* Doesn't make sense to mask off the physical space part */
ASSERT(!mask || mask->rmap.rm_startblock);
- d = (int64_t)be32_to_cpu(kp1->rm_startblock) -
- be32_to_cpu(kp2->rm_startblock);
+ d = cmp_int(be32_to_cpu(kp1->rm_startblock),
+ be32_to_cpu(kp2->rm_startblock));
if (d)
return d;
if (!mask || mask->rmap.rm_owner) {
- x = be64_to_cpu(kp1->rm_owner);
- y = be64_to_cpu(kp2->rm_owner);
- if (x > y)
- return 1;
- else if (y > x)
- return -1;
+ d = cmp_int(be64_to_cpu(kp1->rm_owner),
+ be64_to_cpu(kp2->rm_owner));
+ if (d)
+ return d;
}
if (!mask || mask->rmap.rm_offset) {
/* Doesn't make sense to allow offset but not owner */
ASSERT(!mask || mask->rmap.rm_owner);
- x = offset_keymask(be64_to_cpu(kp1->rm_offset));
- y = offset_keymask(be64_to_cpu(kp2->rm_offset));
- if (x > y)
- return 1;
- else if (y > x)
- return -1;
+ d = cmp_int(offset_keymask(be64_to_cpu(kp1->rm_offset)),
+ offset_keymask(be64_to_cpu(kp2->rm_offset)));
+ if (d)
+ return d;
}
return 0;
@@ -511,9 +490,9 @@ const struct xfs_btree_ops xfs_rtrmapbt_ops = {
.init_high_key_from_rec = xfs_rtrmapbt_init_high_key_from_rec,
.init_rec_from_cur = xfs_rtrmapbt_init_rec_from_cur,
.init_ptr_from_cur = xfs_rtrmapbt_init_ptr_from_cur,
- .key_diff = xfs_rtrmapbt_key_diff,
+ .cmp_key_with_cur = xfs_rtrmapbt_cmp_key_with_cur,
.buf_ops = &xfs_rtrmapbt_buf_ops,
- .diff_two_keys = xfs_rtrmapbt_diff_two_keys,
+ .cmp_two_keys = xfs_rtrmapbt_cmp_two_keys,
.keys_inorder = xfs_rtrmapbt_keys_inorder,
.recs_inorder = xfs_rtrmapbt_recs_inorder,
.keys_contiguous = xfs_rtrmapbt_keys_contiguous,
@@ -620,9 +599,9 @@ const struct xfs_btree_ops xfs_rtrmapbt_mem_ops = {
.init_high_key_from_rec = xfs_rtrmapbt_init_high_key_from_rec,
.init_rec_from_cur = xfs_rtrmapbt_init_rec_from_cur,
.init_ptr_from_cur = xfbtree_init_ptr_from_cur,
- .key_diff = xfs_rtrmapbt_key_diff,
+ .cmp_key_with_cur = xfs_rtrmapbt_cmp_key_with_cur,
.buf_ops = &xfs_rtrmapbt_mem_buf_ops,
- .diff_two_keys = xfs_rtrmapbt_diff_two_keys,
+ .cmp_two_keys = xfs_rtrmapbt_cmp_two_keys,
.keys_inorder = xfs_rtrmapbt_keys_inorder,
.recs_inorder = xfs_rtrmapbt_recs_inorder,
.keys_contiguous = xfs_rtrmapbt_keys_contiguous,
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 711e180f9ebb..94c272a2ae26 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -142,8 +142,6 @@ xfs_sb_version_to_features(
if (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) {
if (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT)
features |= XFS_FEAT_LAZYSBCOUNT;
- if (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT)
- features |= XFS_FEAT_ATTR2;
if (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT)
features |= XFS_FEAT_PROJID32;
if (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE)
@@ -155,7 +153,7 @@ xfs_sb_version_to_features(
/* Always on V5 features */
features |= XFS_FEAT_ALIGN | XFS_FEAT_LOGV2 | XFS_FEAT_EXTFLG |
- XFS_FEAT_LAZYSBCOUNT | XFS_FEAT_ATTR2 | XFS_FEAT_PROJID32 |
+ XFS_FEAT_LAZYSBCOUNT | XFS_FEAT_PROJID32 |
XFS_FEAT_V3INODES | XFS_FEAT_CRC | XFS_FEAT_PQUOTINO;
/* Optional V5 features */
@@ -303,6 +301,21 @@ xfs_validate_rt_geometry(
sbp->sb_rbmblocks != xfs_expected_rbmblocks(sbp))
return false;
+ if (xfs_sb_is_v5(sbp) &&
+ (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED)) {
+ uint32_t mod;
+
+ /*
+ * Zoned RT devices must be aligned to the RT group size,
+ * because garbage collection assumes that all zones have the
+ * same size to avoid insane complexity if that weren't the
+ * case.
+ */
+ div_u64_rem(sbp->sb_rextents, sbp->sb_rgextents, &mod);
+ if (mod)
+ return false;
+ }
+
return true;
}
@@ -1524,7 +1537,8 @@ xfs_fs_geometry(
geo->version = XFS_FSOP_GEOM_VERSION;
geo->flags = XFS_FSOP_GEOM_FLAGS_NLINK |
XFS_FSOP_GEOM_FLAGS_DIRV2 |
- XFS_FSOP_GEOM_FLAGS_EXTFLG;
+ XFS_FSOP_GEOM_FLAGS_EXTFLG |
+ XFS_FSOP_GEOM_FLAGS_ATTR2;
if (xfs_has_attr(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_ATTR;
if (xfs_has_quota(mp))
@@ -1537,8 +1551,6 @@ xfs_fs_geometry(
geo->flags |= XFS_FSOP_GEOM_FLAGS_DIRV2CI;
if (xfs_has_lazysbcount(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_LAZYSB;
- if (xfs_has_attr2(mp))
- geo->flags |= XFS_FSOP_GEOM_FLAGS_ATTR2;
if (xfs_has_projid32(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_PROJID32;
if (xfs_has_crc(mp))
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index 13d00c7166e1..86a111d0f2fc 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -22,6 +22,12 @@
#include "xfs_rtbitmap.h"
#include "xfs_attr_item.h"
#include "xfs_log.h"
+#include "xfs_defer.h"
+#include "xfs_bmap_item.h"
+#include "xfs_extfree_item.h"
+#include "xfs_rmap_item.h"
+#include "xfs_refcount_item.h"
+#include "xfs_trace.h"
#define _ALLOC true
#define _FREE false
@@ -264,6 +270,42 @@ xfs_rtalloc_block_count(
*/
/*
+ * Finishing a data device refcount updates (t1):
+ * the agfs of the ags containing the blocks: nr_ops * sector size
+ * the refcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size
+ */
+inline unsigned int
+xfs_calc_finish_cui_reservation(
+ struct xfs_mount *mp,
+ unsigned int nr_ops)
+{
+ if (!xfs_has_reflink(mp))
+ return 0;
+
+ return xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops),
+ mp->m_sb.sb_blocksize);
+}
+
+/*
+ * Realtime refcount updates (t2);
+ * the rt refcount inode
+ * the rtrefcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size
+ */
+inline unsigned int
+xfs_calc_finish_rt_cui_reservation(
+ struct xfs_mount *mp,
+ unsigned int nr_ops)
+{
+ if (!xfs_has_rtreflink(mp))
+ return 0;
+
+ return xfs_calc_inode_res(mp, 1) +
+ xfs_calc_buf_res(xfs_rtrefcountbt_block_count(mp, nr_ops),
+ mp->m_sb.sb_blocksize);
+}
+
+/*
* Compute the log reservation required to handle the refcount update
* transaction. Refcount updates are always done via deferred log items.
*
@@ -280,19 +322,10 @@ xfs_calc_refcountbt_reservation(
struct xfs_mount *mp,
unsigned int nr_ops)
{
- unsigned int blksz = XFS_FSB_TO_B(mp, 1);
- unsigned int t1, t2 = 0;
+ unsigned int t1, t2;
- if (!xfs_has_reflink(mp))
- return 0;
-
- t1 = xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops), blksz);
-
- if (xfs_has_realtime(mp))
- t2 = xfs_calc_inode_res(mp, 1) +
- xfs_calc_buf_res(xfs_rtrefcountbt_block_count(mp, nr_ops),
- blksz);
+ t1 = xfs_calc_finish_cui_reservation(mp, nr_ops);
+ t2 = xfs_calc_finish_rt_cui_reservation(mp, nr_ops);
return max(t1, t2);
}
@@ -380,6 +413,96 @@ xfs_calc_write_reservation_minlogsize(
}
/*
+ * Finishing an EFI can free the blocks and bmap blocks (t2):
+ * the agf for each of the ags: nr * sector size
+ * the agfl for each of the ags: nr * sector size
+ * the super block to reflect the freed blocks: sector size
+ * worst case split in allocation btrees per extent assuming nr extents:
+ * nr exts * 2 trees * (2 * max depth - 1) * block size
+ */
+inline unsigned int
+xfs_calc_finish_efi_reservation(
+ struct xfs_mount *mp,
+ unsigned int nr)
+{
+ return xfs_calc_buf_res((2 * nr) + 1, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, nr),
+ mp->m_sb.sb_blocksize);
+}
+
+/*
+ * Or, if it's a realtime file (t3):
+ * the agf for each of the ags: 2 * sector size
+ * the agfl for each of the ags: 2 * sector size
+ * the super block to reflect the freed blocks: sector size
+ * the realtime bitmap:
+ * 2 exts * ((XFS_BMBT_MAX_EXTLEN / rtextsize) / NBBY) bytes
+ * the realtime summary: 2 exts * 1 block
+ * worst case split in allocation btrees per extent assuming 2 extents:
+ * 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+inline unsigned int
+xfs_calc_finish_rt_efi_reservation(
+ struct xfs_mount *mp,
+ unsigned int nr)
+{
+ if (!xfs_has_realtime(mp))
+ return 0;
+
+ return xfs_calc_buf_res((2 * nr) + 1, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(xfs_rtalloc_block_count(mp, nr),
+ mp->m_sb.sb_blocksize) +
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, nr),
+ mp->m_sb.sb_blocksize);
+}
+
+/*
+ * Finishing an RUI is the same as an EFI. We can split the rmap btree twice
+ * on each end of the record, and that can cause the AGFL to be refilled or
+ * emptied out.
+ */
+inline unsigned int
+xfs_calc_finish_rui_reservation(
+ struct xfs_mount *mp,
+ unsigned int nr)
+{
+ if (!xfs_has_rmapbt(mp))
+ return 0;
+ return xfs_calc_finish_efi_reservation(mp, nr);
+}
+
+/*
+ * Finishing an RUI is the same as an EFI. We can split the rmap btree twice
+ * on each end of the record, and that can cause the AGFL to be refilled or
+ * emptied out.
+ */
+inline unsigned int
+xfs_calc_finish_rt_rui_reservation(
+ struct xfs_mount *mp,
+ unsigned int nr)
+{
+ if (!xfs_has_rtrmapbt(mp))
+ return 0;
+ return xfs_calc_finish_rt_efi_reservation(mp, nr);
+}
+
+/*
+ * In finishing a BUI, we can modify:
+ * the inode being truncated: inode size
+ * dquots
+ * the inode's bmap btree: (max depth + 1) * block size
+ */
+inline unsigned int
+xfs_calc_finish_bui_reservation(
+ struct xfs_mount *mp,
+ unsigned int nr)
+{
+ return xfs_calc_inode_res(mp, 1) + XFS_DQUOT_LOGRES +
+ xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1,
+ mp->m_sb.sb_blocksize);
+}
+
+/*
* In truncating a file we free up to two extents at once. We can modify (t1):
* the inode being truncated: inode size
* the inode's bmap btree: (max depth + 1) * block size
@@ -411,16 +534,8 @@ xfs_calc_itruncate_reservation(
t1 = xfs_calc_inode_res(mp, 1) +
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, blksz);
- t2 = xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_block_count(mp, 4), blksz);
-
- if (xfs_has_realtime(mp)) {
- t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_rtalloc_block_count(mp, 2), blksz) +
- xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz);
- } else {
- t3 = 0;
- }
+ t2 = xfs_calc_finish_efi_reservation(mp, 4);
+ t3 = xfs_calc_finish_rt_efi_reservation(mp, 2);
/*
* In the early days of reflink, we included enough reservation to log
@@ -501,9 +616,7 @@ xfs_calc_rename_reservation(
xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp),
XFS_FSB_TO_B(mp, 1));
- t2 = xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_block_count(mp, 3),
- XFS_FSB_TO_B(mp, 1));
+ t2 = xfs_calc_finish_efi_reservation(mp, 3);
if (xfs_has_parent(mp)) {
unsigned int rename_overhead, exchange_overhead;
@@ -611,9 +724,7 @@ xfs_calc_link_reservation(
overhead += xfs_calc_iunlink_remove_reservation(mp);
t1 = xfs_calc_inode_res(mp, 2) +
xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1));
- t2 = xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
- XFS_FSB_TO_B(mp, 1));
+ t2 = xfs_calc_finish_efi_reservation(mp, 1);
if (xfs_has_parent(mp)) {
t3 = resp->tr_attrsetm.tr_logres;
@@ -676,9 +787,7 @@ xfs_calc_remove_reservation(
t1 = xfs_calc_inode_res(mp, 2) +
xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1));
- t2 = xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2),
- XFS_FSB_TO_B(mp, 1));
+ t2 = xfs_calc_finish_efi_reservation(mp, 2);
if (xfs_has_parent(mp)) {
t3 = resp->tr_attrrm.tr_logres;
@@ -1181,6 +1290,15 @@ xfs_calc_namespace_reservations(
resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
}
+STATIC void
+xfs_calc_default_atomic_ioend_reservation(
+ struct xfs_mount *mp,
+ struct xfs_trans_resv *resp)
+{
+ /* Pick a default that will scale reasonably for the log size. */
+ resp->tr_atomic_ioend = resp->tr_itruncate;
+}
+
void
xfs_trans_resv_calc(
struct xfs_mount *mp,
@@ -1275,4 +1393,167 @@ xfs_trans_resv_calc(
resp->tr_itruncate.tr_logcount += logcount_adj;
resp->tr_write.tr_logcount += logcount_adj;
resp->tr_qm_dqalloc.tr_logcount += logcount_adj;
+
+ /*
+ * Now that we've finished computing the static reservations, we can
+ * compute the dynamic reservation for atomic writes.
+ */
+ xfs_calc_default_atomic_ioend_reservation(mp, resp);
+}
+
+/*
+ * Return the per-extent and fixed transaction reservation sizes needed to
+ * complete an atomic write.
+ */
+STATIC unsigned int
+xfs_calc_atomic_write_ioend_geometry(
+ struct xfs_mount *mp,
+ unsigned int *step_size)
+{
+ const unsigned int efi = xfs_efi_log_space(1);
+ const unsigned int efd = xfs_efd_log_space(1);
+ const unsigned int rui = xfs_rui_log_space(1);
+ const unsigned int rud = xfs_rud_log_space();
+ const unsigned int cui = xfs_cui_log_space(1);
+ const unsigned int cud = xfs_cud_log_space();
+ const unsigned int bui = xfs_bui_log_space(1);
+ const unsigned int bud = xfs_bud_log_space();
+
+ /*
+ * Maximum overhead to complete an atomic write ioend in software:
+ * remove data fork extent + remove cow fork extent + map extent into
+ * data fork.
+ *
+ * tx0: Creates a BUI and a CUI and that's all it needs.
+ *
+ * tx1: Roll to finish the BUI. Need space for the BUD, an RUI, and
+ * enough space to relog the CUI (== CUI + CUD).
+ *
+ * tx2: Roll again to finish the RUI. Need space for the RUD and space
+ * to relog the CUI.
+ *
+ * tx3: Roll again, need space for the CUD and possibly a new EFI.
+ *
+ * tx4: Roll again, need space for an EFD.
+ *
+ * If the extent referenced by the pair of BUI/CUI items is not the one
+ * being currently processed, then we need to reserve space to relog
+ * both items.
+ */
+ const unsigned int tx0 = bui + cui;
+ const unsigned int tx1 = bud + rui + cui + cud;
+ const unsigned int tx2 = rud + cui + cud;
+ const unsigned int tx3 = cud + efi;
+ const unsigned int tx4 = efd;
+ const unsigned int relog = bui + bud + cui + cud;
+
+ const unsigned int per_intent = max(max3(tx0, tx1, tx2),
+ max3(tx3, tx4, relog));
+
+ /* Overhead to finish one step of each intent item type */
+ const unsigned int f1 = xfs_calc_finish_efi_reservation(mp, 1);
+ const unsigned int f2 = xfs_calc_finish_rui_reservation(mp, 1);
+ const unsigned int f3 = xfs_calc_finish_cui_reservation(mp, 1);
+ const unsigned int f4 = xfs_calc_finish_bui_reservation(mp, 1);
+
+ /* We only finish one item per transaction in a chain */
+ *step_size = max(f4, max3(f1, f2, f3));
+
+ return per_intent;
+}
+
+/*
+ * Compute the maximum size (in fsblocks) of atomic writes that we can complete
+ * given the existing log reservations.
+ */
+xfs_extlen_t
+xfs_calc_max_atomic_write_fsblocks(
+ struct xfs_mount *mp)
+{
+ const struct xfs_trans_res *resv = &M_RES(mp)->tr_atomic_ioend;
+ unsigned int per_intent = 0;
+ unsigned int step_size = 0;
+ unsigned int ret = 0;
+
+ if (resv->tr_logres > 0) {
+ per_intent = xfs_calc_atomic_write_ioend_geometry(mp,
+ &step_size);
+
+ if (resv->tr_logres >= step_size)
+ ret = (resv->tr_logres - step_size) / per_intent;
+ }
+
+ trace_xfs_calc_max_atomic_write_fsblocks(mp, per_intent, step_size,
+ resv->tr_logres, ret);
+
+ return ret;
+}
+
+/*
+ * Compute the log blocks and transaction reservation needed to complete an
+ * atomic write of a given number of blocks. Worst case, each block requires
+ * separate handling. A return value of 0 means something went wrong.
+ */
+xfs_extlen_t
+xfs_calc_atomic_write_log_geometry(
+ struct xfs_mount *mp,
+ xfs_extlen_t blockcount,
+ unsigned int *new_logres)
+{
+ struct xfs_trans_res *curr_res = &M_RES(mp)->tr_atomic_ioend;
+ uint old_logres = curr_res->tr_logres;
+ unsigned int per_intent, step_size;
+ unsigned int logres;
+ xfs_extlen_t min_logblocks;
+
+ ASSERT(blockcount > 0);
+
+ xfs_calc_default_atomic_ioend_reservation(mp, M_RES(mp));
+
+ per_intent = xfs_calc_atomic_write_ioend_geometry(mp, &step_size);
+
+ /* Check for overflows */
+ if (check_mul_overflow(blockcount, per_intent, &logres) ||
+ check_add_overflow(logres, step_size, &logres))
+ return 0;
+
+ curr_res->tr_logres = logres;
+ min_logblocks = xfs_log_calc_minimum_size(mp);
+ curr_res->tr_logres = old_logres;
+
+ trace_xfs_calc_max_atomic_write_log_geometry(mp, per_intent, step_size,
+ blockcount, min_logblocks, logres);
+
+ *new_logres = logres;
+ return min_logblocks;
+}
+
+/*
+ * Compute the transaction reservation needed to complete an out of place
+ * atomic write of a given number of blocks.
+ */
+int
+xfs_calc_atomic_write_reservation(
+ struct xfs_mount *mp,
+ xfs_extlen_t blockcount)
+{
+ unsigned int new_logres;
+ xfs_extlen_t min_logblocks;
+
+ /*
+ * If the caller doesn't ask for a specific atomic write size, then
+ * use the defaults.
+ */
+ if (blockcount == 0) {
+ xfs_calc_default_atomic_ioend_reservation(mp, M_RES(mp));
+ return 0;
+ }
+
+ min_logblocks = xfs_calc_atomic_write_log_geometry(mp, blockcount,
+ &new_logres);
+ if (!min_logblocks || min_logblocks > mp->m_sb.sb_logblocks)
+ return -EINVAL;
+
+ M_RES(mp)->tr_atomic_ioend.tr_logres = new_logres;
+ return 0;
}
diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h
index 0554b9d775d2..336279e0fc61 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.h
+++ b/fs/xfs/libxfs/xfs_trans_resv.h
@@ -48,6 +48,7 @@ struct xfs_trans_resv {
struct xfs_trans_res tr_qm_dqalloc; /* allocate quota on disk */
struct xfs_trans_res tr_sb; /* modify superblock */
struct xfs_trans_res tr_fsyncts; /* update timestamps on fsync */
+ struct xfs_trans_res tr_atomic_ioend; /* untorn write completion */
};
/* shorthand way of accessing reservation structure */
@@ -98,8 +99,32 @@ struct xfs_trans_resv {
void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp);
uint xfs_allocfree_block_count(struct xfs_mount *mp, uint num_ops);
+unsigned int xfs_calc_finish_bui_reservation(struct xfs_mount *mp,
+ unsigned int nr_ops);
+
+unsigned int xfs_calc_finish_efi_reservation(struct xfs_mount *mp,
+ unsigned int nr_ops);
+unsigned int xfs_calc_finish_rt_efi_reservation(struct xfs_mount *mp,
+ unsigned int nr_ops);
+
+unsigned int xfs_calc_finish_rui_reservation(struct xfs_mount *mp,
+ unsigned int nr_ops);
+unsigned int xfs_calc_finish_rt_rui_reservation(struct xfs_mount *mp,
+ unsigned int nr_ops);
+
+unsigned int xfs_calc_finish_cui_reservation(struct xfs_mount *mp,
+ unsigned int nr_ops);
+unsigned int xfs_calc_finish_rt_cui_reservation(struct xfs_mount *mp,
+ unsigned int nr_ops);
+
unsigned int xfs_calc_itruncate_reservation_minlogsize(struct xfs_mount *mp);
unsigned int xfs_calc_write_reservation_minlogsize(struct xfs_mount *mp);
unsigned int xfs_calc_qm_dqalloc_reservation_minlogsize(struct xfs_mount *mp);
+xfs_extlen_t xfs_calc_max_atomic_write_fsblocks(struct xfs_mount *mp);
+xfs_extlen_t xfs_calc_atomic_write_log_geometry(struct xfs_mount *mp,
+ xfs_extlen_t blockcount, unsigned int *new_logres);
+int xfs_calc_atomic_write_reservation(struct xfs_mount *mp,
+ xfs_extlen_t blockcount);
+
#endif /* __XFS_TRANS_RESV_H__ */
diff --git a/fs/xfs/libxfs/xfs_zones.c b/fs/xfs/libxfs/xfs_zones.c
index b0791a71931c..b40f71f878b5 100644
--- a/fs/xfs/libxfs/xfs_zones.c
+++ b/fs/xfs/libxfs/xfs_zones.c
@@ -95,6 +95,7 @@ xfs_zone_validate_seq(
case BLK_ZONE_COND_IMP_OPEN:
case BLK_ZONE_COND_EXP_OPEN:
case BLK_ZONE_COND_CLOSED:
+ case BLK_ZONE_COND_ACTIVE:
return xfs_zone_validate_wp(zone, rtg, write_pointer);
case BLK_ZONE_COND_FULL:
return xfs_zone_validate_full(zone, rtg, write_pointer);
diff --git a/fs/xfs/libxfs/xfs_zones.h b/fs/xfs/libxfs/xfs_zones.h
index c4f1367b2cca..5fefd132e002 100644
--- a/fs/xfs/libxfs/xfs_zones.h
+++ b/fs/xfs/libxfs/xfs_zones.h
@@ -29,6 +29,13 @@ struct xfs_rtgroup;
#define XFS_OPEN_GC_ZONES 1U
#define XFS_MIN_OPEN_ZONES (XFS_OPEN_GC_ZONES + 1U)
+/*
+ * For zoned devices that do not have a limit on the number of open zones, and
+ * for regular devices using the zoned allocator, use the most common SMR disks
+ * limit (128) as the default limit on the number of open zones.
+ */
+#define XFS_DEFAULT_MAX_OPEN_ZONES 128
+
bool xfs_zone_validate(struct blk_zone *zone, struct xfs_rtgroup *rtg,
xfs_rgblock_t *write_pointer);
diff --git a/fs/xfs/scrub/attr_repair.c b/fs/xfs/scrub/attr_repair.c
index c7eb94069caf..09d63aa10314 100644
--- a/fs/xfs/scrub/attr_repair.c
+++ b/fs/xfs/scrub/attr_repair.c
@@ -333,7 +333,6 @@ xrep_xattr_salvage_remote_attr(
.attr_filter = ent->flags & XFS_ATTR_NSP_ONDISK_MASK,
.namelen = rentry->namelen,
.name = rentry->name,
- .value = ab->value,
.valuelen = be32_to_cpu(rentry->valuelen),
};
unsigned int namesize;
@@ -363,6 +362,7 @@ xrep_xattr_salvage_remote_attr(
error = -EDEADLOCK;
if (error)
return error;
+ args.value = ab->value;
/* Look up the remote value and stash it for reconstruction. */
error = xfs_attr3_leaf_getvalue(leaf_bp, &args);
diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c
index fe678a0438bc..cd6f0ff382a7 100644
--- a/fs/xfs/scrub/btree.c
+++ b/fs/xfs/scrub/btree.c
@@ -306,7 +306,7 @@ xchk_btree_block_check_sibling(
if (pbp)
xchk_buffer_recheck(bs->sc, pbp);
- if (xfs_btree_diff_two_ptrs(cur, pp, sibling))
+ if (xfs_btree_cmp_two_ptrs(cur, pp, sibling))
xchk_btree_set_corrupt(bs->sc, cur, level);
out:
xfs_btree_del_cursor(ncur, XFS_BTREE_ERROR);
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 28ad341df8ee..7bfa37c99480 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -866,11 +866,11 @@ xchk_trans_cancel(
sc->tp = NULL;
}
-int
+void
xchk_trans_alloc_empty(
struct xfs_scrub *sc)
{
- return xfs_trans_alloc_empty(sc->mp, &sc->tp);
+ sc->tp = xfs_trans_alloc_empty(sc->mp);
}
/*
@@ -892,7 +892,8 @@ xchk_trans_alloc(
return xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate,
resblks, 0, 0, &sc->tp);
- return xchk_trans_alloc_empty(sc);
+ xchk_trans_alloc_empty(sc);
+ return 0;
}
/* Set us up with a transaction and an empty context. */
@@ -1248,7 +1249,7 @@ xchk_irele(
* hits do not clear DONTCACHE, so we must do it here.
*/
spin_lock(&VFS_I(ip)->i_lock);
- VFS_I(ip)->i_state &= ~I_DONTCACHE;
+ inode_state_clear(VFS_I(ip), I_DONTCACHE);
spin_unlock(&VFS_I(ip)->i_lock);
}
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index 19877d99f255..ddbc065c798c 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -7,7 +7,7 @@
#define __XFS_SCRUB_COMMON_H__
int xchk_trans_alloc(struct xfs_scrub *sc, uint resblks);
-int xchk_trans_alloc_empty(struct xfs_scrub *sc);
+void xchk_trans_alloc_empty(struct xfs_scrub *sc);
void xchk_trans_cancel(struct xfs_scrub *sc);
bool xchk_process_error(struct xfs_scrub *sc, xfs_agnumber_t agno,
diff --git a/fs/xfs/scrub/cow_repair.c b/fs/xfs/scrub/cow_repair.c
index 38a246b8bf11..b2a83801412e 100644
--- a/fs/xfs/scrub/cow_repair.c
+++ b/fs/xfs/scrub/cow_repair.c
@@ -300,7 +300,7 @@ xrep_cow_find_bad(
* on the debugging knob, replace everything in the CoW fork.
*/
if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) ||
- XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) {
+ XFS_TEST_ERROR(sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) {
error = xrep_cow_mark_file_range(xc, xc->irec.br_startblock,
xc->irec.br_blockcount);
if (error)
@@ -385,7 +385,7 @@ xrep_cow_find_bad_rt(
* CoW fork and then scan for staging extents in the refcountbt.
*/
if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) ||
- XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) {
+ XFS_TEST_ERROR(sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) {
error = xrep_cow_mark_file_range(xc, xc->irec.br_startblock,
xc->irec.br_blockcount);
if (error)
diff --git a/fs/xfs/scrub/dir_repair.c b/fs/xfs/scrub/dir_repair.c
index 249313882108..8d3b550990b5 100644
--- a/fs/xfs/scrub/dir_repair.c
+++ b/fs/xfs/scrub/dir_repair.c
@@ -1289,9 +1289,7 @@ xrep_dir_scan_dirtree(
if (sc->ilock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL))
xchk_iunlock(sc, sc->ilock_flags & (XFS_ILOCK_SHARED |
XFS_ILOCK_EXCL));
- error = xchk_trans_alloc_empty(sc);
- if (error)
- return error;
+ xchk_trans_alloc_empty(sc);
while ((error = xchk_iscan_iter(&rd->pscan.iscan, &ip)) == 1) {
bool flush;
@@ -1317,9 +1315,7 @@ xrep_dir_scan_dirtree(
if (error)
break;
- error = xchk_trans_alloc_empty(sc);
- if (error)
- break;
+ xchk_trans_alloc_empty(sc);
}
if (xchk_should_terminate(sc, &error))
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index e629663e460a..cebd0d526926 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -123,7 +123,7 @@ xchk_fsfreeze(
{
int error;
- error = freeze_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL);
+ error = freeze_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL, NULL);
trace_xchk_fsfreeze(sc, error);
return error;
}
@@ -135,7 +135,7 @@ xchk_fsthaw(
int error;
/* This should always succeed, we have a kernel freeze */
- error = thaw_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL);
+ error = thaw_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL, NULL);
trace_xchk_fsthaw(sc, error);
return error;
}
@@ -237,7 +237,8 @@ xchk_setup_fscounters(
return error;
}
- return xchk_trans_alloc_empty(sc);
+ xchk_trans_alloc_empty(sc);
+ return 0;
}
/*
diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
index a90a011c7e5f..4f7040c9ddf0 100644
--- a/fs/xfs/scrub/inode_repair.c
+++ b/fs/xfs/scrub/inode_repair.c
@@ -1933,7 +1933,7 @@ xrep_inode_pptr(
* Unlinked inodes that cannot be added to the directory tree will not
* have a parent pointer.
*/
- if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
+ if (inode->i_nlink == 0 && !(inode_state_read_once(inode) & I_LINKABLE))
return 0;
/* Children of the superblock do not have parent pointers. */
diff --git a/fs/xfs/scrub/metapath.c b/fs/xfs/scrub/metapath.c
index e21c16fbd15d..378ec7c8d38e 100644
--- a/fs/xfs/scrub/metapath.c
+++ b/fs/xfs/scrub/metapath.c
@@ -79,7 +79,7 @@ xchk_metapath_cleanup(
if (mpath->dp_ilock_flags)
xfs_iunlock(mpath->dp, mpath->dp_ilock_flags);
- kfree(mpath->path);
+ kfree_const(mpath->path);
}
/* Set up a metadir path scan. @path must be dynamically allocated. */
@@ -98,13 +98,13 @@ xchk_setup_metapath_scan(
error = xchk_install_live_inode(sc, ip);
if (error) {
- kfree(path);
+ kfree_const(path);
return error;
}
mpath = kzalloc(sizeof(struct xchk_metapath), XCHK_GFP_FLAGS);
if (!mpath) {
- kfree(path);
+ kfree_const(path);
return -ENOMEM;
}
@@ -132,7 +132,7 @@ xchk_setup_metapath_rtdir(
return -ENOENT;
return xchk_setup_metapath_scan(sc, sc->mp->m_metadirip,
- kasprintf(GFP_KERNEL, "rtgroups"), sc->mp->m_rtdirip);
+ kstrdup_const("rtgroups", GFP_KERNEL), sc->mp->m_rtdirip);
}
/* Scan a rtgroup inode under the /rtgroups directory. */
@@ -179,7 +179,7 @@ xchk_setup_metapath_quotadir(
return -ENOENT;
return xchk_setup_metapath_scan(sc, sc->mp->m_metadirip,
- kstrdup("quota", GFP_KERNEL), qi->qi_dirip);
+ kstrdup_const("quota", GFP_KERNEL), qi->qi_dirip);
}
/* Scan a quota inode under the /quota directory. */
@@ -212,7 +212,7 @@ xchk_setup_metapath_dqinode(
return -ENOENT;
return xchk_setup_metapath_scan(sc, qi->qi_dirip,
- kstrdup(xfs_dqinode_path(type), GFP_KERNEL), ip);
+ kstrdup_const(xfs_dqinode_path(type), GFP_KERNEL), ip);
}
#else
# define xchk_setup_metapath_quotadir(...) (-ENOENT)
@@ -318,9 +318,7 @@ xchk_metapath(
return 0;
}
- error = xchk_trans_alloc_empty(sc);
- if (error)
- return error;
+ xchk_trans_alloc_empty(sc);
error = xchk_metapath_ilock_both(mpath);
if (error)
diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c
index 1588ce971cb8..951ae8b71566 100644
--- a/fs/xfs/scrub/newbt.c
+++ b/fs/xfs/scrub/newbt.c
@@ -28,6 +28,15 @@
#include "scrub/newbt.h"
/*
+ * This is the maximum number of deferred extent freeing item extents (EFIs)
+ * that we'll attach to a transaction without rolling the transaction to avoid
+ * overrunning a tr_itruncate reservation. The newbt code should reserve
+ * exactly the correct number of blocks to rebuild the btree, so there should
+ * not be any excess blocks to free when committing a new btree.
+ */
+#define XREP_MAX_ITRUNCATE_EFIS (128)
+
+/*
* Estimate proper slack values for a btree that's being reloaded.
*
* Under most circumstances, we'll take whatever default loading value the
diff --git a/fs/xfs/scrub/nlinks.c b/fs/xfs/scrub/nlinks.c
index 4a47d0aabf73..091c79e432e5 100644
--- a/fs/xfs/scrub/nlinks.c
+++ b/fs/xfs/scrub/nlinks.c
@@ -376,6 +376,36 @@ out_incomplete:
return error;
}
+static uint
+xchk_nlinks_ilock_dir(
+ struct xfs_inode *ip)
+{
+ uint lock_mode = XFS_ILOCK_SHARED;
+
+ /*
+ * We're going to scan the directory entries, so we must be ready to
+ * pull the data fork mappings into memory if they aren't already.
+ */
+ if (xfs_need_iread_extents(&ip->i_df))
+ lock_mode = XFS_ILOCK_EXCL;
+
+ /*
+ * We're going to scan the parent pointers, so we must be ready to
+ * pull the attr fork mappings into memory if they aren't already.
+ */
+ if (xfs_has_parent(ip->i_mount) && xfs_inode_has_attr_fork(ip) &&
+ xfs_need_iread_extents(&ip->i_af))
+ lock_mode = XFS_ILOCK_EXCL;
+
+ /*
+ * Take the IOLOCK so that other threads cannot start a directory
+ * update while we're scanning.
+ */
+ lock_mode |= XFS_IOLOCK_SHARED;
+ xfs_ilock(ip, lock_mode);
+ return lock_mode;
+}
+
/* Walk a directory to bump the observed link counts of the children. */
STATIC int
xchk_nlinks_collect_dir(
@@ -394,8 +424,7 @@ xchk_nlinks_collect_dir(
return 0;
/* Prevent anyone from changing this directory while we walk it. */
- xfs_ilock(dp, XFS_IOLOCK_SHARED);
- lock_mode = xfs_ilock_data_map_shared(dp);
+ lock_mode = xchk_nlinks_ilock_dir(dp);
/*
* The dotdot entry of an unlinked directory still points to the last
@@ -452,7 +481,6 @@ out_abort:
xchk_iscan_abort(&xnc->collect_iscan);
out_unlock:
xfs_iunlock(dp, lock_mode);
- xfs_iunlock(dp, XFS_IOLOCK_SHARED);
return error;
}
@@ -555,9 +583,7 @@ xchk_nlinks_collect(
* do not take sb_internal.
*/
xchk_trans_cancel(sc);
- error = xchk_trans_alloc_empty(sc);
- if (error)
- return error;
+ xchk_trans_alloc_empty(sc);
while ((error = xchk_iscan_iter(&xnc->collect_iscan, &ip)) == 1) {
if (S_ISDIR(VFS_I(ip)->i_mode))
@@ -880,9 +906,7 @@ xchk_nlinks_compare(
* inactivation workqueue.
*/
xchk_trans_cancel(sc);
- error = xchk_trans_alloc_empty(sc);
- if (error)
- return error;
+ xchk_trans_alloc_empty(sc);
/*
* Use the inobt to walk all allocated inodes to compare the link
diff --git a/fs/xfs/scrub/nlinks_repair.c b/fs/xfs/scrub/nlinks_repair.c
index 4ebdee095428..6ef2ee9c3814 100644
--- a/fs/xfs/scrub/nlinks_repair.c
+++ b/fs/xfs/scrub/nlinks_repair.c
@@ -340,9 +340,7 @@ xrep_nlinks(
* We can only push the inactivation workqueues with an empty
* transaction.
*/
- error = xchk_trans_alloc_empty(sc);
- if (error)
- break;
+ xchk_trans_alloc_empty(sc);
}
xchk_iscan_iter_finish(&xnc->compare_iscan);
xchk_iscan_teardown(&xnc->compare_iscan);
diff --git a/fs/xfs/scrub/orphanage.c b/fs/xfs/scrub/orphanage.c
index 3537f3cca6d5..4e550a1d5353 100644
--- a/fs/xfs/scrub/orphanage.c
+++ b/fs/xfs/scrub/orphanage.c
@@ -152,12 +152,10 @@ xrep_orphanage_create(
}
/* Try to find the orphanage directory. */
- inode_lock_nested(root_inode, I_MUTEX_PARENT);
- orphanage_dentry = lookup_one_len(ORPHANAGE, root_dentry,
- strlen(ORPHANAGE));
+ orphanage_dentry = start_creating_noperm(root_dentry, &QSTR(ORPHANAGE));
if (IS_ERR(orphanage_dentry)) {
error = PTR_ERR(orphanage_dentry);
- goto out_unlock_root;
+ goto out_dput_root;
}
/*
@@ -168,10 +166,10 @@ xrep_orphanage_create(
*/
if (d_really_is_negative(orphanage_dentry)) {
orphanage_dentry = vfs_mkdir(&nop_mnt_idmap, root_inode,
- orphanage_dentry, 0750);
+ orphanage_dentry, 0750, NULL);
error = PTR_ERR(orphanage_dentry);
if (IS_ERR(orphanage_dentry))
- goto out_unlock_root;
+ goto out_dput_orphanage;
}
/* Not a directory? Bail out. */
@@ -201,9 +199,7 @@ xrep_orphanage_create(
sc->orphanage_ilock_flags = 0;
out_dput_orphanage:
- dput(orphanage_dentry);
-out_unlock_root:
- inode_unlock(VFS_I(sc->mp->m_rootip));
+ end_creating(orphanage_dentry);
out_dput_root:
dput(root_dentry);
out:
@@ -445,7 +441,7 @@ xrep_adoption_check_dcache(
if (!d_orphanage)
return 0;
- d_child = d_hash_and_lookup(d_orphanage, &qname);
+ d_child = try_lookup_noperm(&qname, d_orphanage);
if (d_child) {
trace_xrep_adoption_check_child(sc->mp, d_child);
@@ -482,7 +478,7 @@ xrep_adoption_zap_dcache(
if (!d_orphanage)
return;
- d_child = d_hash_and_lookup(d_orphanage, &qname);
+ d_child = try_lookup_noperm(&qname, d_orphanage);
while (d_child != NULL) {
trace_xrep_adoption_invalidate_child(sc->mp, d_child);
diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c
index 3b692c4acc1e..11d5de10fd56 100644
--- a/fs/xfs/scrub/parent.c
+++ b/fs/xfs/scrub/parent.c
@@ -915,7 +915,7 @@ xchk_pptr_looks_zapped(
* Temporary files that cannot be linked into the directory tree do not
* have attr forks because they cannot ever have parents.
*/
- if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
+ if (inode->i_nlink == 0 && !(inode_state_read_once(inode) & I_LINKABLE))
return false;
/*
diff --git a/fs/xfs/scrub/parent_repair.c b/fs/xfs/scrub/parent_repair.c
index 31bfe10be22a..2949feda6271 100644
--- a/fs/xfs/scrub/parent_repair.c
+++ b/fs/xfs/scrub/parent_repair.c
@@ -569,9 +569,7 @@ xrep_parent_scan_dirtree(
if (sc->ilock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL))
xchk_iunlock(sc, sc->ilock_flags & (XFS_ILOCK_SHARED |
XFS_ILOCK_EXCL));
- error = xchk_trans_alloc_empty(sc);
- if (error)
- return error;
+ xchk_trans_alloc_empty(sc);
while ((error = xchk_iscan_iter(&rp->pscan.iscan, &ip)) == 1) {
bool flush;
@@ -597,9 +595,7 @@ xrep_parent_scan_dirtree(
if (error)
break;
- error = xchk_trans_alloc_empty(sc);
- if (error)
- break;
+ xchk_trans_alloc_empty(sc);
}
if (xchk_should_terminate(sc, &error))
@@ -1099,9 +1095,7 @@ xrep_parent_flush_xattrs(
xrep_tempfile_iounlock(rp->sc);
/* Recreate the empty transaction and relock the inode. */
- error = xchk_trans_alloc_empty(rp->sc);
- if (error)
- return error;
+ xchk_trans_alloc_empty(rp->sc);
xchk_ilock(rp->sc, XFS_ILOCK_EXCL);
return 0;
}
diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c
index 58d6d4ed2853..5c5374c44c5a 100644
--- a/fs/xfs/scrub/quota.c
+++ b/fs/xfs/scrub/quota.c
@@ -155,12 +155,9 @@ xchk_quota_item(
* We want to validate the bmap record for the storage backing this
* dquot, so we need to lock the dquot and the quota file. For quota
* operations, the locking order is first the ILOCK and then the dquot.
- * However, dqiterate gave us a locked dquot, so drop the dquot lock to
- * get the ILOCK.
*/
- xfs_dqunlock(dq);
xchk_ilock(sc, XFS_ILOCK_SHARED);
- xfs_dqlock(dq);
+ mutex_lock(&dq->q_qlock);
/*
* Except for the root dquot, the actual dquot we got must either have
@@ -251,6 +248,7 @@ xchk_quota_item(
xchk_quota_item_timer(sc, offset, &dq->q_rtb);
out:
+ mutex_unlock(&dq->q_qlock);
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
return -ECANCELED;
@@ -330,7 +328,7 @@ xchk_quota(
xchk_dqiter_init(&cursor, sc, dqtype);
while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) {
error = xchk_quota_item(&sqi, dq);
- xfs_qm_dqput(dq);
+ xfs_qm_dqrele(dq);
if (error)
break;
}
diff --git a/fs/xfs/scrub/quota_repair.c b/fs/xfs/scrub/quota_repair.c
index 8f4c8d41f308..b1d661aa5f06 100644
--- a/fs/xfs/scrub/quota_repair.c
+++ b/fs/xfs/scrub/quota_repair.c
@@ -184,17 +184,13 @@ xrep_quota_item(
/*
* We might need to fix holes in the bmap record for the storage
* backing this dquot, so we need to lock the dquot and the quota file.
- * dqiterate gave us a locked dquot, so drop the dquot lock to get the
- * ILOCK_EXCL.
*/
- xfs_dqunlock(dq);
xchk_ilock(sc, XFS_ILOCK_EXCL);
- xfs_dqlock(dq);
-
+ mutex_lock(&dq->q_qlock);
error = xrep_quota_item_bmap(sc, dq, &dirty);
xchk_iunlock(sc, XFS_ILOCK_EXCL);
if (error)
- return error;
+ goto out_unlock_dquot;
/* Check the limits. */
if (dq->q_blk.softlimit > dq->q_blk.hardlimit) {
@@ -246,7 +242,7 @@ xrep_quota_item(
xrep_quota_item_timer(sc, &dq->q_rtb, &dirty);
if (!dirty)
- return 0;
+ goto out_unlock_dquot;
trace_xrep_dquot_item(sc->mp, dq->q_type, dq->q_id);
@@ -257,8 +253,10 @@ xrep_quota_item(
xfs_qm_adjust_dqtimers(dq);
}
xfs_trans_log_dquot(sc->tp, dq);
- error = xfs_trans_roll(&sc->tp);
- xfs_dqlock(dq);
+ return xfs_trans_roll(&sc->tp);
+
+out_unlock_dquot:
+ mutex_unlock(&dq->q_qlock);
return error;
}
@@ -513,7 +511,7 @@ xrep_quota_problems(
xchk_dqiter_init(&cursor, sc, dqtype);
while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) {
error = xrep_quota_item(&rqi, dq);
- xfs_qm_dqput(dq);
+ xfs_qm_dqrele(dq);
if (error)
break;
}
diff --git a/fs/xfs/scrub/quotacheck.c b/fs/xfs/scrub/quotacheck.c
index dc4033b91e44..d412a8359784 100644
--- a/fs/xfs/scrub/quotacheck.c
+++ b/fs/xfs/scrub/quotacheck.c
@@ -505,9 +505,7 @@ xqcheck_collect_counts(
* transactions do not take sb_internal.
*/
xchk_trans_cancel(sc);
- error = xchk_trans_alloc_empty(sc);
- if (error)
- return error;
+ xchk_trans_alloc_empty(sc);
while ((error = xchk_iscan_iter(&xqc->iscan, &ip)) == 1) {
error = xqcheck_collect_inode(xqc, ip);
@@ -565,6 +563,7 @@ xqcheck_compare_dquot(
return -ECANCELED;
}
+ mutex_lock(&dq->q_qlock);
mutex_lock(&xqc->lock);
error = xfarray_load_sparse(counts, dq->q_id, &xcdq);
if (error)
@@ -591,7 +590,9 @@ xqcheck_compare_dquot(
xchk_set_incomplete(xqc->sc);
error = -ECANCELED;
}
+out_unlock:
mutex_unlock(&xqc->lock);
+ mutex_unlock(&dq->q_qlock);
if (error)
return error;
@@ -599,10 +600,6 @@ xqcheck_compare_dquot(
return -ECANCELED;
return 0;
-
-out_unlock:
- mutex_unlock(&xqc->lock);
- return error;
}
/*
@@ -638,7 +635,7 @@ xqcheck_walk_observations(
return error;
error = xqcheck_compare_dquot(xqc, dqtype, dq);
- xfs_qm_dqput(dq);
+ xfs_qm_dqrele(dq);
if (error)
return error;
@@ -676,7 +673,7 @@ xqcheck_compare_dqtype(
xchk_dqiter_init(&cursor, sc, dqtype);
while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) {
error = xqcheck_compare_dquot(xqc, dqtype, dq);
- xfs_qm_dqput(dq);
+ xfs_qm_dqrele(dq);
if (error)
break;
}
diff --git a/fs/xfs/scrub/quotacheck_repair.c b/fs/xfs/scrub/quotacheck_repair.c
index dd8554c755b5..51be8d8d261b 100644
--- a/fs/xfs/scrub/quotacheck_repair.c
+++ b/fs/xfs/scrub/quotacheck_repair.c
@@ -52,13 +52,11 @@ xqcheck_commit_dquot(
bool dirty = false;
int error = 0;
- /* Unlock the dquot just long enough to allocate a transaction. */
- xfs_dqunlock(dq);
error = xchk_trans_alloc(xqc->sc, 0);
- xfs_dqlock(dq);
if (error)
return error;
+ mutex_lock(&dq->q_qlock);
xfs_trans_dqjoin(xqc->sc->tp, dq);
if (xchk_iscan_aborted(&xqc->iscan)) {
@@ -115,23 +113,12 @@ xqcheck_commit_dquot(
if (dq->q_id)
xfs_qm_adjust_dqtimers(dq);
xfs_trans_log_dquot(xqc->sc->tp, dq);
-
- /*
- * Transaction commit unlocks the dquot, so we must re-lock it so that
- * the caller can put the reference (which apparently requires a locked
- * dquot).
- */
- error = xrep_trans_commit(xqc->sc);
- xfs_dqlock(dq);
- return error;
+ return xrep_trans_commit(xqc->sc);
out_unlock:
mutex_unlock(&xqc->lock);
out_cancel:
xchk_trans_cancel(xqc->sc);
-
- /* Re-lock the dquot so the caller can put the reference. */
- xfs_dqlock(dq);
return error;
}
@@ -156,7 +143,7 @@ xqcheck_commit_dqtype(
xchk_dqiter_init(&cursor, sc, dqtype);
while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) {
error = xqcheck_commit_dquot(xqc, dqtype, dq);
- xfs_qm_dqput(dq);
+ xfs_qm_dqrele(dq);
if (error)
break;
}
@@ -187,7 +174,7 @@ xqcheck_commit_dqtype(
return error;
error = xqcheck_commit_dquot(xqc, dqtype, dq);
- xfs_qm_dqput(dq);
+ xfs_qm_dqrele(dq);
if (error)
return error;
diff --git a/fs/xfs/scrub/rcbag_btree.c b/fs/xfs/scrub/rcbag_btree.c
index 709356dc6256..9a4ef823c5a7 100644
--- a/fs/xfs/scrub/rcbag_btree.c
+++ b/fs/xfs/scrub/rcbag_btree.c
@@ -47,29 +47,20 @@ rcbagbt_init_rec_from_cur(
bag_rec->rbg_refcount = bag_irec->rbg_refcount;
}
-STATIC int64_t
-rcbagbt_key_diff(
+STATIC int
+rcbagbt_cmp_key_with_cur(
struct xfs_btree_cur *cur,
const union xfs_btree_key *key)
{
struct rcbag_rec *rec = (struct rcbag_rec *)&cur->bc_rec;
const struct rcbag_key *kp = (const struct rcbag_key *)key;
- if (kp->rbg_startblock > rec->rbg_startblock)
- return 1;
- if (kp->rbg_startblock < rec->rbg_startblock)
- return -1;
-
- if (kp->rbg_blockcount > rec->rbg_blockcount)
- return 1;
- if (kp->rbg_blockcount < rec->rbg_blockcount)
- return -1;
-
- return 0;
+ return cmp_int(kp->rbg_startblock, rec->rbg_startblock) ?:
+ cmp_int(kp->rbg_blockcount, rec->rbg_blockcount);
}
-STATIC int64_t
-rcbagbt_diff_two_keys(
+STATIC int
+rcbagbt_cmp_two_keys(
struct xfs_btree_cur *cur,
const union xfs_btree_key *k1,
const union xfs_btree_key *k2,
@@ -80,17 +71,8 @@ rcbagbt_diff_two_keys(
ASSERT(mask == NULL);
- if (kp1->rbg_startblock > kp2->rbg_startblock)
- return 1;
- if (kp1->rbg_startblock < kp2->rbg_startblock)
- return -1;
-
- if (kp1->rbg_blockcount > kp2->rbg_blockcount)
- return 1;
- if (kp1->rbg_blockcount < kp2->rbg_blockcount)
- return -1;
-
- return 0;
+ return cmp_int(kp1->rbg_startblock, kp2->rbg_startblock) ?:
+ cmp_int(kp1->rbg_blockcount, kp2->rbg_blockcount);
}
STATIC int
@@ -201,9 +183,9 @@ static const struct xfs_btree_ops rcbagbt_mem_ops = {
.init_key_from_rec = rcbagbt_init_key_from_rec,
.init_rec_from_cur = rcbagbt_init_rec_from_cur,
.init_ptr_from_cur = xfbtree_init_ptr_from_cur,
- .key_diff = rcbagbt_key_diff,
+ .cmp_key_with_cur = rcbagbt_cmp_key_with_cur,
.buf_ops = &rcbagbt_mem_buf_ops,
- .diff_two_keys = rcbagbt_diff_two_keys,
+ .cmp_two_keys = rcbagbt_cmp_two_keys,
.keys_inorder = rcbagbt_keys_inorder,
.recs_inorder = rcbagbt_recs_inorder,
};
diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c
index 8703897c0a9c..07f5bb8a6421 100644
--- a/fs/xfs/scrub/reap.c
+++ b/fs/xfs/scrub/reap.c
@@ -36,6 +36,12 @@
#include "xfs_metafile.h"
#include "xfs_rtgroup.h"
#include "xfs_rtrmap_btree.h"
+#include "xfs_extfree_item.h"
+#include "xfs_rmap_item.h"
+#include "xfs_refcount_item.h"
+#include "xfs_buf_item.h"
+#include "xfs_bmap_item.h"
+#include "xfs_bmap_btree.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -91,21 +97,33 @@
struct xreap_state {
struct xfs_scrub *sc;
- /* Reverse mapping owner and metadata reservation type. */
- const struct xfs_owner_info *oinfo;
- enum xfs_ag_resv_type resv;
+ union {
+ struct {
+ /*
+ * For AG blocks, this is reverse mapping owner and
+ * metadata reservation type.
+ */
+ const struct xfs_owner_info *oinfo;
+ enum xfs_ag_resv_type resv;
+ };
+ struct {
+ /* For file blocks, this is the inode and fork. */
+ struct xfs_inode *ip;
+ int whichfork;
+ };
+ };
- /* If true, roll the transaction before reaping the next extent. */
- bool force_roll;
+ /* Number of invalidated buffers logged to the current transaction. */
+ unsigned int nr_binval;
- /* Number of deferred reaps attached to the current transaction. */
- unsigned int deferred;
+ /* Maximum number of buffers we can invalidate in a single tx. */
+ unsigned int max_binval;
- /* Number of invalidated buffers logged to the current transaction. */
- unsigned int invalidated;
+ /* Number of deferred reaps attached to the current transaction. */
+ unsigned int nr_deferred;
- /* Number of deferred reaps queued during the whole reap sequence. */
- unsigned long long total_deferred;
+ /* Maximum number of intents we can reap in a single transaction. */
+ unsigned int max_deferred;
};
/* Put a block back on the AGFL. */
@@ -148,71 +166,79 @@ xreap_put_freelist(
}
/* Are there any uncommitted reap operations? */
-static inline bool xreap_dirty(const struct xreap_state *rs)
+static inline bool xreap_is_dirty(const struct xreap_state *rs)
{
- if (rs->force_roll)
- return true;
- if (rs->deferred)
- return true;
- if (rs->invalidated)
- return true;
- if (rs->total_deferred)
- return true;
- return false;
+ return rs->nr_binval > 0 || rs->nr_deferred > 0;
}
-#define XREAP_MAX_BINVAL (2048)
-
/*
- * Decide if we want to roll the transaction after reaping an extent. We don't
- * want to overrun the transaction reservation, so we prohibit more than
- * 128 EFIs per transaction. For the same reason, we limit the number
- * of buffer invalidations to 2048.
+ * Decide if we need to roll the transaction to clear out the the log
+ * reservation that we allocated to buffer invalidations.
*/
-static inline bool xreap_want_roll(const struct xreap_state *rs)
+static inline bool xreap_want_binval_roll(const struct xreap_state *rs)
{
- if (rs->force_roll)
- return true;
- if (rs->deferred > XREP_MAX_ITRUNCATE_EFIS)
- return true;
- if (rs->invalidated > XREAP_MAX_BINVAL)
- return true;
- return false;
+ return rs->nr_binval >= rs->max_binval;
}
-static inline void xreap_reset(struct xreap_state *rs)
+/* Reset the buffer invalidation count after rolling. */
+static inline void xreap_binval_reset(struct xreap_state *rs)
{
- rs->total_deferred += rs->deferred;
- rs->deferred = 0;
- rs->invalidated = 0;
- rs->force_roll = false;
+ rs->nr_binval = 0;
}
-#define XREAP_MAX_DEFER_CHAIN (2048)
+/*
+ * Bump the number of invalidated buffers, and return true if we can continue,
+ * or false if we need to roll the transaction.
+ */
+static inline bool xreap_inc_binval(struct xreap_state *rs)
+{
+ rs->nr_binval++;
+ return rs->nr_binval < rs->max_binval;
+}
/*
* Decide if we want to finish the deferred ops that are attached to the scrub
* transaction. We don't want to queue huge chains of deferred ops because
* that can consume a lot of log space and kernel memory. Hence we trigger a
- * xfs_defer_finish if there are more than 2048 deferred reap operations or the
- * caller did some real work.
+ * xfs_defer_finish if there are too many deferred reap operations or we've run
+ * out of space for invalidations.
*/
-static inline bool
-xreap_want_defer_finish(const struct xreap_state *rs)
+static inline bool xreap_want_defer_finish(const struct xreap_state *rs)
{
- if (rs->force_roll)
- return true;
- if (rs->total_deferred > XREAP_MAX_DEFER_CHAIN)
- return true;
- return false;
+ return rs->nr_deferred >= rs->max_deferred;
}
+/*
+ * Reset the defer chain length and buffer invalidation count after finishing
+ * items.
+ */
static inline void xreap_defer_finish_reset(struct xreap_state *rs)
{
- rs->total_deferred = 0;
- rs->deferred = 0;
- rs->invalidated = 0;
- rs->force_roll = false;
+ rs->nr_deferred = 0;
+ rs->nr_binval = 0;
+}
+
+/*
+ * Bump the number of deferred extent reaps.
+ */
+static inline void xreap_inc_defer(struct xreap_state *rs)
+{
+ rs->nr_deferred++;
+}
+
+/* Force the caller to finish a deferred item chain. */
+static inline void xreap_force_defer_finish(struct xreap_state *rs)
+{
+ rs->nr_deferred = rs->max_deferred;
+}
+
+/* Maximum number of fsblocks that we might find in a buffer to invalidate. */
+static inline unsigned int
+xrep_binval_max_fsblocks(
+ struct xfs_mount *mp)
+{
+ /* Remote xattr values are the largest buffers that we support. */
+ return xfs_attr3_max_rmt_blocks(mp);
}
/*
@@ -224,12 +250,8 @@ xrep_bufscan_max_sectors(
struct xfs_mount *mp,
xfs_extlen_t fsblocks)
{
- int max_fsbs;
-
- /* Remote xattr values are the largest buffers that we support. */
- max_fsbs = xfs_attr3_max_rmt_blocks(mp);
-
- return XFS_FSB_TO_BB(mp, min_t(xfs_extlen_t, fsblocks, max_fsbs));
+ return XFS_FSB_TO_BB(mp, min_t(xfs_extlen_t, fsblocks,
+ xrep_binval_max_fsblocks(mp)));
}
/*
@@ -297,14 +319,13 @@ xreap_agextent_binval(
while ((bp = xrep_bufscan_advance(mp, &scan)) != NULL) {
xfs_trans_bjoin(sc->tp, bp);
xfs_trans_binval(sc->tp, bp);
- rs->invalidated++;
/*
* Stop invalidating if we've hit the limit; we should
* still have enough reservation left to free however
* far we've gotten.
*/
- if (rs->invalidated > XREAP_MAX_BINVAL) {
+ if (!xreap_inc_binval(rs)) {
*aglenp -= agbno_next - bno;
goto out;
}
@@ -416,21 +437,23 @@ xreap_agextent_iter(
trace_xreap_dispose_unmap_extent(pag_group(sc->sa.pag), agbno,
*aglenp);
- rs->force_roll = true;
-
if (rs->oinfo == &XFS_RMAP_OINFO_COW) {
/*
- * If we're unmapping CoW staging extents, remove the
+ * t0: Unmapping CoW staging extents, remove the
* records from the refcountbt, which will remove the
* rmap record as well.
*/
xfs_refcount_free_cow_extent(sc->tp, false, fsbno,
*aglenp);
+ xreap_inc_defer(rs);
return 0;
}
- return xfs_rmap_free(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno,
- *aglenp, rs->oinfo);
+ /* t1: unmap crosslinked metadata blocks */
+ xfs_rmap_free_extent(sc->tp, false, fsbno, *aglenp,
+ rs->oinfo->oi_owner);
+ xreap_inc_defer(rs);
+ return 0;
}
trace_xreap_dispose_free_extent(pag_group(sc->sa.pag), agbno, *aglenp);
@@ -443,12 +466,12 @@ xreap_agextent_iter(
*/
xreap_agextent_binval(rs, agbno, aglenp);
if (*aglenp == 0) {
- ASSERT(xreap_want_roll(rs));
+ ASSERT(xreap_want_binval_roll(rs));
return 0;
}
/*
- * If we're getting rid of CoW staging extents, use deferred work items
+ * t2: To get rid of CoW staging extents, use deferred work items
* to remove the refcountbt records (which removes the rmap records)
* and free the extent. We're not worried about the system going down
* here because log recovery walks the refcount btree to clean out the
@@ -463,23 +486,23 @@ xreap_agextent_iter(
if (error)
return error;
- rs->force_roll = true;
+ xreap_inc_defer(rs);
return 0;
}
- /* Put blocks back on the AGFL one at a time. */
+ /* t3: Put blocks back on the AGFL one at a time. */
if (rs->resv == XFS_AG_RESV_AGFL) {
ASSERT(*aglenp == 1);
error = xreap_put_freelist(sc, agbno);
if (error)
return error;
- rs->force_roll = true;
+ xreap_force_defer_finish(rs);
return 0;
}
/*
- * Use deferred frees to get rid of the old btree blocks to try to
+ * t4: Use deferred frees to get rid of the old btree blocks to try to
* minimize the window in which we could crash and lose the old blocks.
* Add a defer ops barrier every other extent to avoid stressing the
* system with large EFIs.
@@ -489,12 +512,194 @@ xreap_agextent_iter(
if (error)
return error;
- rs->deferred++;
- if (rs->deferred % 2 == 0)
+ xreap_inc_defer(rs);
+ if (rs->nr_deferred % 2 == 0)
xfs_defer_add_barrier(sc->tp);
return 0;
}
+/* Configure the deferral and invalidation limits */
+static inline void
+xreap_configure_limits(
+ struct xreap_state *rs,
+ unsigned int fixed_overhead,
+ unsigned int variable_overhead,
+ unsigned int per_intent,
+ unsigned int per_binval)
+{
+ struct xfs_scrub *sc = rs->sc;
+ unsigned int res = sc->tp->t_log_res - fixed_overhead;
+
+ /* Don't underflow the reservation */
+ if (sc->tp->t_log_res < (fixed_overhead + variable_overhead)) {
+ ASSERT(sc->tp->t_log_res >=
+ (fixed_overhead + variable_overhead));
+ xfs_force_shutdown(sc->mp, SHUTDOWN_CORRUPT_INCORE);
+ return;
+ }
+
+ rs->max_deferred = per_intent ? res / variable_overhead : 0;
+ res -= rs->max_deferred * per_intent;
+ rs->max_binval = per_binval ? res / per_binval : 0;
+}
+
+/*
+ * Compute the maximum number of intent items that reaping can attach to the
+ * scrub transaction given the worst case log overhead of the intent items
+ * needed to reap a single per-AG space extent. This is not for freeing CoW
+ * staging extents.
+ */
+STATIC void
+xreap_configure_agextent_limits(
+ struct xreap_state *rs)
+{
+ struct xfs_scrub *sc = rs->sc;
+ struct xfs_mount *mp = sc->mp;
+
+ /*
+ * In the worst case, relogging an intent item causes both an intent
+ * item and a done item to be attached to a transaction for each extent
+ * that we'd like to process.
+ */
+ const unsigned int efi = xfs_efi_log_space(1) +
+ xfs_efd_log_space(1);
+ const unsigned int rui = xfs_rui_log_space(1) +
+ xfs_rud_log_space();
+
+ /*
+ * Various things can happen when reaping non-CoW metadata blocks:
+ *
+ * t1: Unmapping crosslinked metadata blocks: deferred removal of rmap
+ * record.
+ *
+ * t3: Freeing to AGFL: roll and finish deferred items for every block.
+ * Limits here do not matter.
+ *
+ * t4: Freeing metadata blocks: deferred freeing of the space, which
+ * also removes the rmap record.
+ *
+ * For simplicity, we'll use the worst-case intents size to determine
+ * the maximum number of deferred extents before we have to finish the
+ * whole chain. If we're trying to reap a btree larger than this size,
+ * a crash midway through reaping can result in leaked blocks.
+ */
+ const unsigned int t1 = rui;
+ const unsigned int t4 = rui + efi;
+ const unsigned int per_intent = max(t1, t4);
+
+ /*
+ * For each transaction in a reap chain, we must be able to take one
+ * step in the defer item chain, which should only consist of EFI or
+ * RUI items.
+ */
+ const unsigned int f1 = xfs_calc_finish_efi_reservation(mp, 1);
+ const unsigned int f2 = xfs_calc_finish_rui_reservation(mp, 1);
+ const unsigned int step_size = max(f1, f2);
+
+ /* Largest buffer size (in fsblocks) that can be invalidated. */
+ const unsigned int max_binval = xrep_binval_max_fsblocks(mp);
+
+ /* Maximum overhead of invalidating one buffer. */
+ const unsigned int per_binval =
+ xfs_buf_inval_log_space(1, XFS_B_TO_FSBT(mp, max_binval));
+
+ /*
+ * For each transaction in a reap chain, we can delete some number of
+ * extents and invalidate some number of blocks. We assume that btree
+ * blocks aren't usually contiguous; and that scrub likely pulled all
+ * the buffers into memory. From these assumptions, set the maximum
+ * number of deferrals we can queue before flushing the defer chain,
+ * and the number of invalidations we can queue before rolling to a
+ * clean transaction (and possibly relogging some of the deferrals) to
+ * the same quantity.
+ */
+ const unsigned int variable_overhead = per_intent + per_binval;
+
+ xreap_configure_limits(rs, step_size, variable_overhead, per_intent,
+ per_binval);
+
+ trace_xreap_agextent_limits(sc->tp, per_binval, rs->max_binval,
+ step_size, per_intent, rs->max_deferred);
+}
+
+/*
+ * Compute the maximum number of intent items that reaping can attach to the
+ * scrub transaction given the worst case log overhead of the intent items
+ * needed to reap a single CoW staging extent. This is not for freeing
+ * metadata blocks.
+ */
+STATIC void
+xreap_configure_agcow_limits(
+ struct xreap_state *rs)
+{
+ struct xfs_scrub *sc = rs->sc;
+ struct xfs_mount *mp = sc->mp;
+
+ /*
+ * In the worst case, relogging an intent item causes both an intent
+ * item and a done item to be attached to a transaction for each extent
+ * that we'd like to process.
+ */
+ const unsigned int efi = xfs_efi_log_space(1) +
+ xfs_efd_log_space(1);
+ const unsigned int rui = xfs_rui_log_space(1) +
+ xfs_rud_log_space();
+ const unsigned int cui = xfs_cui_log_space(1) +
+ xfs_cud_log_space();
+
+ /*
+ * Various things can happen when reaping non-CoW metadata blocks:
+ *
+ * t0: Unmapping crosslinked CoW blocks: deferred removal of refcount
+ * record, which defers removal of rmap record
+ *
+ * t2: Freeing CoW blocks: deferred removal of refcount record, which
+ * defers removal of rmap record; and deferred removal of the space
+ *
+ * For simplicity, we'll use the worst-case intents size to determine
+ * the maximum number of deferred extents before we have to finish the
+ * whole chain. If we're trying to reap a btree larger than this size,
+ * a crash midway through reaping can result in leaked blocks.
+ */
+ const unsigned int t0 = cui + rui;
+ const unsigned int t2 = cui + rui + efi;
+ const unsigned int per_intent = max(t0, t2);
+
+ /*
+ * For each transaction in a reap chain, we must be able to take one
+ * step in the defer item chain, which should only consist of CUI, EFI,
+ * or RUI items.
+ */
+ const unsigned int f1 = xfs_calc_finish_efi_reservation(mp, 1);
+ const unsigned int f2 = xfs_calc_finish_rui_reservation(mp, 1);
+ const unsigned int f3 = xfs_calc_finish_cui_reservation(mp, 1);
+ const unsigned int step_size = max3(f1, f2, f3);
+
+ /* Largest buffer size (in fsblocks) that can be invalidated. */
+ const unsigned int max_binval = xrep_binval_max_fsblocks(mp);
+
+ /* Overhead of invalidating one buffer */
+ const unsigned int per_binval =
+ xfs_buf_inval_log_space(1, XFS_B_TO_FSBT(mp, max_binval));
+
+ /*
+ * For each transaction in a reap chain, we can delete some number of
+ * extents and invalidate some number of blocks. We assume that CoW
+ * staging extents are usually more than 1 fsblock, and that there
+ * shouldn't be any buffers for those blocks. From the assumptions,
+ * set the number of deferrals to use as much of the reservation as
+ * it can, but leave space to invalidate 1/8th that number of buffers.
+ */
+ const unsigned int variable_overhead = per_intent +
+ (per_binval / 8);
+
+ xreap_configure_limits(rs, step_size, variable_overhead, per_intent,
+ per_binval);
+
+ trace_xreap_agcow_limits(sc->tp, per_binval, rs->max_binval, step_size,
+ per_intent, rs->max_deferred);
+}
+
/*
* Break an AG metadata extent into sub-extents by fate (crosslinked, not
* crosslinked), and dispose of each sub-extent separately.
@@ -531,11 +736,11 @@ xreap_agmeta_extent(
if (error)
return error;
xreap_defer_finish_reset(rs);
- } else if (xreap_want_roll(rs)) {
+ } else if (xreap_want_binval_roll(rs)) {
error = xrep_roll_ag_trans(sc);
if (error)
return error;
- xreap_reset(rs);
+ xreap_binval_reset(rs);
}
agbno += aglen;
@@ -562,11 +767,12 @@ xrep_reap_agblocks(
ASSERT(xfs_has_rmapbt(sc->mp));
ASSERT(sc->ip == NULL);
+ xreap_configure_agextent_limits(&rs);
error = xagb_bitmap_walk(bitmap, xreap_agmeta_extent, &rs);
if (error)
return error;
- if (xreap_dirty(&rs))
+ if (xreap_is_dirty(&rs))
return xrep_defer_finish(sc);
return 0;
@@ -628,7 +834,7 @@ xreap_fsmeta_extent(
if (error)
goto out_agf;
xreap_defer_finish_reset(rs);
- } else if (xreap_want_roll(rs)) {
+ } else if (xreap_want_binval_roll(rs)) {
/*
* Hold the AGF buffer across the transaction roll so
* that we don't have to reattach it to the scrub
@@ -639,7 +845,7 @@ xreap_fsmeta_extent(
xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
if (error)
goto out_agf;
- xreap_reset(rs);
+ xreap_binval_reset(rs);
}
agbno += aglen;
@@ -674,11 +880,15 @@ xrep_reap_fsblocks(
ASSERT(xfs_has_rmapbt(sc->mp));
ASSERT(sc->ip != NULL);
+ if (oinfo == &XFS_RMAP_OINFO_COW)
+ xreap_configure_agcow_limits(&rs);
+ else
+ xreap_configure_agextent_limits(&rs);
error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs);
if (error)
return error;
- if (xreap_dirty(&rs))
+ if (xreap_is_dirty(&rs))
return xrep_defer_finish(sc);
return 0;
@@ -770,7 +980,7 @@ xreap_rgextent_iter(
rtbno = xfs_rgbno_to_rtb(sc->sr.rtg, rgbno);
/*
- * If there are other rmappings, this block is cross linked and must
+ * t1: There are other rmappings; this block is cross linked and must
* not be freed. Remove the forward and reverse mapping and move on.
*/
if (crosslinked) {
@@ -778,14 +988,14 @@ xreap_rgextent_iter(
*rglenp);
xfs_refcount_free_cow_extent(sc->tp, true, rtbno, *rglenp);
- rs->deferred++;
+ xreap_inc_defer(rs);
return 0;
}
trace_xreap_dispose_free_extent(rtg_group(sc->sr.rtg), rgbno, *rglenp);
/*
- * The CoW staging extent is not crosslinked. Use deferred work items
+ * t2: The CoW staging extent is not crosslinked. Use deferred work
* to remove the refcountbt records (which removes the rmap records)
* and free the extent. We're not worried about the system going down
* here because log recovery walks the refcount btree to clean out the
@@ -799,10 +1009,73 @@ xreap_rgextent_iter(
if (error)
return error;
- rs->deferred++;
+ xreap_inc_defer(rs);
return 0;
}
+/*
+ * Compute the maximum number of intent items that reaping can attach to the
+ * scrub transaction given the worst case log overhead of the intent items
+ * needed to reap a single CoW staging extent. This is not for freeing
+ * metadata blocks.
+ */
+STATIC void
+xreap_configure_rgcow_limits(
+ struct xreap_state *rs)
+{
+ struct xfs_scrub *sc = rs->sc;
+ struct xfs_mount *mp = sc->mp;
+
+ /*
+ * In the worst case, relogging an intent item causes both an intent
+ * item and a done item to be attached to a transaction for each extent
+ * that we'd like to process.
+ */
+ const unsigned int efi = xfs_efi_log_space(1) +
+ xfs_efd_log_space(1);
+ const unsigned int rui = xfs_rui_log_space(1) +
+ xfs_rud_log_space();
+ const unsigned int cui = xfs_cui_log_space(1) +
+ xfs_cud_log_space();
+
+ /*
+ * Various things can happen when reaping non-CoW metadata blocks:
+ *
+ * t1: Unmapping crosslinked CoW blocks: deferred removal of refcount
+ * record, which defers removal of rmap record
+ *
+ * t2: Freeing CoW blocks: deferred removal of refcount record, which
+ * defers removal of rmap record; and deferred removal of the space
+ *
+ * For simplicity, we'll use the worst-case intents size to determine
+ * the maximum number of deferred extents before we have to finish the
+ * whole chain. If we're trying to reap a btree larger than this size,
+ * a crash midway through reaping can result in leaked blocks.
+ */
+ const unsigned int t1 = cui + rui;
+ const unsigned int t2 = cui + rui + efi;
+ const unsigned int per_intent = max(t1, t2);
+
+ /*
+ * For each transaction in a reap chain, we must be able to take one
+ * step in the defer item chain, which should only consist of CUI, EFI,
+ * or RUI items.
+ */
+ const unsigned int f1 = xfs_calc_finish_rt_efi_reservation(mp, 1);
+ const unsigned int f2 = xfs_calc_finish_rt_rui_reservation(mp, 1);
+ const unsigned int f3 = xfs_calc_finish_rt_cui_reservation(mp, 1);
+ const unsigned int step_size = max3(f1, f2, f3);
+
+ /*
+ * The only buffer for the rt device is the rtgroup super, so we don't
+ * need to save space for buffer invalidations.
+ */
+ xreap_configure_limits(rs, step_size, per_intent, per_intent, 0);
+
+ trace_xreap_rgcow_limits(sc->tp, 0, 0, step_size, per_intent,
+ rs->max_deferred);
+}
+
#define XREAP_RTGLOCK_ALL (XFS_RTGLOCK_BITMAP | \
XFS_RTGLOCK_RMAP | \
XFS_RTGLOCK_REFCOUNT)
@@ -855,11 +1128,11 @@ xreap_rtmeta_extent(
if (error)
goto out_unlock;
xreap_defer_finish_reset(rs);
- } else if (xreap_want_roll(rs)) {
+ } else if (xreap_want_binval_roll(rs)) {
error = xfs_trans_roll_inode(&sc->tp, sc->ip);
if (error)
goto out_unlock;
- xreap_reset(rs);
+ xreap_binval_reset(rs);
}
rgbno += rglen;
@@ -891,12 +1164,14 @@ xrep_reap_rtblocks(
ASSERT(xfs_has_rmapbt(sc->mp));
ASSERT(sc->ip != NULL);
+ ASSERT(oinfo == &XFS_RMAP_OINFO_COW);
+ xreap_configure_rgcow_limits(&rs);
error = xrtb_bitmap_walk(bitmap, xreap_rtmeta_extent, &rs);
if (error)
return error;
- if (xreap_dirty(&rs))
+ if (xreap_is_dirty(&rs))
return xrep_defer_finish(sc);
return 0;
@@ -929,13 +1204,13 @@ xrep_reap_metadir_fsblocks(
ASSERT(sc->ip != NULL);
ASSERT(xfs_is_metadir_inode(sc->ip));
+ xreap_configure_agextent_limits(&rs);
xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, XFS_DATA_FORK);
-
error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs);
if (error)
return error;
- if (xreap_dirty(&rs)) {
+ if (xreap_is_dirty(&rs)) {
error = xrep_defer_finish(sc);
if (error)
return error;
@@ -955,13 +1230,12 @@ xrep_reap_metadir_fsblocks(
*/
STATIC int
xreap_bmapi_select(
- struct xfs_scrub *sc,
- struct xfs_inode *ip,
- int whichfork,
+ struct xreap_state *rs,
struct xfs_bmbt_irec *imap,
bool *crosslinked)
{
struct xfs_owner_info oinfo;
+ struct xfs_scrub *sc = rs->sc;
struct xfs_btree_cur *cur;
xfs_filblks_t len = 1;
xfs_agblock_t bno;
@@ -975,7 +1249,8 @@ xreap_bmapi_select(
cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
sc->sa.pag);
- xfs_rmap_ino_owner(&oinfo, ip->i_ino, whichfork, imap->br_startoff);
+ xfs_rmap_ino_owner(&oinfo, rs->ip->i_ino, rs->whichfork,
+ imap->br_startoff);
error = xfs_rmap_has_other_keys(cur, agbno, 1, &oinfo, crosslinked);
if (error)
goto out_cur;
@@ -1038,21 +1313,19 @@ xreap_buf_loggable(
*/
STATIC int
xreap_bmapi_binval(
- struct xfs_scrub *sc,
- struct xfs_inode *ip,
- int whichfork,
+ struct xreap_state *rs,
struct xfs_bmbt_irec *imap)
{
+ struct xfs_scrub *sc = rs->sc;
struct xfs_mount *mp = sc->mp;
struct xfs_perag *pag = sc->sa.pag;
- int bmap_flags = xfs_bmapi_aflag(whichfork);
+ int bmap_flags = xfs_bmapi_aflag(rs->whichfork);
xfs_fileoff_t off;
xfs_fileoff_t max_off;
xfs_extlen_t scan_blocks;
xfs_agblock_t bno;
xfs_agblock_t agbno;
xfs_agblock_t agbno_next;
- unsigned int invalidated = 0;
int error;
/*
@@ -1079,7 +1352,7 @@ xreap_bmapi_binval(
struct xfs_bmbt_irec hmap;
int nhmaps = 1;
- error = xfs_bmapi_read(ip, off, max_off - off, &hmap,
+ error = xfs_bmapi_read(rs->ip, off, max_off - off, &hmap,
&nhmaps, bmap_flags);
if (error)
return error;
@@ -1120,14 +1393,13 @@ xreap_bmapi_binval(
xfs_buf_stale(bp);
xfs_buf_relse(bp);
}
- invalidated++;
/*
* Stop invalidating if we've hit the limit; we should
* still have enough reservation left to free however
- * much of the mapping we've seen so far.
+ * far we've gotten.
*/
- if (invalidated > XREAP_MAX_BINVAL) {
+ if (!xreap_inc_binval(rs)) {
imap->br_blockcount = agbno_next - bno;
goto out;
}
@@ -1149,12 +1421,11 @@ out:
*/
STATIC int
xrep_reap_bmapi_iter(
- struct xfs_scrub *sc,
- struct xfs_inode *ip,
- int whichfork,
+ struct xreap_state *rs,
struct xfs_bmbt_irec *imap,
bool crosslinked)
{
+ struct xfs_scrub *sc = rs->sc;
int error;
if (crosslinked) {
@@ -1171,14 +1442,14 @@ xrep_reap_bmapi_iter(
imap->br_blockcount);
/*
- * Schedule removal of the mapping from the fork. We use
+ * t0: Schedule removal of the mapping from the fork. We use
* deferred log intents in this function to control the exact
* sequence of metadata updates.
*/
- xfs_bmap_unmap_extent(sc->tp, ip, whichfork, imap);
- xfs_trans_mod_dquot_byino(sc->tp, ip, XFS_TRANS_DQ_BCOUNT,
+ xfs_bmap_unmap_extent(sc->tp, rs->ip, rs->whichfork, imap);
+ xfs_trans_mod_dquot_byino(sc->tp, rs->ip, XFS_TRANS_DQ_BCOUNT,
-(int64_t)imap->br_blockcount);
- xfs_rmap_unmap_extent(sc->tp, ip, whichfork, imap);
+ xfs_rmap_unmap_extent(sc->tp, rs->ip, rs->whichfork, imap);
return 0;
}
@@ -1199,41 +1470,139 @@ xrep_reap_bmapi_iter(
* transaction is full of logged buffer invalidations, so we need to
* return early so that we can roll and retry.
*/
- error = xreap_bmapi_binval(sc, ip, whichfork, imap);
+ error = xreap_bmapi_binval(rs, imap);
if (error || imap->br_blockcount == 0)
return error;
/*
- * Schedule removal of the mapping from the fork. We use deferred log
- * intents in this function to control the exact sequence of metadata
+ * t1: Schedule removal of the mapping from the fork. We use deferred
+ * work in this function to control the exact sequence of metadata
* updates.
*/
- xfs_bmap_unmap_extent(sc->tp, ip, whichfork, imap);
- xfs_trans_mod_dquot_byino(sc->tp, ip, XFS_TRANS_DQ_BCOUNT,
+ xfs_bmap_unmap_extent(sc->tp, rs->ip, rs->whichfork, imap);
+ xfs_trans_mod_dquot_byino(sc->tp, rs->ip, XFS_TRANS_DQ_BCOUNT,
-(int64_t)imap->br_blockcount);
return xfs_free_extent_later(sc->tp, imap->br_startblock,
imap->br_blockcount, NULL, XFS_AG_RESV_NONE,
XFS_FREE_EXTENT_SKIP_DISCARD);
}
+/* Compute the maximum mapcount of a file buffer. */
+static unsigned int
+xreap_bmapi_binval_mapcount(
+ struct xfs_scrub *sc)
+{
+ /* directory blocks can span multiple fsblocks and be discontiguous */
+ if (sc->sm->sm_type == XFS_SCRUB_TYPE_DIR)
+ return sc->mp->m_dir_geo->fsbcount;
+
+ /* all other file xattr/symlink blocks must be contiguous */
+ return 1;
+}
+
+/* Compute the maximum block size of a file buffer. */
+static unsigned int
+xreap_bmapi_binval_blocksize(
+ struct xfs_scrub *sc)
+{
+ switch (sc->sm->sm_type) {
+ case XFS_SCRUB_TYPE_DIR:
+ return sc->mp->m_dir_geo->blksize;
+ case XFS_SCRUB_TYPE_XATTR:
+ case XFS_SCRUB_TYPE_PARENT:
+ /*
+ * The xattr structure itself consists of single fsblocks, but
+ * there could be remote xattr blocks to invalidate.
+ */
+ return XFS_XATTR_SIZE_MAX;
+ }
+
+ /* everything else is a single block */
+ return sc->mp->m_sb.sb_blocksize;
+}
+
+/*
+ * Compute the maximum number of buffer invalidations that we can do while
+ * reaping a single extent from a file fork.
+ */
+STATIC void
+xreap_configure_bmapi_limits(
+ struct xreap_state *rs)
+{
+ struct xfs_scrub *sc = rs->sc;
+ struct xfs_mount *mp = sc->mp;
+
+ /* overhead of invalidating a buffer */
+ const unsigned int per_binval =
+ xfs_buf_inval_log_space(xreap_bmapi_binval_mapcount(sc),
+ xreap_bmapi_binval_blocksize(sc));
+
+ /*
+ * In the worst case, relogging an intent item causes both an intent
+ * item and a done item to be attached to a transaction for each extent
+ * that we'd like to process.
+ */
+ const unsigned int efi = xfs_efi_log_space(1) +
+ xfs_efd_log_space(1);
+ const unsigned int rui = xfs_rui_log_space(1) +
+ xfs_rud_log_space();
+ const unsigned int bui = xfs_bui_log_space(1) +
+ xfs_bud_log_space();
+
+ /*
+ * t1: Unmapping crosslinked file data blocks: one bmap deletion,
+ * possibly an EFI for underfilled bmbt blocks, and an rmap deletion.
+ *
+ * t2: Freeing freeing file data blocks: one bmap deletion, possibly an
+ * EFI for underfilled bmbt blocks, and another EFI for the space
+ * itself.
+ */
+ const unsigned int t1 = (bui + efi) + rui;
+ const unsigned int t2 = (bui + efi) + efi;
+ const unsigned int per_intent = max(t1, t2);
+
+ /*
+ * For each transaction in a reap chain, we must be able to take one
+ * step in the defer item chain, which should only consist of CUI, EFI,
+ * or RUI items.
+ */
+ const unsigned int f1 = xfs_calc_finish_efi_reservation(mp, 1);
+ const unsigned int f2 = xfs_calc_finish_rui_reservation(mp, 1);
+ const unsigned int f3 = xfs_calc_finish_bui_reservation(mp, 1);
+ const unsigned int step_size = max3(f1, f2, f3);
+
+ /*
+ * Each call to xreap_ifork_extent starts with a clean transaction and
+ * operates on a single mapping by creating a chain of log intent items
+ * for that mapping. We need to leave enough reservation in the
+ * transaction to log btree buffer and inode updates for each step in
+ * the chain, and to relog the log intents.
+ */
+ const unsigned int per_extent_res = per_intent + step_size;
+
+ xreap_configure_limits(rs, per_extent_res, per_binval, 0, per_binval);
+
+ trace_xreap_bmapi_limits(sc->tp, per_binval, rs->max_binval,
+ step_size, per_intent, 1);
+}
+
/*
* Dispose of as much of this file extent as we can. Upon successful return,
* the imap will reflect the mapping that was removed from the fork.
*/
STATIC int
xreap_ifork_extent(
- struct xfs_scrub *sc,
- struct xfs_inode *ip,
- int whichfork,
+ struct xreap_state *rs,
struct xfs_bmbt_irec *imap)
{
+ struct xfs_scrub *sc = rs->sc;
xfs_agnumber_t agno;
bool crosslinked;
int error;
ASSERT(sc->sa.pag == NULL);
- trace_xreap_ifork_extent(sc, ip, whichfork, imap);
+ trace_xreap_ifork_extent(sc, rs->ip, rs->whichfork, imap);
agno = XFS_FSB_TO_AGNO(sc->mp, imap->br_startblock);
sc->sa.pag = xfs_perag_get(sc->mp, agno);
@@ -1248,11 +1617,11 @@ xreap_ifork_extent(
* Decide the fate of the blocks at the beginning of the mapping, then
* update the mapping to use it with the unmap calls.
*/
- error = xreap_bmapi_select(sc, ip, whichfork, imap, &crosslinked);
+ error = xreap_bmapi_select(rs, imap, &crosslinked);
if (error)
goto out_agf;
- error = xrep_reap_bmapi_iter(sc, ip, whichfork, imap, crosslinked);
+ error = xrep_reap_bmapi_iter(rs, imap, crosslinked);
if (error)
goto out_agf;
@@ -1276,6 +1645,11 @@ xrep_reap_ifork(
struct xfs_inode *ip,
int whichfork)
{
+ struct xreap_state rs = {
+ .sc = sc,
+ .ip = ip,
+ .whichfork = whichfork,
+ };
xfs_fileoff_t off = 0;
int bmap_flags = xfs_bmapi_aflag(whichfork);
int error;
@@ -1284,6 +1658,7 @@ xrep_reap_ifork(
ASSERT(ip == sc->ip || ip == sc->tempip);
ASSERT(whichfork == XFS_ATTR_FORK || !XFS_IS_REALTIME_INODE(ip));
+ xreap_configure_bmapi_limits(&rs);
while (off < XFS_MAX_FILEOFF) {
struct xfs_bmbt_irec imap;
int nimaps = 1;
@@ -1303,13 +1678,14 @@ xrep_reap_ifork(
* can in a single transaction.
*/
if (xfs_bmap_is_real_extent(&imap)) {
- error = xreap_ifork_extent(sc, ip, whichfork, &imap);
+ error = xreap_ifork_extent(&rs, &imap);
if (error)
return error;
error = xfs_defer_finish(&sc->tp);
if (error)
return error;
+ xreap_defer_finish_reset(&rs);
}
off = imap.br_startoff + imap.br_blockcount;
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index f8f9ed30f56b..efd5a7ccdf62 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -1110,7 +1110,7 @@ xrep_will_attempt(
return true;
/* Let debug users force us into the repair routines. */
- if (XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR))
+ if (XFS_TEST_ERROR(sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR))
return true;
/* Metadata is corrupt or failed cross-referencing. */
@@ -1269,42 +1269,6 @@ xrep_setup_xfbtree(
}
/*
- * Create a dummy transaction for use in a live update hook function. This
- * function MUST NOT be called from regular repair code because the current
- * process' transaction is saved via the cookie.
- */
-int
-xrep_trans_alloc_hook_dummy(
- struct xfs_mount *mp,
- void **cookiep,
- struct xfs_trans **tpp)
-{
- int error;
-
- *cookiep = current->journal_info;
- current->journal_info = NULL;
-
- error = xfs_trans_alloc_empty(mp, tpp);
- if (!error)
- return 0;
-
- current->journal_info = *cookiep;
- *cookiep = NULL;
- return error;
-}
-
-/* Cancel a dummy transaction used by a live update hook function. */
-void
-xrep_trans_cancel_hook_dummy(
- void **cookiep,
- struct xfs_trans *tp)
-{
- xfs_trans_cancel(tp);
- current->journal_info = *cookiep;
- *cookiep = NULL;
-}
-
-/*
* See if this buffer can pass the given ->verify_struct() function.
*
* If the buffer already has ops attached and they're not the ones that were
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index af0a3a9e5ed9..2bb125c4f9bf 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -18,14 +18,6 @@ static inline int xrep_notsupported(struct xfs_scrub *sc)
#ifdef CONFIG_XFS_ONLINE_REPAIR
-/*
- * This is the maximum number of deferred extent freeing item extents (EFIs)
- * that we'll attach to a transaction without rolling the transaction to avoid
- * overrunning a tr_itruncate reservation.
- */
-#define XREP_MAX_ITRUNCATE_EFIS (128)
-
-
/* Repair helpers */
int xrep_attempt(struct xfs_scrub *sc, struct xchk_stats_run *run);
@@ -180,10 +172,6 @@ int xrep_quotacheck(struct xfs_scrub *sc);
int xrep_reinit_pagf(struct xfs_scrub *sc);
int xrep_reinit_pagi(struct xfs_scrub *sc);
-int xrep_trans_alloc_hook_dummy(struct xfs_mount *mp, void **cookiep,
- struct xfs_trans **tpp);
-void xrep_trans_cancel_hook_dummy(void **cookiep, struct xfs_trans *tp);
-
bool xrep_buf_verify_struct(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
void xrep_inode_set_nblocks(struct xfs_scrub *sc, int64_t new_blocks);
int xrep_reset_metafile_resv(struct xfs_scrub *sc);
diff --git a/fs/xfs/scrub/rmap_repair.c b/fs/xfs/scrub/rmap_repair.c
index f5f73078ffe2..17d4a38d735c 100644
--- a/fs/xfs/scrub/rmap_repair.c
+++ b/fs/xfs/scrub/rmap_repair.c
@@ -951,9 +951,7 @@ end_agscan:
sa->agf_bp = NULL;
sa->agi_bp = NULL;
xchk_trans_cancel(sc);
- error = xchk_trans_alloc_empty(sc);
- if (error)
- return error;
+ xchk_trans_alloc_empty(sc);
/* Iterate all AGs for inodes rmaps. */
while ((error = xchk_iscan_iter(&rr->iscan, &ip)) == 1) {
@@ -1612,7 +1610,6 @@ xrep_rmapbt_live_update(
struct xfs_mount *mp;
struct xfs_btree_cur *mcur;
struct xfs_trans *tp;
- void *txcookie;
int error;
rr = container_of(nb, struct xrep_rmap, rhook.rmap_hook.nb);
@@ -1623,9 +1620,7 @@ xrep_rmapbt_live_update(
trace_xrep_rmap_live_update(pag_group(rr->sc->sa.pag), action, p);
- error = xrep_trans_alloc_hook_dummy(mp, &txcookie, &tp);
- if (error)
- goto out_abort;
+ tp = xfs_trans_alloc_empty(mp);
mutex_lock(&rr->lock);
mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, tp, &rr->rmap_btree);
@@ -1639,14 +1634,13 @@ xrep_rmapbt_live_update(
if (error)
goto out_cancel;
- xrep_trans_cancel_hook_dummy(&txcookie, tp);
+ xfs_trans_cancel(tp);
mutex_unlock(&rr->lock);
return NOTIFY_DONE;
out_cancel:
xfbtree_trans_cancel(&rr->rmap_btree, tp);
- xrep_trans_cancel_hook_dummy(&txcookie, tp);
-out_abort:
+ xfs_trans_cancel(tp);
mutex_unlock(&rr->lock);
xchk_iscan_abort(&rr->iscan);
out_unlock:
diff --git a/fs/xfs/scrub/rtrmap_repair.c b/fs/xfs/scrub/rtrmap_repair.c
index fc2592c53af5..7561941a337a 100644
--- a/fs/xfs/scrub/rtrmap_repair.c
+++ b/fs/xfs/scrub/rtrmap_repair.c
@@ -580,9 +580,7 @@ xrep_rtrmap_find_rmaps(
*/
xchk_trans_cancel(sc);
xchk_rtgroup_unlock(&sc->sr);
- error = xchk_trans_alloc_empty(sc);
- if (error)
- return error;
+ xchk_trans_alloc_empty(sc);
while ((error = xchk_iscan_iter(&rr->iscan, &ip)) == 1) {
error = xrep_rtrmap_scan_inode(rr, ip);
@@ -846,7 +844,6 @@ xrep_rtrmapbt_live_update(
struct xfs_mount *mp;
struct xfs_btree_cur *mcur;
struct xfs_trans *tp;
- void *txcookie;
int error;
rr = container_of(nb, struct xrep_rtrmap, rhook.rmap_hook.nb);
@@ -857,9 +854,7 @@ xrep_rtrmapbt_live_update(
trace_xrep_rmap_live_update(rtg_group(rr->sc->sr.rtg), action, p);
- error = xrep_trans_alloc_hook_dummy(mp, &txcookie, &tp);
- if (error)
- goto out_abort;
+ tp = xfs_trans_alloc_empty(mp);
mutex_lock(&rr->lock);
mcur = xfs_rtrmapbt_mem_cursor(rr->sc->sr.rtg, tp, &rr->rtrmap_btree);
@@ -873,14 +868,13 @@ xrep_rtrmapbt_live_update(
if (error)
goto out_cancel;
- xrep_trans_cancel_hook_dummy(&txcookie, tp);
+ xfs_trans_cancel(tp);
mutex_unlock(&rr->lock);
return NOTIFY_DONE;
out_cancel:
xfbtree_trans_cancel(&rr->rtrmap_btree, tp);
- xrep_trans_cancel_hook_dummy(&txcookie, tp);
-out_abort:
+ xfs_trans_cancel(tp);
xchk_iscan_abort(&rr->iscan);
mutex_unlock(&rr->lock);
out_unlock:
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 9908850bf76f..3c3b0d25006f 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -680,8 +680,6 @@ xfs_scrub_metadata(
if (error)
goto out;
- xfs_warn_experimental(mp, XFS_EXPERIMENTAL_SCRUB);
-
sc = kzalloc(sizeof(struct xfs_scrub), XCHK_GFP_FLAGS);
if (!sc) {
error = -ENOMEM;
@@ -878,10 +876,7 @@ xchk_scrubv_open_by_handle(
struct xfs_inode *ip;
int error;
- error = xfs_trans_alloc_empty(mp, &tp);
- if (error)
- return NULL;
-
+ tp = xfs_trans_alloc_empty(mp);
error = xfs_iget(mp, tp, head->svh_ino, XCHK_IGET_FLAGS, 0, &ip);
xfs_trans_cancel(tp);
if (error)
diff --git a/fs/xfs/scrub/symlink_repair.c b/fs/xfs/scrub/symlink_repair.c
index 953ce7be78dc..df629892462f 100644
--- a/fs/xfs/scrub/symlink_repair.c
+++ b/fs/xfs/scrub/symlink_repair.c
@@ -184,8 +184,8 @@ xrep_symlink_salvage_inline(
sc->ip->i_disk_size == 1 && old_target[0] == '?')
return 0;
- nr = min(XFS_SYMLINK_MAXLEN, xfs_inode_data_fork_size(ip));
- strncpy(target_buf, ifp->if_data, nr);
+ nr = min(XFS_SYMLINK_MAXLEN, ifp->if_bytes);
+ memcpy(target_buf, ifp->if_data, nr);
return nr;
}
diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c
index 2450e214103f..987313a52e64 100644
--- a/fs/xfs/scrub/trace.c
+++ b/fs/xfs/scrub/trace.c
@@ -22,6 +22,7 @@
#include "xfs_parent.h"
#include "xfs_metafile.h"
#include "xfs_rtgroup.h"
+#include "xfs_trans.h"
#include "scrub/scrub.h"
#include "scrub/xfile.h"
#include "scrub/xfarray.h"
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index d7c4ced47c15..39ea651cbb75 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -479,7 +479,7 @@ DECLARE_EVENT_CLASS(xchk_dqiter_class,
__field(xfs_exntst_t, state)
),
TP_fast_assign(
- __entry->dev = cursor->sc->ip->i_mount->m_super->s_dev;
+ __entry->dev = cursor->sc->mp->m_super->s_dev;
__entry->dqtype = cursor->dqtype;
__entry->ino = cursor->quota_ip->i_ino;
__entry->cur_id = cursor->id;
@@ -2000,6 +2000,51 @@ DEFINE_REPAIR_EXTENT_EVENT(xreap_agextent_binval);
DEFINE_REPAIR_EXTENT_EVENT(xreap_bmapi_binval);
DEFINE_REPAIR_EXTENT_EVENT(xrep_agfl_insert);
+DECLARE_EVENT_CLASS(xrep_reap_limits_class,
+ TP_PROTO(const struct xfs_trans *tp, unsigned int per_binval,
+ unsigned int max_binval, unsigned int step_size,
+ unsigned int per_intent,
+ unsigned int max_deferred),
+ TP_ARGS(tp, per_binval, max_binval, step_size, per_intent, max_deferred),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned int, log_res)
+ __field(unsigned int, per_binval)
+ __field(unsigned int, max_binval)
+ __field(unsigned int, step_size)
+ __field(unsigned int, per_intent)
+ __field(unsigned int, max_deferred)
+ ),
+ TP_fast_assign(
+ __entry->dev = tp->t_mountp->m_super->s_dev;
+ __entry->log_res = tp->t_log_res;
+ __entry->per_binval = per_binval;
+ __entry->max_binval = max_binval;
+ __entry->step_size = step_size;
+ __entry->per_intent = per_intent;
+ __entry->max_deferred = max_deferred;
+ ),
+ TP_printk("dev %d:%d logres %u per_binval %u max_binval %u step_size %u per_intent %u max_deferred %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->log_res,
+ __entry->per_binval,
+ __entry->max_binval,
+ __entry->step_size,
+ __entry->per_intent,
+ __entry->max_deferred)
+);
+#define DEFINE_REPAIR_REAP_LIMITS_EVENT(name) \
+DEFINE_EVENT(xrep_reap_limits_class, name, \
+ TP_PROTO(const struct xfs_trans *tp, unsigned int per_binval, \
+ unsigned int max_binval, unsigned int step_size, \
+ unsigned int per_intent, \
+ unsigned int max_deferred), \
+ TP_ARGS(tp, per_binval, max_binval, step_size, per_intent, max_deferred))
+DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_agextent_limits);
+DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_agcow_limits);
+DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_rgcow_limits);
+DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_bmapi_limits);
+
DECLARE_EVENT_CLASS(xrep_reap_find_class,
TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno,
xfs_extlen_t len, bool crosslinked),
@@ -2996,7 +3041,7 @@ DEFINE_EVENT(xrep_pptr_salvage_class, name, \
DEFINE_XREP_PPTR_SALVAGE_EVENT(xrep_xattr_salvage_pptr);
DEFINE_XREP_PPTR_SALVAGE_EVENT(xrep_xattr_insert_pptr);
-TRACE_EVENT(xrep_xattr_class,
+DECLARE_EVENT_CLASS(xrep_xattr_class,
TP_PROTO(struct xfs_inode *ip, struct xfs_inode *arg_ip),
TP_ARGS(ip, arg_ip),
TP_STRUCT__entry(
diff --git a/fs/xfs/scrub/xfarray.c b/fs/xfs/scrub/xfarray.c
index cdd13ed9c569..ed2e8c64b1a8 100644
--- a/fs/xfs/scrub/xfarray.c
+++ b/fs/xfs/scrub/xfarray.c
@@ -834,7 +834,7 @@ xfarray_sort_scan(
si->first_folio_idx = xfarray_idx(si->array,
folio_pos(si->folio) + si->array->obj_size - 1);
- next_pos = folio_pos(si->folio) + folio_size(si->folio);
+ next_pos = folio_next_pos(si->folio);
si->last_folio_idx = xfarray_idx(si->array, next_pos - 1);
if (xfarray_pos(si->array, si->last_folio_idx + 1) > next_pos)
si->last_folio_idx--;
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 26a04a783489..56a544638491 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -234,6 +234,47 @@ xfs_end_bio(
}
/*
+ * We cannot cancel the ioend directly on error. We may have already set other
+ * pages under writeback and hence we have to run I/O completion to mark the
+ * error state of the pages under writeback appropriately.
+ *
+ * If the folio has delalloc blocks on it, the caller is asking us to punch them
+ * out. If we don't, we can leave a stale delalloc mapping covered by a clean
+ * page that needs to be dirtied again before the delalloc mapping can be
+ * converted. This stale delalloc mapping can trip up a later direct I/O read
+ * operation on the same region.
+ *
+ * We prevent this by truncating away the delalloc regions on the folio. Because
+ * they are delalloc, we can do this without needing a transaction. Indeed - if
+ * we get ENOSPC errors, we have to be able to do this truncation without a
+ * transaction as there is no space left for block reservation (typically why
+ * we see a ENOSPC in writeback).
+ */
+static void
+xfs_discard_folio(
+ struct folio *folio,
+ loff_t pos)
+{
+ struct xfs_inode *ip = XFS_I(folio->mapping->host);
+ struct xfs_mount *mp = ip->i_mount;
+
+ if (xfs_is_shutdown(mp))
+ return;
+
+ xfs_alert_ratelimited(mp,
+ "page discard on page "PTR_FMT", inode 0x%llx, pos %llu.",
+ folio, ip->i_ino, pos);
+
+ /*
+ * The end of the punch range is always the offset of the first
+ * byte of the next folio. Hence the end offset is only dependent on the
+ * folio itself and not the start offset that is passed in.
+ */
+ xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos,
+ folio_next_pos(folio), NULL);
+}
+
+/*
* Fast revalidation of the cached writeback mapping. Return true if the current
* mapping is valid, false otherwise.
*/
@@ -278,13 +319,12 @@ xfs_imap_valid(
static int
xfs_map_blocks(
struct iomap_writepage_ctx *wpc,
- struct inode *inode,
loff_t offset,
unsigned int len)
{
- struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_inode *ip = XFS_I(wpc->inode);
struct xfs_mount *mp = ip->i_mount;
- ssize_t count = i_blocksize(inode);
+ ssize_t count = i_blocksize(wpc->inode);
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
xfs_fileoff_t cow_fsb;
@@ -436,81 +476,78 @@ allocate_blocks:
return 0;
}
-static int
-xfs_submit_ioend(
+static ssize_t
+xfs_writeback_range(
struct iomap_writepage_ctx *wpc,
- int status)
+ struct folio *folio,
+ u64 offset,
+ unsigned int len,
+ u64 end_pos)
{
- struct iomap_ioend *ioend = wpc->ioend;
- unsigned int nofs_flag;
-
- /*
- * We can allocate memory here while doing writeback on behalf of
- * memory reclaim. To avoid memory allocation deadlocks set the
- * task-wide nofs context for the following operations.
- */
- nofs_flag = memalloc_nofs_save();
+ ssize_t ret;
+
+ ret = xfs_map_blocks(wpc, offset, len);
+ if (!ret)
+ ret = iomap_add_to_ioend(wpc, folio, offset, end_pos, len);
+ if (ret < 0)
+ xfs_discard_folio(folio, offset);
+ return ret;
+}
- /* Convert CoW extents to regular */
- if (!status && (ioend->io_flags & IOMAP_IOEND_SHARED)) {
- status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
- ioend->io_offset, ioend->io_size);
- }
+static bool
+xfs_ioend_needs_wq_completion(
+ struct iomap_ioend *ioend)
+{
+ /* Changing inode size requires a transaction. */
+ if (xfs_ioend_is_append(ioend))
+ return true;
- memalloc_nofs_restore(nofs_flag);
+ /* Extent manipulation requires a transaction. */
+ if (ioend->io_flags & (IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_SHARED))
+ return true;
- /* send ioends that might require a transaction to the completion wq */
- if (xfs_ioend_is_append(ioend) ||
- (ioend->io_flags & (IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_SHARED)))
- ioend->io_bio.bi_end_io = xfs_end_bio;
+ /* Page cache invalidation cannot be done in irq context. */
+ if (ioend->io_flags & IOMAP_IOEND_DONTCACHE)
+ return true;
- if (status)
- return status;
- submit_bio(&ioend->io_bio);
- return 0;
+ return false;
}
-/*
- * If the folio has delalloc blocks on it, the caller is asking us to punch them
- * out. If we don't, we can leave a stale delalloc mapping covered by a clean
- * page that needs to be dirtied again before the delalloc mapping can be
- * converted. This stale delalloc mapping can trip up a later direct I/O read
- * operation on the same region.
- *
- * We prevent this by truncating away the delalloc regions on the folio. Because
- * they are delalloc, we can do this without needing a transaction. Indeed - if
- * we get ENOSPC errors, we have to be able to do this truncation without a
- * transaction as there is no space left for block reservation (typically why
- * we see a ENOSPC in writeback).
- */
-static void
-xfs_discard_folio(
- struct folio *folio,
- loff_t pos)
+static int
+xfs_writeback_submit(
+ struct iomap_writepage_ctx *wpc,
+ int error)
{
- struct xfs_inode *ip = XFS_I(folio->mapping->host);
- struct xfs_mount *mp = ip->i_mount;
+ struct iomap_ioend *ioend = wpc->wb_ctx;
- if (xfs_is_shutdown(mp))
- return;
+ /*
+ * Convert CoW extents to regular.
+ *
+ * We can allocate memory here while doing writeback on behalf of memory
+ * reclaim. To avoid memory allocation deadlocks, set the task-wide
+ * nofs context.
+ */
+ if (!error && (ioend->io_flags & IOMAP_IOEND_SHARED)) {
+ unsigned int nofs_flag;
- xfs_alert_ratelimited(mp,
- "page discard on page "PTR_FMT", inode 0x%llx, pos %llu.",
- folio, ip->i_ino, pos);
+ nofs_flag = memalloc_nofs_save();
+ error = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
+ ioend->io_offset, ioend->io_size);
+ memalloc_nofs_restore(nofs_flag);
+ }
/*
- * The end of the punch range is always the offset of the first
- * byte of the next folio. Hence the end offset is only dependent on the
- * folio itself and not the start offset that is passed in.
+ * Send ioends that might require a transaction to the completion wq.
*/
- xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos,
- folio_pos(folio) + folio_size(folio), NULL);
+ if (xfs_ioend_needs_wq_completion(ioend))
+ ioend->io_bio.bi_end_io = xfs_end_bio;
+
+ return iomap_ioend_writeback_submit(wpc, error);
}
static const struct iomap_writeback_ops xfs_writeback_ops = {
- .map_blocks = xfs_map_blocks,
- .submit_ioend = xfs_submit_ioend,
- .discard_folio = xfs_discard_folio,
+ .writeback_range = xfs_writeback_range,
+ .writeback_submit = xfs_writeback_submit,
};
struct xfs_zoned_writepage_ctx {
@@ -527,11 +564,10 @@ XFS_ZWPC(struct iomap_writepage_ctx *ctx)
static int
xfs_zoned_map_blocks(
struct iomap_writepage_ctx *wpc,
- struct inode *inode,
loff_t offset,
unsigned int len)
{
- struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_inode *ip = XFS_I(wpc->inode);
struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + len);
@@ -590,22 +626,44 @@ xfs_zoned_map_blocks(
return 0;
}
-static int
-xfs_zoned_submit_ioend(
+static ssize_t
+xfs_zoned_writeback_range(
struct iomap_writepage_ctx *wpc,
- int status)
+ struct folio *folio,
+ u64 offset,
+ unsigned int len,
+ u64 end_pos)
{
- wpc->ioend->io_bio.bi_end_io = xfs_end_bio;
- if (status)
- return status;
- xfs_zone_alloc_and_submit(wpc->ioend, &XFS_ZWPC(wpc)->open_zone);
+ ssize_t ret;
+
+ ret = xfs_zoned_map_blocks(wpc, offset, len);
+ if (!ret)
+ ret = iomap_add_to_ioend(wpc, folio, offset, end_pos, len);
+ if (ret < 0)
+ xfs_discard_folio(folio, offset);
+ return ret;
+}
+
+static int
+xfs_zoned_writeback_submit(
+ struct iomap_writepage_ctx *wpc,
+ int error)
+{
+ struct iomap_ioend *ioend = wpc->wb_ctx;
+
+ ioend->io_bio.bi_end_io = xfs_end_bio;
+ if (error) {
+ ioend->io_bio.bi_status = errno_to_blk_status(error);
+ bio_endio(&ioend->io_bio);
+ return error;
+ }
+ xfs_zone_alloc_and_submit(ioend, &XFS_ZWPC(wpc)->open_zone);
return 0;
}
static const struct iomap_writeback_ops xfs_zoned_writeback_ops = {
- .map_blocks = xfs_zoned_map_blocks,
- .submit_ioend = xfs_zoned_submit_ioend,
- .discard_folio = xfs_discard_folio,
+ .writeback_range = xfs_zoned_writeback_range,
+ .writeback_submit = xfs_zoned_writeback_submit,
};
STATIC int
@@ -618,19 +676,29 @@ xfs_vm_writepages(
xfs_iflags_clear(ip, XFS_ITRUNCATED);
if (xfs_is_zoned_inode(ip)) {
- struct xfs_zoned_writepage_ctx xc = { };
+ struct xfs_zoned_writepage_ctx xc = {
+ .ctx = {
+ .inode = mapping->host,
+ .wbc = wbc,
+ .ops = &xfs_zoned_writeback_ops
+ },
+ };
int error;
- error = iomap_writepages(mapping, wbc, &xc.ctx,
- &xfs_zoned_writeback_ops);
+ error = iomap_writepages(&xc.ctx);
if (xc.open_zone)
xfs_open_zone_put(xc.open_zone);
return error;
} else {
- struct xfs_writepage_ctx wpc = { };
-
- return iomap_writepages(mapping, wbc, &wpc.ctx,
- &xfs_writeback_ops);
+ struct xfs_writepage_ctx wpc = {
+ .ctx = {
+ .inode = mapping->host,
+ .wbc = wbc,
+ .ops = &xfs_writeback_ops
+ },
+ };
+
+ return iomap_writepages(&wpc.ctx);
}
}
@@ -674,14 +742,15 @@ xfs_vm_read_folio(
struct file *unused,
struct folio *folio)
{
- return iomap_read_folio(folio, &xfs_read_iomap_ops);
+ iomap_bio_read_folio(folio, &xfs_read_iomap_ops);
+ return 0;
}
STATIC void
xfs_vm_readahead(
struct readahead_control *rac)
{
- iomap_readahead(rac, &xfs_read_iomap_ops);
+ iomap_bio_readahead(rac, &xfs_read_iomap_ops);
}
static int
@@ -692,6 +761,9 @@ xfs_vm_swap_activate(
{
struct xfs_inode *ip = XFS_I(file_inode(swap_file));
+ if (xfs_is_zoned_inode(ip))
+ return -EINVAL;
+
/*
* Swap file activation can race against concurrent shared extent
* removal in files that have been cloned. If this happens,
diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c
index f683b7a9323f..e8fa326ac995 100644
--- a/fs/xfs/xfs_attr_item.c
+++ b/fs/xfs/xfs_attr_item.c
@@ -91,41 +91,37 @@ xfs_attri_log_nameval_alloc(
name_len + new_name_len + value_len +
new_value_len);
- nv->name.i_addr = nv + 1;
- nv->name.i_len = name_len;
- nv->name.i_type = XLOG_REG_TYPE_ATTR_NAME;
- memcpy(nv->name.i_addr, name, name_len);
+ nv->name.iov_base = nv + 1;
+ nv->name.iov_len = name_len;
+ memcpy(nv->name.iov_base, name, name_len);
if (new_name_len) {
- nv->new_name.i_addr = nv->name.i_addr + name_len;
- nv->new_name.i_len = new_name_len;
- memcpy(nv->new_name.i_addr, new_name, new_name_len);
+ nv->new_name.iov_base = nv->name.iov_base + name_len;
+ nv->new_name.iov_len = new_name_len;
+ memcpy(nv->new_name.iov_base, new_name, new_name_len);
} else {
- nv->new_name.i_addr = NULL;
- nv->new_name.i_len = 0;
+ nv->new_name.iov_base = NULL;
+ nv->new_name.iov_len = 0;
}
- nv->new_name.i_type = XLOG_REG_TYPE_ATTR_NEWNAME;
if (value_len) {
- nv->value.i_addr = nv->name.i_addr + name_len + new_name_len;
- nv->value.i_len = value_len;
- memcpy(nv->value.i_addr, value, value_len);
+ nv->value.iov_base = nv->name.iov_base + name_len + new_name_len;
+ nv->value.iov_len = value_len;
+ memcpy(nv->value.iov_base, value, value_len);
} else {
- nv->value.i_addr = NULL;
- nv->value.i_len = 0;
+ nv->value.iov_base = NULL;
+ nv->value.iov_len = 0;
}
- nv->value.i_type = XLOG_REG_TYPE_ATTR_VALUE;
if (new_value_len) {
- nv->new_value.i_addr = nv->name.i_addr + name_len +
+ nv->new_value.iov_base = nv->name.iov_base + name_len +
new_name_len + value_len;
- nv->new_value.i_len = new_value_len;
- memcpy(nv->new_value.i_addr, new_value, new_value_len);
+ nv->new_value.iov_len = new_value_len;
+ memcpy(nv->new_value.iov_base, new_value, new_value_len);
} else {
- nv->new_value.i_addr = NULL;
- nv->new_value.i_len = 0;
+ nv->new_value.iov_base = NULL;
+ nv->new_value.iov_len = 0;
}
- nv->new_value.i_type = XLOG_REG_TYPE_ATTR_NEWVALUE;
refcount_set(&nv->refcount, 1);
return nv;
@@ -170,21 +166,21 @@ xfs_attri_item_size(
*nvecs += 2;
*nbytes += sizeof(struct xfs_attri_log_format) +
- xlog_calc_iovec_len(nv->name.i_len);
+ xlog_calc_iovec_len(nv->name.iov_len);
- if (nv->new_name.i_len) {
+ if (nv->new_name.iov_len) {
*nvecs += 1;
- *nbytes += xlog_calc_iovec_len(nv->new_name.i_len);
+ *nbytes += xlog_calc_iovec_len(nv->new_name.iov_len);
}
- if (nv->value.i_len) {
+ if (nv->value.iov_len) {
*nvecs += 1;
- *nbytes += xlog_calc_iovec_len(nv->value.i_len);
+ *nbytes += xlog_calc_iovec_len(nv->value.iov_len);
}
- if (nv->new_value.i_len) {
+ if (nv->new_value.iov_len) {
*nvecs += 1;
- *nbytes += xlog_calc_iovec_len(nv->new_value.i_len);
+ *nbytes += xlog_calc_iovec_len(nv->new_value.iov_len);
}
}
@@ -212,31 +208,36 @@ xfs_attri_item_format(
* the log recovery.
*/
- ASSERT(nv->name.i_len > 0);
+ ASSERT(nv->name.iov_len > 0);
attrip->attri_format.alfi_size++;
- if (nv->new_name.i_len > 0)
+ if (nv->new_name.iov_len > 0)
attrip->attri_format.alfi_size++;
- if (nv->value.i_len > 0)
+ if (nv->value.iov_len > 0)
attrip->attri_format.alfi_size++;
- if (nv->new_value.i_len > 0)
+ if (nv->new_value.iov_len > 0)
attrip->attri_format.alfi_size++;
xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTRI_FORMAT,
&attrip->attri_format,
sizeof(struct xfs_attri_log_format));
- xlog_copy_from_iovec(lv, &vecp, &nv->name);
- if (nv->new_name.i_len > 0)
- xlog_copy_from_iovec(lv, &vecp, &nv->new_name);
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTR_NAME, nv->name.iov_base,
+ nv->name.iov_len);
- if (nv->value.i_len > 0)
- xlog_copy_from_iovec(lv, &vecp, &nv->value);
+ if (nv->new_name.iov_len > 0)
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTR_NEWNAME,
+ nv->new_name.iov_base, nv->new_name.iov_len);
- if (nv->new_value.i_len > 0)
- xlog_copy_from_iovec(lv, &vecp, &nv->new_value);
+ if (nv->value.iov_len > 0)
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTR_VALUE,
+ nv->value.iov_base, nv->value.iov_len);
+
+ if (nv->new_value.iov_len > 0)
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTR_NEWVALUE,
+ nv->new_value.iov_base, nv->new_value.iov_len);
}
/*
@@ -383,22 +384,22 @@ xfs_attr_log_item(
attrp->alfi_ino = args->dp->i_ino;
ASSERT(!(attr->xattri_op_flags & ~XFS_ATTRI_OP_FLAGS_TYPE_MASK));
attrp->alfi_op_flags = attr->xattri_op_flags;
- attrp->alfi_value_len = nv->value.i_len;
+ attrp->alfi_value_len = nv->value.iov_len;
switch (xfs_attr_log_item_op(attrp)) {
case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE:
- ASSERT(nv->value.i_len == nv->new_value.i_len);
+ ASSERT(nv->value.iov_len == nv->new_value.iov_len);
attrp->alfi_igen = VFS_I(args->dp)->i_generation;
- attrp->alfi_old_name_len = nv->name.i_len;
- attrp->alfi_new_name_len = nv->new_name.i_len;
+ attrp->alfi_old_name_len = nv->name.iov_len;
+ attrp->alfi_new_name_len = nv->new_name.iov_len;
break;
case XFS_ATTRI_OP_FLAGS_PPTR_REMOVE:
case XFS_ATTRI_OP_FLAGS_PPTR_SET:
attrp->alfi_igen = VFS_I(args->dp)->i_generation;
fallthrough;
default:
- attrp->alfi_name_len = nv->name.i_len;
+ attrp->alfi_name_len = nv->name.iov_len;
break;
}
@@ -490,7 +491,7 @@ xfs_attr_finish_item(
/* Reset trans after EAGAIN cycle since the transaction is new */
args->trans = tp;
- if (XFS_TEST_ERROR(false, args->dp->i_mount, XFS_ERRTAG_LARP)) {
+ if (XFS_TEST_ERROR(args->dp->i_mount, XFS_ERRTAG_LARP)) {
error = -EIO;
goto out;
}
@@ -616,10 +617,7 @@ xfs_attri_iread_extents(
struct xfs_trans *tp;
int error;
- error = xfs_trans_alloc_empty(ip->i_mount, &tp);
- if (error)
- return error;
-
+ tp = xfs_trans_alloc_empty(ip->i_mount);
xfs_ilock(ip, XFS_ILOCK_EXCL);
error = xfs_iread_extents(tp, ip, XFS_ATTR_FORK);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -690,14 +688,14 @@ xfs_attri_recover_work(
args->dp = ip;
args->geo = mp->m_attr_geo;
args->whichfork = XFS_ATTR_FORK;
- args->name = nv->name.i_addr;
- args->namelen = nv->name.i_len;
- args->new_name = nv->new_name.i_addr;
- args->new_namelen = nv->new_name.i_len;
- args->value = nv->value.i_addr;
- args->valuelen = nv->value.i_len;
- args->new_value = nv->new_value.i_addr;
- args->new_valuelen = nv->new_value.i_len;
+ args->name = nv->name.iov_base;
+ args->namelen = nv->name.iov_len;
+ args->new_name = nv->new_name.iov_base;
+ args->new_namelen = nv->new_name.iov_len;
+ args->value = nv->value.iov_base;
+ args->valuelen = nv->value.iov_len;
+ args->new_value = nv->new_value.iov_base;
+ args->new_valuelen = nv->new_value.iov_len;
args->attr_filter = attrp->alfi_attr_filter & XFS_ATTRI_FILTER_MASK;
args->op_flags = XFS_DA_OP_RECOVERY | XFS_DA_OP_OKNOENT |
XFS_DA_OP_LOGGED;
@@ -739,7 +737,7 @@ xfs_attr_recover_work(
struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip);
struct xfs_attr_intent *attr;
struct xfs_mount *mp = lip->li_log->l_mp;
- struct xfs_inode *ip;
+ struct xfs_inode *ip = NULL;
struct xfs_da_args *args;
struct xfs_trans *tp;
struct xfs_trans_res resv;
@@ -754,8 +752,8 @@ xfs_attr_recover_work(
*/
attrp = &attrip->attri_format;
if (!xfs_attri_validate(mp, attrp) ||
- !xfs_attr_namecheck(attrp->alfi_attr_filter, nv->name.i_addr,
- nv->name.i_len))
+ !xfs_attr_namecheck(attrp->alfi_attr_filter, nv->name.iov_base,
+ nv->name.iov_len))
return -EFSCORRUPTED;
attr = xfs_attri_recover_work(mp, dfp, attrp, &ip, nv);
@@ -953,50 +951,50 @@ static inline void *
xfs_attri_validate_name_iovec(
struct xfs_mount *mp,
struct xfs_attri_log_format *attri_formatp,
- const struct xfs_log_iovec *iovec,
+ const struct kvec *iovec,
unsigned int name_len)
{
- if (iovec->i_len != xlog_calc_iovec_len(name_len)) {
+ if (iovec->iov_len != xlog_calc_iovec_len(name_len)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
attri_formatp, sizeof(*attri_formatp));
return NULL;
}
- if (!xfs_attr_namecheck(attri_formatp->alfi_attr_filter, iovec->i_addr,
+ if (!xfs_attr_namecheck(attri_formatp->alfi_attr_filter, iovec->iov_base,
name_len)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
attri_formatp, sizeof(*attri_formatp));
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- iovec->i_addr, iovec->i_len);
+ iovec->iov_base, iovec->iov_len);
return NULL;
}
- return iovec->i_addr;
+ return iovec->iov_base;
}
static inline void *
xfs_attri_validate_value_iovec(
struct xfs_mount *mp,
struct xfs_attri_log_format *attri_formatp,
- const struct xfs_log_iovec *iovec,
+ const struct kvec *iovec,
unsigned int value_len)
{
- if (iovec->i_len != xlog_calc_iovec_len(value_len)) {
+ if (iovec->iov_len != xlog_calc_iovec_len(value_len)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
attri_formatp, sizeof(*attri_formatp));
return NULL;
}
if ((attri_formatp->alfi_attr_filter & XFS_ATTR_PARENT) &&
- !xfs_parent_valuecheck(mp, iovec->i_addr, value_len)) {
+ !xfs_parent_valuecheck(mp, iovec->iov_base, value_len)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
attri_formatp, sizeof(*attri_formatp));
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- iovec->i_addr, iovec->i_len);
+ iovec->iov_base, iovec->iov_len);
return NULL;
}
- return iovec->i_addr;
+ return iovec->iov_base;
}
STATIC int
@@ -1023,13 +1021,13 @@ xlog_recover_attri_commit_pass2(
/* Validate xfs_attri_log_format before the large memory allocation */
len = sizeof(struct xfs_attri_log_format);
- if (item->ri_buf[i].i_len != len) {
+ if (item->ri_buf[i].iov_len != len) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
- attri_formatp = item->ri_buf[i].i_addr;
+ attri_formatp = item->ri_buf[i].iov_base;
if (!xfs_attri_validate(mp, attri_formatp)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
attri_formatp, len);
@@ -1218,10 +1216,10 @@ xlog_recover_attrd_commit_pass2(
{
struct xfs_attrd_log_format *attrd_formatp;
- attrd_formatp = item->ri_buf[0].i_addr;
- if (item->ri_buf[0].i_len != sizeof(struct xfs_attrd_log_format)) {
+ attrd_formatp = item->ri_buf[0].iov_base;
+ if (item->ri_buf[0].iov_len != sizeof(struct xfs_attrd_log_format)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
diff --git a/fs/xfs/xfs_attr_item.h b/fs/xfs/xfs_attr_item.h
index e74128cbb722..d108a11b55ae 100644
--- a/fs/xfs/xfs_attr_item.h
+++ b/fs/xfs/xfs_attr_item.h
@@ -12,10 +12,10 @@ struct xfs_mount;
struct kmem_zone;
struct xfs_attri_log_nameval {
- struct xfs_log_iovec name;
- struct xfs_log_iovec new_name; /* PPTR_REPLACE only */
- struct xfs_log_iovec value;
- struct xfs_log_iovec new_value; /* PPTR_REPLACE only */
+ struct kvec name;
+ struct kvec new_name; /* PPTR_REPLACE only */
+ struct kvec value;
+ struct kvec new_value; /* PPTR_REPLACE only */
refcount_t refcount;
/* name and value follow the end of this struct */
diff --git a/fs/xfs/xfs_bio_io.c b/fs/xfs/xfs_bio_io.c
index fe21c76f75b8..2a736d10eafb 100644
--- a/fs/xfs/xfs_bio_io.c
+++ b/fs/xfs/xfs_bio_io.c
@@ -18,42 +18,36 @@ xfs_rw_bdev(
enum req_op op)
{
- unsigned int is_vmalloc = is_vmalloc_addr(data);
- unsigned int left = count;
+ unsigned int done = 0, added;
int error;
struct bio *bio;
- if (is_vmalloc && op == REQ_OP_WRITE)
- flush_kernel_vmap_range(data, count);
+ op |= REQ_META | REQ_SYNC;
+ if (!is_vmalloc_addr(data))
+ return bdev_rw_virt(bdev, sector, data, count, op);
- bio = bio_alloc(bdev, bio_max_vecs(left), op | REQ_META | REQ_SYNC,
- GFP_KERNEL);
+ bio = bio_alloc(bdev, bio_max_vecs(count), op, GFP_KERNEL);
bio->bi_iter.bi_sector = sector;
do {
- struct page *page = kmem_to_page(data);
- unsigned int off = offset_in_page(data);
- unsigned int len = min_t(unsigned, left, PAGE_SIZE - off);
-
- while (bio_add_page(bio, page, len, off) != len) {
+ added = bio_add_vmalloc_chunk(bio, data + done, count - done);
+ if (!added) {
struct bio *prev = bio;
- bio = bio_alloc(prev->bi_bdev, bio_max_vecs(left),
+ bio = bio_alloc(prev->bi_bdev,
+ bio_max_vecs(count - done),
prev->bi_opf, GFP_KERNEL);
bio->bi_iter.bi_sector = bio_end_sector(prev);
bio_chain(prev, bio);
-
submit_bio(prev);
}
-
- data += len;
- left -= len;
- } while (left > 0);
+ done += added;
+ } while (done < count);
error = submit_bio_wait(bio);
bio_put(bio);
- if (is_vmalloc && op == REQ_OP_READ)
+ if (op == REQ_OP_READ)
invalidate_kernel_vmap_range(data, count);
return error;
}
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 3d52e9d7ad57..80f0c4bcc483 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -77,6 +77,11 @@ xfs_bui_item_size(
*nbytes += xfs_bui_log_format_sizeof(buip->bui_format.bui_nextents);
}
+unsigned int xfs_bui_log_space(unsigned int nr)
+{
+ return xlog_item_space(1, xfs_bui_log_format_sizeof(nr));
+}
+
/*
* This is called to fill in the vector of log iovecs for the
* given bui log item. We use only 1 iovec, and we point that
@@ -168,6 +173,11 @@ xfs_bud_item_size(
*nbytes += sizeof(struct xfs_bud_log_format);
}
+unsigned int xfs_bud_log_space(void)
+{
+ return xlog_item_space(1, sizeof(struct xfs_bud_log_format));
+}
+
/*
* This is called to fill in the vector of log iovecs for the
* given bud log item. We use only 1 iovec, and we point that
@@ -644,24 +654,24 @@ xlog_recover_bui_commit_pass2(
struct xfs_bui_log_format *bui_formatp;
size_t len;
- bui_formatp = item->ri_buf[0].i_addr;
+ bui_formatp = item->ri_buf[0].iov_base;
- if (item->ri_buf[0].i_len < xfs_bui_log_format_sizeof(0)) {
+ if (item->ri_buf[0].iov_len < xfs_bui_log_format_sizeof(0)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
if (bui_formatp->bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
len = xfs_bui_log_format_sizeof(bui_formatp->bui_nextents);
- if (item->ri_buf[0].i_len != len) {
+ if (item->ri_buf[0].iov_len != len) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
@@ -695,10 +705,10 @@ xlog_recover_bud_commit_pass2(
{
struct xfs_bud_log_format *bud_formatp;
- bud_formatp = item->ri_buf[0].i_addr;
- if (item->ri_buf[0].i_len != sizeof(struct xfs_bud_log_format)) {
+ bud_formatp = item->ri_buf[0].iov_base;
+ if (item->ri_buf[0].iov_len != sizeof(struct xfs_bud_log_format)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h
index 6fee6a508343..b42fee06899d 100644
--- a/fs/xfs/xfs_bmap_item.h
+++ b/fs/xfs/xfs_bmap_item.h
@@ -72,4 +72,7 @@ struct xfs_bmap_intent;
void xfs_bmap_defer_add(struct xfs_trans *tp, struct xfs_bmap_intent *bi);
+unsigned int xfs_bui_log_space(unsigned int nr);
+unsigned int xfs_bud_log_space(void);
+
#endif /* __XFS_BMAP_ITEM_H__ */
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 06ca11731e43..2208a720ec3f 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -514,7 +514,7 @@ xfs_can_free_eofblocks(
* Caller must either hold the exclusive io lock; or be inactivating
* the inode, which guarantees there are no other users of the inode.
*/
- if (!(VFS_I(ip)->i_state & I_FREEING))
+ if (!(inode_state_read_once(VFS_I(ip)) & I_FREEING))
xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL);
/* prealloc/delalloc exists only on regular files */
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 8e7f1b324b3b..47edf3041631 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -105,6 +105,7 @@ xfs_buf_free(
{
unsigned int size = BBTOB(bp->b_length);
+ might_sleep();
trace_xfs_buf_free(bp, _RET_IP_);
ASSERT(list_empty(&bp->b_lru));
@@ -386,8 +387,6 @@ xfs_buf_map_verify(
struct xfs_buftarg *btp,
struct xfs_buf_map *map)
{
- xfs_daddr_t eofs;
-
/* Check for IOs smaller than the sector size / not sector aligned */
ASSERT(!(BBTOB(map->bm_len) < btp->bt_meta_sectorsize));
ASSERT(!(BBTOB(map->bm_bn) & (xfs_off_t)btp->bt_meta_sectormask));
@@ -396,11 +395,10 @@ xfs_buf_map_verify(
* Corrupted block numbers can get through to here, unfortunately, so we
* have to check that the buffer falls within the filesystem bounds.
*/
- eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
- if (map->bm_bn < 0 || map->bm_bn >= eofs) {
+ if (map->bm_bn < 0 || map->bm_bn >= btp->bt_nr_sectors) {
xfs_alert(btp->bt_mount,
"%s: daddr 0x%llx out of range, EOFS 0x%llx",
- __func__, map->bm_bn, eofs);
+ __func__, map->bm_bn, btp->bt_nr_sectors);
WARN_ON(1);
return -EFSCORRUPTED;
}
@@ -1298,7 +1296,7 @@ xfs_buf_bio_end_io(
if (bio->bi_status)
xfs_buf_ioerror(bp, blk_status_to_errno(bio->bi_status));
else if ((bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) &&
- XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR))
+ XFS_TEST_ERROR(bp->b_mount, XFS_ERRTAG_BUF_IOERROR))
xfs_buf_ioerror(bp, -EIO);
if (bp->b_flags & XBF_ASYNC) {
@@ -1332,45 +1330,18 @@ static void
xfs_buf_submit_bio(
struct xfs_buf *bp)
{
+ unsigned int len = BBTOB(bp->b_length);
+ unsigned int nr_vecs = bio_add_max_vecs(bp->b_addr, len);
unsigned int map = 0;
struct blk_plug plug;
struct bio *bio;
- if (is_vmalloc_addr(bp->b_addr)) {
- unsigned int size = BBTOB(bp->b_length);
- unsigned int alloc_size = roundup(size, PAGE_SIZE);
- void *data = bp->b_addr;
-
- bio = bio_alloc(bp->b_target->bt_bdev, alloc_size >> PAGE_SHIFT,
- xfs_buf_bio_op(bp), GFP_NOIO);
-
- do {
- unsigned int len = min(size, PAGE_SIZE);
-
- ASSERT(offset_in_page(data) == 0);
- __bio_add_page(bio, vmalloc_to_page(data), len, 0);
- data += len;
- size -= len;
- } while (size);
-
- flush_kernel_vmap_range(bp->b_addr, alloc_size);
- } else {
- /*
- * Single folio or slab allocation. Must be contiguous and thus
- * only a single bvec is needed.
- *
- * This uses the page based bio add helper for now as that is
- * the lowest common denominator between folios and slab
- * allocations. To be replaced with a better block layer
- * helper soon (hopefully).
- */
- bio = bio_alloc(bp->b_target->bt_bdev, 1, xfs_buf_bio_op(bp),
- GFP_NOIO);
- __bio_add_page(bio, virt_to_page(bp->b_addr),
- BBTOB(bp->b_length),
- offset_in_page(bp->b_addr));
- }
-
+ bio = bio_alloc(bp->b_target->bt_bdev, nr_vecs, xfs_buf_bio_op(bp),
+ GFP_NOIO);
+ if (is_vmalloc_addr(bp->b_addr))
+ bio_add_vmalloc(bio, bp->b_addr, len);
+ else
+ bio_add_virt_nofail(bio, bp->b_addr, len);
bio->bi_private = bp;
bio->bi_end_io = xfs_buf_bio_end_io;
@@ -1709,26 +1680,67 @@ xfs_free_buftarg(
fs_put_dax(btp->bt_daxdev, btp->bt_mount);
/* the main block device is closed by kill_block_super */
if (btp->bt_bdev != btp->bt_mount->m_super->s_bdev)
- bdev_fput(btp->bt_bdev_file);
+ bdev_fput(btp->bt_file);
kfree(btp);
}
+/*
+ * Configure this buffer target for hardware-assisted atomic writes if the
+ * underlying block device supports is congruent with the filesystem geometry.
+ */
+static inline void
+xfs_configure_buftarg_atomic_writes(
+ struct xfs_buftarg *btp)
+{
+ struct xfs_mount *mp = btp->bt_mount;
+ unsigned int min_bytes, max_bytes;
+
+ min_bytes = bdev_atomic_write_unit_min_bytes(btp->bt_bdev);
+ max_bytes = bdev_atomic_write_unit_max_bytes(btp->bt_bdev);
+
+ /*
+ * Ignore atomic write geometry that is nonsense or doesn't even cover
+ * a single fsblock.
+ */
+ if (min_bytes > max_bytes ||
+ min_bytes > mp->m_sb.sb_blocksize ||
+ max_bytes < mp->m_sb.sb_blocksize) {
+ min_bytes = 0;
+ max_bytes = 0;
+ }
+
+ btp->bt_awu_min = min_bytes;
+ btp->bt_awu_max = max_bytes;
+}
+
+/* Configure a buffer target that abstracts a block device. */
int
-xfs_setsize_buftarg(
+xfs_configure_buftarg(
struct xfs_buftarg *btp,
- unsigned int sectorsize)
+ unsigned int sectorsize,
+ xfs_rfsblock_t nr_blocks)
{
- /* Set up metadata sector size info */
- btp->bt_meta_sectorsize = sectorsize;
- btp->bt_meta_sectormask = sectorsize - 1;
+ struct xfs_mount *mp = btp->bt_mount;
+
+ if (btp->bt_bdev) {
+ int error;
+
+ error = bdev_validate_blocksize(btp->bt_bdev, sectorsize);
+ if (error) {
+ xfs_warn(mp,
+ "Cannot use blocksize %u on device %pg, err %d",
+ sectorsize, btp->bt_bdev, error);
+ return -EINVAL;
+ }
- if (set_blocksize(btp->bt_bdev_file, sectorsize)) {
- xfs_warn(btp->bt_mount,
- "Cannot set_blocksize to %u on device %pg",
- sectorsize, btp->bt_bdev);
- return -EINVAL;
+ if (bdev_can_atomic_write(btp->bt_bdev))
+ xfs_configure_buftarg_atomic_writes(btp);
}
+ btp->bt_meta_sectorsize = sectorsize;
+ btp->bt_meta_sectormask = sectorsize - 1;
+ /* m_blkbb_log is not set up yet */
+ btp->bt_nr_sectors = nr_blocks << (mp->m_sb.sb_blocklog - BBSHIFT);
return 0;
}
@@ -1738,6 +1750,9 @@ xfs_init_buftarg(
size_t logical_sectorsize,
const char *descr)
{
+ /* The maximum size of the buftarg is only known once the sb is read. */
+ btp->bt_nr_sectors = XFS_BUF_DADDR_MAX;
+
/* Set up device logical sector size mask */
btp->bt_logical_sectorsize = logical_sectorsize;
btp->bt_logical_sectormask = logical_sectorsize - 1;
@@ -1778,6 +1793,8 @@ xfs_alloc_buftarg(
{
struct xfs_buftarg *btp;
const struct dax_holder_operations *ops = NULL;
+ int error;
+
#if defined(CONFIG_FS_DAX) && defined(CONFIG_MEMORY_FAILURE)
ops = &xfs_dax_holder_operations;
@@ -1785,34 +1802,37 @@ xfs_alloc_buftarg(
btp = kzalloc(sizeof(*btp), GFP_KERNEL | __GFP_NOFAIL);
btp->bt_mount = mp;
- btp->bt_bdev_file = bdev_file;
+ btp->bt_file = bdev_file;
btp->bt_bdev = file_bdev(bdev_file);
btp->bt_dev = btp->bt_bdev->bd_dev;
btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off,
mp, ops);
- if (bdev_can_atomic_write(btp->bt_bdev)) {
- btp->bt_bdev_awu_min = bdev_atomic_write_unit_min_bytes(
- btp->bt_bdev);
- btp->bt_bdev_awu_max = bdev_atomic_write_unit_max_bytes(
- btp->bt_bdev);
- }
+ /*
+ * Flush and invalidate all devices' pagecaches before reading any
+ * metadata because XFS doesn't use the bdev pagecache.
+ */
+ error = sync_blockdev(btp->bt_bdev);
+ if (error)
+ goto error_free;
/*
* When allocating the buftargs we have not yet read the super block and
* thus don't know the file system sector size yet.
*/
- if (xfs_setsize_buftarg(btp, bdev_logical_block_size(btp->bt_bdev)))
- goto error_free;
- if (xfs_init_buftarg(btp, bdev_logical_block_size(btp->bt_bdev),
- mp->m_super->s_id))
+ btp->bt_meta_sectorsize = bdev_logical_block_size(btp->bt_bdev);
+ btp->bt_meta_sectormask = btp->bt_meta_sectorsize - 1;
+
+ error = xfs_init_buftarg(btp, btp->bt_meta_sectorsize,
+ mp->m_super->s_id);
+ if (error)
goto error_free;
return btp;
error_free:
kfree(btp);
- return NULL;
+ return ERR_PTR(error);
}
static inline void
@@ -2061,44 +2081,6 @@ xfs_buf_delwri_submit(
return error;
}
-/*
- * Push a single buffer on a delwri queue.
- *
- * The purpose of this function is to submit a single buffer of a delwri queue
- * and return with the buffer still on the original queue.
- *
- * The buffer locking and queue management logic between _delwri_pushbuf() and
- * _delwri_queue() guarantee that the buffer cannot be queued to another list
- * before returning.
- */
-int
-xfs_buf_delwri_pushbuf(
- struct xfs_buf *bp,
- struct list_head *buffer_list)
-{
- int error;
-
- ASSERT(bp->b_flags & _XBF_DELWRI_Q);
-
- trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_);
-
- xfs_buf_lock(bp);
- bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC);
- bp->b_flags |= XBF_WRITE;
- xfs_buf_submit(bp);
-
- /*
- * The buffer is now locked, under I/O but still on the original delwri
- * queue. Wait for I/O completion, restore the DELWRI_Q flag and
- * return with the buffer unlocked and still on the original queue.
- */
- error = xfs_buf_iowait(bp);
- bp->b_flags |= _XBF_DELWRI_Q;
- xfs_buf_unlock(bp);
-
- return error;
-}
-
void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
{
/*
@@ -2106,7 +2088,7 @@ void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
* This allows userspace to disrupt buffer caching for debug/testing
* purposes.
*/
- if (XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_LRU_REF))
+ if (XFS_TEST_ERROR(bp->b_mount, XFS_ERRTAG_BUF_LRU_REF))
lru_ref = 0;
atomic_set(&bp->b_lru_ref, lru_ref);
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index d0b065a9a9f0..e25cd2a160f3 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -22,6 +22,7 @@ extern struct kmem_cache *xfs_buf_cache;
*/
struct xfs_buf;
+#define XFS_BUF_DADDR_MAX ((xfs_daddr_t) S64_MAX)
#define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL))
#define XBF_READ (1u << 0) /* buffer intended for reading from device */
@@ -94,7 +95,6 @@ void xfs_buf_cache_destroy(struct xfs_buf_cache *bch);
*/
struct xfs_buftarg {
dev_t bt_dev;
- struct file *bt_bdev_file;
struct block_device *bt_bdev;
struct dax_device *bt_daxdev;
struct file *bt_file;
@@ -104,6 +104,7 @@ struct xfs_buftarg {
size_t bt_meta_sectormask;
size_t bt_logical_sectorsize;
size_t bt_logical_sectormask;
+ xfs_daddr_t bt_nr_sectors;
/* LRU control structures */
struct shrinker *bt_shrinker;
@@ -112,9 +113,9 @@ struct xfs_buftarg {
struct percpu_counter bt_readahead_count;
struct ratelimit_state bt_ioerror_rl;
- /* Atomic write unit values */
- unsigned int bt_bdev_awu_min;
- unsigned int bt_bdev_awu_max;
+ /* Hardware atomic write unit values, bytes */
+ unsigned int bt_awu_min;
+ unsigned int bt_awu_max;
/* built-in cache, if we're not using the perag one */
struct xfs_buf_cache bt_cache[];
@@ -326,7 +327,6 @@ extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *);
void xfs_buf_delwri_queue_here(struct xfs_buf *bp, struct list_head *bl);
extern int xfs_buf_delwri_submit(struct list_head *);
extern int xfs_buf_delwri_submit_nowait(struct list_head *);
-extern int xfs_buf_delwri_pushbuf(struct xfs_buf *, struct list_head *);
static inline xfs_daddr_t xfs_buf_daddr(struct xfs_buf *bp)
{
@@ -374,9 +374,9 @@ struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *mp,
extern void xfs_free_buftarg(struct xfs_buftarg *);
extern void xfs_buftarg_wait(struct xfs_buftarg *);
extern void xfs_buftarg_drain(struct xfs_buftarg *);
-extern int xfs_setsize_buftarg(struct xfs_buftarg *, unsigned int);
+int xfs_configure_buftarg(struct xfs_buftarg *btp, unsigned int sectorsize,
+ xfs_fsblock_t nr_blocks);
-#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev)
#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev)
int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 19eb0b7a3e58..f4c5be67826e 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -32,19 +32,74 @@ static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip)
return container_of(lip, struct xfs_buf_log_item, bli_item);
}
+static void
+xfs_buf_item_get_format(
+ struct xfs_buf_log_item *bip,
+ int count)
+{
+ ASSERT(bip->bli_formats == NULL);
+ bip->bli_format_count = count;
+
+ if (count == 1) {
+ bip->bli_formats = &bip->__bli_format;
+ return;
+ }
+
+ bip->bli_formats = kzalloc(count * sizeof(struct xfs_buf_log_format),
+ GFP_KERNEL | __GFP_NOFAIL);
+}
+
+static void
+xfs_buf_item_free_format(
+ struct xfs_buf_log_item *bip)
+{
+ if (bip->bli_formats != &bip->__bli_format) {
+ kfree(bip->bli_formats);
+ bip->bli_formats = NULL;
+ }
+}
+
+static void
+xfs_buf_item_free(
+ struct xfs_buf_log_item *bip)
+{
+ xfs_buf_item_free_format(bip);
+ kvfree(bip->bli_item.li_lv_shadow);
+ kmem_cache_free(xfs_buf_item_cache, bip);
+}
+
+/*
+ * xfs_buf_item_relse() is called when the buf log item is no longer needed.
+ */
+static void
+xfs_buf_item_relse(
+ struct xfs_buf_log_item *bip)
+{
+ struct xfs_buf *bp = bip->bli_buf;
+
+ trace_xfs_buf_item_relse(bp, _RET_IP_);
+
+ ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags));
+ ASSERT(atomic_read(&bip->bli_refcount) == 0);
+
+ bp->b_log_item = NULL;
+ xfs_buf_rele(bp);
+ xfs_buf_item_free(bip);
+}
+
/* Is this log iovec plausibly large enough to contain the buffer log format? */
bool
xfs_buf_log_check_iovec(
- struct xfs_log_iovec *iovec)
+ struct kvec *iovec)
{
- struct xfs_buf_log_format *blfp = iovec->i_addr;
+ struct xfs_buf_log_format *blfp = iovec->iov_base;
char *bmp_end;
char *item_end;
- if (offsetof(struct xfs_buf_log_format, blf_data_map) > iovec->i_len)
+ if (offsetof(struct xfs_buf_log_format, blf_data_map) > iovec->iov_len)
return false;
- item_end = (char *)iovec->i_addr + iovec->i_len;
+ item_end = (char *)iovec->iov_base + iovec->iov_len;
bmp_end = (char *)&blfp->blf_data_map[blfp->blf_map_size];
return bmp_end <= item_end;
}
@@ -104,6 +159,25 @@ xfs_buf_item_size_segment(
}
/*
+ * Compute the worst case log item overhead for an invalidated buffer with the
+ * given map count and block size.
+ */
+unsigned int
+xfs_buf_inval_log_space(
+ unsigned int map_count,
+ unsigned int blocksize)
+{
+ unsigned int chunks = DIV_ROUND_UP(blocksize, XFS_BLF_CHUNK);
+ unsigned int bitmap_size = DIV_ROUND_UP(chunks, NBWORD);
+ unsigned int ret =
+ offsetof(struct xfs_buf_log_format, blf_data_map) +
+ (bitmap_size * sizeof_field(struct xfs_buf_log_format,
+ blf_data_map[0]));
+
+ return ret * map_count;
+}
+
+/*
* Return the number of log iovecs and space needed to log the given buf log
* item.
*
@@ -371,6 +445,42 @@ xfs_buf_item_pin(
}
/*
+ * For a stale BLI, process all the necessary completions that must be
+ * performed when the final BLI reference goes away. The buffer will be
+ * referenced and locked here - we return to the caller with the buffer still
+ * referenced and locked for them to finalise processing of the buffer.
+ */
+static void
+xfs_buf_item_finish_stale(
+ struct xfs_buf_log_item *bip)
+{
+ struct xfs_buf *bp = bip->bli_buf;
+ struct xfs_log_item *lip = &bip->bli_item;
+
+ ASSERT(bip->bli_flags & XFS_BLI_STALE);
+ ASSERT(xfs_buf_islocked(bp));
+ ASSERT(bp->b_flags & XBF_STALE);
+ ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
+ ASSERT(list_empty(&lip->li_trans));
+ ASSERT(!bp->b_transp);
+
+ if (bip->bli_flags & XFS_BLI_STALE_INODE) {
+ xfs_buf_item_done(bp);
+ xfs_buf_inode_iodone(bp);
+ ASSERT(list_empty(&bp->b_li_list));
+ return;
+ }
+
+ /*
+ * We may or may not be on the AIL here, xfs_trans_ail_delete() will do
+ * the right thing regardless of the situation in which we are called.
+ */
+ xfs_trans_ail_delete(lip, SHUTDOWN_LOG_IO_ERROR);
+ xfs_buf_item_relse(bip);
+ ASSERT(bp->b_log_item == NULL);
+}
+
+/*
* This is called to unpin the buffer associated with the buf log item which was
* previously pinned with a call to xfs_buf_item_pin(). We enter this function
* with a buffer pin count, a buffer reference and a BLI reference.
@@ -419,13 +529,6 @@ xfs_buf_item_unpin(
}
if (stale) {
- ASSERT(bip->bli_flags & XFS_BLI_STALE);
- ASSERT(xfs_buf_islocked(bp));
- ASSERT(bp->b_flags & XBF_STALE);
- ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
- ASSERT(list_empty(&lip->li_trans));
- ASSERT(!bp->b_transp);
-
trace_xfs_buf_item_unpin_stale(bip);
/*
@@ -436,22 +539,7 @@ xfs_buf_item_unpin(
* processing is complete.
*/
xfs_buf_rele(bp);
-
- /*
- * If we get called here because of an IO error, we may or may
- * not have the item on the AIL. xfs_trans_ail_delete() will
- * take care of that situation. xfs_trans_ail_delete() drops
- * the AIL lock.
- */
- if (bip->bli_flags & XFS_BLI_STALE_INODE) {
- xfs_buf_item_done(bp);
- xfs_buf_inode_iodone(bp);
- ASSERT(list_empty(&bp->b_li_list));
- } else {
- xfs_trans_ail_delete(lip, SHUTDOWN_LOG_IO_ERROR);
- xfs_buf_item_relse(bp);
- ASSERT(bp->b_log_item == NULL);
- }
+ xfs_buf_item_finish_stale(bip);
xfs_buf_relse(bp);
return;
}
@@ -524,43 +612,42 @@ xfs_buf_item_push(
* Drop the buffer log item refcount and take appropriate action. This helper
* determines whether the bli must be freed or not, since a decrement to zero
* does not necessarily mean the bli is unused.
- *
- * Return true if the bli is freed, false otherwise.
*/
-bool
+void
xfs_buf_item_put(
struct xfs_buf_log_item *bip)
{
- struct xfs_log_item *lip = &bip->bli_item;
- bool aborted;
- bool dirty;
+
+ ASSERT(xfs_buf_islocked(bip->bli_buf));
/* drop the bli ref and return if it wasn't the last one */
if (!atomic_dec_and_test(&bip->bli_refcount))
- return false;
+ return;
- /*
- * We dropped the last ref and must free the item if clean or aborted.
- * If the bli is dirty and non-aborted, the buffer was clean in the
- * transaction but still awaiting writeback from previous changes. In
- * that case, the bli is freed on buffer writeback completion.
- */
- aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags) ||
- xlog_is_shutdown(lip->li_log);
- dirty = bip->bli_flags & XFS_BLI_DIRTY;
- if (dirty && !aborted)
- return false;
+ /* If the BLI is in the AIL, then it is still dirty and in use */
+ if (test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)) {
+ ASSERT(bip->bli_flags & XFS_BLI_DIRTY);
+ return;
+ }
/*
- * The bli is aborted or clean. An aborted item may be in the AIL
- * regardless of dirty state. For example, consider an aborted
- * transaction that invalidated a dirty bli and cleared the dirty
- * state.
+ * In shutdown conditions, we can be asked to free a dirty BLI that
+ * isn't in the AIL. This can occur due to a checkpoint aborting a BLI
+ * instead of inserting it into the AIL at checkpoint IO completion. If
+ * there's another bli reference (e.g. a btree cursor holds a clean
+ * reference) and it is released via xfs_trans_brelse(), we can get here
+ * with that aborted, dirty BLI. In this case, it is safe to free the
+ * dirty BLI immediately, as it is not in the AIL and there are no
+ * other references to it.
+ *
+ * We should never get here with a stale BLI via that path as
+ * xfs_trans_brelse() specifically holds onto stale buffers rather than
+ * releasing them.
*/
- if (aborted)
- xfs_trans_ail_delete(lip, 0);
- xfs_buf_item_relse(bip->bli_buf);
- return true;
+ ASSERT(!(bip->bli_flags & XFS_BLI_DIRTY) ||
+ test_bit(XFS_LI_ABORTED, &bip->bli_item.li_flags));
+ ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+ xfs_buf_item_relse(bip);
}
/*
@@ -581,6 +668,15 @@ xfs_buf_item_put(
* if necessary but do not unlock the buffer. This is for support of
* xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't
* free the item.
+ *
+ * If the XFS_BLI_STALE flag is set, the last reference to the BLI *must*
+ * perform a completion abort of any objects attached to the buffer for IO
+ * tracking purposes. This generally only happens in shutdown situations,
+ * normally xfs_buf_item_unpin() will drop the last BLI reference and perform
+ * completion processing. However, because transaction completion can race with
+ * checkpoint completion during a shutdown, this release context may end up
+ * being the last active reference to the BLI and so needs to perform this
+ * cleanup.
*/
STATIC void
xfs_buf_item_release(
@@ -588,18 +684,19 @@ xfs_buf_item_release(
{
struct xfs_buf_log_item *bip = BUF_ITEM(lip);
struct xfs_buf *bp = bip->bli_buf;
- bool released;
bool hold = bip->bli_flags & XFS_BLI_HOLD;
bool stale = bip->bli_flags & XFS_BLI_STALE;
-#if defined(DEBUG) || defined(XFS_WARN)
- bool ordered = bip->bli_flags & XFS_BLI_ORDERED;
- bool dirty = bip->bli_flags & XFS_BLI_DIRTY;
bool aborted = test_bit(XFS_LI_ABORTED,
&lip->li_flags);
+ bool dirty = bip->bli_flags & XFS_BLI_DIRTY;
+#if defined(DEBUG) || defined(XFS_WARN)
+ bool ordered = bip->bli_flags & XFS_BLI_ORDERED;
#endif
trace_xfs_buf_item_release(bip);
+ ASSERT(xfs_buf_islocked(bp));
+
/*
* The bli dirty state should match whether the blf has logged segments
* except for ordered buffers, where only the bli should be dirty.
@@ -615,16 +712,56 @@ xfs_buf_item_release(
bp->b_transp = NULL;
bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED);
+ /* If there are other references, then we have nothing to do. */
+ if (!atomic_dec_and_test(&bip->bli_refcount))
+ goto out_release;
+
+ /*
+ * Stale buffer completion frees the BLI, unlocks and releases the
+ * buffer. Neither the BLI or buffer are safe to reference after this
+ * call, so there's nothing more we need to do here.
+ *
+ * If we get here with a stale buffer and references to the BLI remain,
+ * we must not unlock the buffer as the last BLI reference owns lock
+ * context, not us.
+ */
+ if (stale) {
+ xfs_buf_item_finish_stale(bip);
+ xfs_buf_relse(bp);
+ ASSERT(!hold);
+ return;
+ }
+
+ /*
+ * Dirty or clean, aborted items are done and need to be removed from
+ * the AIL and released. This frees the BLI, but leaves the buffer
+ * locked and referenced.
+ */
+ if (aborted || xlog_is_shutdown(lip->li_log)) {
+ ASSERT(list_empty(&bip->bli_buf->b_li_list));
+ xfs_buf_item_done(bp);
+ goto out_release;
+ }
+
+ /*
+ * Clean, unreferenced BLIs can be immediately freed, leaving the buffer
+ * locked and referenced.
+ *
+ * Dirty, unreferenced BLIs *must* be in the AIL awaiting writeback.
+ */
+ if (!dirty)
+ xfs_buf_item_relse(bip);
+ else
+ ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags));
+
+ /* Not safe to reference the BLI from here */
+out_release:
/*
- * Unref the item and unlock the buffer unless held or stale. Stale
- * buffers remain locked until final unpin unless the bli is freed by
- * the unref call. The latter implies shutdown because buffer
- * invalidation dirties the bli and transaction.
+ * If we get here with a stale buffer, we must not unlock the
+ * buffer as the last BLI reference owns lock context, not us.
*/
- released = xfs_buf_item_put(bip);
- if (hold || (stale && !released))
+ if (stale || hold)
return;
- ASSERT(!stale || aborted);
xfs_buf_relse(bp);
}
@@ -710,33 +847,6 @@ static const struct xfs_item_ops xfs_buf_item_ops = {
.iop_push = xfs_buf_item_push,
};
-STATIC void
-xfs_buf_item_get_format(
- struct xfs_buf_log_item *bip,
- int count)
-{
- ASSERT(bip->bli_formats == NULL);
- bip->bli_format_count = count;
-
- if (count == 1) {
- bip->bli_formats = &bip->__bli_format;
- return;
- }
-
- bip->bli_formats = kzalloc(count * sizeof(struct xfs_buf_log_format),
- GFP_KERNEL | __GFP_NOFAIL);
-}
-
-STATIC void
-xfs_buf_item_free_format(
- struct xfs_buf_log_item *bip)
-{
- if (bip->bli_formats != &bip->__bli_format) {
- kfree(bip->bli_formats);
- bip->bli_formats = NULL;
- }
-}
-
/*
* Allocate a new buf log item to go with the given buffer.
* Set the buffer's b_log_item field to point to the new
@@ -786,6 +896,7 @@ xfs_buf_item_init(
map_size = DIV_ROUND_UP(chunks, NBWORD);
if (map_size > XFS_BLF_DATAMAP_SIZE) {
+ xfs_buf_item_free_format(bip);
kmem_cache_free(xfs_buf_item_cache, bip);
xfs_err(mp,
"buffer item dirty bitmap (%u uints) too small to reflect %u bytes!",
@@ -957,34 +1068,6 @@ xfs_buf_item_dirty_format(
return false;
}
-STATIC void
-xfs_buf_item_free(
- struct xfs_buf_log_item *bip)
-{
- xfs_buf_item_free_format(bip);
- kvfree(bip->bli_item.li_lv_shadow);
- kmem_cache_free(xfs_buf_item_cache, bip);
-}
-
-/*
- * xfs_buf_item_relse() is called when the buf log item is no longer needed.
- */
-void
-xfs_buf_item_relse(
- struct xfs_buf *bp)
-{
- struct xfs_buf_log_item *bip = bp->b_log_item;
-
- trace_xfs_buf_item_relse(bp, _RET_IP_);
- ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags));
-
- if (atomic_read(&bip->bli_refcount))
- return;
- bp->b_log_item = NULL;
- xfs_buf_rele(bp);
- xfs_buf_item_free(bip);
-}
-
void
xfs_buf_item_done(
struct xfs_buf *bp)
@@ -1004,5 +1087,5 @@ xfs_buf_item_done(
xfs_trans_ail_delete(&bp->b_log_item->bli_item,
(bp->b_flags & _XBF_LOGRECOVERY) ? 0 :
SHUTDOWN_CORRUPT_INCORE);
- xfs_buf_item_relse(bp);
+ xfs_buf_item_relse(bp->b_log_item);
}
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 8cde85259a58..3159325dd17b 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -49,8 +49,7 @@ struct xfs_buf_log_item {
int xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
void xfs_buf_item_done(struct xfs_buf *bp);
-void xfs_buf_item_relse(struct xfs_buf *);
-bool xfs_buf_item_put(struct xfs_buf_log_item *);
+void xfs_buf_item_put(struct xfs_buf_log_item *bip);
void xfs_buf_item_log(struct xfs_buf_log_item *, uint, uint);
bool xfs_buf_item_dirty_format(struct xfs_buf_log_item *);
void xfs_buf_inode_iodone(struct xfs_buf *);
@@ -62,7 +61,10 @@ static inline void xfs_buf_dquot_iodone(struct xfs_buf *bp)
}
#endif /* CONFIG_XFS_QUOTA */
void xfs_buf_iodone(struct xfs_buf *);
-bool xfs_buf_log_check_iovec(struct xfs_log_iovec *iovec);
+bool xfs_buf_log_check_iovec(struct kvec *iovec);
+
+unsigned int xfs_buf_inval_log_space(unsigned int map_count,
+ unsigned int blocksize);
extern struct kmem_cache *xfs_buf_item_cache;
diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
index d4c5cef5bc43..e4c8af873632 100644
--- a/fs/xfs/xfs_buf_item_recover.c
+++ b/fs/xfs/xfs_buf_item_recover.c
@@ -159,7 +159,7 @@ STATIC enum xlog_recover_reorder
xlog_recover_buf_reorder(
struct xlog_recover_item *item)
{
- struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr;
+ struct xfs_buf_log_format *buf_f = item->ri_buf[0].iov_base;
if (buf_f->blf_flags & XFS_BLF_CANCEL)
return XLOG_REORDER_CANCEL_LIST;
@@ -173,7 +173,7 @@ xlog_recover_buf_ra_pass2(
struct xlog *log,
struct xlog_recover_item *item)
{
- struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr;
+ struct xfs_buf_log_format *buf_f = item->ri_buf[0].iov_base;
xlog_buf_readahead(log, buf_f->blf_blkno, buf_f->blf_len, NULL);
}
@@ -187,11 +187,11 @@ xlog_recover_buf_commit_pass1(
struct xlog *log,
struct xlog_recover_item *item)
{
- struct xfs_buf_log_format *bf = item->ri_buf[0].i_addr;
+ struct xfs_buf_log_format *bf = item->ri_buf[0].iov_base;
if (!xfs_buf_log_check_iovec(&item->ri_buf[0])) {
- xfs_err(log->l_mp, "bad buffer log item size (%d)",
- item->ri_buf[0].i_len);
+ xfs_err(log->l_mp, "bad buffer log item size (%zd)",
+ item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
@@ -487,8 +487,8 @@ xlog_recover_do_reg_buffer(
nbits = xfs_contig_bits(buf_f->blf_data_map,
buf_f->blf_map_size, bit);
ASSERT(nbits > 0);
- ASSERT(item->ri_buf[i].i_addr != NULL);
- ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
+ ASSERT(item->ri_buf[i].iov_base != NULL);
+ ASSERT(item->ri_buf[i].iov_len % XFS_BLF_CHUNK == 0);
ASSERT(BBTOB(bp->b_length) >=
((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT));
@@ -500,8 +500,8 @@ xlog_recover_do_reg_buffer(
* the log. Hence we need to trim nbits back to the length of
* the current region being copied out of the log.
*/
- if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT))
- nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT;
+ if (item->ri_buf[i].iov_len < (nbits << XFS_BLF_SHIFT))
+ nbits = item->ri_buf[i].iov_len >> XFS_BLF_SHIFT;
/*
* Do a sanity check if this is a dquot buffer. Just checking
@@ -511,18 +511,18 @@ xlog_recover_do_reg_buffer(
fa = NULL;
if (buf_f->blf_flags &
(XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
- if (item->ri_buf[i].i_addr == NULL) {
+ if (item->ri_buf[i].iov_base == NULL) {
xfs_alert(mp,
"XFS: NULL dquot in %s.", __func__);
goto next;
}
- if (item->ri_buf[i].i_len < size_disk_dquot) {
+ if (item->ri_buf[i].iov_len < size_disk_dquot) {
xfs_alert(mp,
- "XFS: dquot too small (%d) in %s.",
- item->ri_buf[i].i_len, __func__);
+ "XFS: dquot too small (%zd) in %s.",
+ item->ri_buf[i].iov_len, __func__);
goto next;
}
- fa = xfs_dquot_verify(mp, item->ri_buf[i].i_addr, -1);
+ fa = xfs_dquot_verify(mp, item->ri_buf[i].iov_base, -1);
if (fa) {
xfs_alert(mp,
"dquot corrupt at %pS trying to replay into block 0x%llx",
@@ -533,7 +533,7 @@ xlog_recover_do_reg_buffer(
memcpy(xfs_buf_offset(bp,
(uint)bit << XFS_BLF_SHIFT), /* dest */
- item->ri_buf[i].i_addr, /* source */
+ item->ri_buf[i].iov_base, /* source */
nbits<<XFS_BLF_SHIFT); /* length */
next:
i++;
@@ -669,8 +669,8 @@ xlog_recover_do_inode_buffer(
if (next_unlinked_offset < reg_buf_offset)
continue;
- ASSERT(item->ri_buf[item_index].i_addr != NULL);
- ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
+ ASSERT(item->ri_buf[item_index].iov_base != NULL);
+ ASSERT((item->ri_buf[item_index].iov_len % XFS_BLF_CHUNK) == 0);
ASSERT((reg_buf_offset + reg_buf_bytes) <= BBTOB(bp->b_length));
/*
@@ -678,7 +678,7 @@ xlog_recover_do_inode_buffer(
* current di_next_unlinked field. Extract its value
* and copy it to the buffer copy.
*/
- logged_nextp = item->ri_buf[item_index].i_addr +
+ logged_nextp = item->ri_buf[item_index].iov_base +
next_unlinked_offset - reg_buf_offset;
if (XFS_IS_CORRUPT(mp, *logged_nextp == 0)) {
xfs_alert(mp,
@@ -736,6 +736,16 @@ xlog_recover_do_primary_sb_buffer(
*/
xfs_sb_from_disk(&mp->m_sb, dsb);
+ /*
+ * Grow can change the device size. Mirror that into the buftarg.
+ */
+ mp->m_ddev_targp->bt_nr_sectors =
+ XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
+ if (mp->m_rtdev_targp && mp->m_rtdev_targp != mp->m_ddev_targp) {
+ mp->m_rtdev_targp->bt_nr_sectors =
+ XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
+ }
+
if (mp->m_sb.sb_agcount < orig_agcount) {
xfs_alert(mp, "Shrinking AG count in log recovery not supported");
return -EFSCORRUPTED;
@@ -1002,7 +1012,7 @@ xlog_recover_buf_commit_pass2(
struct xlog_recover_item *item,
xfs_lsn_t current_lsn)
{
- struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr;
+ struct xfs_buf_log_format *buf_f = item->ri_buf[0].iov_base;
struct xfs_mount *mp = log->l_mp;
struct xfs_buf *bp;
int error;
diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c
index b4ffd80b7cb6..dcbfa274e06d 100644
--- a/fs/xfs/xfs_buf_mem.c
+++ b/fs/xfs/xfs_buf_mem.c
@@ -165,7 +165,7 @@ xmbuf_map_backing_mem(
folio_set_dirty(folio);
folio_unlock(folio);
- bp->b_addr = folio_address(folio);
+ bp->b_addr = folio_address(folio) + offset_in_folio(folio, pos);
return 0;
}
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index c1a306268ae4..b6ffe4807a11 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -103,30 +103,12 @@ xfs_discard_endio(
bio_put(bio);
}
-static inline struct block_device *
-xfs_group_bdev(
- const struct xfs_group *xg)
-{
- struct xfs_mount *mp = xg->xg_mount;
-
- switch (xg->xg_type) {
- case XG_TYPE_AG:
- return mp->m_ddev_targp->bt_bdev;
- case XG_TYPE_RTG:
- return mp->m_rtdev_targp->bt_bdev;
- default:
- ASSERT(0);
- break;
- }
- return NULL;
-}
-
/*
* Walk the discard list and issue discards on all the busy extents in the
* list. We plug and chain the bios so that we only need a single completion
* call to clear all the busy extents once the discards are complete.
*/
-int
+void
xfs_discard_extents(
struct xfs_mount *mp,
struct xfs_busy_extents *extents)
@@ -134,25 +116,19 @@ xfs_discard_extents(
struct xfs_extent_busy *busyp;
struct bio *bio = NULL;
struct blk_plug plug;
- int error = 0;
blk_start_plug(&plug);
list_for_each_entry(busyp, &extents->extent_list, list) {
- trace_xfs_discard_extent(busyp->group, busyp->bno,
- busyp->length);
+ struct xfs_group *xg = busyp->group;
+ struct xfs_buftarg *btp =
+ xfs_group_type_buftarg(xg->xg_mount, xg->xg_type);
- error = __blkdev_issue_discard(xfs_group_bdev(busyp->group),
- xfs_gbno_to_daddr(busyp->group, busyp->bno),
+ trace_xfs_discard_extent(xg, busyp->bno, busyp->length);
+
+ __blkdev_issue_discard(btp->bt_bdev,
+ xfs_gbno_to_daddr(xg, busyp->bno),
XFS_FSB_TO_BB(mp, busyp->length),
GFP_KERNEL, &bio);
- if (error && error != -EOPNOTSUPP) {
- xfs_info(mp,
- "discard failed for extent [0x%llx,%u], error %d",
- (unsigned long long)busyp->bno,
- busyp->length,
- error);
- break;
- }
}
if (bio) {
@@ -163,10 +139,16 @@ xfs_discard_extents(
xfs_discard_endio_work(&extents->endio_work);
}
blk_finish_plug(&plug);
-
- return error;
}
+/*
+ * Care must be taken setting up the trim cursor as the perags may not have been
+ * initialised when the cursor is initialised. e.g. a clean mount which hasn't
+ * read in AGFs and the first operation run on the mounted fs is a trim. This
+ * can result in perag fields that aren't initialised until
+ * xfs_trim_gather_extents() calls xfs_alloc_read_agf() to lock down the AG for
+ * the free space search.
+ */
struct xfs_trim_cur {
xfs_agblock_t start;
xfs_extlen_t count;
@@ -196,14 +178,20 @@ xfs_trim_gather_extents(
*/
xfs_log_force(mp, XFS_LOG_SYNC);
- error = xfs_trans_alloc_empty(mp, &tp);
- if (error)
- return error;
+ tp = xfs_trans_alloc_empty(mp);
error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
if (error)
goto out_trans_cancel;
+ /*
+ * First time through tcur->count will not have been initialised as
+ * pag->pagf_longest is not guaranteed to be valid before we read
+ * the AGF buffer above.
+ */
+ if (!tcur->count)
+ tcur->count = pag->pagf_longest;
+
if (tcur->by_bno) {
/* sub-AG discard request always starts at tcur->start */
cur = xfs_bnobt_init_cursor(mp, tp, agbp, pag);
@@ -350,7 +338,6 @@ xfs_trim_perag_extents(
{
struct xfs_trim_cur tcur = {
.start = start,
- .count = pag->pagf_longest,
.end = end,
.minlen = minlen,
};
@@ -387,9 +374,7 @@ xfs_trim_perag_extents(
* list after this function call, as it may have been freed by
* the time control returns to us.
*/
- error = xfs_discard_extents(pag_mount(pag), extents);
- if (error)
- break;
+ xfs_discard_extents(pag_mount(pag), extents);
if (xfs_trim_should_stop())
break;
@@ -498,12 +483,10 @@ xfs_discard_rtdev_extents(
trace_xfs_discard_rtextent(mp, busyp->bno, busyp->length);
- error = __blkdev_issue_discard(bdev,
+ __blkdev_issue_discard(bdev,
xfs_rtb_to_daddr(mp, busyp->bno),
XFS_FSB_TO_BB(mp, busyp->length),
GFP_NOFS, &bio);
- if (error)
- break;
}
xfs_discard_free_rtdev_extents(tr);
@@ -583,9 +566,7 @@ xfs_trim_rtextents(
struct xfs_trans *tp;
int error;
- error = xfs_trans_alloc_empty(mp, &tp);
- if (error)
- return error;
+ tp = xfs_trans_alloc_empty(mp);
/*
* Walk the free ranges between low and high. The query_range function
@@ -701,9 +682,7 @@ xfs_trim_rtgroup_extents(
struct xfs_trans *tp;
int error;
- error = xfs_trans_alloc_empty(mp, &tp);
- if (error)
- return error;
+ tp = xfs_trans_alloc_empty(mp);
/*
* Walk the free ranges between low and high. The query_range function
@@ -732,8 +711,10 @@ xfs_trim_rtgroup_extents(
break;
}
- if (!tr.queued)
+ if (!tr.queued) {
+ kfree(tr.extents);
break;
+ }
/*
* We hand the extent list to the discard function here so the
@@ -745,9 +726,7 @@ xfs_trim_rtgroup_extents(
* list after this function call, as it may have been freed by
* the time control returns to us.
*/
- error = xfs_discard_extents(rtg_mount(rtg), tr.extents);
- if (error)
- break;
+ xfs_discard_extents(rtg_mount(rtg), tr.extents);
low = tr.restart_rtx;
} while (!xfs_trim_should_stop() && low <= high);
diff --git a/fs/xfs/xfs_discard.h b/fs/xfs/xfs_discard.h
index 2b1a85223a56..8c5cc4af6a07 100644
--- a/fs/xfs/xfs_discard.h
+++ b/fs/xfs/xfs_discard.h
@@ -6,7 +6,7 @@ struct fstrim_range;
struct xfs_mount;
struct xfs_busy_extents;
-int xfs_discard_extents(struct xfs_mount *mp, struct xfs_busy_extents *busy);
+void xfs_discard_extents(struct xfs_mount *mp, struct xfs_busy_extents *busy);
int xfs_ioc_trim(struct xfs_mount *mp, struct fstrim_range __user *fstrim);
#endif /* XFS_DISCARD_H */
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index edbc521870a1..612ca682a513 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -31,7 +31,7 @@
*
* ip->i_lock
* qi->qi_tree_lock
- * dquot->q_qlock (xfs_dqlock() and friends)
+ * dquot->q_qlock
* dquot->q_flush (xfs_dqflock() and friends)
* qi->qi_lru_lock
*
@@ -801,10 +801,11 @@ xfs_dq_get_next_id(
static struct xfs_dquot *
xfs_qm_dqget_cache_lookup(
struct xfs_mount *mp,
- struct xfs_quotainfo *qi,
- struct radix_tree_root *tree,
- xfs_dqid_t id)
+ xfs_dqid_t id,
+ xfs_dqtype_t type)
{
+ struct xfs_quotainfo *qi = mp->m_quotainfo;
+ struct radix_tree_root *tree = xfs_dquot_tree(qi, type);
struct xfs_dquot *dqp;
restart:
@@ -816,16 +817,12 @@ restart:
return NULL;
}
- xfs_dqlock(dqp);
- if (dqp->q_flags & XFS_DQFLAG_FREEING) {
- xfs_dqunlock(dqp);
+ if (!lockref_get_not_dead(&dqp->q_lockref)) {
mutex_unlock(&qi->qi_tree_lock);
trace_xfs_dqget_freeing(dqp);
delay(1);
goto restart;
}
-
- dqp->q_nrefs++;
mutex_unlock(&qi->qi_tree_lock);
trace_xfs_dqget_hit(dqp);
@@ -836,8 +833,7 @@ restart:
/*
* Try to insert a new dquot into the in-core cache. If an error occurs the
* caller should throw away the dquot and start over. Otherwise, the dquot
- * is returned locked (and held by the cache) as if there had been a cache
- * hit.
+ * is returned (and held by the cache) as if there had been a cache hit.
*
* The insert needs to be done under memalloc_nofs context because the radix
* tree can do memory allocation during insert. The qi->qi_tree_lock is taken in
@@ -848,11 +844,12 @@ restart:
static int
xfs_qm_dqget_cache_insert(
struct xfs_mount *mp,
- struct xfs_quotainfo *qi,
- struct radix_tree_root *tree,
xfs_dqid_t id,
+ xfs_dqtype_t type,
struct xfs_dquot *dqp)
{
+ struct xfs_quotainfo *qi = mp->m_quotainfo;
+ struct radix_tree_root *tree = xfs_dquot_tree(qi, type);
unsigned int nofs_flags;
int error;
@@ -860,14 +857,11 @@ xfs_qm_dqget_cache_insert(
mutex_lock(&qi->qi_tree_lock);
error = radix_tree_insert(tree, id, dqp);
if (unlikely(error)) {
- /* Duplicate found! Caller must try again. */
trace_xfs_dqget_dup(dqp);
goto out_unlock;
}
- /* Return a locked dquot to the caller, with a reference taken. */
- xfs_dqlock(dqp);
- dqp->q_nrefs = 1;
+ lockref_init(&dqp->q_lockref);
qi->qi_dquots++;
out_unlock:
@@ -903,7 +897,7 @@ xfs_qm_dqget_checks(
/*
* Given the file system, id, and type (UDQUOT/GDQUOT/PDQUOT), return a
- * locked dquot, doing an allocation (if requested) as needed.
+ * dquot, doing an allocation (if requested) as needed.
*/
int
xfs_qm_dqget(
@@ -913,8 +907,6 @@ xfs_qm_dqget(
bool can_alloc,
struct xfs_dquot **O_dqpp)
{
- struct xfs_quotainfo *qi = mp->m_quotainfo;
- struct radix_tree_root *tree = xfs_dquot_tree(qi, type);
struct xfs_dquot *dqp;
int error;
@@ -923,28 +915,30 @@ xfs_qm_dqget(
return error;
restart:
- dqp = xfs_qm_dqget_cache_lookup(mp, qi, tree, id);
- if (dqp) {
- *O_dqpp = dqp;
- return 0;
- }
+ dqp = xfs_qm_dqget_cache_lookup(mp, id, type);
+ if (dqp)
+ goto found;
error = xfs_qm_dqread(mp, id, type, can_alloc, &dqp);
if (error)
return error;
- error = xfs_qm_dqget_cache_insert(mp, qi, tree, id, dqp);
+ error = xfs_qm_dqget_cache_insert(mp, id, type, dqp);
if (error) {
- /*
- * Duplicate found. Just throw away the new dquot and start
- * over.
- */
xfs_qm_dqdestroy(dqp);
- XFS_STATS_INC(mp, xs_qm_dquot_dups);
- goto restart;
+ if (error == -EEXIST) {
+ /*
+ * Duplicate found. Just throw away the new dquot and
+ * start over.
+ */
+ XFS_STATS_INC(mp, xs_qm_dquot_dups);
+ goto restart;
+ }
+ return error;
}
trace_xfs_dqget_miss(dqp);
+found:
*O_dqpp = dqp;
return 0;
}
@@ -999,15 +993,16 @@ xfs_qm_dqget_inode(
struct xfs_inode *ip,
xfs_dqtype_t type,
bool can_alloc,
- struct xfs_dquot **O_dqpp)
+ struct xfs_dquot **dqpp)
{
struct xfs_mount *mp = ip->i_mount;
- struct xfs_quotainfo *qi = mp->m_quotainfo;
- struct radix_tree_root *tree = xfs_dquot_tree(qi, type);
struct xfs_dquot *dqp;
xfs_dqid_t id;
int error;
+ ASSERT(!*dqpp);
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
+
error = xfs_qm_dqget_checks(mp, type);
if (error)
return error;
@@ -1019,11 +1014,9 @@ xfs_qm_dqget_inode(
id = xfs_qm_id_for_quotatype(ip, type);
restart:
- dqp = xfs_qm_dqget_cache_lookup(mp, qi, tree, id);
- if (dqp) {
- *O_dqpp = dqp;
- return 0;
- }
+ dqp = xfs_qm_dqget_cache_lookup(mp, id, type);
+ if (dqp)
+ goto found;
/*
* Dquot cache miss. We don't want to keep the inode lock across
@@ -1049,7 +1042,6 @@ restart:
if (dqp1) {
xfs_qm_dqdestroy(dqp);
dqp = dqp1;
- xfs_dqlock(dqp);
goto dqret;
}
} else {
@@ -1058,21 +1050,26 @@ restart:
return -ESRCH;
}
- error = xfs_qm_dqget_cache_insert(mp, qi, tree, id, dqp);
+ error = xfs_qm_dqget_cache_insert(mp, id, type, dqp);
if (error) {
- /*
- * Duplicate found. Just throw away the new dquot and start
- * over.
- */
xfs_qm_dqdestroy(dqp);
- XFS_STATS_INC(mp, xs_qm_dquot_dups);
- goto restart;
+ if (error == -EEXIST) {
+ /*
+ * Duplicate found. Just throw away the new dquot and
+ * start over.
+ */
+ XFS_STATS_INC(mp, xs_qm_dquot_dups);
+ goto restart;
+ }
+ return error;
}
dqret:
xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
trace_xfs_dqget_miss(dqp);
- *O_dqpp = dqp;
+found:
+ trace_xfs_dqattach_get(dqp);
+ *dqpp = dqp;
return 0;
}
@@ -1098,63 +1095,41 @@ xfs_qm_dqget_next(
else if (error != 0)
break;
+ mutex_lock(&dqp->q_qlock);
if (!XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
*dqpp = dqp;
return 0;
}
- xfs_qm_dqput(dqp);
+ mutex_unlock(&dqp->q_qlock);
+ xfs_qm_dqrele(dqp);
}
return error;
}
/*
- * Release a reference to the dquot (decrement ref-count) and unlock it.
- *
- * If there is a group quota attached to this dquot, carefully release that
- * too without tripping over deadlocks'n'stuff.
+ * Release a reference to the dquot.
*/
void
-xfs_qm_dqput(
+xfs_qm_dqrele(
struct xfs_dquot *dqp)
{
- ASSERT(dqp->q_nrefs > 0);
- ASSERT(XFS_DQ_IS_LOCKED(dqp));
+ if (!dqp)
+ return;
- trace_xfs_dqput(dqp);
+ trace_xfs_dqrele(dqp);
- if (--dqp->q_nrefs == 0) {
+ if (lockref_put_or_lock(&dqp->q_lockref))
+ return;
+ if (!--dqp->q_lockref.count) {
struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo;
- trace_xfs_dqput_free(dqp);
+ trace_xfs_dqrele_free(dqp);
if (list_lru_add_obj(&qi->qi_lru, &dqp->q_lru))
XFS_STATS_INC(dqp->q_mount, xs_qm_dquot_unused);
}
- xfs_dqunlock(dqp);
-}
-
-/*
- * Release a dquot. Flush it if dirty, then dqput() it.
- * dquot must not be locked.
- */
-void
-xfs_qm_dqrele(
- struct xfs_dquot *dqp)
-{
- if (!dqp)
- return;
-
- trace_xfs_dqrele(dqp);
-
- xfs_dqlock(dqp);
- /*
- * We don't care to flush it if the dquot is dirty here.
- * That will create stutters that we want to avoid.
- * Instead we do a delayed write when we try to reclaim
- * a dirty dquot. Also xfs_sync will take part of the burden...
- */
- xfs_qm_dqput(dqp);
+ spin_unlock(&dqp->q_lockref.lock);
}
/*
@@ -1186,9 +1161,8 @@ xfs_qm_dqflush_done(
if (test_bit(XFS_LI_IN_AIL, &lip->li_flags) &&
(lip->li_lsn == qlip->qli_flush_lsn ||
test_bit(XFS_LI_FAILED, &lip->li_flags))) {
-
spin_lock(&ailp->ail_lock);
- xfs_clear_li_failed(lip);
+ clear_bit(XFS_LI_FAILED, &lip->li_flags);
if (lip->li_lsn == qlip->qli_flush_lsn) {
/* xfs_ail_update_finish() drops the AIL lock */
tail_lsn = xfs_ail_delete_one(ailp, lip);
@@ -1399,11 +1373,9 @@ xfs_qm_dqflush(
ASSERT(XFS_DQ_IS_LOCKED(dqp));
ASSERT(!completion_done(&dqp->q_flush));
+ ASSERT(atomic_read(&dqp->q_pincount) == 0);
trace_xfs_dqflush(dqp);
-
- xfs_qm_dqunpin_wait(dqp);
-
fa = xfs_qm_dqflush_check(dqp);
if (fa) {
xfs_alert(mp, "corrupt dquot ID 0x%x in memory at %pS",
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 61217adf5ba5..bbb824adca82 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -71,7 +71,7 @@ struct xfs_dquot {
xfs_dqtype_t q_type;
uint16_t q_flags;
xfs_dqid_t q_id;
- uint q_nrefs;
+ struct lockref q_lockref;
int q_bufoffset;
xfs_daddr_t q_blkno;
xfs_fileoff_t q_fileoffset;
@@ -121,21 +121,6 @@ static inline void xfs_dqfunlock(struct xfs_dquot *dqp)
complete(&dqp->q_flush);
}
-static inline int xfs_dqlock_nowait(struct xfs_dquot *dqp)
-{
- return mutex_trylock(&dqp->q_qlock);
-}
-
-static inline void xfs_dqlock(struct xfs_dquot *dqp)
-{
- mutex_lock(&dqp->q_qlock);
-}
-
-static inline void xfs_dqunlock(struct xfs_dquot *dqp)
-{
- mutex_unlock(&dqp->q_qlock);
-}
-
static inline int
xfs_dquot_type(const struct xfs_dquot *dqp)
{
@@ -233,7 +218,6 @@ int xfs_qm_dqget_next(struct xfs_mount *mp, xfs_dqid_t id,
int xfs_qm_dqget_uncached(struct xfs_mount *mp,
xfs_dqid_t id, xfs_dqtype_t type,
struct xfs_dquot **dqpp);
-void xfs_qm_dqput(struct xfs_dquot *dqp);
void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *);
void xfs_dqlockn(struct xfs_dqtrx *q);
@@ -246,9 +230,7 @@ void xfs_dquot_detach_buf(struct xfs_dquot *dqp);
static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp)
{
- xfs_dqlock(dqp);
- dqp->q_nrefs++;
- xfs_dqunlock(dqp);
+ lockref_get(&dqp->q_lockref);
return dqp;
}
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 271b195ebb93..b374cd9f1900 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -132,7 +132,7 @@ xfs_qm_dquot_logitem_push(
if (atomic_read(&dqp->q_pincount) > 0)
return XFS_ITEM_PINNED;
- if (!xfs_dqlock_nowait(dqp))
+ if (!mutex_trylock(&dqp->q_qlock))
return XFS_ITEM_LOCKED;
/*
@@ -177,7 +177,7 @@ xfs_qm_dquot_logitem_push(
out_relock_ail:
spin_lock(&lip->li_ailp->ail_lock);
out_unlock:
- xfs_dqunlock(dqp);
+ mutex_unlock(&dqp->q_qlock);
return rval;
}
@@ -195,7 +195,7 @@ xfs_qm_dquot_logitem_release(
* transaction layer, within trans_commit. Hence, no LI_HOLD flag
* for the logitem.
*/
- xfs_dqunlock(dqp);
+ mutex_unlock(&dqp->q_qlock);
}
STATIC void
diff --git a/fs/xfs/xfs_dquot_item_recover.c b/fs/xfs/xfs_dquot_item_recover.c
index 2c2720ce6923..89bc9bcaf51e 100644
--- a/fs/xfs/xfs_dquot_item_recover.c
+++ b/fs/xfs/xfs_dquot_item_recover.c
@@ -34,10 +34,10 @@ xlog_recover_dquot_ra_pass2(
if (mp->m_qflags == 0)
return;
- recddq = item->ri_buf[1].i_addr;
+ recddq = item->ri_buf[1].iov_base;
if (recddq == NULL)
return;
- if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot))
+ if (item->ri_buf[1].iov_len < sizeof(struct xfs_disk_dquot))
return;
type = recddq->d_type & XFS_DQTYPE_REC_MASK;
@@ -45,7 +45,7 @@ xlog_recover_dquot_ra_pass2(
if (log->l_quotaoffs_flag & type)
return;
- dq_f = item->ri_buf[0].i_addr;
+ dq_f = item->ri_buf[0].iov_base;
ASSERT(dq_f);
ASSERT(dq_f->qlf_len == 1);
@@ -79,14 +79,14 @@ xlog_recover_dquot_commit_pass2(
if (mp->m_qflags == 0)
return 0;
- recddq = item->ri_buf[1].i_addr;
+ recddq = item->ri_buf[1].iov_base;
if (recddq == NULL) {
xfs_alert(log->l_mp, "NULL dquot in %s.", __func__);
return -EFSCORRUPTED;
}
- if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot)) {
- xfs_alert(log->l_mp, "dquot too small (%d) in %s.",
- item->ri_buf[1].i_len, __func__);
+ if (item->ri_buf[1].iov_len < sizeof(struct xfs_disk_dquot)) {
+ xfs_alert(log->l_mp, "dquot too small (%zd) in %s.",
+ item->ri_buf[1].iov_len, __func__);
return -EFSCORRUPTED;
}
@@ -108,7 +108,7 @@ xlog_recover_dquot_commit_pass2(
* The other possibility, of course, is that the quota subsystem was
* removed since the last mount - ENOSYS.
*/
- dq_f = item->ri_buf[0].i_addr;
+ dq_f = item->ri_buf[0].iov_base;
ASSERT(dq_f);
fa = xfs_dquot_verify(mp, recddq, dq_f->qlf_id);
if (fa) {
@@ -147,7 +147,7 @@ xlog_recover_dquot_commit_pass2(
}
}
- memcpy(ddq, recddq, item->ri_buf[1].i_len);
+ memcpy(ddq, recddq, item->ri_buf[1].iov_len);
if (xfs_has_crc(mp)) {
xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk),
XFS_DQUOT_CRC_OFF);
@@ -192,7 +192,7 @@ xlog_recover_quotaoff_commit_pass1(
struct xlog *log,
struct xlog_recover_item *item)
{
- struct xfs_qoff_logformat *qoff_f = item->ri_buf[0].i_addr;
+ struct xfs_qoff_logformat *qoff_f = item->ri_buf[0].iov_base;
ASSERT(qoff_f);
/*
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index dbd87e137694..39830b252ac8 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -10,61 +10,17 @@
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
-#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_sysfs.h"
#include "xfs_inode.h"
#ifdef DEBUG
-static unsigned int xfs_errortag_random_default[] = {
- XFS_RANDOM_DEFAULT,
- XFS_RANDOM_IFLUSH_1,
- XFS_RANDOM_IFLUSH_2,
- XFS_RANDOM_IFLUSH_3,
- XFS_RANDOM_IFLUSH_4,
- XFS_RANDOM_IFLUSH_5,
- XFS_RANDOM_IFLUSH_6,
- XFS_RANDOM_DA_READ_BUF,
- XFS_RANDOM_BTREE_CHECK_LBLOCK,
- XFS_RANDOM_BTREE_CHECK_SBLOCK,
- XFS_RANDOM_ALLOC_READ_AGF,
- XFS_RANDOM_IALLOC_READ_AGI,
- XFS_RANDOM_ITOBP_INOTOBP,
- XFS_RANDOM_IUNLINK,
- XFS_RANDOM_IUNLINK_REMOVE,
- XFS_RANDOM_DIR_INO_VALIDATE,
- XFS_RANDOM_BULKSTAT_READ_CHUNK,
- XFS_RANDOM_IODONE_IOERR,
- XFS_RANDOM_STRATREAD_IOERR,
- XFS_RANDOM_STRATCMPL_IOERR,
- XFS_RANDOM_DIOWRITE_IOERR,
- XFS_RANDOM_BMAPIFORMAT,
- XFS_RANDOM_FREE_EXTENT,
- XFS_RANDOM_RMAP_FINISH_ONE,
- XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE,
- XFS_RANDOM_REFCOUNT_FINISH_ONE,
- XFS_RANDOM_BMAP_FINISH_ONE,
- XFS_RANDOM_AG_RESV_CRITICAL,
- 0, /* XFS_RANDOM_DROP_WRITES has been removed */
- XFS_RANDOM_LOG_BAD_CRC,
- XFS_RANDOM_LOG_ITEM_PIN,
- XFS_RANDOM_BUF_LRU_REF,
- XFS_RANDOM_FORCE_SCRUB_REPAIR,
- XFS_RANDOM_FORCE_SUMMARY_RECALC,
- XFS_RANDOM_IUNLINK_FALLBACK,
- XFS_RANDOM_BUF_IOERROR,
- XFS_RANDOM_REDUCE_MAX_IEXTENTS,
- XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT,
- XFS_RANDOM_AG_RESV_FAIL,
- XFS_RANDOM_LARP,
- XFS_RANDOM_DA_LEAF_SPLIT,
- XFS_RANDOM_ATTR_LEAF_TO_NODE,
- XFS_RANDOM_WB_DELAY_MS,
- XFS_RANDOM_WRITE_DELAY_MS,
- XFS_RANDOM_EXCHMAPS_FINISH_ONE,
- XFS_RANDOM_METAFILE_RESV_CRITICAL,
-};
+#define XFS_ERRTAG(_tag, _name, _default) \
+ [XFS_ERRTAG_##_tag] = (_default),
+#include "xfs_errortag.h"
+static const unsigned int xfs_errortag_random_default[] = { XFS_ERRTAGS };
+#undef XFS_ERRTAG
struct xfs_errortag_attr {
struct attribute attr;
@@ -93,21 +49,18 @@ xfs_errortag_attr_store(
size_t count)
{
struct xfs_mount *mp = to_mp(kobject);
- struct xfs_errortag_attr *xfs_attr = to_attr(attr);
+ unsigned int error_tag = to_attr(attr)->tag;
int ret;
- unsigned int val;
if (strcmp(buf, "default") == 0) {
- val = xfs_errortag_random_default[xfs_attr->tag];
+ mp->m_errortag[error_tag] =
+ xfs_errortag_random_default[error_tag];
} else {
- ret = kstrtouint(buf, 0, &val);
+ ret = kstrtouint(buf, 0, &mp->m_errortag[error_tag]);
if (ret)
return ret;
}
- ret = xfs_errortag_set(mp, xfs_attr->tag, val);
- if (ret)
- return ret;
return count;
}
@@ -118,10 +71,9 @@ xfs_errortag_attr_show(
char *buf)
{
struct xfs_mount *mp = to_mp(kobject);
- struct xfs_errortag_attr *xfs_attr = to_attr(attr);
+ unsigned int error_tag = to_attr(attr)->tag;
- return snprintf(buf, PAGE_SIZE, "%u\n",
- xfs_errortag_get(mp, xfs_attr->tag));
+ return snprintf(buf, PAGE_SIZE, "%u\n", mp->m_errortag[error_tag]);
}
static const struct sysfs_ops xfs_errortag_sysfs_ops = {
@@ -129,110 +81,28 @@ static const struct sysfs_ops xfs_errortag_sysfs_ops = {
.store = xfs_errortag_attr_store,
};
-#define XFS_ERRORTAG_ATTR_RW(_name, _tag) \
+#define XFS_ERRTAG(_tag, _name, _default) \
static struct xfs_errortag_attr xfs_errortag_attr_##_name = { \
.attr = {.name = __stringify(_name), \
.mode = VERIFY_OCTAL_PERMISSIONS(S_IWUSR | S_IRUGO) }, \
- .tag = (_tag), \
-}
-
-#define XFS_ERRORTAG_ATTR_LIST(_name) &xfs_errortag_attr_##_name.attr
-
-XFS_ERRORTAG_ATTR_RW(noerror, XFS_ERRTAG_NOERROR);
-XFS_ERRORTAG_ATTR_RW(iflush1, XFS_ERRTAG_IFLUSH_1);
-XFS_ERRORTAG_ATTR_RW(iflush2, XFS_ERRTAG_IFLUSH_2);
-XFS_ERRORTAG_ATTR_RW(iflush3, XFS_ERRTAG_IFLUSH_3);
-XFS_ERRORTAG_ATTR_RW(iflush4, XFS_ERRTAG_IFLUSH_4);
-XFS_ERRORTAG_ATTR_RW(iflush5, XFS_ERRTAG_IFLUSH_5);
-XFS_ERRORTAG_ATTR_RW(iflush6, XFS_ERRTAG_IFLUSH_6);
-XFS_ERRORTAG_ATTR_RW(dareadbuf, XFS_ERRTAG_DA_READ_BUF);
-XFS_ERRORTAG_ATTR_RW(btree_chk_lblk, XFS_ERRTAG_BTREE_CHECK_LBLOCK);
-XFS_ERRORTAG_ATTR_RW(btree_chk_sblk, XFS_ERRTAG_BTREE_CHECK_SBLOCK);
-XFS_ERRORTAG_ATTR_RW(readagf, XFS_ERRTAG_ALLOC_READ_AGF);
-XFS_ERRORTAG_ATTR_RW(readagi, XFS_ERRTAG_IALLOC_READ_AGI);
-XFS_ERRORTAG_ATTR_RW(itobp, XFS_ERRTAG_ITOBP_INOTOBP);
-XFS_ERRORTAG_ATTR_RW(iunlink, XFS_ERRTAG_IUNLINK);
-XFS_ERRORTAG_ATTR_RW(iunlinkrm, XFS_ERRTAG_IUNLINK_REMOVE);
-XFS_ERRORTAG_ATTR_RW(dirinovalid, XFS_ERRTAG_DIR_INO_VALIDATE);
-XFS_ERRORTAG_ATTR_RW(bulkstat, XFS_ERRTAG_BULKSTAT_READ_CHUNK);
-XFS_ERRORTAG_ATTR_RW(logiodone, XFS_ERRTAG_IODONE_IOERR);
-XFS_ERRORTAG_ATTR_RW(stratread, XFS_ERRTAG_STRATREAD_IOERR);
-XFS_ERRORTAG_ATTR_RW(stratcmpl, XFS_ERRTAG_STRATCMPL_IOERR);
-XFS_ERRORTAG_ATTR_RW(diowrite, XFS_ERRTAG_DIOWRITE_IOERR);
-XFS_ERRORTAG_ATTR_RW(bmapifmt, XFS_ERRTAG_BMAPIFORMAT);
-XFS_ERRORTAG_ATTR_RW(free_extent, XFS_ERRTAG_FREE_EXTENT);
-XFS_ERRORTAG_ATTR_RW(rmap_finish_one, XFS_ERRTAG_RMAP_FINISH_ONE);
-XFS_ERRORTAG_ATTR_RW(refcount_continue_update, XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE);
-XFS_ERRORTAG_ATTR_RW(refcount_finish_one, XFS_ERRTAG_REFCOUNT_FINISH_ONE);
-XFS_ERRORTAG_ATTR_RW(bmap_finish_one, XFS_ERRTAG_BMAP_FINISH_ONE);
-XFS_ERRORTAG_ATTR_RW(ag_resv_critical, XFS_ERRTAG_AG_RESV_CRITICAL);
-XFS_ERRORTAG_ATTR_RW(log_bad_crc, XFS_ERRTAG_LOG_BAD_CRC);
-XFS_ERRORTAG_ATTR_RW(log_item_pin, XFS_ERRTAG_LOG_ITEM_PIN);
-XFS_ERRORTAG_ATTR_RW(buf_lru_ref, XFS_ERRTAG_BUF_LRU_REF);
-XFS_ERRORTAG_ATTR_RW(force_repair, XFS_ERRTAG_FORCE_SCRUB_REPAIR);
-XFS_ERRORTAG_ATTR_RW(bad_summary, XFS_ERRTAG_FORCE_SUMMARY_RECALC);
-XFS_ERRORTAG_ATTR_RW(iunlink_fallback, XFS_ERRTAG_IUNLINK_FALLBACK);
-XFS_ERRORTAG_ATTR_RW(buf_ioerror, XFS_ERRTAG_BUF_IOERROR);
-XFS_ERRORTAG_ATTR_RW(reduce_max_iextents, XFS_ERRTAG_REDUCE_MAX_IEXTENTS);
-XFS_ERRORTAG_ATTR_RW(bmap_alloc_minlen_extent, XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT);
-XFS_ERRORTAG_ATTR_RW(ag_resv_fail, XFS_ERRTAG_AG_RESV_FAIL);
-XFS_ERRORTAG_ATTR_RW(larp, XFS_ERRTAG_LARP);
-XFS_ERRORTAG_ATTR_RW(da_leaf_split, XFS_ERRTAG_DA_LEAF_SPLIT);
-XFS_ERRORTAG_ATTR_RW(attr_leaf_to_node, XFS_ERRTAG_ATTR_LEAF_TO_NODE);
-XFS_ERRORTAG_ATTR_RW(wb_delay_ms, XFS_ERRTAG_WB_DELAY_MS);
-XFS_ERRORTAG_ATTR_RW(write_delay_ms, XFS_ERRTAG_WRITE_DELAY_MS);
-XFS_ERRORTAG_ATTR_RW(exchmaps_finish_one, XFS_ERRTAG_EXCHMAPS_FINISH_ONE);
-XFS_ERRORTAG_ATTR_RW(metafile_resv_crit, XFS_ERRTAG_METAFILE_RESV_CRITICAL);
+ .tag = XFS_ERRTAG_##_tag, \
+};
+#include "xfs_errortag.h"
+XFS_ERRTAGS
+#undef XFS_ERRTAG
+#define XFS_ERRTAG(_tag, _name, _default) \
+ &xfs_errortag_attr_##_name.attr,
+#include "xfs_errortag.h"
static struct attribute *xfs_errortag_attrs[] = {
- XFS_ERRORTAG_ATTR_LIST(noerror),
- XFS_ERRORTAG_ATTR_LIST(iflush1),
- XFS_ERRORTAG_ATTR_LIST(iflush2),
- XFS_ERRORTAG_ATTR_LIST(iflush3),
- XFS_ERRORTAG_ATTR_LIST(iflush4),
- XFS_ERRORTAG_ATTR_LIST(iflush5),
- XFS_ERRORTAG_ATTR_LIST(iflush6),
- XFS_ERRORTAG_ATTR_LIST(dareadbuf),
- XFS_ERRORTAG_ATTR_LIST(btree_chk_lblk),
- XFS_ERRORTAG_ATTR_LIST(btree_chk_sblk),
- XFS_ERRORTAG_ATTR_LIST(readagf),
- XFS_ERRORTAG_ATTR_LIST(readagi),
- XFS_ERRORTAG_ATTR_LIST(itobp),
- XFS_ERRORTAG_ATTR_LIST(iunlink),
- XFS_ERRORTAG_ATTR_LIST(iunlinkrm),
- XFS_ERRORTAG_ATTR_LIST(dirinovalid),
- XFS_ERRORTAG_ATTR_LIST(bulkstat),
- XFS_ERRORTAG_ATTR_LIST(logiodone),
- XFS_ERRORTAG_ATTR_LIST(stratread),
- XFS_ERRORTAG_ATTR_LIST(stratcmpl),
- XFS_ERRORTAG_ATTR_LIST(diowrite),
- XFS_ERRORTAG_ATTR_LIST(bmapifmt),
- XFS_ERRORTAG_ATTR_LIST(free_extent),
- XFS_ERRORTAG_ATTR_LIST(rmap_finish_one),
- XFS_ERRORTAG_ATTR_LIST(refcount_continue_update),
- XFS_ERRORTAG_ATTR_LIST(refcount_finish_one),
- XFS_ERRORTAG_ATTR_LIST(bmap_finish_one),
- XFS_ERRORTAG_ATTR_LIST(ag_resv_critical),
- XFS_ERRORTAG_ATTR_LIST(log_bad_crc),
- XFS_ERRORTAG_ATTR_LIST(log_item_pin),
- XFS_ERRORTAG_ATTR_LIST(buf_lru_ref),
- XFS_ERRORTAG_ATTR_LIST(force_repair),
- XFS_ERRORTAG_ATTR_LIST(bad_summary),
- XFS_ERRORTAG_ATTR_LIST(iunlink_fallback),
- XFS_ERRORTAG_ATTR_LIST(buf_ioerror),
- XFS_ERRORTAG_ATTR_LIST(reduce_max_iextents),
- XFS_ERRORTAG_ATTR_LIST(bmap_alloc_minlen_extent),
- XFS_ERRORTAG_ATTR_LIST(ag_resv_fail),
- XFS_ERRORTAG_ATTR_LIST(larp),
- XFS_ERRORTAG_ATTR_LIST(da_leaf_split),
- XFS_ERRORTAG_ATTR_LIST(attr_leaf_to_node),
- XFS_ERRORTAG_ATTR_LIST(wb_delay_ms),
- XFS_ERRORTAG_ATTR_LIST(write_delay_ms),
- XFS_ERRORTAG_ATTR_LIST(exchmaps_finish_one),
- XFS_ERRORTAG_ATTR_LIST(metafile_resv_crit),
- NULL,
+ XFS_ERRTAGS
+ NULL
};
ATTRIBUTE_GROUPS(xfs_errortag);
+#undef XFS_ERRTAG
+
+/* -1 because XFS_ERRTAG_DROP_WRITES got removed, + 1 for NULL termination */
+static_assert(ARRAY_SIZE(xfs_errortag_attrs) == XFS_ERRTAG_MAX);
static const struct kobj_type xfs_errortag_ktype = {
.release = xfs_sysfs_release,
@@ -295,7 +165,6 @@ xfs_errortag_enabled(
bool
xfs_errortag_test(
struct xfs_mount *mp,
- const char *expression,
const char *file,
int line,
unsigned int error_tag)
@@ -321,36 +190,12 @@ xfs_errortag_test(
return false;
xfs_warn_ratelimited(mp,
-"Injecting error (%s) at file %s, line %d, on filesystem \"%s\"",
- expression, file, line, mp->m_super->s_id);
+"Injecting error at file %s, line %d, on filesystem \"%s\"",
+ file, line, mp->m_super->s_id);
return true;
}
int
-xfs_errortag_get(
- struct xfs_mount *mp,
- unsigned int error_tag)
-{
- if (!xfs_errortag_valid(error_tag))
- return -EINVAL;
-
- return mp->m_errortag[error_tag];
-}
-
-int
-xfs_errortag_set(
- struct xfs_mount *mp,
- unsigned int error_tag,
- unsigned int tag_value)
-{
- if (!xfs_errortag_valid(error_tag))
- return -EINVAL;
-
- mp->m_errortag[error_tag] = tag_value;
- return 0;
-}
-
-int
xfs_errortag_add(
struct xfs_mount *mp,
unsigned int error_tag)
@@ -359,9 +204,8 @@ xfs_errortag_add(
if (!xfs_errortag_valid(error_tag))
return -EINVAL;
-
- return xfs_errortag_set(mp, error_tag,
- xfs_errortag_random_default[error_tag]);
+ mp->m_errortag[error_tag] = xfs_errortag_random_default[error_tag];
+ return 0;
}
int
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 0b9c5ba8a598..fe6a71bbe9cd 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -8,22 +8,17 @@
struct xfs_mount;
-extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp,
- const char *filename, int linenum,
- xfs_failaddr_t failaddr);
-extern void xfs_corruption_error(const char *tag, int level,
- struct xfs_mount *mp, const void *buf, size_t bufsize,
- const char *filename, int linenum,
- xfs_failaddr_t failaddr);
+void xfs_error_report(const char *tag, int level, struct xfs_mount *mp,
+ const char *filename, int linenum, xfs_failaddr_t failaddr);
+void xfs_corruption_error(const char *tag, int level, struct xfs_mount *mp,
+ const void *buf, size_t bufsize, const char *filename,
+ int linenum, xfs_failaddr_t failaddr);
void xfs_buf_corruption_error(struct xfs_buf *bp, xfs_failaddr_t fa);
-extern void xfs_buf_verifier_error(struct xfs_buf *bp, int error,
- const char *name, const void *buf, size_t bufsz,
- xfs_failaddr_t failaddr);
-extern void xfs_verifier_error(struct xfs_buf *bp, int error,
- xfs_failaddr_t failaddr);
-extern void xfs_inode_verifier_error(struct xfs_inode *ip, int error,
- const char *name, const void *buf, size_t bufsz,
- xfs_failaddr_t failaddr);
+void xfs_buf_verifier_error(struct xfs_buf *bp, int error, const char *name,
+ const void *buf, size_t bufsz, xfs_failaddr_t failaddr);
+void xfs_verifier_error(struct xfs_buf *bp, int error, xfs_failaddr_t failaddr);
+void xfs_inode_verifier_error(struct xfs_inode *ip, int error, const char *name,
+ const void *buf, size_t bufsz, xfs_failaddr_t failaddr);
#define XFS_ERROR_REPORT(e, lvl, mp) \
xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address)
@@ -39,12 +34,12 @@ extern void xfs_inode_verifier_error(struct xfs_inode *ip, int error,
#define XFS_CORRUPTION_DUMP_LEN (128)
#ifdef DEBUG
-extern int xfs_errortag_init(struct xfs_mount *mp);
-extern void xfs_errortag_del(struct xfs_mount *mp);
-extern bool xfs_errortag_test(struct xfs_mount *mp, const char *expression,
- const char *file, int line, unsigned int error_tag);
-#define XFS_TEST_ERROR(expr, mp, tag) \
- ((expr) || xfs_errortag_test((mp), #expr, __FILE__, __LINE__, (tag)))
+int xfs_errortag_init(struct xfs_mount *mp);
+void xfs_errortag_del(struct xfs_mount *mp);
+bool xfs_errortag_test(struct xfs_mount *mp, const char *file, int line,
+ unsigned int error_tag);
+#define XFS_TEST_ERROR(mp, tag) \
+ xfs_errortag_test((mp), __FILE__, __LINE__, (tag))
bool xfs_errortag_enabled(struct xfs_mount *mp, unsigned int tag);
#define XFS_ERRORTAG_DELAY(mp, tag) \
do { \
@@ -58,17 +53,13 @@ bool xfs_errortag_enabled(struct xfs_mount *mp, unsigned int tag);
mdelay((mp)->m_errortag[(tag)]); \
} while (0)
-extern int xfs_errortag_get(struct xfs_mount *mp, unsigned int error_tag);
-extern int xfs_errortag_set(struct xfs_mount *mp, unsigned int error_tag,
- unsigned int tag_value);
-extern int xfs_errortag_add(struct xfs_mount *mp, unsigned int error_tag);
-extern int xfs_errortag_clearall(struct xfs_mount *mp);
+int xfs_errortag_add(struct xfs_mount *mp, unsigned int error_tag);
+int xfs_errortag_clearall(struct xfs_mount *mp);
#else
#define xfs_errortag_init(mp) (0)
#define xfs_errortag_del(mp)
-#define XFS_TEST_ERROR(expr, mp, tag) (expr)
+#define XFS_TEST_ERROR(mp, tag) (false)
#define XFS_ERRORTAG_DELAY(mp, tag) ((void)0)
-#define xfs_errortag_set(mp, tag, val) (ENOSYS)
#define xfs_errortag_add(mp, tag) (ENOSYS)
#define xfs_errortag_clearall(mp) (ENOSYS)
#endif /* DEBUG */
diff --git a/fs/xfs/xfs_exchmaps_item.c b/fs/xfs/xfs_exchmaps_item.c
index 264a121c5e16..229cbe0adf17 100644
--- a/fs/xfs/xfs_exchmaps_item.c
+++ b/fs/xfs/xfs_exchmaps_item.c
@@ -558,12 +558,12 @@ xlog_recover_xmi_commit_pass2(
size_t len;
len = sizeof(struct xfs_xmi_log_format);
- if (item->ri_buf[0].i_len != len) {
+ if (item->ri_buf[0].iov_len != len) {
XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
return -EFSCORRUPTED;
}
- xmi_formatp = item->ri_buf[0].i_addr;
+ xmi_formatp = item->ri_buf[0].iov_base;
if (xmi_formatp->__pad != 0) {
XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
return -EFSCORRUPTED;
@@ -598,8 +598,8 @@ xlog_recover_xmd_commit_pass2(
{
struct xfs_xmd_log_format *xmd_formatp;
- xmd_formatp = item->ri_buf[0].i_addr;
- if (item->ri_buf[0].i_len != sizeof(struct xfs_xmd_log_format)) {
+ xmd_formatp = item->ri_buf[0].iov_base;
+ if (item->ri_buf[0].iov_len != sizeof(struct xfs_xmd_log_format)) {
XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
return -EFSCORRUPTED;
}
diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
index f069b04e8ea1..3e6e019b6146 100644
--- a/fs/xfs/xfs_extent_busy.h
+++ b/fs/xfs/xfs_extent_busy.h
@@ -68,4 +68,12 @@ static inline void xfs_extent_busy_sort(struct list_head *list)
list_sort(NULL, list, xfs_extent_busy_ag_cmp);
}
+/*
+ * Zoned RTGs don't need to track busy extents, as the actual block freeing only
+ * happens by a zone reset, which forces out all transactions that touched the
+ * to be reset zone first.
+ */
+#define xfs_group_has_extent_busy(mp, type) \
+ ((type) == XG_TYPE_AG || !xfs_has_zoned((mp)))
+
#endif /* __XFS_EXTENT_BUSY_H__ */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 777438b853da..418ddab590e0 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -83,6 +83,11 @@ xfs_efi_item_size(
*nbytes += xfs_efi_log_format_sizeof(efip->efi_format.efi_nextents);
}
+unsigned int xfs_efi_log_space(unsigned int nr)
+{
+ return xlog_item_space(1, xfs_efi_log_format_sizeof(nr));
+}
+
/*
* This is called to fill in the vector of log iovecs for the
* given efi log item. We use only 1 iovec, and we point that
@@ -177,15 +182,18 @@ xfs_efi_init(
* It will handle the conversion of formats if necessary.
*/
STATIC int
-xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
+xfs_efi_copy_format(
+ struct kvec *buf,
+ struct xfs_efi_log_format *dst_efi_fmt)
{
- xfs_efi_log_format_t *src_efi_fmt = buf->i_addr;
- uint i;
- uint len = xfs_efi_log_format_sizeof(src_efi_fmt->efi_nextents);
- uint len32 = xfs_efi_log_format32_sizeof(src_efi_fmt->efi_nextents);
- uint len64 = xfs_efi_log_format64_sizeof(src_efi_fmt->efi_nextents);
+ struct xfs_efi_log_format *src_efi_fmt = buf->iov_base;
+ uint len, len32, len64, i;
+
+ len = xfs_efi_log_format_sizeof(src_efi_fmt->efi_nextents);
+ len32 = xfs_efi_log_format32_sizeof(src_efi_fmt->efi_nextents);
+ len64 = xfs_efi_log_format64_sizeof(src_efi_fmt->efi_nextents);
- if (buf->i_len == len) {
+ if (buf->iov_len == len) {
memcpy(dst_efi_fmt, src_efi_fmt,
offsetof(struct xfs_efi_log_format, efi_extents));
for (i = 0; i < src_efi_fmt->efi_nextents; i++)
@@ -193,8 +201,8 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
&src_efi_fmt->efi_extents[i],
sizeof(struct xfs_extent));
return 0;
- } else if (buf->i_len == len32) {
- xfs_efi_log_format_32_t *src_efi_fmt_32 = buf->i_addr;
+ } else if (buf->iov_len == len32) {
+ struct xfs_efi_log_format_32 *src_efi_fmt_32 = buf->iov_base;
dst_efi_fmt->efi_type = src_efi_fmt_32->efi_type;
dst_efi_fmt->efi_size = src_efi_fmt_32->efi_size;
@@ -207,8 +215,8 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
src_efi_fmt_32->efi_extents[i].ext_len;
}
return 0;
- } else if (buf->i_len == len64) {
- xfs_efi_log_format_64_t *src_efi_fmt_64 = buf->i_addr;
+ } else if (buf->iov_len == len64) {
+ struct xfs_efi_log_format_64 *src_efi_fmt_64 = buf->iov_base;
dst_efi_fmt->efi_type = src_efi_fmt_64->efi_type;
dst_efi_fmt->efi_size = src_efi_fmt_64->efi_size;
@@ -222,8 +230,8 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
}
return 0;
}
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, NULL, buf->i_addr,
- buf->i_len);
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, NULL, buf->iov_base,
+ buf->iov_len);
return -EFSCORRUPTED;
}
@@ -254,6 +262,11 @@ xfs_efd_item_size(
*nbytes += xfs_efd_log_format_sizeof(efdp->efd_format.efd_nextents);
}
+unsigned int xfs_efd_log_space(unsigned int nr)
+{
+ return xlog_item_space(1, xfs_efd_log_format_sizeof(nr));
+}
+
/*
* This is called to fill in the vector of log iovecs for the
* given efd log item. We use only 1 iovec, and we point that
@@ -855,11 +868,11 @@ xlog_recover_efi_commit_pass2(
struct xfs_efi_log_format *efi_formatp;
int error;
- efi_formatp = item->ri_buf[0].i_addr;
+ efi_formatp = item->ri_buf[0].iov_base;
- if (item->ri_buf[0].i_len < xfs_efi_log_format_sizeof(0)) {
+ if (item->ri_buf[0].iov_len < xfs_efi_log_format_sizeof(0)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
@@ -894,11 +907,11 @@ xlog_recover_rtefi_commit_pass2(
struct xfs_efi_log_format *efi_formatp;
int error;
- efi_formatp = item->ri_buf[0].i_addr;
+ efi_formatp = item->ri_buf[0].iov_base;
- if (item->ri_buf[0].i_len < xfs_efi_log_format_sizeof(0)) {
+ if (item->ri_buf[0].iov_len < xfs_efi_log_format_sizeof(0)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
@@ -923,7 +936,7 @@ xlog_recover_rtefi_commit_pass2(
xfs_lsn_t lsn)
{
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
#endif
@@ -948,9 +961,9 @@ xlog_recover_efd_commit_pass2(
xfs_lsn_t lsn)
{
struct xfs_efd_log_format *efd_formatp;
- int buflen = item->ri_buf[0].i_len;
+ int buflen = item->ri_buf[0].iov_len;
- efd_formatp = item->ri_buf[0].i_addr;
+ efd_formatp = item->ri_buf[0].iov_base;
if (buflen < sizeof(struct xfs_efd_log_format)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
@@ -958,9 +971,9 @@ xlog_recover_efd_commit_pass2(
return -EFSCORRUPTED;
}
- if (item->ri_buf[0].i_len != xfs_efd_log_format32_sizeof(
+ if (item->ri_buf[0].iov_len != xfs_efd_log_format32_sizeof(
efd_formatp->efd_nextents) &&
- item->ri_buf[0].i_len != xfs_efd_log_format64_sizeof(
+ item->ri_buf[0].iov_len != xfs_efd_log_format64_sizeof(
efd_formatp->efd_nextents)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
efd_formatp, buflen);
@@ -985,9 +998,9 @@ xlog_recover_rtefd_commit_pass2(
xfs_lsn_t lsn)
{
struct xfs_efd_log_format *efd_formatp;
- int buflen = item->ri_buf[0].i_len;
+ int buflen = item->ri_buf[0].iov_len;
- efd_formatp = item->ri_buf[0].i_addr;
+ efd_formatp = item->ri_buf[0].iov_base;
if (buflen < sizeof(struct xfs_efd_log_format)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
@@ -995,9 +1008,9 @@ xlog_recover_rtefd_commit_pass2(
return -EFSCORRUPTED;
}
- if (item->ri_buf[0].i_len != xfs_efd_log_format32_sizeof(
+ if (item->ri_buf[0].iov_len != xfs_efd_log_format32_sizeof(
efd_formatp->efd_nextents) &&
- item->ri_buf[0].i_len != xfs_efd_log_format64_sizeof(
+ item->ri_buf[0].iov_len != xfs_efd_log_format64_sizeof(
efd_formatp->efd_nextents)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
efd_formatp, buflen);
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index 41b7c4306079..af1b0331f7af 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -49,7 +49,7 @@ struct xfs_efi_log_item {
struct xfs_log_item efi_item;
atomic_t efi_refcount;
atomic_t efi_next_extent;
- xfs_efi_log_format_t efi_format;
+ struct xfs_efi_log_format efi_format;
};
static inline size_t
@@ -69,7 +69,7 @@ struct xfs_efd_log_item {
struct xfs_log_item efd_item;
struct xfs_efi_log_item *efd_efip;
uint efd_next_extent;
- xfs_efd_log_format_t efd_format;
+ struct xfs_efd_log_format efd_format;
};
static inline size_t
@@ -94,4 +94,7 @@ void xfs_extent_free_defer_add(struct xfs_trans *tp,
struct xfs_extent_free_item *xefi,
struct xfs_defer_pending **dfpp);
+unsigned int xfs_efi_log_space(unsigned int nr);
+unsigned int xfs_efd_log_space(unsigned int nr);
+
#endif /* __XFS_EXTFREE_ITEM_H__ */
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 84f08c976ac4..7874cf745af3 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -27,6 +27,8 @@
#include "xfs_file.h"
#include "xfs_aops.h"
#include "xfs_zone_alloc.h"
+#include "xfs_error.h"
+#include "xfs_errortag.h"
#include <linux/dax.h>
#include <linux/falloc.h>
@@ -75,52 +77,47 @@ xfs_dir_fsync(
return xfs_log_force_inode(ip);
}
-static xfs_csn_t
-xfs_fsync_seq(
- struct xfs_inode *ip,
- bool datasync)
-{
- if (!xfs_ipincount(ip))
- return 0;
- if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
- return 0;
- return ip->i_itemp->ili_commit_seq;
-}
-
/*
- * All metadata updates are logged, which means that we just have to flush the
- * log up to the latest LSN that touched the inode.
+ * All metadata updates are logged, which means that we just have to push the
+ * journal to the required sequence number than holds the updates. We track
+ * datasync commits separately to full sync commits, and hence only need to
+ * select the correct sequence number for the log force here.
+ *
+ * We don't have to serialise against concurrent modifications, as we do not
+ * have to wait for modifications that have not yet completed. We define a
+ * transaction commit as completing when the commit sequence number is updated,
+ * hence if the sequence number has not updated, the sync operation has been
+ * run before the commit completed and we don't have to wait for it.
*
- * If we have concurrent fsync/fdatasync() calls, we need them to all block on
- * the log force before we clear the ili_fsync_fields field. This ensures that
- * we don't get a racing sync operation that does not wait for the metadata to
- * hit the journal before returning. If we race with clearing ili_fsync_fields,
- * then all that will happen is the log force will do nothing as the lsn will
- * already be on disk. We can't race with setting ili_fsync_fields because that
- * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock
- * shared until after the ili_fsync_fields is cleared.
+ * If we have concurrent fsync/fdatasync() calls, the sequence numbers remain
+ * set on the log item until - at least - the journal flush completes. In
+ * reality, they are only cleared when the inode is fully unpinned (i.e.
+ * persistent in the journal and not dirty in the CIL), and so we rely on
+ * xfs_log_force_seq() either skipping sequences that have been persisted or
+ * waiting on sequences that are still in flight to correctly order concurrent
+ * sync operations.
*/
-static int
+static int
xfs_fsync_flush_log(
struct xfs_inode *ip,
bool datasync,
int *log_flushed)
{
- int error = 0;
- xfs_csn_t seq;
+ struct xfs_inode_log_item *iip = ip->i_itemp;
+ xfs_csn_t seq = 0;
- xfs_ilock(ip, XFS_ILOCK_SHARED);
- seq = xfs_fsync_seq(ip, datasync);
- if (seq) {
- error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC,
- log_flushed);
+ spin_lock(&iip->ili_lock);
+ if (datasync)
+ seq = iip->ili_datasync_seq;
+ else
+ seq = iip->ili_commit_seq;
+ spin_unlock(&iip->ili_lock);
- spin_lock(&ip->i_itemp->ili_lock);
- ip->i_itemp->ili_fsync_fields = 0;
- spin_unlock(&ip->i_itemp->ili_lock);
- }
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
- return error;
+ if (!seq)
+ return 0;
+
+ return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC,
+ log_flushed);
}
STATIC int
@@ -158,12 +155,10 @@ xfs_file_fsync(
error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
/*
- * Any inode that has dirty modifications in the log is pinned. The
- * racy check here for a pinned inode will not catch modifications
- * that happen concurrently to the fsync call, but fsync semantics
- * only require to sync previously completed I/O.
+ * If the inode has a inode log item attached, it may need the journal
+ * flushed to persist any changes the log item might be tracking.
*/
- if (xfs_ipincount(ip)) {
+ if (ip->i_itemp) {
err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed);
if (err2 && !error)
error = err2;
@@ -497,7 +492,7 @@ restart:
static ssize_t
xfs_zoned_write_space_reserve(
- struct xfs_inode *ip,
+ struct xfs_mount *mp,
struct kiocb *iocb,
struct iov_iter *from,
unsigned int flags,
@@ -533,8 +528,8 @@ xfs_zoned_write_space_reserve(
*
* Any remaining block will be returned after the write.
*/
- return xfs_zoned_space_reserve(ip,
- XFS_B_TO_FSB(ip->i_mount, count) + 1 + 2, flags, ac);
+ return xfs_zoned_space_reserve(mp, XFS_B_TO_FSB(mp, count) + 1 + 2,
+ flags, ac);
}
static int
@@ -576,7 +571,10 @@ xfs_dio_write_end_io(
nofs_flag = memalloc_nofs_save();
if (flags & IOMAP_DIO_COW) {
- error = xfs_reflink_end_cow(ip, offset, size);
+ if (iocb->ki_flags & IOCB_ATOMIC)
+ error = xfs_reflink_end_atomic_cow(ip, offset, size);
+ else
+ error = xfs_reflink_end_cow(ip, offset, size);
if (error)
goto out;
}
@@ -678,8 +676,17 @@ xfs_file_dio_write_aligned(
struct xfs_zone_alloc_ctx *ac)
{
unsigned int iolock = XFS_IOLOCK_SHARED;
+ unsigned int dio_flags = 0;
ssize_t ret;
+ /*
+ * For always COW inodes, each bio must be aligned to the file system
+ * block size and not just the device sector size because we need to
+ * allocate a block-aligned amount of space for each write.
+ */
+ if (xfs_is_always_cow_inode(ip))
+ dio_flags |= IOMAP_DIO_FSBLOCK_ALIGNED;
+
ret = xfs_ilock_iocb_for_write(iocb, &iolock);
if (ret)
return ret;
@@ -697,7 +704,7 @@ xfs_file_dio_write_aligned(
iolock = XFS_IOLOCK_SHARED;
}
trace_xfs_file_direct_write(iocb, from);
- ret = iomap_dio_rw(iocb, from, ops, dops, 0, ac, 0);
+ ret = iomap_dio_rw(iocb, from, ops, dops, dio_flags, ac, 0);
out_unlock:
xfs_iunlock(ip, iolock);
return ret;
@@ -715,13 +722,79 @@ xfs_file_dio_write_zoned(
struct xfs_zone_alloc_ctx ac = { };
ssize_t ret;
- ret = xfs_zoned_write_space_reserve(ip, iocb, from, 0, &ac);
+ ret = xfs_zoned_write_space_reserve(ip->i_mount, iocb, from, 0, &ac);
if (ret < 0)
return ret;
ret = xfs_file_dio_write_aligned(ip, iocb, from,
&xfs_zoned_direct_write_iomap_ops,
&xfs_dio_zoned_write_ops, &ac);
- xfs_zoned_space_unreserve(ip, &ac);
+ xfs_zoned_space_unreserve(ip->i_mount, &ac);
+ return ret;
+}
+
+/*
+ * Handle block atomic writes
+ *
+ * Two methods of atomic writes are supported:
+ * - REQ_ATOMIC-based, which would typically use some form of HW offload in the
+ * disk
+ * - COW-based, which uses a COW fork as a staging extent for data updates
+ * before atomically updating extent mappings for the range being written
+ *
+ */
+static noinline ssize_t
+xfs_file_dio_write_atomic(
+ struct xfs_inode *ip,
+ struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ unsigned int iolock = XFS_IOLOCK_SHARED;
+ ssize_t ret, ocount = iov_iter_count(from);
+ const struct iomap_ops *dops;
+
+ /*
+ * HW offload should be faster, so try that first if it is already
+ * known that the write length is not too large.
+ */
+ if (ocount > xfs_inode_buftarg(ip)->bt_awu_max)
+ dops = &xfs_atomic_write_cow_iomap_ops;
+ else
+ dops = &xfs_direct_write_iomap_ops;
+
+retry:
+ ret = xfs_ilock_iocb_for_write(iocb, &iolock);
+ if (ret)
+ return ret;
+
+ ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
+ if (ret)
+ goto out_unlock;
+
+ /* Demote similar to xfs_file_dio_write_aligned() */
+ if (iolock == XFS_IOLOCK_EXCL) {
+ xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
+ iolock = XFS_IOLOCK_SHARED;
+ }
+
+ trace_xfs_file_direct_write(iocb, from);
+ ret = iomap_dio_rw(iocb, from, dops, &xfs_dio_write_ops,
+ 0, NULL, 0);
+
+ /*
+ * The retry mechanism is based on the ->iomap_begin method returning
+ * -ENOPROTOOPT, which would be when the REQ_ATOMIC-based write is not
+ * possible. The REQ_ATOMIC-based method typically not be possible if
+ * the write spans multiple extents or the disk blocks are misaligned.
+ */
+ if (ret == -ENOPROTOOPT && dops == &xfs_direct_write_iomap_ops) {
+ xfs_iunlock(ip, iolock);
+ dops = &xfs_atomic_write_cow_iomap_ops;
+ goto retry;
+ }
+
+out_unlock:
+ if (iolock)
+ xfs_iunlock(ip, iolock);
return ret;
}
@@ -828,18 +901,12 @@ xfs_file_dio_write(
if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
return -EINVAL;
- /*
- * For always COW inodes we also must check the alignment of each
- * individual iovec segment, as they could end up with different
- * I/Os due to the way bio_iov_iter_get_pages works, and we'd
- * then overwrite an already written block.
- */
- if (((iocb->ki_pos | count) & ip->i_mount->m_blockmask) ||
- (xfs_is_always_cow_inode(ip) &&
- (iov_iter_alignment(from) & ip->i_mount->m_blockmask)))
+ if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
return xfs_file_dio_write_unaligned(ip, iocb, from);
if (xfs_is_zoned_inode(ip))
return xfs_file_dio_write_zoned(ip, iocb, from);
+ if (iocb->ki_flags & IOCB_ATOMIC)
+ return xfs_file_dio_write_atomic(ip, iocb, from);
return xfs_file_dio_write_aligned(ip, iocb, from,
&xfs_direct_write_iomap_ops, &xfs_dio_write_ops, NULL);
}
@@ -908,7 +975,8 @@ write_retry:
trace_xfs_file_buffered_write(iocb, from);
ret = iomap_file_buffered_write(iocb, from,
- &xfs_buffered_write_iomap_ops, NULL);
+ &xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops,
+ NULL);
/*
* If we hit a space limit, try to free up some lingering preallocated
@@ -961,7 +1029,7 @@ xfs_file_buffered_write_zoned(
struct xfs_zone_alloc_ctx ac = { };
ssize_t ret;
- ret = xfs_zoned_write_space_reserve(ip, iocb, from, XFS_ZR_GREEDY, &ac);
+ ret = xfs_zoned_write_space_reserve(mp, iocb, from, XFS_ZR_GREEDY, &ac);
if (ret < 0)
return ret;
@@ -988,7 +1056,8 @@ xfs_file_buffered_write_zoned(
retry:
trace_xfs_file_buffered_write(iocb, from);
ret = iomap_file_buffered_write(iocb, from,
- &xfs_buffered_write_iomap_ops, &ac);
+ &xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops,
+ &ac);
if (ret == -ENOSPC && !cleared_space) {
/*
* Kick off writeback to convert delalloc space and release the
@@ -1002,7 +1071,7 @@ retry:
out_unlock:
xfs_iunlock(ip, iolock);
out_unreserve:
- xfs_zoned_space_unreserve(ip, &ac);
+ xfs_zoned_space_unreserve(ip->i_mount, &ac);
if (ret > 0) {
XFS_STATS_ADD(mp, xs_write_bytes, ret);
ret = generic_write_sync(iocb, ret);
@@ -1028,23 +1097,21 @@ xfs_file_write_iter(
if (xfs_is_shutdown(ip->i_mount))
return -EIO;
- if (IS_DAX(inode))
- return xfs_file_dax_write(iocb, from);
-
if (iocb->ki_flags & IOCB_ATOMIC) {
- /*
- * Currently only atomic writing of a single FS block is
- * supported. It would be possible to atomic write smaller than
- * a FS block, but there is no requirement to support this.
- * Note that iomap also does not support this yet.
- */
- if (ocount != ip->i_mount->m_sb.sb_blocksize)
+ if (ocount < xfs_get_atomic_write_min(ip))
return -EINVAL;
+
+ if (ocount > xfs_get_atomic_write_max(ip))
+ return -EINVAL;
+
ret = generic_atomic_write_valid(iocb, from);
if (ret)
return ret;
}
+ if (IS_DAX(inode))
+ return xfs_file_dax_write(iocb, from);
+
if (iocb->ki_flags & IOCB_DIRECT) {
/*
* Allow a directio write to fall back to a buffered
@@ -1174,6 +1241,38 @@ xfs_falloc_insert_range(
}
/*
+ * For various operations we need to zero up to one block at each end of
+ * the affected range. For zoned file systems this will require a space
+ * allocation, for which we need a reservation ahead of time.
+ */
+#define XFS_ZONED_ZERO_EDGE_SPACE_RES 2
+
+/*
+ * Zero range implements a full zeroing mechanism but is only used in limited
+ * situations. It is more efficient to allocate unwritten extents than to
+ * perform zeroing here, so use an errortag to randomly force zeroing on DEBUG
+ * kernels for added test coverage.
+ *
+ * On zoned file systems, the error is already injected by
+ * xfs_file_zoned_fallocate, which then reserves the additional space needed.
+ * We only check for this extra space reservation here.
+ */
+static inline bool
+xfs_falloc_force_zero(
+ struct xfs_inode *ip,
+ struct xfs_zone_alloc_ctx *ac)
+{
+ if (xfs_is_zoned_inode(ip)) {
+ if (ac->reserved_blocks > XFS_ZONED_ZERO_EDGE_SPACE_RES) {
+ ASSERT(IS_ENABLED(CONFIG_XFS_DEBUG));
+ return true;
+ }
+ return false;
+ }
+ return XFS_TEST_ERROR(ip->i_mount, XFS_ERRTAG_FORCE_ZERO_RANGE);
+}
+
+/*
* Punch a hole and prealloc the range. We use a hole punch rather than
* unwritten extent conversion for two reasons:
*
@@ -1190,23 +1289,29 @@ xfs_falloc_zero_range(
struct xfs_zone_alloc_ctx *ac)
{
struct inode *inode = file_inode(file);
+ struct xfs_inode *ip = XFS_I(inode);
unsigned int blksize = i_blocksize(inode);
loff_t new_size = 0;
int error;
- trace_xfs_zero_file_space(XFS_I(inode));
+ trace_xfs_zero_file_space(ip);
error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
if (error)
return error;
- error = xfs_free_file_space(XFS_I(inode), offset, len, ac);
- if (error)
- return error;
+ if (xfs_falloc_force_zero(ip, ac)) {
+ error = xfs_zero_range(ip, offset, len, ac, NULL);
+ } else {
+ error = xfs_free_file_space(ip, offset, len, ac);
+ if (error)
+ return error;
- len = round_up(offset + len, blksize) - round_down(offset, blksize);
- offset = round_down(offset, blksize);
- error = xfs_alloc_file_space(XFS_I(inode), offset, len);
+ len = round_up(offset + len, blksize) -
+ round_down(offset, blksize);
+ offset = round_down(offset, blksize);
+ error = xfs_alloc_file_space(ip, offset, len);
+ }
if (error)
return error;
return xfs_falloc_setsize(file, new_size);
@@ -1266,9 +1371,10 @@ xfs_falloc_allocate_range(
}
#define XFS_FALLOC_FL_SUPPORTED \
- (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \
- FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \
- FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE)
+ (FALLOC_FL_ALLOCATE_RANGE | FALLOC_FL_KEEP_SIZE | \
+ FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE | \
+ FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE | \
+ FALLOC_FL_UNSHARE_RANGE)
STATIC long
__xfs_file_fallocate(
@@ -1342,13 +1448,26 @@ xfs_file_zoned_fallocate(
{
struct xfs_zone_alloc_ctx ac = { };
struct xfs_inode *ip = XFS_I(file_inode(file));
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_filblks_t count_fsb;
int error;
- error = xfs_zoned_space_reserve(ip, 2, XFS_ZR_RESERVED, &ac);
+ /*
+ * If full zeroing is forced by the error injection knob, we need a
+ * space reservation that covers the entire range. See the comment in
+ * xfs_zoned_write_space_reserve for the rationale for the calculation.
+ * Otherwise just reserve space for the two boundary blocks.
+ */
+ count_fsb = XFS_ZONED_ZERO_EDGE_SPACE_RES;
+ if ((mode & FALLOC_FL_MODE_MASK) == FALLOC_FL_ZERO_RANGE &&
+ XFS_TEST_ERROR(mp, XFS_ERRTAG_FORCE_ZERO_RANGE))
+ count_fsb += XFS_B_TO_FSB(mp, len) + 1;
+
+ error = xfs_zoned_space_reserve(mp, count_fsb, XFS_ZR_RESERVED, &ac);
if (error)
return error;
error = __xfs_file_fallocate(file, mode, offset, len, &ac);
- xfs_zoned_space_unreserve(ip, &ac);
+ xfs_zoned_space_unreserve(mp, &ac);
return error;
}
@@ -1488,7 +1607,7 @@ xfs_file_open(
if (xfs_is_shutdown(XFS_M(inode->i_sb)))
return -EIO;
file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
- if (xfs_inode_can_atomicwrite(XFS_I(inode)))
+ if (xfs_get_atomic_write_min(XFS_I(inode)) > 0)
file->f_mode |= FMODE_CAN_ATOMIC_WRITE;
return generic_file_open(inode, file);
}
@@ -1660,7 +1779,7 @@ xfs_dax_fault_locked(
bool write_fault)
{
vm_fault_t ret;
- pfn_t pfn;
+ unsigned long pfn;
if (!IS_ENABLED(CONFIG_FS_DAX)) {
ASSERT(0);
@@ -1758,12 +1877,12 @@ xfs_write_fault_zoned(
* But as the overallocation is limited to less than a folio and will be
* release instantly that's just fine.
*/
- error = xfs_zoned_space_reserve(ip, XFS_B_TO_FSB(ip->i_mount, len), 0,
- &ac);
+ error = xfs_zoned_space_reserve(ip->i_mount,
+ XFS_B_TO_FSB(ip->i_mount, len), 0, &ac);
if (error < 0)
return vmf_fs_error(error);
ret = __xfs_write_fault(vmf, order, &ac);
- xfs_zoned_space_unreserve(ip, &ac);
+ xfs_zoned_space_unreserve(ip->i_mount, &ac);
return ret;
}
@@ -1844,10 +1963,10 @@ static const struct vm_operations_struct xfs_file_vm_ops = {
};
STATIC int
-xfs_file_mmap(
- struct file *file,
- struct vm_area_struct *vma)
+xfs_file_mmap_prepare(
+ struct vm_area_desc *desc)
{
+ struct file *file = desc->file;
struct inode *inode = file_inode(file);
struct xfs_buftarg *target = xfs_inode_buftarg(XFS_I(inode));
@@ -1855,13 +1974,14 @@ xfs_file_mmap(
* We don't support synchronous mappings for non-DAX files and
* for DAX files if underneath dax_device is not synchronous.
*/
- if (!daxdev_mapping_supported(vma, target->bt_daxdev))
+ if (!daxdev_mapping_supported(desc->vm_flags, file_inode(file),
+ target->bt_daxdev))
return -EOPNOTSUPP;
file_accessed(file);
- vma->vm_ops = &xfs_file_vm_ops;
+ desc->vm_ops = &xfs_file_vm_ops;
if (IS_DAX(inode))
- vm_flags_set(vma, VM_HUGEPAGE);
+ desc->vm_flags |= VM_HUGEPAGE;
return 0;
}
@@ -1876,7 +1996,7 @@ const struct file_operations xfs_file_operations = {
#ifdef CONFIG_COMPAT
.compat_ioctl = xfs_file_compat_ioctl,
#endif
- .mmap = xfs_file_mmap,
+ .mmap_prepare = xfs_file_mmap_prepare,
.open = xfs_file_open,
.release = xfs_file_release,
.fsync = xfs_file_fsync,
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index a961aa420c48..044918fbae06 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -304,11 +304,9 @@ xfs_filestream_create_association(
* for us, so all we need to do here is take another active reference to
* the perag for the cached association.
*
- * If we fail to store the association, we need to drop the fstrms
- * counter as well as drop the perag reference we take here for the
- * item. We do not need to return an error for this failure - as long as
- * we return a referenced AG, the allocation can still go ahead just
- * fine.
+ * If we fail to store the association, we do not need to return an
+ * error for this failure - as long as we return a referenced AG, the
+ * allocation can still go ahead just fine.
*/
item = kmalloc(sizeof(*item), GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (!item)
@@ -316,14 +314,9 @@ xfs_filestream_create_association(
atomic_inc(&pag_group(args->pag)->xg_active_ref);
item->pag = args->pag;
- error = xfs_mru_cache_insert(mp->m_filestream, pino, &item->mru);
- if (error)
- goto out_free_item;
+ xfs_mru_cache_insert(mp->m_filestream, pino, &item->mru);
return 0;
-out_free_item:
- xfs_perag_rele(item->pag);
- kfree(item);
out_put_fstrms:
atomic_dec(&args->pag->pagf_fstrms);
return 0;
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index a4bc1642fe56..af68c7de8ee8 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -876,6 +876,7 @@ xfs_getfsmap_rtdev_rmapbt(
const struct xfs_fsmap *keys,
struct xfs_getfsmap_info *info)
{
+ struct xfs_fsmap key0 = *keys; /* struct copy */
struct xfs_mount *mp = tp->t_mountp;
struct xfs_rtgroup *rtg = NULL;
struct xfs_btree_cur *bt_cur = NULL;
@@ -887,32 +888,46 @@ xfs_getfsmap_rtdev_rmapbt(
int error = 0;
eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart + mp->m_sb.sb_rblocks);
- if (keys[0].fmr_physical >= eofs)
+ if (key0.fmr_physical >= eofs)
return 0;
+ /*
+ * On zoned filesystems with an internal rt volume, the volume comes
+ * immediately after the end of the data volume. However, the
+ * xfs_rtblock_t address space is relative to the start of the data
+ * device, which means that the first @rtstart fsblocks do not actually
+ * point anywhere. If a fsmap query comes in with the low key starting
+ * below @rtstart, report it as "owned by filesystem".
+ */
rtstart_daddr = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart);
- if (keys[0].fmr_physical < rtstart_daddr) {
+ if (xfs_has_zoned(mp) && key0.fmr_physical < rtstart_daddr) {
struct xfs_fsmap_irec frec = {
.owner = XFS_RMAP_OWN_FS,
.len_daddr = rtstart_daddr,
};
- /* Adjust the low key if we are continuing from where we left off. */
- if (keys[0].fmr_length > 0) {
- info->low_daddr = keys[0].fmr_physical + keys[0].fmr_length;
- return 0;
+ /*
+ * Adjust the start of the query range if we're picking up from
+ * a previous round, and only emit the record if we haven't
+ * already gone past.
+ */
+ key0.fmr_physical += key0.fmr_length;
+ if (key0.fmr_physical < rtstart_daddr) {
+ error = xfs_getfsmap_helper(tp, info, &frec);
+ if (error)
+ return error;
+
+ key0.fmr_physical = rtstart_daddr;
}
- /* Fabricate an rmap entry for space occupied by the data dev */
- error = xfs_getfsmap_helper(tp, info, &frec);
- if (error)
- return error;
+ /* Zero the other fields to avoid further adjustments. */
+ key0.fmr_owner = 0;
+ key0.fmr_offset = 0;
+ key0.fmr_length = 0;
}
- start_rtb = xfs_daddr_to_rtb(mp, rtstart_daddr + keys[0].fmr_physical);
- end_rtb = xfs_daddr_to_rtb(mp, rtstart_daddr +
- min(eofs - 1, keys[1].fmr_physical));
-
+ start_rtb = xfs_daddr_to_rtb(mp, key0.fmr_physical);
+ end_rtb = xfs_daddr_to_rtb(mp, min(eofs - 1, keys[1].fmr_physical));
info->missing_owner = XFS_FMR_OWN_FREE;
/*
@@ -920,12 +935,12 @@ xfs_getfsmap_rtdev_rmapbt(
* low to the fsmap low key and max out the high key to the end
* of the rtgroup.
*/
- info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset);
- error = xfs_fsmap_owner_to_rmap(&info->low, &keys[0]);
+ info->low.rm_offset = XFS_BB_TO_FSBT(mp, key0.fmr_offset);
+ error = xfs_fsmap_owner_to_rmap(&info->low, &key0);
if (error)
return error;
- info->low.rm_blockcount = XFS_BB_TO_FSBT(mp, keys[0].fmr_length);
- xfs_getfsmap_set_irec_flags(&info->low, &keys[0]);
+ info->low.rm_blockcount = XFS_BB_TO_FSBT(mp, key0.fmr_length);
+ xfs_getfsmap_set_irec_flags(&info->low, &key0);
/* Adjust the low key if we are continuing from where we left off. */
if (info->low.rm_blockcount == 0) {
@@ -1255,9 +1270,7 @@ xfs_getfsmap(
* buffer locking abilities to detect cycles in the rmapbt
* without deadlocking.
*/
- error = xfs_trans_alloc_empty(mp, &tp);
- if (error)
- break;
+ tp = xfs_trans_alloc_empty(mp);
info.dev = handlers[i].dev;
info.last = false;
diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
index f18fec0adf66..566fd663c95b 100644
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -14,8 +14,6 @@
*/
xfs_param_t xfs_params = {
/* MIN DFLT MAX */
- .sgid_inherit = { 0, 0, 1 },
- .symlink_mode = { 0, 0, 1 },
.panic_mask = { 0, 0, XFS_PTAG_MASK},
.error_level = { 0, 3, 11 },
.syncd_timer = { 1*100, 30*100, 7200*100},
@@ -23,8 +21,6 @@ xfs_param_t xfs_params = {
.inherit_sync = { 0, 1, 1 },
.inherit_nodump = { 0, 1, 1 },
.inherit_noatim = { 0, 1, 1 },
- .xfs_buf_timer = { 100/2, 1*100, 30*100 },
- .xfs_buf_age = { 1*100, 15*100, 7200*100},
.inherit_nosym = { 0, 0, 1 },
.rotorstep = { 1, 1, 255 },
.inherit_nodfrg = { 0, 1, 1 },
diff --git a/fs/xfs/xfs_handle.c b/fs/xfs/xfs_handle.c
index f19fce557354..5a3e3bf4e7cc 100644
--- a/fs/xfs/xfs_handle.c
+++ b/fs/xfs/xfs_handle.c
@@ -233,14 +233,11 @@ xfs_open_by_handle(
xfs_fsop_handlereq_t *hreq)
{
const struct cred *cred = current_cred();
- int error;
- int fd;
int permflag;
- struct file *filp;
struct inode *inode;
struct dentry *dentry;
fmode_t fmode;
- struct path path;
+ struct path path __free(path_put) = {};
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -249,12 +246,11 @@ xfs_open_by_handle(
if (IS_ERR(dentry))
return PTR_ERR(dentry);
inode = d_inode(dentry);
+ path.dentry = dentry;
/* Restrict xfs_open_by_handle to directories & regular files. */
- if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) {
- error = -EPERM;
- goto out_dput;
- }
+ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
+ return -EPERM;
#if BITS_PER_LONG != 32
hreq->oflags |= O_LARGEFILE;
@@ -263,48 +259,30 @@ xfs_open_by_handle(
permflag = hreq->oflags;
fmode = OPEN_FMODE(permflag);
if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) &&
- (fmode & FMODE_WRITE) && IS_APPEND(inode)) {
- error = -EPERM;
- goto out_dput;
- }
+ (fmode & FMODE_WRITE) && IS_APPEND(inode))
+ return -EPERM;
- if ((fmode & FMODE_WRITE) && IS_IMMUTABLE(inode)) {
- error = -EPERM;
- goto out_dput;
- }
+ if ((fmode & FMODE_WRITE) && IS_IMMUTABLE(inode))
+ return -EPERM;
/* Can't write directories. */
- if (S_ISDIR(inode->i_mode) && (fmode & FMODE_WRITE)) {
- error = -EISDIR;
- goto out_dput;
- }
+ if (S_ISDIR(inode->i_mode) && (fmode & FMODE_WRITE))
+ return -EISDIR;
- fd = get_unused_fd_flags(0);
- if (fd < 0) {
- error = fd;
- goto out_dput;
- }
+ path.mnt = mntget(parfilp->f_path.mnt);
- path.mnt = parfilp->f_path.mnt;
- path.dentry = dentry;
- filp = dentry_open(&path, hreq->oflags, cred);
- dput(dentry);
- if (IS_ERR(filp)) {
- put_unused_fd(fd);
- return PTR_ERR(filp);
- }
+ FD_PREPARE(fdf, 0, dentry_open(&path, hreq->oflags, cred));
+ if (fdf.err)
+ return fdf.err;
if (S_ISREG(inode->i_mode)) {
+ struct file *filp = fd_prepare_file(fdf);
+
filp->f_flags |= O_NOATIME;
filp->f_mode |= FMODE_NOCMTIME;
}
- fd_install(fd, filp);
- return fd;
-
- out_dput:
- dput(dentry);
- return error;
+ return fd_publish(fdf);
}
int
diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c
index 7c541fb373d5..3c1557fb1cf0 100644
--- a/fs/xfs/xfs_health.c
+++ b/fs/xfs/xfs_health.c
@@ -285,7 +285,7 @@ xfs_inode_mark_sick(
* is not the case here.
*/
spin_lock(&VFS_I(ip)->i_lock);
- VFS_I(ip)->i_state &= ~I_DONTCACHE;
+ inode_state_clear(VFS_I(ip), I_DONTCACHE);
spin_unlock(&VFS_I(ip)->i_lock);
}
@@ -309,7 +309,7 @@ xfs_inode_mark_corrupt(
* is not the case here.
*/
spin_lock(&VFS_I(ip)->i_lock);
- VFS_I(ip)->i_state &= ~I_DONTCACHE;
+ inode_state_clear(VFS_I(ip), I_DONTCACHE);
spin_unlock(&VFS_I(ip)->i_lock);
}
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 726e29b837e6..23a920437fe4 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -334,7 +334,7 @@ xfs_reinit_inode(
dev_t dev = inode->i_rdev;
kuid_t uid = inode->i_uid;
kgid_t gid = inode->i_gid;
- unsigned long state = inode->i_state;
+ unsigned long state = inode_state_read_once(inode);
error = inode_init_always(mp->m_super, inode);
@@ -345,7 +345,7 @@ xfs_reinit_inode(
inode->i_rdev = dev;
inode->i_uid = uid;
inode->i_gid = gid;
- inode->i_state = state;
+ inode_state_assign_raw(inode, state);
mapping_set_folio_min_order(inode->i_mapping,
M_IGEO(mp)->min_folio_order);
return error;
@@ -358,7 +358,7 @@ xfs_reinit_inode(
static int
xfs_iget_recycle(
struct xfs_perag *pag,
- struct xfs_inode *ip) __releases(&ip->i_flags_lock)
+ struct xfs_inode *ip)
{
struct xfs_mount *mp = ip->i_mount;
struct inode *inode = VFS_I(ip);
@@ -366,20 +366,6 @@ xfs_iget_recycle(
trace_xfs_iget_recycle(ip);
- if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
- return -EAGAIN;
-
- /*
- * We need to make it look like the inode is being reclaimed to prevent
- * the actual reclaim workers from stomping over us while we recycle
- * the inode. We can't clear the radix tree tag yet as it requires
- * pag_ici_lock to be held exclusive.
- */
- ip->i_flags |= XFS_IRECLAIM;
-
- spin_unlock(&ip->i_flags_lock);
- rcu_read_unlock();
-
ASSERT(!rwsem_is_locked(&inode->i_rwsem));
error = xfs_reinit_inode(mp, inode);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -411,7 +397,7 @@ xfs_iget_recycle(
ip->i_flags |= XFS_INEW;
xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
XFS_ICI_RECLAIM_TAG);
- inode->i_state = I_NEW;
+ inode_state_assign_raw(inode, I_NEW);
spin_unlock(&ip->i_flags_lock);
spin_unlock(&pag->pag_ici_lock);
@@ -576,10 +562,19 @@ xfs_iget_cache_hit(
/* The inode fits the selection criteria; process it. */
if (ip->i_flags & XFS_IRECLAIMABLE) {
- /* Drops i_flags_lock and RCU read lock. */
- error = xfs_iget_recycle(pag, ip);
- if (error == -EAGAIN)
+ /*
+ * We need to make it look like the inode is being reclaimed to
+ * prevent the actual reclaim workers from stomping over us
+ * while we recycle the inode. We can't clear the radix tree
+ * tag yet as it requires pag_ici_lock to be held exclusive.
+ */
+ if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
goto out_skip;
+ ip->i_flags |= XFS_IRECLAIM;
+ spin_unlock(&ip->i_flags_lock);
+ rcu_read_unlock();
+
+ error = xfs_iget_recycle(pag, ip);
if (error)
return error;
} else {
@@ -646,8 +641,7 @@ xfs_iget_cache_miss(
goto out_destroy;
/*
- * For version 5 superblocks, if we are initialising a new inode and we
- * are not utilising the XFS_FEAT_IKEEP inode cluster mode, we can
+ * For version 5 superblocks, if we are initialising a new inode, we
* simply build the new inode core with a random generation number.
*
* For version 4 (and older) superblocks, log recovery is dependent on
@@ -655,8 +649,7 @@ xfs_iget_cache_miss(
* value and hence we must also read the inode off disk even when
* initializing new inodes.
*/
- if (xfs_has_v3inodes(mp) &&
- (flags & XFS_IGET_CREATE) && !xfs_has_ikeep(mp)) {
+ if (xfs_has_v3inodes(mp) && (flags & XFS_IGET_CREATE)) {
VFS_I(ip)->i_generation = get_random_u32();
} else {
struct xfs_buf *bp;
@@ -893,10 +886,7 @@ xfs_metafile_iget(
struct xfs_trans *tp;
int error;
- error = xfs_trans_alloc_empty(mp, &tp);
- if (error)
- return error;
-
+ tp = xfs_trans_alloc_empty(mp);
error = xfs_trans_metafile_iget(tp, ino, metafile_type, ipp);
xfs_trans_cancel(tp);
return error;
@@ -979,7 +969,15 @@ xfs_reclaim_inode(
*/
if (xlog_is_shutdown(ip->i_mount->m_log)) {
xfs_iunpin_wait(ip);
+ /*
+ * Avoid a ABBA deadlock on the inode cluster buffer vs
+ * concurrent xfs_ifree_cluster() trying to mark the inode
+ * stale. We don't need the inode locked to run the flush abort
+ * code, but the flush abort needs to lock the cluster buffer.
+ */
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
xfs_iflush_shutdown_abort(ip);
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
goto reclaim;
}
if (xfs_ipincount(ip))
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index 4345db501714..f83ec2bd0583 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -158,7 +158,7 @@ xlog_recover_icreate_commit_pass2(
int nbufs;
int i;
- icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr;
+ icl = (struct xfs_icreate_log *)item->ri_buf[0].iov_base;
if (icl->icl_type != XFS_LI_ICREATE) {
xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type");
return -EINVAL;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index ee3e0f284287..f1f88e48fe22 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -877,6 +877,35 @@ xfs_create_tmpfile(
return error;
}
+static inline int
+xfs_projid_differ(
+ struct xfs_inode *tdp,
+ struct xfs_inode *sip)
+{
+ /*
+ * If we are using project inheritance, we only allow hard link/renames
+ * creation in our tree when the project IDs are the same; else
+ * the tree quota mechanism could be circumvented.
+ */
+ if (unlikely((tdp->i_diflags & XFS_DIFLAG_PROJINHERIT) &&
+ tdp->i_projid != sip->i_projid)) {
+ /*
+ * Project quota setup skips special files which can
+ * leave inodes in a PROJINHERIT directory without a
+ * project ID set. We need to allow links to be made
+ * to these "project-less" inodes because userspace
+ * expects them to succeed after project ID setup,
+ * but everything else should be rejected.
+ */
+ if (!special_file(VFS_I(sip)->i_mode) ||
+ sip->i_projid != 0) {
+ return -EXDEV;
+ }
+ }
+
+ return 0;
+}
+
int
xfs_link(
struct xfs_inode *tdp,
@@ -930,27 +959,9 @@ xfs_link(
goto error_return;
}
- /*
- * If we are using project inheritance, we only allow hard link
- * creation in our tree when the project IDs are the same; else
- * the tree quota mechanism could be circumvented.
- */
- if (unlikely((tdp->i_diflags & XFS_DIFLAG_PROJINHERIT) &&
- tdp->i_projid != sip->i_projid)) {
- /*
- * Project quota setup skips special files which can
- * leave inodes in a PROJINHERIT directory without a
- * project ID set. We need to allow links to be made
- * to these "project-less" inodes because userspace
- * expects them to succeed after project ID setup,
- * but everything else should be rejected.
- */
- if (!special_file(VFS_I(sip)->i_mode) ||
- sip->i_projid != 0) {
- error = -EXDEV;
- goto error_return;
- }
- }
+ error = xfs_projid_differ(tdp, sip);
+ if (error)
+ goto error_return;
error = xfs_dir_add_child(tp, resblks, &du);
if (error)
@@ -1035,7 +1046,7 @@ xfs_itruncate_extents_flags(
int error = 0;
xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
- if (atomic_read(&VFS_I(ip)->i_count))
+ if (icount_read(VFS_I(ip)))
xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL);
ASSERT(new_size <= XFS_ISIZE(ip));
ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
@@ -1569,7 +1580,7 @@ xfs_iunlink_reload_next(
next_ip->i_prev_unlinked = prev_agino;
trace_xfs_iunlink_reload_next(next_ip);
rele:
- ASSERT(!(VFS_I(next_ip)->i_state & I_DONTCACHE));
+ ASSERT(!(inode_state_read_once(VFS_I(next_ip)) & I_DONTCACHE));
if (xfs_is_quotacheck_running(mp) && next_ip)
xfs_iflags_set(next_ip, XFS_IQUOTAUNCHECKED);
xfs_irele(next_ip);
@@ -1635,7 +1646,7 @@ retry:
iip = ip->i_itemp;
if (__xfs_iflags_test(ip, XFS_IFLUSHING)) {
ASSERT(!list_empty(&iip->ili_item.li_bio_list));
- ASSERT(iip->ili_last_fields);
+ ASSERT(iip->ili_last_fields || xlog_is_shutdown(mp->m_log));
goto out_iunlock;
}
@@ -1656,7 +1667,6 @@ retry:
spin_lock(&iip->ili_lock);
iip->ili_last_fields = iip->ili_fields;
iip->ili_fields = 0;
- iip->ili_fsync_fields = 0;
spin_unlock(&iip->ili_lock);
ASSERT(iip->ili_last_fields);
@@ -1821,12 +1831,20 @@ static void
xfs_iunpin(
struct xfs_inode *ip)
{
- xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED);
+ struct xfs_inode_log_item *iip = ip->i_itemp;
+ xfs_csn_t seq = 0;
trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
+ xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED);
+
+ spin_lock(&iip->ili_lock);
+ seq = iip->ili_commit_seq;
+ spin_unlock(&iip->ili_lock);
+ if (!seq)
+ return;
/* Give the log a push to start the unpinning I/O */
- xfs_log_force_seq(ip->i_mount, ip->i_itemp->ili_commit_seq, 0, NULL);
+ xfs_log_force_seq(ip->i_mount, seq, 0, NULL);
}
@@ -2093,7 +2111,7 @@ xfs_rename_alloc_whiteout(
*/
xfs_setup_iops(tmpfile);
xfs_finish_inode_setup(tmpfile);
- VFS_I(tmpfile)->i_state |= I_LINKABLE;
+ inode_state_set_raw(VFS_I(tmpfile), I_LINKABLE);
*wip = tmpfile;
return 0;
@@ -2227,16 +2245,9 @@ retry:
if (du_wip.ip)
xfs_trans_ijoin(tp, du_wip.ip, 0);
- /*
- * If we are using project inheritance, we only allow renames
- * into our tree when the project IDs are the same; else the
- * tree quota mechanism would be circumvented.
- */
- if (unlikely((target_dp->i_diflags & XFS_DIFLAG_PROJINHERIT) &&
- target_dp->i_projid != src_ip->i_projid)) {
- error = -EXDEV;
+ error = xfs_projid_differ(target_dp, src_ip);
+ if (error)
goto out_trans_cancel;
- }
/* RENAME_EXCHANGE is unique from here on. */
if (flags & RENAME_EXCHANGE) {
@@ -2319,7 +2330,7 @@ retry:
* flag from the inode so it doesn't accidentally get misused in
* future.
*/
- VFS_I(du_wip.ip)->i_state &= ~I_LINKABLE;
+ inode_state_clear_raw(VFS_I(du_wip.ip), I_LINKABLE);
}
out_commit:
@@ -2377,8 +2388,8 @@ xfs_iflush(
* error handling as the caller will shutdown and fail the buffer.
*/
error = -EFSCORRUPTED;
- if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC),
- mp, XFS_ERRTAG_IFLUSH_1)) {
+ if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC) ||
+ XFS_TEST_ERROR(mp, XFS_ERRTAG_IFLUSH_1)) {
xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
"%s: Bad inode %llu magic number 0x%x, ptr "PTR_FMT,
__func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
@@ -2394,29 +2405,27 @@ xfs_iflush(
goto flush_out;
}
} else if (S_ISREG(VFS_I(ip)->i_mode)) {
- if (XFS_TEST_ERROR(
- ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
- ip->i_df.if_format != XFS_DINODE_FMT_BTREE,
- mp, XFS_ERRTAG_IFLUSH_3)) {
+ if ((ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
+ ip->i_df.if_format != XFS_DINODE_FMT_BTREE) ||
+ XFS_TEST_ERROR(mp, XFS_ERRTAG_IFLUSH_3)) {
xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
"%s: Bad regular inode %llu, ptr "PTR_FMT,
__func__, ip->i_ino, ip);
goto flush_out;
}
} else if (S_ISDIR(VFS_I(ip)->i_mode)) {
- if (XFS_TEST_ERROR(
- ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
- ip->i_df.if_format != XFS_DINODE_FMT_BTREE &&
- ip->i_df.if_format != XFS_DINODE_FMT_LOCAL,
- mp, XFS_ERRTAG_IFLUSH_4)) {
+ if ((ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS &&
+ ip->i_df.if_format != XFS_DINODE_FMT_BTREE &&
+ ip->i_df.if_format != XFS_DINODE_FMT_LOCAL) ||
+ XFS_TEST_ERROR(mp, XFS_ERRTAG_IFLUSH_4)) {
xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
"%s: Bad directory inode %llu, ptr "PTR_FMT,
__func__, ip->i_ino, ip);
goto flush_out;
}
}
- if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af) >
- ip->i_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) {
+ if (ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af) >
+ ip->i_nblocks || XFS_TEST_ERROR(mp, XFS_ERRTAG_IFLUSH_5)) {
xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
"%s: detected corrupt incore inode %llu, "
"total extents = %llu nblocks = %lld, ptr "PTR_FMT,
@@ -2425,8 +2434,8 @@ xfs_iflush(
ip->i_nblocks, ip);
goto flush_out;
}
- if (XFS_TEST_ERROR(ip->i_forkoff > mp->m_sb.sb_inodesize,
- mp, XFS_ERRTAG_IFLUSH_6)) {
+ if (ip->i_forkoff > mp->m_sb.sb_inodesize ||
+ XFS_TEST_ERROR(mp, XFS_ERRTAG_IFLUSH_6)) {
xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
"%s: bad inode %llu, forkoff 0x%x, ptr "PTR_FMT,
__func__, ip->i_ino, ip->i_forkoff, ip);
@@ -2502,7 +2511,6 @@ flush_out:
spin_lock(&iip->ili_lock);
iip->ili_last_fields = iip->ili_fields;
iip->ili_fields = 0;
- iip->ili_fsync_fields = 0;
set_bit(XFS_LI_FLUSHING, &iip->ili_item.li_flags);
spin_unlock(&iip->ili_lock);
@@ -2661,12 +2669,15 @@ int
xfs_log_force_inode(
struct xfs_inode *ip)
{
+ struct xfs_inode_log_item *iip = ip->i_itemp;
xfs_csn_t seq = 0;
- xfs_ilock(ip, XFS_ILOCK_SHARED);
- if (xfs_ipincount(ip))
- seq = ip->i_itemp->ili_commit_seq;
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ if (!iip)
+ return 0;
+
+ spin_lock(&iip->ili_lock);
+ seq = iip->ili_commit_seq;
+ spin_unlock(&iip->ili_lock);
if (!seq)
return 0;
@@ -2932,12 +2943,9 @@ xfs_inode_reload_unlinked(
struct xfs_inode *ip)
{
struct xfs_trans *tp;
- int error;
-
- error = xfs_trans_alloc_empty(ip->i_mount, &tp);
- if (error)
- return error;
+ int error = 0;
+ tp = xfs_trans_alloc_empty(ip->i_mount);
xfs_ilock(ip, XFS_ILOCK_SHARED);
if (xfs_inode_unlinked_incomplete(ip))
error = xfs_inode_reload_unlinked_bucket(tp, ip);
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index eae0159983ca..bd6d33557194 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -356,19 +356,20 @@ static inline bool xfs_inode_has_bigrtalloc(const struct xfs_inode *ip)
(XFS_IS_REALTIME_INODE(ip) ? \
(ip)->i_mount->m_rtdev_targp : (ip)->i_mount->m_ddev_targp)
-static inline bool
-xfs_inode_can_atomicwrite(
- struct xfs_inode *ip)
+static inline bool xfs_inode_can_hw_atomic_write(const struct xfs_inode *ip)
{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_buftarg *target = xfs_inode_buftarg(ip);
-
- if (mp->m_sb.sb_blocksize < target->bt_bdev_awu_min)
+ if (IS_DAX(VFS_IC(ip)))
return false;
- if (mp->m_sb.sb_blocksize > target->bt_bdev_awu_max)
+
+ return xfs_inode_buftarg(ip)->bt_awu_max > 0;
+}
+
+static inline bool xfs_inode_can_sw_atomic_write(const struct xfs_inode *ip)
+{
+ if (IS_DAX(VFS_IC(ip)))
return false;
- return true;
+ return xfs_can_sw_atomic_write(ip->i_mount);
}
/*
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 40fc1bf900af..2eb0c6011a2e 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -113,9 +113,9 @@ xfs_inode_item_precommit(
* to log the timestamps, or will clear already cleared fields in the
* worst case.
*/
- if (inode->i_state & I_DIRTY_TIME) {
+ if (inode_state_read_once(inode) & I_DIRTY_TIME) {
spin_lock(&inode->i_lock);
- inode->i_state &= ~I_DIRTY_TIME;
+ inode_state_clear(inode, I_DIRTY_TIME);
spin_unlock(&inode->i_lock);
}
@@ -131,46 +131,28 @@ xfs_inode_item_precommit(
}
/*
- * Inode verifiers do not check that the extent size hint is an integer
- * multiple of the rt extent size on a directory with both rtinherit
- * and extszinherit flags set. If we're logging a directory that is
- * misconfigured in this way, clear the hint.
+ * Inode verifiers do not check that the extent size hints are an
+ * integer multiple of the rt extent size on a directory with
+ * rtinherit flags set. If we're logging a directory that is
+ * misconfigured in this way, clear the bad hints.
*/
- if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
- (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) &&
- xfs_extlen_to_rtxmod(ip->i_mount, ip->i_extsize) > 0) {
- ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE |
- XFS_DIFLAG_EXTSZINHERIT);
- ip->i_extsize = 0;
- flags |= XFS_ILOG_CORE;
+ if (ip->i_diflags & XFS_DIFLAG_RTINHERIT) {
+ if ((ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) &&
+ xfs_extlen_to_rtxmod(ip->i_mount, ip->i_extsize) > 0) {
+ ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE |
+ XFS_DIFLAG_EXTSZINHERIT);
+ ip->i_extsize = 0;
+ flags |= XFS_ILOG_CORE;
+ }
+ if ((ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) &&
+ xfs_extlen_to_rtxmod(ip->i_mount, ip->i_cowextsize) > 0) {
+ ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE;
+ ip->i_cowextsize = 0;
+ flags |= XFS_ILOG_CORE;
+ }
}
- /*
- * Record the specific change for fdatasync optimisation. This allows
- * fdatasync to skip log forces for inodes that are only timestamp
- * dirty. Once we've processed the XFS_ILOG_IVERSION flag, convert it
- * to XFS_ILOG_CORE so that the actual on-disk dirty tracking
- * (ili_fields) correctly tracks that the version has changed.
- */
spin_lock(&iip->ili_lock);
- iip->ili_fsync_fields |= (flags & ~XFS_ILOG_IVERSION);
- if (flags & XFS_ILOG_IVERSION)
- flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE);
-
- /*
- * Inode verifiers do not check that the CoW extent size hint is an
- * integer multiple of the rt extent size on a directory with both
- * rtinherit and cowextsize flags set. If we're logging a directory
- * that is misconfigured in this way, clear the hint.
- */
- if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
- (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) &&
- xfs_extlen_to_rtxmod(ip->i_mount, ip->i_cowextsize) > 0) {
- ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE;
- ip->i_cowextsize = 0;
- flags |= XFS_ILOG_CORE;
- }
-
if (!iip->ili_item.li_buf) {
struct xfs_buf *bp;
int error;
@@ -205,6 +187,20 @@ xfs_inode_item_precommit(
}
/*
+ * Store the dirty flags back into the inode item as this state is used
+ * later on in xfs_inode_item_committing() to determine whether the
+ * transaction is relevant to fsync state or not.
+ */
+ iip->ili_dirty_flags = flags;
+
+ /*
+ * Convert the flags on-disk fields that have been modified in the
+ * transaction so that ili_fields tracks the changes correctly.
+ */
+ if (flags & XFS_ILOG_IVERSION)
+ flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE);
+
+ /*
* Always OR in the bits from the ili_last_fields field. This is to
* coordinate with the xfs_iflush() and xfs_buf_inode_iodone() routines
* in the eventual clearing of the ili_fields bits. See the big comment
@@ -214,12 +210,6 @@ xfs_inode_item_precommit(
spin_unlock(&iip->ili_lock);
xfs_inode_item_precommit_check(ip);
-
- /*
- * We are done with the log item transaction dirty state, so clear it so
- * that it doesn't pollute future transactions.
- */
- iip->ili_dirty_flags = 0;
return 0;
}
@@ -729,13 +719,24 @@ xfs_inode_item_unpin(
struct xfs_log_item *lip,
int remove)
{
- struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode;
+ struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+ struct xfs_inode *ip = iip->ili_inode;
trace_xfs_inode_unpin(ip, _RET_IP_);
ASSERT(lip->li_buf || xfs_iflags_test(ip, XFS_ISTALE));
ASSERT(atomic_read(&ip->i_pincount) > 0);
- if (atomic_dec_and_test(&ip->i_pincount))
+
+ /*
+ * If this is the last unpin, then the inode no longer needs a journal
+ * flush to persist it. Hence we can clear the commit sequence numbers
+ * as a fsync/fdatasync operation on the inode at this point is a no-op.
+ */
+ if (atomic_dec_and_lock(&ip->i_pincount, &iip->ili_lock)) {
+ iip->ili_commit_seq = 0;
+ iip->ili_datasync_seq = 0;
+ spin_unlock(&iip->ili_lock);
wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT);
+ }
}
STATIC uint
@@ -758,11 +759,14 @@ xfs_inode_item_push(
* completed and items removed from the AIL before the next push
* attempt.
*/
+ trace_xfs_inode_push_stale(ip, _RET_IP_);
return XFS_ITEM_PINNED;
}
- if (xfs_ipincount(ip) > 0 || xfs_buf_ispinned(bp))
+ if (xfs_ipincount(ip) > 0 || xfs_buf_ispinned(bp)) {
+ trace_xfs_inode_push_pinned(ip, _RET_IP_);
return XFS_ITEM_PINNED;
+ }
if (xfs_iflags_test(ip, XFS_IFLUSHING))
return XFS_ITEM_FLUSHING;
@@ -855,12 +859,45 @@ xfs_inode_item_committed(
return lsn;
}
+/*
+ * The modification is now complete, so before we unlock the inode we need to
+ * update the commit sequence numbers for data integrity journal flushes. We
+ * always record the commit sequence number (ili_commit_seq) so that anything
+ * that needs a full journal sync will capture all of this modification.
+ *
+ * We then
+ * check if the changes will impact a datasync (O_DSYNC) journal flush. If the
+ * changes will require a datasync flush, then we also record the sequence in
+ * ili_datasync_seq.
+ *
+ * These commit sequence numbers will get cleared atomically with the inode being
+ * unpinned (i.e. pin count goes to zero), and so it will only be set when the
+ * inode is dirty in the journal. This removes the need for checking if the
+ * inode is pinned to determine if a journal flush is necessary, and hence
+ * removes the need for holding the ILOCK_SHARED in xfs_file_fsync() to
+ * serialise pin counts against commit sequence number updates.
+ *
+ */
STATIC void
xfs_inode_item_committing(
struct xfs_log_item *lip,
xfs_csn_t seq)
{
- INODE_ITEM(lip)->ili_commit_seq = seq;
+ struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+
+ spin_lock(&iip->ili_lock);
+ iip->ili_commit_seq = seq;
+ if (iip->ili_dirty_flags & ~(XFS_ILOG_IVERSION | XFS_ILOG_TIMESTAMP))
+ iip->ili_datasync_seq = seq;
+ spin_unlock(&iip->ili_lock);
+
+ /*
+ * Clear the per-transaction dirty flags now that we have finished
+ * recording the transaction's inode modifications in the CIL and are
+ * about to release and (maybe) unlock the inode.
+ */
+ iip->ili_dirty_flags = 0;
+
return xfs_inode_item_release(lip);
}
@@ -1052,7 +1089,6 @@ xfs_iflush_abort_clean(
{
iip->ili_last_fields = 0;
iip->ili_fields = 0;
- iip->ili_fsync_fields = 0;
iip->ili_flush_lsn = 0;
iip->ili_item.li_buf = NULL;
list_del_init(&iip->ili_item.li_bio_list);
@@ -1089,13 +1125,7 @@ xfs_iflush_abort(
* state. Whilst the inode is in the AIL, it should have a valid buffer
* pointer for push operations to access - it is only safe to remove the
* inode from the buffer once it has been removed from the AIL.
- *
- * We also clear the failed bit before removing the item from the AIL
- * as xfs_trans_ail_delete()->xfs_clear_li_failed() will release buffer
- * references the inode item owns and needs to hold until we've fully
- * aborted the inode log item and detached it from the buffer.
*/
- clear_bit(XFS_LI_FAILED, &iip->ili_item.li_flags);
xfs_trans_ail_delete(&iip->ili_item, 0);
/*
@@ -1185,12 +1215,12 @@ xfs_iflush_shutdown_abort(
*/
int
xfs_inode_item_format_convert(
- struct xfs_log_iovec *buf,
+ struct kvec *buf,
struct xfs_inode_log_format *in_f)
{
- struct xfs_inode_log_format_32 *in_f32 = buf->i_addr;
+ struct xfs_inode_log_format_32 *in_f32 = buf->iov_base;
- if (buf->i_len != sizeof(*in_f32)) {
+ if (buf->iov_len != sizeof(*in_f32)) {
XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
return -EFSCORRUPTED;
}
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 377e06007804..2ddcca41714f 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -32,9 +32,17 @@ struct xfs_inode_log_item {
spinlock_t ili_lock; /* flush state lock */
unsigned int ili_last_fields; /* fields when flushed */
unsigned int ili_fields; /* fields to be logged */
- unsigned int ili_fsync_fields; /* logged since last fsync */
xfs_lsn_t ili_flush_lsn; /* lsn at last flush */
+
+ /*
+ * We record the sequence number for every inode modification, as
+ * well as those that only require fdatasync operations for data
+ * integrity. This allows optimisation of the O_DSYNC/fdatasync path
+ * without needing to track what modifications the journal is currently
+ * carrying for the inode. These are protected by the above ili_lock.
+ */
xfs_csn_t ili_commit_seq; /* last transaction commit */
+ xfs_csn_t ili_datasync_seq; /* for datasync optimisation */
};
static inline int xfs_inode_clean(struct xfs_inode *ip)
@@ -46,8 +54,8 @@ extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *);
extern void xfs_inode_item_destroy(struct xfs_inode *);
extern void xfs_iflush_abort(struct xfs_inode *);
extern void xfs_iflush_shutdown_abort(struct xfs_inode *);
-extern int xfs_inode_item_format_convert(xfs_log_iovec_t *,
- struct xfs_inode_log_format *);
+int xfs_inode_item_format_convert(struct kvec *buf,
+ struct xfs_inode_log_format *in_f);
extern struct kmem_cache *xfs_ili_cache;
diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c
index 7205fd14f6b3..9d1999d41be1 100644
--- a/fs/xfs/xfs_inode_item_recover.c
+++ b/fs/xfs/xfs_inode_item_recover.c
@@ -30,13 +30,13 @@ xlog_recover_inode_ra_pass2(
struct xlog *log,
struct xlog_recover_item *item)
{
- if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
- struct xfs_inode_log_format *ilfp = item->ri_buf[0].i_addr;
+ if (item->ri_buf[0].iov_len == sizeof(struct xfs_inode_log_format)) {
+ struct xfs_inode_log_format *ilfp = item->ri_buf[0].iov_base;
xlog_buf_readahead(log, ilfp->ilf_blkno, ilfp->ilf_len,
&xfs_inode_buf_ra_ops);
} else {
- struct xfs_inode_log_format_32 *ilfp = item->ri_buf[0].i_addr;
+ struct xfs_inode_log_format_32 *ilfp = item->ri_buf[0].iov_base;
xlog_buf_readahead(log, ilfp->ilf_blkno, ilfp->ilf_len,
&xfs_inode_buf_ra_ops);
@@ -326,8 +326,8 @@ xlog_recover_inode_commit_pass2(
int need_free = 0;
xfs_failaddr_t fa;
- if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
- in_f = item->ri_buf[0].i_addr;
+ if (item->ri_buf[0].iov_len == sizeof(struct xfs_inode_log_format)) {
+ in_f = item->ri_buf[0].iov_base;
} else {
in_f = kmalloc(sizeof(struct xfs_inode_log_format),
GFP_KERNEL | __GFP_NOFAIL);
@@ -366,7 +366,7 @@ xlog_recover_inode_commit_pass2(
error = -EFSCORRUPTED;
goto out_release;
}
- ldip = item->ri_buf[1].i_addr;
+ ldip = item->ri_buf[1].iov_base;
if (XFS_IS_CORRUPT(mp, ldip->di_magic != XFS_DINODE_MAGIC)) {
xfs_alert(mp,
"%s: Bad inode log record, rec ptr "PTR_FMT", ino %lld",
@@ -472,12 +472,12 @@ xlog_recover_inode_commit_pass2(
goto out_release;
}
isize = xfs_log_dinode_size(mp);
- if (unlikely(item->ri_buf[1].i_len > isize)) {
+ if (unlikely(item->ri_buf[1].iov_len > isize)) {
XFS_CORRUPTION_ERROR("Bad log dinode size", XFS_ERRLEVEL_LOW,
mp, ldip, sizeof(*ldip));
xfs_alert(mp,
- "Bad inode 0x%llx log dinode size 0x%x",
- in_f->ilf_ino, item->ri_buf[1].i_len);
+ "Bad inode 0x%llx log dinode size 0x%zx",
+ in_f->ilf_ino, item->ri_buf[1].iov_len);
error = -EFSCORRUPTED;
goto out_release;
}
@@ -500,8 +500,8 @@ xlog_recover_inode_commit_pass2(
if (in_f->ilf_size == 2)
goto out_owner_change;
- len = item->ri_buf[2].i_len;
- src = item->ri_buf[2].i_addr;
+ len = item->ri_buf[2].iov_len;
+ src = item->ri_buf[2].iov_base;
ASSERT(in_f->ilf_size <= 4);
ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK));
ASSERT(!(fields & XFS_ILOG_DFORK) ||
@@ -538,8 +538,8 @@ xlog_recover_inode_commit_pass2(
} else {
attr_index = 2;
}
- len = item->ri_buf[attr_index].i_len;
- src = item->ri_buf[attr_index].i_addr;
+ len = item->ri_buf[attr_index].iov_len;
+ src = item->ri_buf[attr_index].iov_base;
ASSERT(len == xlog_calc_iovec_len(in_f->ilf_asize));
switch (in_f->ilf_fields & XFS_ILOG_AFORK) {
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index d250f7f74e3b..59eaad774371 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -219,7 +219,7 @@ xfs_bulk_ireq_setup(
else if (XFS_INO_TO_AGNO(mp, breq->startino) < hdr->agno)
return -EINVAL;
- breq->flags |= XFS_IBULK_SAME_AG;
+ breq->iwalk_flags |= XFS_IWALK_SAME_AG;
/* Asking for an inode past the end of the AG? We're done! */
if (XFS_INO_TO_AGNO(mp, breq->startino) > hdr->agno)
@@ -444,7 +444,7 @@ static void
xfs_fill_fsxattr(
struct xfs_inode *ip,
int whichfork,
- struct fileattr *fa)
+ struct file_kattr *fa)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
@@ -496,7 +496,7 @@ xfs_ioc_fsgetxattra(
xfs_inode_t *ip,
void __user *arg)
{
- struct fileattr fa;
+ struct file_kattr fa;
xfs_ilock(ip, XFS_ILOCK_SHARED);
xfs_fill_fsxattr(ip, XFS_ATTR_FORK, &fa);
@@ -508,13 +508,10 @@ xfs_ioc_fsgetxattra(
int
xfs_fileattr_get(
struct dentry *dentry,
- struct fileattr *fa)
+ struct file_kattr *fa)
{
struct xfs_inode *ip = XFS_I(d_inode(dentry));
- if (d_is_special(dentry))
- return -ENOTTY;
-
xfs_ilock(ip, XFS_ILOCK_SHARED);
xfs_fill_fsxattr(ip, XFS_DATA_FORK, fa);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -526,7 +523,7 @@ static int
xfs_ioctl_setattr_xflags(
struct xfs_trans *tp,
struct xfs_inode *ip,
- struct fileattr *fa)
+ struct file_kattr *fa)
{
struct xfs_mount *mp = ip->i_mount;
bool rtflag = (fa->fsx_xflags & FS_XFLAG_REALTIME);
@@ -582,7 +579,7 @@ xfs_ioctl_setattr_xflags(
static void
xfs_ioctl_setattr_prepare_dax(
struct xfs_inode *ip,
- struct fileattr *fa)
+ struct file_kattr *fa)
{
struct xfs_mount *mp = ip->i_mount;
struct inode *inode = VFS_I(ip);
@@ -642,7 +639,7 @@ out_error:
static int
xfs_ioctl_setattr_check_extsize(
struct xfs_inode *ip,
- struct fileattr *fa)
+ struct file_kattr *fa)
{
struct xfs_mount *mp = ip->i_mount;
xfs_failaddr_t failaddr;
@@ -684,7 +681,7 @@ xfs_ioctl_setattr_check_extsize(
static int
xfs_ioctl_setattr_check_cowextsize(
struct xfs_inode *ip,
- struct fileattr *fa)
+ struct file_kattr *fa)
{
struct xfs_mount *mp = ip->i_mount;
xfs_failaddr_t failaddr;
@@ -709,7 +706,7 @@ xfs_ioctl_setattr_check_cowextsize(
static int
xfs_ioctl_setattr_check_projid(
struct xfs_inode *ip,
- struct fileattr *fa)
+ struct file_kattr *fa)
{
if (!fa->fsx_valid)
return 0;
@@ -725,7 +722,7 @@ int
xfs_fileattr_set(
struct mnt_idmap *idmap,
struct dentry *dentry,
- struct fileattr *fa)
+ struct file_kattr *fa)
{
struct xfs_inode *ip = XFS_I(d_inode(dentry));
struct xfs_mount *mp = ip->i_mount;
@@ -736,9 +733,6 @@ xfs_fileattr_set(
trace_xfs_ioctl_setattr(ip);
- if (d_is_special(dentry))
- return -ENOTTY;
-
if (!fa->fsx_valid) {
if (fa->flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL |
FS_NOATIME_FL | FS_NODUMP_FL |
@@ -990,9 +984,8 @@ xfs_ioc_getlabel(
BUILD_BUG_ON(sizeof(sbp->sb_fname) > FSLABEL_MAX);
/* 1 larger than sb_fname, so this ensures a trailing NUL char */
- memset(label, 0, sizeof(label));
spin_lock(&mp->m_sb_lock);
- strncpy(label, sbp->sb_fname, XFSLABEL_MAX);
+ memtostr_pad(label, sbp->sb_fname);
spin_unlock(&mp->m_sb_lock);
if (copy_to_user(user_label, label, sizeof(label)))
@@ -1210,21 +1203,21 @@ xfs_file_ioctl(
current->comm);
return -ENOTTY;
case XFS_IOC_DIOINFO: {
- struct xfs_buftarg *target = xfs_inode_buftarg(ip);
+ struct kstat st;
struct dioattr da;
- da.d_mem = target->bt_logical_sectorsize;
+ error = vfs_getattr(&filp->f_path, &st, STATX_DIOALIGN, 0);
+ if (error)
+ return error;
/*
- * See xfs_report_dioalign() for an explanation about why this
- * reports a value larger than the sector size for COW inodes.
+ * Some userspace directly feeds the return value to
+ * posix_memalign, which fails for values that are smaller than
+ * the pointer size. Round up the value to not break userspace.
*/
- if (xfs_is_cow_inode(ip))
- da.d_miniosz = xfs_inode_alloc_unitsize(ip);
- else
- da.d_miniosz = target->bt_logical_sectorsize;
+ da.d_mem = roundup(st.dio_mem_align, sizeof(void *));
+ da.d_miniosz = st.dio_offset_align;
da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1);
-
if (copy_to_user(arg, &da, sizeof(da)))
return -EFAULT;
return 0;
@@ -1415,10 +1408,8 @@ xfs_file_ioctl(
trace_xfs_ioc_free_eofblocks(mp, &icw, _RET_IP_);
- sb_start_write(mp->m_super);
- error = xfs_blockgc_free_space(mp, &icw);
- sb_end_write(mp->m_super);
- return error;
+ guard(super_write)(mp->m_super);
+ return xfs_blockgc_free_space(mp, &icw);
}
case XFS_IOC_EXCHANGE_RANGE:
diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h
index 12124946f347..f5ed5cf9d3df 100644
--- a/fs/xfs/xfs_ioctl.h
+++ b/fs/xfs/xfs_ioctl.h
@@ -17,13 +17,13 @@ xfs_ioc_swapext(
extern int
xfs_fileattr_get(
struct dentry *dentry,
- struct fileattr *fa);
+ struct file_kattr *fa);
extern int
xfs_fileattr_set(
struct mnt_idmap *idmap,
struct dentry *dentry,
- struct fileattr *fa);
+ struct file_kattr *fa);
extern long
xfs_file_ioctl(
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index cb23c8871f81..04f39ea15898 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -79,6 +79,9 @@ xfs_iomap_valid(
{
struct xfs_inode *ip = XFS_I(inode);
+ if (iomap->type == IOMAP_HOLE)
+ return true;
+
if (iomap->validity_cookie !=
xfs_iomap_inode_sequence(ip, iomap->flags)) {
trace_xfs_iomap_invalid(ip, iomap);
@@ -89,7 +92,7 @@ xfs_iomap_valid(
return true;
}
-static const struct iomap_folio_ops xfs_iomap_folio_ops = {
+const struct iomap_write_ops xfs_iomap_write_ops = {
.iomap_valid = xfs_iomap_valid,
};
@@ -146,12 +149,20 @@ xfs_bmbt_to_iomap(
iomap->bdev = target->bt_bdev;
iomap->flags = iomap_flags;
- if (xfs_ipincount(ip) &&
- (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
- iomap->flags |= IOMAP_F_DIRTY;
+ /*
+ * If the inode is dirty for datasync purposes, let iomap know so it
+ * doesn't elide the IO completion journal flushes on O_DSYNC IO.
+ */
+ if (ip->i_itemp) {
+ struct xfs_inode_log_item *iip = ip->i_itemp;
+
+ spin_lock(&iip->ili_lock);
+ if (iip->ili_datasync_seq)
+ iomap->flags |= IOMAP_F_DIRTY;
+ spin_unlock(&iip->ili_lock);
+ }
iomap->validity_cookie = sequence_cookie;
- iomap->folio_ops = &xfs_iomap_folio_ops;
return 0;
}
@@ -798,6 +809,38 @@ imap_spans_range(
return true;
}
+static bool
+xfs_bmap_hw_atomic_write_possible(
+ struct xfs_inode *ip,
+ struct xfs_bmbt_irec *imap,
+ xfs_fileoff_t offset_fsb,
+ xfs_fileoff_t end_fsb)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fsize_t len = XFS_FSB_TO_B(mp, end_fsb - offset_fsb);
+
+ /*
+ * atomic writes are required to be naturally aligned for disk blocks,
+ * which ensures that we adhere to block layer rules that we won't
+ * straddle any boundary or violate write alignment requirement.
+ */
+ if (!IS_ALIGNED(imap->br_startblock, imap->br_blockcount))
+ return false;
+
+ /*
+ * Spanning multiple extents would mean that multiple BIOs would be
+ * issued, and so would lose atomicity required for REQ_ATOMIC-based
+ * atomics.
+ */
+ if (!imap_spans_range(imap, offset_fsb, end_fsb))
+ return false;
+
+ /*
+ * The ->iomap_begin caller should ensure this, but check anyway.
+ */
+ return len <= xfs_inode_buftarg(ip)->bt_awu_max;
+}
+
static int
xfs_direct_write_iomap_begin(
struct inode *inode,
@@ -812,9 +855,11 @@ xfs_direct_write_iomap_begin(
struct xfs_bmbt_irec imap, cmap;
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length);
+ xfs_fileoff_t orig_end_fsb = end_fsb;
int nimaps = 1, error = 0;
bool shared = false;
u16 iomap_flags = 0;
+ bool needs_alloc;
unsigned int lockmode;
u64 seq;
@@ -875,13 +920,37 @@ relock:
(flags & IOMAP_DIRECT) || IS_DAX(inode));
if (error)
goto out_unlock;
- if (shared)
+ if (shared) {
+ if ((flags & IOMAP_ATOMIC) &&
+ !xfs_bmap_hw_atomic_write_possible(ip, &cmap,
+ offset_fsb, end_fsb)) {
+ error = -ENOPROTOOPT;
+ goto out_unlock;
+ }
goto out_found_cow;
+ }
end_fsb = imap.br_startoff + imap.br_blockcount;
length = XFS_FSB_TO_B(mp, end_fsb) - offset;
}
- if (imap_needs_alloc(inode, flags, &imap, nimaps))
+ needs_alloc = imap_needs_alloc(inode, flags, &imap, nimaps);
+
+ if (flags & IOMAP_ATOMIC) {
+ error = -ENOPROTOOPT;
+ /*
+ * If we allocate less than what is required for the write
+ * then we may end up with multiple extents, which means that
+ * REQ_ATOMIC-based cannot be used, so avoid this possibility.
+ */
+ if (needs_alloc && orig_end_fsb - offset_fsb > 1)
+ goto out_unlock;
+
+ if (!xfs_bmap_hw_atomic_write_possible(ip, &imap, offset_fsb,
+ orig_end_fsb))
+ goto out_unlock;
+ }
+
+ if (needs_alloc)
goto allocate_blocks;
/*
@@ -1022,6 +1091,190 @@ const struct iomap_ops xfs_zoned_direct_write_iomap_ops = {
};
#endif /* CONFIG_XFS_RT */
+#ifdef DEBUG
+static void
+xfs_check_atomic_cow_conversion(
+ struct xfs_inode *ip,
+ xfs_fileoff_t offset_fsb,
+ xfs_filblks_t count_fsb,
+ const struct xfs_bmbt_irec *cmap)
+{
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec cmap2 = { };
+
+ if (xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap2))
+ xfs_trim_extent(&cmap2, offset_fsb, count_fsb);
+
+ ASSERT(cmap2.br_startoff == cmap->br_startoff);
+ ASSERT(cmap2.br_blockcount == cmap->br_blockcount);
+ ASSERT(cmap2.br_startblock == cmap->br_startblock);
+ ASSERT(cmap2.br_state == cmap->br_state);
+}
+#else
+# define xfs_check_atomic_cow_conversion(...) ((void)0)
+#endif
+
+static int
+xfs_atomic_write_cow_iomap_begin(
+ struct inode *inode,
+ loff_t offset,
+ loff_t length,
+ unsigned flags,
+ struct iomap *iomap,
+ struct iomap *srcmap)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ const xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ const xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + length);
+ const xfs_filblks_t count_fsb = end_fsb - offset_fsb;
+ xfs_filblks_t hole_count_fsb;
+ int nmaps = 1;
+ xfs_filblks_t resaligned;
+ struct xfs_bmbt_irec cmap;
+ struct xfs_iext_cursor icur;
+ struct xfs_trans *tp;
+ unsigned int dblocks = 0, rblocks = 0;
+ int error;
+ u64 seq;
+
+ ASSERT(flags & IOMAP_WRITE);
+ ASSERT(flags & IOMAP_DIRECT);
+
+ if (xfs_is_shutdown(mp))
+ return -EIO;
+
+ if (!xfs_can_sw_atomic_write(mp)) {
+ ASSERT(xfs_can_sw_atomic_write(mp));
+ return -EINVAL;
+ }
+
+ /* blocks are always allocated in this path */
+ if (flags & IOMAP_NOWAIT)
+ return -EAGAIN;
+
+ trace_xfs_iomap_atomic_write_cow(ip, offset, length);
+retry:
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+ if (!ip->i_cowfp) {
+ ASSERT(!xfs_is_reflink_inode(ip));
+ xfs_ifork_init_cow(ip);
+ }
+
+ if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap))
+ cmap.br_startoff = end_fsb;
+ if (cmap.br_startoff <= offset_fsb) {
+ if (isnullstartblock(cmap.br_startblock))
+ goto convert_delay;
+
+ /*
+ * cmap could extend outside the write range due to previous
+ * speculative preallocations. We must trim cmap to the write
+ * range because the cow fork treats written mappings to mean
+ * "write in progress".
+ */
+ xfs_trim_extent(&cmap, offset_fsb, count_fsb);
+ goto found;
+ }
+
+ hole_count_fsb = cmap.br_startoff - offset_fsb;
+
+ resaligned = xfs_aligned_fsb_count(offset_fsb, hole_count_fsb,
+ xfs_get_cowextsz_hint(ip));
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+ if (XFS_IS_REALTIME_INODE(ip)) {
+ dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+ rblocks = resaligned;
+ } else {
+ dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
+ rblocks = 0;
+ }
+
+ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, dblocks,
+ rblocks, false, &tp);
+ if (error)
+ return error;
+
+ /* extent layout could have changed since the unlock, so check again */
+ if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap))
+ cmap.br_startoff = end_fsb;
+ if (cmap.br_startoff <= offset_fsb) {
+ xfs_trans_cancel(tp);
+ if (isnullstartblock(cmap.br_startblock))
+ goto convert_delay;
+ xfs_trim_extent(&cmap, offset_fsb, count_fsb);
+ goto found;
+ }
+
+ /*
+ * Allocate the entire reservation as unwritten blocks.
+ *
+ * Use XFS_BMAPI_EXTSZALIGN to hint at aligning new extents according to
+ * extszhint, such that there will be a greater chance that future
+ * atomic writes to that same range will be aligned (and don't require
+ * this COW-based method).
+ */
+ error = xfs_bmapi_write(tp, ip, offset_fsb, hole_count_fsb,
+ XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC |
+ XFS_BMAPI_EXTSZALIGN, 0, &cmap, &nmaps);
+ if (error) {
+ xfs_trans_cancel(tp);
+ goto out_unlock;
+ }
+
+ xfs_inode_set_cowblocks_tag(ip);
+ error = xfs_trans_commit(tp);
+ if (error)
+ goto out_unlock;
+
+ /*
+ * cmap could map more blocks than the range we passed into bmapi_write
+ * because of EXTSZALIGN or adjacent pre-existing unwritten mappings
+ * that were merged. Trim cmap to the original write range so that we
+ * don't convert more than we were asked to do for this write.
+ */
+ xfs_trim_extent(&cmap, offset_fsb, count_fsb);
+
+found:
+ if (cmap.br_state != XFS_EXT_NORM) {
+ error = xfs_reflink_convert_cow_locked(ip, cmap.br_startoff,
+ cmap.br_blockcount);
+ if (error)
+ goto out_unlock;
+ cmap.br_state = XFS_EXT_NORM;
+ xfs_check_atomic_cow_conversion(ip, offset_fsb, count_fsb,
+ &cmap);
+ }
+
+ trace_xfs_iomap_found(ip, offset, length, XFS_COW_FORK, &cmap);
+ seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq);
+
+convert_delay:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ error = xfs_bmapi_convert_delalloc(ip, XFS_COW_FORK, offset, iomap,
+ NULL);
+ if (error)
+ return error;
+
+ /*
+ * Try the lookup again, because the delalloc conversion might have
+ * turned the COW mapping into unwritten, but we need it to be in
+ * written state.
+ */
+ goto retry;
+out_unlock:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+}
+
+const struct iomap_ops xfs_atomic_write_cow_iomap_ops = {
+ .iomap_begin = xfs_atomic_write_cow_iomap_begin,
+};
+
static int
xfs_dax_write_iomap_end(
struct inode *inode,
@@ -1366,7 +1619,7 @@ xfs_zoned_buffered_write_iomap_begin(
return error;
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) ||
- XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+ XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) {
xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
error = -EFSCORRUPTED;
goto out_unlock;
@@ -1505,6 +1758,8 @@ xfs_buffered_write_iomap_begin(
struct iomap *iomap,
struct iomap *srcmap)
{
+ struct iomap_iter *iter = container_of(iomap, struct iomap_iter,
+ iomap);
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
@@ -1540,7 +1795,7 @@ xfs_buffered_write_iomap_begin(
return error;
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) ||
- XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+ XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) {
xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
error = -EFSCORRUPTED;
goto out_unlock;
@@ -1570,21 +1825,41 @@ xfs_buffered_write_iomap_begin(
}
/*
- * For zeroing, trim a delalloc extent that extends beyond the EOF
- * block. If it starts beyond the EOF block, convert it to an
+ * For zeroing, trim extents that extend beyond the EOF block. If a
+ * delalloc extent starts beyond the EOF block, convert it to an
* unwritten extent.
*/
- if ((flags & IOMAP_ZERO) && imap.br_startoff <= offset_fsb &&
- isnullstartblock(imap.br_startblock)) {
+ if (flags & IOMAP_ZERO) {
xfs_fileoff_t eof_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
+ u64 end;
- if (offset_fsb >= eof_fsb)
+ if (isnullstartblock(imap.br_startblock) &&
+ offset_fsb >= eof_fsb)
goto convert_delay;
- if (end_fsb > eof_fsb) {
+ if (offset_fsb < eof_fsb && end_fsb > eof_fsb)
end_fsb = eof_fsb;
- xfs_trim_extent(&imap, offset_fsb,
- end_fsb - offset_fsb);
+
+ /*
+ * Look up dirty folios for unwritten mappings within EOF.
+ * Providing this bypasses the flush iomap uses to trigger
+ * extent conversion when unwritten mappings have dirty
+ * pagecache in need of zeroing.
+ *
+ * Trim the mapping to the end pos of the lookup, which in turn
+ * was trimmed to the end of the batch if it became full before
+ * the end of the mapping.
+ */
+ if (imap.br_state == XFS_EXT_UNWRITTEN &&
+ offset_fsb < eof_fsb) {
+ loff_t len = min(count,
+ XFS_FSB_TO_B(mp, imap.br_blockcount));
+
+ end = iomap_fill_dirty_folios(iter, offset, len);
+ end_fsb = min_t(xfs_fileoff_t, end_fsb,
+ XFS_B_TO_FSB(mp, end));
}
+
+ xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb);
}
/*
@@ -2012,7 +2287,8 @@ xfs_zero_range(
return dax_zero_range(inode, pos, len, did_zero,
&xfs_dax_write_iomap_ops);
return iomap_zero_range(inode, pos, len, did_zero,
- &xfs_buffered_write_iomap_ops, ac);
+ &xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops,
+ ac);
}
int
@@ -2028,5 +2304,6 @@ xfs_truncate_page(
return dax_truncate_page(inode, pos, did_zero,
&xfs_dax_write_iomap_ops);
return iomap_truncate_page(inode, pos, did_zero,
- &xfs_buffered_write_iomap_ops, ac);
+ &xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops,
+ ac);
}
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index d330c4a581b1..ebcce7d49446 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -56,5 +56,7 @@ extern const struct iomap_ops xfs_read_iomap_ops;
extern const struct iomap_ops xfs_seek_iomap_ops;
extern const struct iomap_ops xfs_xattr_iomap_ops;
extern const struct iomap_ops xfs_dax_write_iomap_ops;
+extern const struct iomap_ops xfs_atomic_write_cow_iomap_ops;
+extern const struct iomap_write_ops xfs_iomap_write_ops;
#endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 756bd3ca8e00..ad94fbf55014 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -431,14 +431,12 @@ xfs_vn_symlink(
struct dentry *dentry,
const char *symname)
{
- struct inode *inode;
- struct xfs_inode *cip = NULL;
- struct xfs_name name;
- int error;
- umode_t mode;
+ struct inode *inode;
+ struct xfs_inode *cip = NULL;
+ struct xfs_name name;
+ int error;
+ umode_t mode = S_IFLNK | S_IRWXUGO;
- mode = S_IFLNK |
- (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO);
error = xfs_dentry_mode_to_name(&name, dentry, mode);
if (unlikely(error))
goto out;
@@ -601,16 +599,83 @@ xfs_report_dioalign(
stat->dio_offset_align = stat->dio_read_offset_align;
}
+unsigned int
+xfs_get_atomic_write_min(
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+
+ /*
+ * If we can complete an atomic write via atomic out of place writes,
+ * then advertise a minimum size of one fsblock. Without this
+ * mechanism, we can only guarantee atomic writes up to a single LBA.
+ *
+ * If out of place writes are not available, we can guarantee an atomic
+ * write of exactly one single fsblock if the bdev will make that
+ * guarantee for us.
+ */
+ if (xfs_inode_can_hw_atomic_write(ip) ||
+ xfs_inode_can_sw_atomic_write(ip))
+ return mp->m_sb.sb_blocksize;
+
+ return 0;
+}
+
+unsigned int
+xfs_get_atomic_write_max(
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+
+ /*
+ * If out of place writes are not available, we can guarantee an atomic
+ * write of exactly one single fsblock if the bdev will make that
+ * guarantee for us.
+ */
+ if (!xfs_inode_can_sw_atomic_write(ip)) {
+ if (xfs_inode_can_hw_atomic_write(ip))
+ return mp->m_sb.sb_blocksize;
+ return 0;
+ }
+
+ /*
+ * If we can complete an atomic write via atomic out of place writes,
+ * then advertise a maximum size of whatever we can complete through
+ * that means. Hardware support is reported via max_opt, not here.
+ */
+ if (XFS_IS_REALTIME_INODE(ip))
+ return XFS_FSB_TO_B(mp, mp->m_groups[XG_TYPE_RTG].awu_max);
+ return XFS_FSB_TO_B(mp, mp->m_groups[XG_TYPE_AG].awu_max);
+}
+
+unsigned int
+xfs_get_atomic_write_max_opt(
+ struct xfs_inode *ip)
+{
+ unsigned int awu_max = xfs_get_atomic_write_max(ip);
+
+ /* if the max is 1x block, then just keep behaviour that opt is 0 */
+ if (awu_max <= ip->i_mount->m_sb.sb_blocksize)
+ return 0;
+
+ /*
+ * Advertise the maximum size of an atomic write that we can tell the
+ * block device to perform for us. In general the bdev limit will be
+ * less than our out of place write limit, but we don't want to exceed
+ * the awu_max.
+ */
+ return min(awu_max, xfs_inode_buftarg(ip)->bt_awu_max);
+}
+
static void
xfs_report_atomic_write(
struct xfs_inode *ip,
struct kstat *stat)
{
- unsigned int unit_min = 0, unit_max = 0;
-
- if (xfs_inode_can_atomicwrite(ip))
- unit_min = unit_max = ip->i_mount->m_sb.sb_blocksize;
- generic_fill_statx_atomic_writes(stat, unit_min, unit_max);
+ generic_fill_statx_atomic_writes(stat,
+ xfs_get_atomic_write_min(ip),
+ xfs_get_atomic_write_max(ip),
+ xfs_get_atomic_write_max_opt(ip));
}
STATIC int
@@ -904,7 +969,7 @@ xfs_setattr_size(
* change.
*/
if (xfs_is_zoned_inode(ip)) {
- error = xfs_zoned_space_reserve(ip, 1,
+ error = xfs_zoned_space_reserve(mp, 1,
XFS_ZR_NOWAIT | XFS_ZR_RESERVED, &ac);
if (error) {
if (error == -EAGAIN)
@@ -932,7 +997,7 @@ xfs_setattr_size(
}
if (xfs_is_zoned_inode(ip))
- xfs_zoned_space_unreserve(ip, &ac);
+ xfs_zoned_space_unreserve(mp, &ac);
if (error)
return error;
@@ -1268,6 +1333,8 @@ static const struct inode_operations xfs_symlink_inode_operations = {
.setattr = xfs_vn_setattr,
.listxattr = xfs_vn_listxattr,
.update_time = xfs_vn_update_time,
+ .fileattr_get = xfs_fileattr_get,
+ .fileattr_set = xfs_fileattr_set,
};
/* Figure out if this file actually supports DAX. */
@@ -1353,7 +1420,7 @@ xfs_setup_inode(
bool is_meta = xfs_is_internal_inode(ip);
inode->i_ino = ip->i_ino;
- inode->i_state |= I_NEW;
+ inode_state_set_raw(inode, I_NEW);
inode_sb_list_add(inode);
/* make the inode look hashed for the writeback code */
diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h
index 3c1a2605ffd2..0896f6b8b3b8 100644
--- a/fs/xfs/xfs_iops.h
+++ b/fs/xfs/xfs_iops.h
@@ -19,5 +19,8 @@ int xfs_inode_init_security(struct inode *inode, struct inode *dir,
extern void xfs_setup_inode(struct xfs_inode *ip);
extern void xfs_setup_iops(struct xfs_inode *ip);
extern void xfs_diflags_to_iflags(struct xfs_inode *ip, bool init);
+unsigned int xfs_get_atomic_write_min(struct xfs_inode *ip);
+unsigned int xfs_get_atomic_write_max(struct xfs_inode *ip);
+unsigned int xfs_get_atomic_write_max_opt(struct xfs_inode *ip);
#endif /* __XFS_IOPS_H__ */
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 1fa1c0564b0c..2aa37a4d2706 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -239,14 +239,10 @@ xfs_bulkstat_one(
* Grab an empty transaction so that we can use its recursive buffer
* locking abilities to detect cycles in the inobt without deadlocking.
*/
- error = xfs_trans_alloc_empty(breq->mp, &tp);
- if (error)
- goto out;
-
+ tp = xfs_trans_alloc_empty(breq->mp);
error = xfs_bulkstat_one_int(breq->mp, breq->idmap, tp,
breq->startino, &bc);
xfs_trans_cancel(tp);
-out:
kfree(bc.buf);
/*
@@ -311,7 +307,6 @@ xfs_bulkstat(
.breq = breq,
};
struct xfs_trans *tp;
- unsigned int iwalk_flags = 0;
int error;
if (breq->idmap != &nop_mnt_idmap) {
@@ -331,17 +326,10 @@ xfs_bulkstat(
* Grab an empty transaction so that we can use its recursive buffer
* locking abilities to detect cycles in the inobt without deadlocking.
*/
- error = xfs_trans_alloc_empty(breq->mp, &tp);
- if (error)
- goto out;
-
- if (breq->flags & XFS_IBULK_SAME_AG)
- iwalk_flags |= XFS_IWALK_SAME_AG;
-
- error = xfs_iwalk(breq->mp, tp, breq->startino, iwalk_flags,
+ tp = xfs_trans_alloc_empty(breq->mp);
+ error = xfs_iwalk(breq->mp, tp, breq->startino, breq->iwalk_flags,
xfs_bulkstat_iwalk, breq->icount, &bc);
xfs_trans_cancel(tp);
-out:
kfree(bc.buf);
/*
@@ -464,14 +452,10 @@ xfs_inumbers(
* Grab an empty transaction so that we can use its recursive buffer
* locking abilities to detect cycles in the inobt without deadlocking.
*/
- error = xfs_trans_alloc_empty(breq->mp, &tp);
- if (error)
- goto out;
-
- error = xfs_inobt_walk(breq->mp, tp, breq->startino, breq->flags,
+ tp = xfs_trans_alloc_empty(breq->mp);
+ error = xfs_inobt_walk(breq->mp, tp, breq->startino, breq->iwalk_flags,
xfs_inumbers_walk, breq->icount, &ic);
xfs_trans_cancel(tp);
-out:
/*
* We found some inode groups, so clear the error status and return
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index f10e8f8f2335..2d0612f14d6e 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -13,17 +13,15 @@ struct xfs_ibulk {
xfs_ino_t startino; /* start with this inode */
unsigned int icount; /* number of elements in ubuffer */
unsigned int ocount; /* number of records returned */
- unsigned int flags; /* see XFS_IBULK_FLAG_* */
+ unsigned int flags; /* XFS_IBULK_FLAG_* */
+ unsigned int iwalk_flags; /* XFS_IWALK_FLAG_* */
};
-/* Only iterate within the same AG as startino */
-#define XFS_IBULK_SAME_AG (1U << 0)
-
/* Fill out the bs_extents64 field if set. */
-#define XFS_IBULK_NREXT64 (1U << 1)
+#define XFS_IBULK_NREXT64 (1U << 0)
/* Signal that we can return metadata directories. */
-#define XFS_IBULK_METADIR (1U << 2)
+#define XFS_IBULK_METADIR (1U << 1)
/*
* Advance the user buffer pointer by one record of the given size. If the
diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c
index 7db3ece370b1..c1c31d1a8e21 100644
--- a/fs/xfs/xfs_iwalk.c
+++ b/fs/xfs/xfs_iwalk.c
@@ -377,11 +377,8 @@ xfs_iwalk_run_callbacks(
if (!has_more)
return 0;
- if (iwag->drop_trans) {
- error = xfs_trans_alloc_empty(mp, &iwag->tp);
- if (error)
- return error;
- }
+ if (iwag->drop_trans)
+ iwag->tp = xfs_trans_alloc_empty(mp);
/* ...and recreate the cursor just past where we left off. */
error = xfs_ialloc_read_agi(iwag->pag, iwag->tp, 0, agi_bpp);
@@ -617,9 +614,7 @@ xfs_iwalk_ag_work(
* Grab an empty transaction so that we can use its recursive buffer
* locking abilities to detect cycles in the inobt without deadlocking.
*/
- error = xfs_trans_alloc_empty(mp, &iwag->tp);
- if (error)
- goto out;
+ iwag->tp = xfs_trans_alloc_empty(mp);
iwag->drop_trans = 1;
error = xfs_iwalk_ag(iwag);
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 9a2221b4aa21..4dd747bdbcca 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -89,8 +89,6 @@ typedef __u32 xfs_nlink_t;
#undef XFS_NATIVE_HOST
#endif
-#define irix_sgid_inherit xfs_params.sgid_inherit.val
-#define irix_symlink_mode xfs_params.symlink_mode.val
#define xfs_panic_mask xfs_params.panic_mask.val
#define xfs_error_level xfs_params.error_level.val
#define xfs_syncd_centisecs xfs_params.syncd_timer.val
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 6493bdb57351..a311385b23d8 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -109,14 +109,14 @@ xlog_prepare_iovec(
vec = &lv->lv_iovecp[0];
}
- len = lv->lv_buf_len + sizeof(struct xlog_op_header);
+ len = lv->lv_buf_used + sizeof(struct xlog_op_header);
if (!IS_ALIGNED(len, sizeof(uint64_t))) {
- lv->lv_buf_len = round_up(len, sizeof(uint64_t)) -
+ lv->lv_buf_used = round_up(len, sizeof(uint64_t)) -
sizeof(struct xlog_op_header);
}
vec->i_type = type;
- vec->i_addr = lv->lv_buf + lv->lv_buf_len;
+ vec->i_addr = lv->lv_buf + lv->lv_buf_used;
oph = vec->i_addr;
oph->oh_clientid = XFS_TRANSACTION;
@@ -534,8 +534,8 @@ xlog_state_release_iclog(
*/
if ((iclog->ic_state == XLOG_STATE_WANT_SYNC ||
(iclog->ic_flags & XLOG_ICL_NEED_FUA)) &&
- !iclog->ic_header.h_tail_lsn) {
- iclog->ic_header.h_tail_lsn =
+ !iclog->ic_header->h_tail_lsn) {
+ iclog->ic_header->h_tail_lsn =
cpu_to_be64(atomic64_read(&log->l_tail_lsn));
}
@@ -969,8 +969,8 @@ xfs_log_unmount_write(
* counters will be recalculated. Refer to xlog_check_unmount_rec for
* more details.
*/
- if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp,
- XFS_ERRTAG_FORCE_SUMMARY_RECALC)) {
+ if (xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS) ||
+ XFS_TEST_ERROR(mp, XFS_ERRTAG_FORCE_SUMMARY_RECALC)) {
xfs_alert(mp, "%s: will fix summary counters at next mount",
__func__);
return;
@@ -1240,7 +1240,7 @@ xlog_ioend_work(
/*
* Race to shutdown the filesystem if we see an error.
*/
- if (XFS_TEST_ERROR(error, log->l_mp, XFS_ERRTAG_IODONE_IOERR)) {
+ if (error || XFS_TEST_ERROR(log->l_mp, XFS_ERRTAG_IODONE_IOERR)) {
xfs_alert(log->l_mp, "log I/O error %d", error);
xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
}
@@ -1279,11 +1279,12 @@ xlog_get_iclog_buffer_size(
log->l_iclog_size = mp->m_logbsize;
/*
- * # headers = size / 32k - one header holds cycles from 32k of data.
+ * Combined size of the log record headers. The first 32k cycles
+ * are stored directly in the xlog_rec_header, the rest in the
+ * variable number of xlog_rec_ext_headers at its end.
*/
- log->l_iclog_heads =
- DIV_ROUND_UP(mp->m_logbsize, XLOG_HEADER_CYCLE_SIZE);
- log->l_iclog_hsize = log->l_iclog_heads << BBSHIFT;
+ log->l_iclog_hsize = struct_size(log->l_iclog->ic_header, h_ext,
+ DIV_ROUND_UP(mp->m_logbsize, XLOG_HEADER_CYCLE_SIZE) - 1);
}
void
@@ -1367,9 +1368,8 @@ xlog_alloc_log(
int num_bblks)
{
struct xlog *log;
- xlog_rec_header_t *head;
- xlog_in_core_t **iclogp;
- xlog_in_core_t *iclog, *prev_iclog=NULL;
+ struct xlog_in_core **iclogp;
+ struct xlog_in_core *iclog, *prev_iclog = NULL;
int i;
int error = -ENOMEM;
uint log2_size = 0;
@@ -1436,13 +1436,6 @@ xlog_alloc_log(
init_waitqueue_head(&log->l_flush_wait);
iclogp = &log->l_iclog;
- /*
- * The amount of memory to allocate for the iclog structure is
- * rather funky due to the way the structure is defined. It is
- * done this way so that we can use different sizes for machines
- * with different amounts of memory. See the definition of
- * xlog_in_core_t in xfs_log_priv.h for details.
- */
ASSERT(log->l_iclog_size >= 4096);
for (i = 0; i < log->l_iclog_bufs; i++) {
size_t bvec_size = howmany(log->l_iclog_size, PAGE_SIZE) *
@@ -1457,26 +1450,25 @@ xlog_alloc_log(
iclog->ic_prev = prev_iclog;
prev_iclog = iclog;
- iclog->ic_data = kvzalloc(log->l_iclog_size,
+ iclog->ic_header = kvzalloc(log->l_iclog_size,
GFP_KERNEL | __GFP_RETRY_MAYFAIL);
- if (!iclog->ic_data)
+ if (!iclog->ic_header)
goto out_free_iclog;
- head = &iclog->ic_header;
- memset(head, 0, sizeof(xlog_rec_header_t));
- head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
- head->h_version = cpu_to_be32(
+ iclog->ic_header->h_magicno =
+ cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
+ iclog->ic_header->h_version = cpu_to_be32(
xfs_has_logv2(log->l_mp) ? 2 : 1);
- head->h_size = cpu_to_be32(log->l_iclog_size);
- /* new fields */
- head->h_fmt = cpu_to_be32(XLOG_FMT);
- memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t));
+ iclog->ic_header->h_size = cpu_to_be32(log->l_iclog_size);
+ iclog->ic_header->h_fmt = cpu_to_be32(XLOG_FMT);
+ memcpy(&iclog->ic_header->h_fs_uuid, &mp->m_sb.sb_uuid,
+ sizeof(iclog->ic_header->h_fs_uuid));
+ iclog->ic_datap = (void *)iclog->ic_header + log->l_iclog_hsize;
iclog->ic_size = log->l_iclog_size - log->l_iclog_hsize;
iclog->ic_state = XLOG_STATE_ACTIVE;
iclog->ic_log = log;
atomic_set(&iclog->ic_refcnt, 0);
INIT_LIST_HEAD(&iclog->ic_callbacks);
- iclog->ic_datap = (void *)iclog->ic_data + log->l_iclog_hsize;
init_waitqueue_head(&iclog->ic_force_wait);
init_waitqueue_head(&iclog->ic_write_wait);
@@ -1489,8 +1481,7 @@ xlog_alloc_log(
log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */
log->l_ioend_workqueue = alloc_workqueue("xfs-log/%s",
- XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM |
- WQ_HIGHPRI),
+ XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_PERCPU),
0, mp->m_super->s_id);
if (!log->l_ioend_workqueue)
goto out_free_iclog;
@@ -1505,7 +1496,7 @@ out_destroy_workqueue:
out_free_iclog:
for (iclog = log->l_iclog; iclog; iclog = prev_iclog) {
prev_iclog = iclog->ic_next;
- kvfree(iclog->ic_data);
+ kvfree(iclog->ic_header);
kfree(iclog);
if (prev_iclog == log->l_iclog)
break;
@@ -1525,36 +1516,19 @@ xlog_pack_data(
struct xlog_in_core *iclog,
int roundoff)
{
- int i, j, k;
- int size = iclog->ic_offset + roundoff;
- __be32 cycle_lsn;
- char *dp;
-
- cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn);
+ struct xlog_rec_header *rhead = iclog->ic_header;
+ __be32 cycle_lsn = CYCLE_LSN_DISK(rhead->h_lsn);
+ char *dp = iclog->ic_datap;
+ int i;
- dp = iclog->ic_datap;
- for (i = 0; i < BTOBB(size); i++) {
- if (i >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE))
- break;
- iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp;
+ for (i = 0; i < BTOBB(iclog->ic_offset + roundoff); i++) {
+ *xlog_cycle_data(rhead, i) = *(__be32 *)dp;
*(__be32 *)dp = cycle_lsn;
dp += BBSIZE;
}
- if (xfs_has_logv2(log->l_mp)) {
- xlog_in_core_2_t *xhdr = iclog->ic_data;
-
- for ( ; i < BTOBB(size); i++) {
- j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
- k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
- xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp;
- *(__be32 *)dp = cycle_lsn;
- dp += BBSIZE;
- }
-
- for (i = 1; i < log->l_iclog_heads; i++)
- xhdr[i].hic_xheader.xh_cycle = cycle_lsn;
- }
+ for (i = 0; i < (log->l_iclog_hsize >> BBSHIFT) - 1; i++)
+ rhead->h_ext[i].xh_cycle = cycle_lsn;
}
/*
@@ -1568,27 +1542,22 @@ xlog_cksum(
struct xlog *log,
struct xlog_rec_header *rhead,
char *dp,
- int size)
+ unsigned int hdrsize,
+ unsigned int size)
{
uint32_t crc;
/* first generate the crc for the record header ... */
- crc = xfs_start_cksum_update((char *)rhead,
- sizeof(struct xlog_rec_header),
+ crc = xfs_start_cksum_update((char *)rhead, hdrsize,
offsetof(struct xlog_rec_header, h_crc));
/* ... then for additional cycle data for v2 logs ... */
if (xfs_has_logv2(log->l_mp)) {
- union xlog_in_core2 *xhdr = (union xlog_in_core2 *)rhead;
- int i;
- int xheads;
-
- xheads = DIV_ROUND_UP(size, XLOG_HEADER_CYCLE_SIZE);
+ int xheads, i;
- for (i = 1; i < xheads; i++) {
- crc = crc32c(crc, &xhdr[i].hic_xheader,
- sizeof(struct xlog_rec_ext_header));
- }
+ xheads = DIV_ROUND_UP(size, XLOG_HEADER_CYCLE_SIZE) - 1;
+ for (i = 0; i < xheads; i++)
+ crc = crc32c(crc, &rhead->h_ext[i], XLOG_REC_EXT_SIZE);
}
/* ... and finally for the payload */
@@ -1607,27 +1576,6 @@ xlog_bio_end_io(
&iclog->ic_end_io_work);
}
-static int
-xlog_map_iclog_data(
- struct bio *bio,
- void *data,
- size_t count)
-{
- do {
- struct page *page = kmem_to_page(data);
- unsigned int off = offset_in_page(data);
- size_t len = min_t(size_t, count, PAGE_SIZE - off);
-
- if (bio_add_page(bio, page, len, off) != len)
- return -EIO;
-
- data += len;
- count -= len;
- } while (count);
-
- return 0;
-}
-
STATIC void
xlog_write_iclog(
struct xlog *log,
@@ -1693,11 +1641,12 @@ xlog_write_iclog(
iclog->ic_flags &= ~(XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA);
- if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count))
- goto shutdown;
-
- if (is_vmalloc_addr(iclog->ic_data))
- flush_kernel_vmap_range(iclog->ic_data, count);
+ if (is_vmalloc_addr(iclog->ic_header)) {
+ if (!bio_add_vmalloc(&iclog->ic_bio, iclog->ic_header, count))
+ goto shutdown;
+ } else {
+ bio_add_virt_nofail(&iclog->ic_bio, iclog->ic_header, count);
+ }
/*
* If this log buffer would straddle the end of the log we will have
@@ -1825,20 +1774,20 @@ xlog_sync(
size = iclog->ic_offset;
if (xfs_has_logv2(log->l_mp))
size += roundoff;
- iclog->ic_header.h_len = cpu_to_be32(size);
+ iclog->ic_header->h_len = cpu_to_be32(size);
XFS_STATS_INC(log->l_mp, xs_log_writes);
XFS_STATS_ADD(log->l_mp, xs_log_blocks, BTOBB(count));
- bno = BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn));
+ bno = BLOCK_LSN(be64_to_cpu(iclog->ic_header->h_lsn));
/* Do we need to split this write into 2 parts? */
if (bno + BTOBB(count) > log->l_logBBsize)
- xlog_split_iclog(log, &iclog->ic_header, bno, count);
+ xlog_split_iclog(log, iclog->ic_header, bno, count);
/* calculcate the checksum */
- iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header,
- iclog->ic_datap, size);
+ iclog->ic_header->h_crc = xlog_cksum(log, iclog->ic_header,
+ iclog->ic_datap, XLOG_REC_SIZE, size);
/*
* Intentionally corrupt the log record CRC based on the error injection
* frequency, if defined. This facilitates testing log recovery in the
@@ -1847,12 +1796,12 @@ xlog_sync(
* detects the bad CRC and attempts to recover.
*/
#ifdef DEBUG
- if (XFS_TEST_ERROR(false, log->l_mp, XFS_ERRTAG_LOG_BAD_CRC)) {
- iclog->ic_header.h_crc &= cpu_to_le32(0xAAAAAAAA);
+ if (XFS_TEST_ERROR(log->l_mp, XFS_ERRTAG_LOG_BAD_CRC)) {
+ iclog->ic_header->h_crc &= cpu_to_le32(0xAAAAAAAA);
iclog->ic_fail_crc = true;
xfs_warn(log->l_mp,
"Intentionally corrupted log record at LSN 0x%llx. Shutdown imminent.",
- be64_to_cpu(iclog->ic_header.h_lsn));
+ be64_to_cpu(iclog->ic_header->h_lsn));
}
#endif
xlog_verify_iclog(log, iclog, count);
@@ -1864,10 +1813,10 @@ xlog_sync(
*/
STATIC void
xlog_dealloc_log(
- struct xlog *log)
+ struct xlog *log)
{
- xlog_in_core_t *iclog, *next_iclog;
- int i;
+ struct xlog_in_core *iclog, *next_iclog;
+ int i;
/*
* Destroy the CIL after waiting for iclog IO completion because an
@@ -1879,7 +1828,7 @@ xlog_dealloc_log(
iclog = log->l_iclog;
for (i = 0; i < log->l_iclog_bufs; i++) {
next_iclog = iclog->ic_next;
- kvfree(iclog->ic_data);
+ kvfree(iclog->ic_header);
kfree(iclog);
iclog = next_iclog;
}
@@ -1901,7 +1850,7 @@ xlog_state_finish_copy(
{
lockdep_assert_held(&log->l_icloglock);
- be32_add_cpu(&iclog->ic_header.h_num_logops, record_cnt);
+ be32_add_cpu(&iclog->ic_header->h_num_logops, record_cnt);
iclog->ic_offset += copy_bytes;
}
@@ -1951,9 +1900,9 @@ xlog_print_trans(
if (!lv)
continue;
xfs_warn(mp, " niovecs = %d", lv->lv_niovecs);
- xfs_warn(mp, " size = %d", lv->lv_size);
+ xfs_warn(mp, " alloc_size = %d", lv->lv_alloc_size);
xfs_warn(mp, " bytes = %d", lv->lv_bytes);
- xfs_warn(mp, " buf len = %d", lv->lv_buf_len);
+ xfs_warn(mp, " buf used= %d", lv->lv_buf_used);
/* dump each iovec for the log item */
vec = lv->lv_iovecp;
@@ -2324,7 +2273,7 @@ xlog_state_activate_iclog(
* We don't need to cover the dummy.
*/
if (*iclogs_changed == 0 &&
- iclog->ic_header.h_num_logops == cpu_to_be32(XLOG_COVER_OPS)) {
+ iclog->ic_header->h_num_logops == cpu_to_be32(XLOG_COVER_OPS)) {
*iclogs_changed = 1;
} else {
/*
@@ -2336,11 +2285,11 @@ xlog_state_activate_iclog(
iclog->ic_state = XLOG_STATE_ACTIVE;
iclog->ic_offset = 0;
- iclog->ic_header.h_num_logops = 0;
- memset(iclog->ic_header.h_cycle_data, 0,
- sizeof(iclog->ic_header.h_cycle_data));
- iclog->ic_header.h_lsn = 0;
- iclog->ic_header.h_tail_lsn = 0;
+ iclog->ic_header->h_num_logops = 0;
+ memset(iclog->ic_header->h_cycle_data, 0,
+ sizeof(iclog->ic_header->h_cycle_data));
+ iclog->ic_header->h_lsn = 0;
+ iclog->ic_header->h_tail_lsn = 0;
}
/*
@@ -2432,7 +2381,7 @@ xlog_get_lowest_lsn(
iclog->ic_state == XLOG_STATE_DIRTY)
continue;
- lsn = be64_to_cpu(iclog->ic_header.h_lsn);
+ lsn = be64_to_cpu(iclog->ic_header->h_lsn);
if ((lsn && !lowest_lsn) || XFS_LSN_CMP(lsn, lowest_lsn) < 0)
lowest_lsn = lsn;
} while ((iclog = iclog->ic_next) != log->l_iclog);
@@ -2467,7 +2416,7 @@ xlog_state_iodone_process_iclog(
* If this is not the lowest lsn iclog, then we will leave it
* for another completion to process.
*/
- header_lsn = be64_to_cpu(iclog->ic_header.h_lsn);
+ header_lsn = be64_to_cpu(iclog->ic_header->h_lsn);
lowest_lsn = xlog_get_lowest_lsn(log);
if (lowest_lsn && XFS_LSN_CMP(lowest_lsn, header_lsn) < 0)
return false;
@@ -2630,9 +2579,9 @@ xlog_state_get_iclog_space(
struct xlog_ticket *ticket,
int *logoffsetp)
{
- int log_offset;
- xlog_rec_header_t *head;
- xlog_in_core_t *iclog;
+ int log_offset;
+ struct xlog_rec_header *head;
+ struct xlog_in_core *iclog;
restart:
spin_lock(&log->l_icloglock);
@@ -2650,7 +2599,7 @@ restart:
goto restart;
}
- head = &iclog->ic_header;
+ head = iclog->ic_header;
atomic_inc(&iclog->ic_refcnt); /* prevents sync */
log_offset = iclog->ic_offset;
@@ -2676,10 +2625,11 @@ restart:
* until you know exactly how many bytes get copied. Therefore, wait
* until later to update ic_offset.
*
- * xlog_write() algorithm assumes that at least 2 xlog_op_header_t's
+ * xlog_write() algorithm assumes that at least 2 xlog_op_header's
* can fit into remaining data section.
*/
- if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) {
+ if (iclog->ic_size - iclog->ic_offset <
+ 2 * sizeof(struct xlog_op_header)) {
int error = 0;
xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
@@ -2814,7 +2764,7 @@ xlog_state_switch_iclogs(
if (!eventual_size)
eventual_size = iclog->ic_offset;
iclog->ic_state = XLOG_STATE_WANT_SYNC;
- iclog->ic_header.h_prev_block = cpu_to_be32(log->l_prev_block);
+ iclog->ic_header->h_prev_block = cpu_to_be32(log->l_prev_block);
log->l_prev_block = log->l_curr_block;
log->l_prev_cycle = log->l_curr_cycle;
@@ -2858,7 +2808,7 @@ xlog_force_and_check_iclog(
struct xlog_in_core *iclog,
bool *completed)
{
- xfs_lsn_t lsn = be64_to_cpu(iclog->ic_header.h_lsn);
+ xfs_lsn_t lsn = be64_to_cpu(iclog->ic_header->h_lsn);
int error;
*completed = false;
@@ -2870,7 +2820,7 @@ xlog_force_and_check_iclog(
* If the iclog has already been completed and reused the header LSN
* will have been rewritten by completion
*/
- if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn)
+ if (be64_to_cpu(iclog->ic_header->h_lsn) != lsn)
*completed = true;
return 0;
}
@@ -2888,7 +2838,7 @@ xlog_force_and_check_iclog(
*
* 1. the current iclog is active and has no data; the previous iclog
* is in the active or dirty state.
- * 2. the current iclog is drity, and the previous iclog is in the
+ * 2. the current iclog is dirty, and the previous iclog is in the
* active or dirty state.
*
* We may sleep if:
@@ -3003,7 +2953,7 @@ xlog_force_lsn(
goto out_error;
iclog = log->l_iclog;
- while (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) {
+ while (be64_to_cpu(iclog->ic_header->h_lsn) != lsn) {
trace_xlog_iclog_force_lsn(iclog, _RET_IP_);
iclog = iclog->ic_next;
if (iclog == log->l_iclog)
@@ -3112,16 +3062,16 @@ xfs_log_force_seq(
*/
void
xfs_log_ticket_put(
- xlog_ticket_t *ticket)
+ struct xlog_ticket *ticket)
{
ASSERT(atomic_read(&ticket->t_ref) > 0);
if (atomic_dec_and_test(&ticket->t_ref))
kmem_cache_free(xfs_log_ticket_cache, ticket);
}
-xlog_ticket_t *
+struct xlog_ticket *
xfs_log_ticket_get(
- xlog_ticket_t *ticket)
+ struct xlog_ticket *ticket)
{
ASSERT(atomic_read(&ticket->t_ref) > 0);
atomic_inc(&ticket->t_ref);
@@ -3173,11 +3123,11 @@ xlog_calc_unit_res(
*/
/* for trans header */
- unit_bytes += sizeof(xlog_op_header_t);
- unit_bytes += sizeof(xfs_trans_header_t);
+ unit_bytes += sizeof(struct xlog_op_header);
+ unit_bytes += sizeof(struct xfs_trans_header);
/* for start-rec */
- unit_bytes += sizeof(xlog_op_header_t);
+ unit_bytes += sizeof(struct xlog_op_header);
/*
* for LR headers - the space for data in an iclog is the size minus
@@ -3200,12 +3150,12 @@ xlog_calc_unit_res(
num_headers = howmany(unit_bytes, iclog_space);
/* for split-recs - ophdrs added when data split over LRs */
- unit_bytes += sizeof(xlog_op_header_t) * num_headers;
+ unit_bytes += sizeof(struct xlog_op_header) * num_headers;
/* add extra header reservations if we overrun */
while (!num_headers ||
howmany(unit_bytes, iclog_space) > num_headers) {
- unit_bytes += sizeof(xlog_op_header_t);
+ unit_bytes += sizeof(struct xlog_op_header);
num_headers++;
}
unit_bytes += log->l_iclog_hsize * num_headers;
@@ -3269,7 +3219,7 @@ xlog_verify_dump_tail(
{
xfs_alert(log->l_mp,
"ran out of log space tail 0x%llx/0x%llx, head lsn 0x%llx, head 0x%x/0x%x, prev head 0x%x/0x%x",
- iclog ? be64_to_cpu(iclog->ic_header.h_tail_lsn) : -1,
+ iclog ? be64_to_cpu(iclog->ic_header->h_tail_lsn) : -1,
atomic64_read(&log->l_tail_lsn),
log->l_ailp->ail_head_lsn,
log->l_curr_cycle, log->l_curr_block,
@@ -3288,7 +3238,7 @@ xlog_verify_tail_lsn(
struct xlog *log,
struct xlog_in_core *iclog)
{
- xfs_lsn_t tail_lsn = be64_to_cpu(iclog->ic_header.h_tail_lsn);
+ xfs_lsn_t tail_lsn = be64_to_cpu(iclog->ic_header->h_tail_lsn);
int blocks;
if (CYCLE_LSN(tail_lsn) == log->l_prev_cycle) {
@@ -3342,13 +3292,12 @@ xlog_verify_iclog(
struct xlog_in_core *iclog,
int count)
{
- xlog_op_header_t *ophead;
- xlog_in_core_t *icptr;
- xlog_in_core_2_t *xhdr;
- void *base_ptr, *ptr, *p;
+ struct xlog_rec_header *rhead = iclog->ic_header;
+ struct xlog_in_core *icptr;
+ void *base_ptr, *ptr;
ptrdiff_t field_offset;
uint8_t clientid;
- int len, i, j, k, op_len;
+ int len, i, op_len;
int idx;
/* check validity of iclog pointers */
@@ -3362,11 +3311,10 @@ xlog_verify_iclog(
spin_unlock(&log->l_icloglock);
/* check log magic numbers */
- if (iclog->ic_header.h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
+ if (rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
xfs_emerg(log->l_mp, "%s: invalid magic num", __func__);
- base_ptr = ptr = &iclog->ic_header;
- p = &iclog->ic_header;
+ base_ptr = ptr = rhead;
for (ptr += BBSIZE; ptr < base_ptr + count; ptr += BBSIZE) {
if (*(__be32 *)ptr == cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
xfs_emerg(log->l_mp, "%s: unexpected magic num",
@@ -3374,29 +3322,19 @@ xlog_verify_iclog(
}
/* check fields */
- len = be32_to_cpu(iclog->ic_header.h_num_logops);
+ len = be32_to_cpu(rhead->h_num_logops);
base_ptr = ptr = iclog->ic_datap;
- ophead = ptr;
- xhdr = iclog->ic_data;
for (i = 0; i < len; i++) {
- ophead = ptr;
+ struct xlog_op_header *ophead = ptr;
+ void *p = &ophead->oh_clientid;
/* clientid is only 1 byte */
- p = &ophead->oh_clientid;
field_offset = p - base_ptr;
if (field_offset & 0x1ff) {
clientid = ophead->oh_clientid;
} else {
idx = BTOBBT((void *)&ophead->oh_clientid - iclog->ic_datap);
- if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
- j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
- k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
- clientid = xlog_get_client_id(
- xhdr[j].hic_xheader.xh_cycle_data[k]);
- } else {
- clientid = xlog_get_client_id(
- iclog->ic_header.h_cycle_data[idx]);
- }
+ clientid = xlog_get_client_id(*xlog_cycle_data(rhead, idx));
}
if (clientid != XFS_TRANSACTION && clientid != XFS_LOG) {
xfs_warn(log->l_mp,
@@ -3412,15 +3350,9 @@ xlog_verify_iclog(
op_len = be32_to_cpu(ophead->oh_len);
} else {
idx = BTOBBT((void *)&ophead->oh_len - iclog->ic_datap);
- if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
- j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
- k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
- op_len = be32_to_cpu(xhdr[j].hic_xheader.xh_cycle_data[k]);
- } else {
- op_len = be32_to_cpu(iclog->ic_header.h_cycle_data[idx]);
- }
+ op_len = be32_to_cpu(*xlog_cycle_data(rhead, idx));
}
- ptr += sizeof(xlog_op_header_t) + op_len;
+ ptr += sizeof(struct xlog_op_header) + op_len;
}
}
#endif
@@ -3549,19 +3481,19 @@ xlog_force_shutdown(
STATIC int
xlog_iclogs_empty(
- struct xlog *log)
+ struct xlog *log)
{
- xlog_in_core_t *iclog;
+ struct xlog_in_core *iclog = log->l_iclog;
- iclog = log->l_iclog;
do {
/* endianness does not matter here, zero is zero in
* any language.
*/
- if (iclog->ic_header.h_num_logops)
+ if (iclog->ic_header->h_num_logops)
return 0;
iclog = iclog->ic_next;
} while (iclog != log->l_iclog);
+
return 1;
}
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 13455854365f..dcc1f44ed68f 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -16,10 +16,47 @@ struct xfs_log_vec {
struct xfs_log_item *lv_item; /* owner */
char *lv_buf; /* formatted buffer */
int lv_bytes; /* accounted space in buffer */
- int lv_buf_len; /* aligned size of buffer */
- int lv_size; /* size of allocated lv */
+ int lv_buf_used; /* buffer space used so far */
+ int lv_alloc_size; /* size of allocated lv */
};
+/* Region types for iovec's i_type */
+#define XLOG_REG_TYPE_BFORMAT 1
+#define XLOG_REG_TYPE_BCHUNK 2
+#define XLOG_REG_TYPE_EFI_FORMAT 3
+#define XLOG_REG_TYPE_EFD_FORMAT 4
+#define XLOG_REG_TYPE_IFORMAT 5
+#define XLOG_REG_TYPE_ICORE 6
+#define XLOG_REG_TYPE_IEXT 7
+#define XLOG_REG_TYPE_IBROOT 8
+#define XLOG_REG_TYPE_ILOCAL 9
+#define XLOG_REG_TYPE_IATTR_EXT 10
+#define XLOG_REG_TYPE_IATTR_BROOT 11
+#define XLOG_REG_TYPE_IATTR_LOCAL 12
+#define XLOG_REG_TYPE_QFORMAT 13
+#define XLOG_REG_TYPE_DQUOT 14
+#define XLOG_REG_TYPE_QUOTAOFF 15
+#define XLOG_REG_TYPE_LRHEADER 16
+#define XLOG_REG_TYPE_UNMOUNT 17
+#define XLOG_REG_TYPE_COMMIT 18
+#define XLOG_REG_TYPE_TRANSHDR 19
+#define XLOG_REG_TYPE_ICREATE 20
+#define XLOG_REG_TYPE_RUI_FORMAT 21
+#define XLOG_REG_TYPE_RUD_FORMAT 22
+#define XLOG_REG_TYPE_CUI_FORMAT 23
+#define XLOG_REG_TYPE_CUD_FORMAT 24
+#define XLOG_REG_TYPE_BUI_FORMAT 25
+#define XLOG_REG_TYPE_BUD_FORMAT 26
+#define XLOG_REG_TYPE_ATTRI_FORMAT 27
+#define XLOG_REG_TYPE_ATTRD_FORMAT 28
+#define XLOG_REG_TYPE_ATTR_NAME 29
+#define XLOG_REG_TYPE_ATTR_VALUE 30
+#define XLOG_REG_TYPE_XMI_FORMAT 31
+#define XLOG_REG_TYPE_XMD_FORMAT 32
+#define XLOG_REG_TYPE_ATTR_NEWNAME 33
+#define XLOG_REG_TYPE_ATTR_NEWVALUE 34
+#define XLOG_REG_TYPE_MAX 34
+
#define XFS_LOG_VEC_ORDERED (-1)
/*
@@ -64,12 +101,13 @@ xlog_finish_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec *vec,
oph->oh_len = cpu_to_be32(len);
len += sizeof(struct xlog_op_header);
- lv->lv_buf_len += len;
+ lv->lv_buf_used += len;
lv->lv_bytes += len;
vec->i_len = len;
/* Catch buffer overruns */
- ASSERT((void *)lv->lv_buf + lv->lv_bytes <= (void *)lv + lv->lv_size);
+ ASSERT((void *)lv->lv_buf + lv->lv_bytes <=
+ (void *)lv + lv->lv_alloc_size);
}
/*
@@ -87,13 +125,6 @@ xlog_copy_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
return buf;
}
-static inline void *
-xlog_copy_from_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
- const struct xfs_log_iovec *src)
-{
- return xlog_copy_iovec(lv, vecp, src->i_type, src->i_addr, src->i_len);
-}
-
/*
* By comparing each component, we don't have to worry about extra
* endian issues in treating two 32 bit numbers as one 64 bit number
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 1ca406ec1b40..778ac47adb8c 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -275,7 +275,7 @@ xlog_cil_alloc_shadow_bufs(
struct xfs_log_vec *lv;
int niovecs = 0;
int nbytes = 0;
- int buf_size;
+ int alloc_size;
bool ordered = false;
/* Skip items which aren't dirty in this transaction. */
@@ -309,23 +309,21 @@ xlog_cil_alloc_shadow_bufs(
* Then round nbytes up to 64-bit alignment so that the initial
* buffer alignment is easy to calculate and verify.
*/
- nbytes += niovecs *
- (sizeof(uint64_t) + sizeof(struct xlog_op_header));
- nbytes = round_up(nbytes, sizeof(uint64_t));
+ nbytes = xlog_item_space(niovecs, nbytes);
/*
* The data buffer needs to start 64-bit aligned, so round up
* that space to ensure we can align it appropriately and not
* overrun the buffer.
*/
- buf_size = nbytes + xlog_cil_iovec_space(niovecs);
+ alloc_size = nbytes + xlog_cil_iovec_space(niovecs);
/*
* if we have no shadow buffer, or it is too small, we need to
* reallocate it.
*/
if (!lip->li_lv_shadow ||
- buf_size > lip->li_lv_shadow->lv_size) {
+ alloc_size > lip->li_lv_shadow->lv_alloc_size) {
/*
* We free and allocate here as a realloc would copy
* unnecessary data. We don't use kvzalloc() for the
@@ -334,15 +332,15 @@ xlog_cil_alloc_shadow_bufs(
* storage.
*/
kvfree(lip->li_lv_shadow);
- lv = xlog_kvmalloc(buf_size);
+ lv = xlog_kvmalloc(alloc_size);
memset(lv, 0, xlog_cil_iovec_space(niovecs));
INIT_LIST_HEAD(&lv->lv_list);
lv->lv_item = lip;
- lv->lv_size = buf_size;
+ lv->lv_alloc_size = alloc_size;
if (ordered)
- lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
+ lv->lv_buf_used = XFS_LOG_VEC_ORDERED;
else
lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1];
lip->li_lv_shadow = lv;
@@ -350,9 +348,9 @@ xlog_cil_alloc_shadow_bufs(
/* same or smaller, optimise common overwrite case */
lv = lip->li_lv_shadow;
if (ordered)
- lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
+ lv->lv_buf_used = XFS_LOG_VEC_ORDERED;
else
- lv->lv_buf_len = 0;
+ lv->lv_buf_used = 0;
lv->lv_bytes = 0;
}
@@ -372,30 +370,30 @@ xlog_cil_alloc_shadow_bufs(
STATIC void
xfs_cil_prepare_item(
struct xlog *log,
+ struct xfs_log_item *lip,
struct xfs_log_vec *lv,
- struct xfs_log_vec *old_lv,
int *diff_len)
{
/* Account for the new LV being passed in */
- if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED)
+ if (lv->lv_buf_used != XFS_LOG_VEC_ORDERED)
*diff_len += lv->lv_bytes;
/*
* If there is no old LV, this is the first time we've seen the item in
* this CIL context and so we need to pin it. If we are replacing the
- * old_lv, then remove the space it accounts for and make it the shadow
+ * old lv, then remove the space it accounts for and make it the shadow
* buffer for later freeing. In both cases we are now switching to the
* shadow buffer, so update the pointer to it appropriately.
*/
- if (!old_lv) {
+ if (!lip->li_lv) {
if (lv->lv_item->li_ops->iop_pin)
lv->lv_item->li_ops->iop_pin(lv->lv_item);
lv->lv_item->li_lv_shadow = NULL;
- } else if (old_lv != lv) {
- ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED);
+ } else if (lip->li_lv != lv) {
+ ASSERT(lv->lv_buf_used != XFS_LOG_VEC_ORDERED);
- *diff_len -= old_lv->lv_bytes;
- lv->lv_item->li_lv_shadow = old_lv;
+ *diff_len -= lip->li_lv->lv_bytes;
+ lv->lv_item->li_lv_shadow = lip->li_lv;
}
/* attach new log vector to log item */
@@ -454,10 +452,8 @@ xlog_cil_insert_format_items(
}
list_for_each_entry(lip, &tp->t_items, li_trans) {
- struct xfs_log_vec *lv;
- struct xfs_log_vec *old_lv = NULL;
- struct xfs_log_vec *shadow;
- bool ordered = false;
+ struct xfs_log_vec *lv = lip->li_lv;
+ struct xfs_log_vec *shadow = lip->li_lv_shadow;
/* Skip items which aren't dirty in this transaction. */
if (!test_bit(XFS_LI_DIRTY, &lip->li_flags))
@@ -467,22 +463,23 @@ xlog_cil_insert_format_items(
* The formatting size information is already attached to
* the shadow lv on the log item.
*/
- shadow = lip->li_lv_shadow;
- if (shadow->lv_buf_len == XFS_LOG_VEC_ORDERED)
- ordered = true;
+ if (shadow->lv_buf_used == XFS_LOG_VEC_ORDERED) {
+ if (!lv) {
+ lv = shadow;
+ lv->lv_item = lip;
+ }
+ ASSERT(shadow->lv_alloc_size == lv->lv_alloc_size);
+ xfs_cil_prepare_item(log, lip, lv, diff_len);
+ continue;
+ }
/* Skip items that do not have any vectors for writing */
- if (!shadow->lv_niovecs && !ordered)
+ if (!shadow->lv_niovecs)
continue;
/* compare to existing item size */
- old_lv = lip->li_lv;
- if (lip->li_lv && shadow->lv_size <= lip->li_lv->lv_size) {
+ if (lv && shadow->lv_alloc_size <= lv->lv_alloc_size) {
/* same or smaller, optimise common overwrite case */
- lv = lip->li_lv;
-
- if (ordered)
- goto insert;
/*
* set the item up as though it is a new insertion so
@@ -494,7 +491,7 @@ xlog_cil_insert_format_items(
lv->lv_niovecs = shadow->lv_niovecs;
/* reset the lv buffer information for new formatting */
- lv->lv_buf_len = 0;
+ lv->lv_buf_used = 0;
lv->lv_bytes = 0;
lv->lv_buf = (char *)lv +
xlog_cil_iovec_space(lv->lv_niovecs);
@@ -502,17 +499,11 @@ xlog_cil_insert_format_items(
/* switch to shadow buffer! */
lv = shadow;
lv->lv_item = lip;
- if (ordered) {
- /* track as an ordered logvec */
- ASSERT(lip->li_lv == NULL);
- goto insert;
- }
}
ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t)));
lip->li_ops->iop_format(lip, lv);
-insert:
- xfs_cil_prepare_item(log, lv, old_lv, diff_len);
+ xfs_cil_prepare_item(log, lip, lv, diff_len);
}
}
@@ -795,8 +786,10 @@ xlog_cil_ail_insert(
struct xfs_log_item *lip = lv->lv_item;
xfs_lsn_t item_lsn;
- if (aborted)
+ if (aborted) {
+ trace_xlog_ail_insert_abort(lip);
set_bit(XFS_LI_ABORTED, &lip->li_flags);
+ }
if (lip->li_ops->flags & XFS_ITEM_RELEASE_WHEN_COMMITTED) {
lip->li_ops->iop_release(lip);
@@ -947,7 +940,7 @@ xlog_cil_set_ctx_write_state(
struct xlog_in_core *iclog)
{
struct xfs_cil *cil = ctx->cil;
- xfs_lsn_t lsn = be64_to_cpu(iclog->ic_header.h_lsn);
+ xfs_lsn_t lsn = be64_to_cpu(iclog->ic_header->h_lsn);
ASSERT(!ctx->commit_lsn);
if (!ctx->start_lsn) {
@@ -1245,7 +1238,7 @@ xlog_cil_build_lv_chain(
lv->lv_order_id = item->li_order_id;
/* we don't write ordered log vectors */
- if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED)
+ if (lv->lv_buf_used != XFS_LOG_VEC_ORDERED)
*num_bytes += lv->lv_bytes;
*num_iovecs += lv->lv_niovecs;
list_add_tail(&lv->lv_list, &ctx->lv_chain);
@@ -1465,9 +1458,9 @@ xlog_cil_push_work(
*/
spin_lock(&log->l_icloglock);
if (ctx->start_lsn != ctx->commit_lsn) {
- xfs_lsn_t plsn;
+ xfs_lsn_t plsn = be64_to_cpu(
+ ctx->commit_iclog->ic_prev->ic_header->h_lsn);
- plsn = be64_to_cpu(ctx->commit_iclog->ic_prev->ic_header.h_lsn);
if (plsn && XFS_LSN_CMP(plsn, ctx->commit_lsn) < 0) {
/*
* Waiting on ic_force_wait orders the completion of
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index f3d78869e5e5..0fe59f0525aa 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -144,7 +144,7 @@ enum xlog_iclog_state {
#define XLOG_COVER_OPS 5
-typedef struct xlog_ticket {
+struct xlog_ticket {
struct list_head t_queue; /* reserve/write queue */
struct task_struct *t_task; /* task that owns this ticket */
xlog_tid_t t_tid; /* transaction identifier */
@@ -155,13 +155,11 @@ typedef struct xlog_ticket {
char t_cnt; /* current unit count */
uint8_t t_flags; /* properties of reservation */
int t_iclog_hdrs; /* iclog hdrs in t_curr_res */
-} xlog_ticket_t;
+};
/*
- * - A log record header is 512 bytes. There is plenty of room to grow the
- * xlog_rec_header_t into the reserved space.
- * - ic_data follows, so a write to disk can start at the beginning of
- * the iclog.
+ * In-core log structure.
+ *
* - ic_forcewait is used to implement synchronous forcing of the iclog to disk.
* - ic_next is the pointer to the next iclog in the ring.
* - ic_log is a pointer back to the global log structure.
@@ -183,7 +181,7 @@ typedef struct xlog_ticket {
* We'll put all the read-only and l_icloglock fields in the first cacheline,
* and move everything else out to subsequent cachelines.
*/
-typedef struct xlog_in_core {
+struct xlog_in_core {
wait_queue_head_t ic_force_wait;
wait_queue_head_t ic_write_wait;
struct xlog_in_core *ic_next;
@@ -198,8 +196,7 @@ typedef struct xlog_in_core {
/* reference counts need their own cacheline */
atomic_t ic_refcnt ____cacheline_aligned_in_smp;
- xlog_in_core_2_t *ic_data;
-#define ic_header ic_data->hic_header
+ struct xlog_rec_header *ic_header;
#ifdef DEBUG
bool ic_fail_crc : 1;
#endif
@@ -207,7 +204,7 @@ typedef struct xlog_in_core {
struct work_struct ic_end_io_work;
struct bio ic_bio;
struct bio_vec ic_bvec[];
-} xlog_in_core_t;
+};
/*
* The CIL context is used to aggregate per-transaction details as well be
@@ -409,7 +406,6 @@ struct xlog {
struct list_head *l_buf_cancel_table;
struct list_head r_dfops; /* recovered log intent items */
int l_iclog_hsize; /* size of iclog header */
- int l_iclog_heads; /* # of iclog header sectors */
uint l_sectBBsize; /* sector size in BBs (2^n) */
int l_iclog_size; /* size of log in bytes */
int l_iclog_bufs; /* number of iclog buffers */
@@ -422,7 +418,7 @@ struct xlog {
/* waiting for iclog flush */
int l_covered_state;/* state of "covering disk
* log entries" */
- xlog_in_core_t *l_iclog; /* head log queue */
+ struct xlog_in_core *l_iclog; /* head log queue */
spinlock_t l_icloglock; /* grab to change iclog state */
int l_curr_cycle; /* Cycle number of log writes */
int l_prev_cycle; /* Cycle number before last
@@ -499,8 +495,8 @@ xlog_recover_finish(
extern void
xlog_recover_cancel(struct xlog *);
-extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead,
- char *dp, int size);
+__le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead,
+ char *dp, unsigned int hdrsize, unsigned int size);
extern struct kmem_cache *xfs_log_ticket_cache;
struct xlog_ticket *xlog_ticket_alloc(struct xlog *log, int unit_bytes,
@@ -698,4 +694,34 @@ xlog_kvmalloc(
return p;
}
+/*
+ * Given a count of iovecs and space for a log item, compute the space we need
+ * in the log to store that data plus the log headers.
+ */
+static inline unsigned int
+xlog_item_space(
+ unsigned int niovecs,
+ unsigned int nbytes)
+{
+ nbytes += niovecs * (sizeof(uint64_t) + sizeof(struct xlog_op_header));
+ return round_up(nbytes, sizeof(uint64_t));
+}
+
+/*
+ * Cycles over XLOG_CYCLE_DATA_SIZE overflow into the extended header that was
+ * added for v2 logs. Addressing for the cycles array there is off by one,
+ * because the first batch of cycles is in the original header.
+ */
+static inline __be32 *xlog_cycle_data(struct xlog_rec_header *rhead, unsigned i)
+{
+ if (i >= XLOG_CYCLE_DATA_SIZE) {
+ unsigned j = i / XLOG_CYCLE_DATA_SIZE;
+ unsigned k = i % XLOG_CYCLE_DATA_SIZE;
+
+ return &rhead->h_ext[j - 1].xh_cycle_data[k];
+ }
+
+ return &rhead->h_cycle_data[i];
+}
+
#endif /* __XFS_LOG_PRIV_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 2f76531842f8..03e42c7dab56 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -190,8 +190,8 @@ xlog_bwrite(
*/
STATIC void
xlog_header_check_dump(
- xfs_mount_t *mp,
- xlog_rec_header_t *head)
+ struct xfs_mount *mp,
+ struct xlog_rec_header *head)
{
xfs_debug(mp, "%s: SB : uuid = %pU, fmt = %d",
__func__, &mp->m_sb.sb_uuid, XLOG_FMT);
@@ -207,8 +207,8 @@ xlog_header_check_dump(
*/
STATIC int
xlog_header_check_recover(
- xfs_mount_t *mp,
- xlog_rec_header_t *head)
+ struct xfs_mount *mp,
+ struct xlog_rec_header *head)
{
ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM));
@@ -238,8 +238,8 @@ xlog_header_check_recover(
*/
STATIC int
xlog_header_check_mount(
- xfs_mount_t *mp,
- xlog_rec_header_t *head)
+ struct xfs_mount *mp,
+ struct xlog_rec_header *head)
{
ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM));
@@ -400,7 +400,7 @@ xlog_find_verify_log_record(
xfs_daddr_t i;
char *buffer;
char *offset = NULL;
- xlog_rec_header_t *head = NULL;
+ struct xlog_rec_header *head = NULL;
int error = 0;
int smallmem = 0;
int num_blks = *last_blk - start_blk;
@@ -437,7 +437,7 @@ xlog_find_verify_log_record(
goto out;
}
- head = (xlog_rec_header_t *)offset;
+ head = (struct xlog_rec_header *)offset;
if (head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
break;
@@ -1237,7 +1237,7 @@ xlog_find_tail(
xfs_daddr_t *head_blk,
xfs_daddr_t *tail_blk)
{
- xlog_rec_header_t *rhead;
+ struct xlog_rec_header *rhead;
char *offset = NULL;
char *buffer;
int error;
@@ -1487,7 +1487,7 @@ xlog_add_record(
int tail_cycle,
int tail_block)
{
- xlog_rec_header_t *recp = (xlog_rec_header_t *)buf;
+ struct xlog_rec_header *recp = (struct xlog_rec_header *)buf;
memset(buf, 0, BBSIZE);
recp->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
@@ -2131,15 +2131,15 @@ xlog_recover_add_to_cont_trans(
item = list_entry(trans->r_itemq.prev, struct xlog_recover_item,
ri_list);
- old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
- old_len = item->ri_buf[item->ri_cnt-1].i_len;
+ old_ptr = item->ri_buf[item->ri_cnt-1].iov_base;
+ old_len = item->ri_buf[item->ri_cnt-1].iov_len;
ptr = kvrealloc(old_ptr, len + old_len, GFP_KERNEL);
if (!ptr)
return -ENOMEM;
memcpy(&ptr[old_len], dp, len);
- item->ri_buf[item->ri_cnt-1].i_len += len;
- item->ri_buf[item->ri_cnt-1].i_addr = ptr;
+ item->ri_buf[item->ri_cnt-1].iov_len += len;
+ item->ri_buf[item->ri_cnt-1].iov_base = ptr;
trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
return 0;
}
@@ -2223,7 +2223,7 @@ xlog_recover_add_to_trans(
}
item->ri_total = in_f->ilf_size;
- item->ri_buf = kzalloc(item->ri_total * sizeof(xfs_log_iovec_t),
+ item->ri_buf = kcalloc(item->ri_total, sizeof(*item->ri_buf),
GFP_KERNEL | __GFP_NOFAIL);
}
@@ -2237,8 +2237,8 @@ xlog_recover_add_to_trans(
}
/* Description region is ri_buf[0] */
- item->ri_buf[item->ri_cnt].i_addr = ptr;
- item->ri_buf[item->ri_cnt].i_len = len;
+ item->ri_buf[item->ri_cnt].iov_base = ptr;
+ item->ri_buf[item->ri_cnt].iov_len = len;
item->ri_cnt++;
trace_xfs_log_recover_item_add(log, trans, item, 0);
return 0;
@@ -2262,7 +2262,7 @@ xlog_recover_free_trans(
/* Free the regions in the item. */
list_del(&item->ri_list);
for (i = 0; i < item->ri_cnt; i++)
- kvfree(item->ri_buf[i].i_addr);
+ kvfree(item->ri_buf[i].iov_base);
/* Free the item itself */
kfree(item->ri_buf);
kfree(item);
@@ -2863,23 +2863,12 @@ xlog_unpack_data(
char *dp,
struct xlog *log)
{
- int i, j, k;
+ int i;
- for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
- i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
- *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i];
+ for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
+ *(__be32 *)dp = *xlog_cycle_data(rhead, i);
dp += BBSIZE;
}
-
- if (xfs_has_logv2(log->l_mp)) {
- xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
- for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
- j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
- k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
- *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];
- dp += BBSIZE;
- }
- }
}
/*
@@ -2894,20 +2883,34 @@ xlog_recover_process(
int pass,
struct list_head *buffer_list)
{
- __le32 old_crc = rhead->h_crc;
- __le32 crc;
+ __le32 expected_crc = rhead->h_crc, crc, other_crc;
+
+ crc = xlog_cksum(log, rhead, dp, XLOG_REC_SIZE,
+ be32_to_cpu(rhead->h_len));
- crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
+ /*
+ * Look at the end of the struct xlog_rec_header definition in
+ * xfs_log_format.h for the glory details.
+ */
+ if (expected_crc && crc != expected_crc) {
+ other_crc = xlog_cksum(log, rhead, dp, XLOG_REC_SIZE_OTHER,
+ be32_to_cpu(rhead->h_len));
+ if (other_crc == expected_crc) {
+ xfs_notice_once(log->l_mp,
+ "Fixing up incorrect CRC due to padding.");
+ crc = other_crc;
+ }
+ }
/*
* Nothing else to do if this is a CRC verification pass. Just return
* if this a record with a non-zero crc. Unfortunately, mkfs always
- * sets old_crc to 0 so we must consider this valid even on v5 supers.
- * Otherwise, return EFSBADCRC on failure so the callers up the stack
- * know precisely what failed.
+ * sets expected_crc to 0 so we must consider this valid even on v5
+ * supers. Otherwise, return EFSBADCRC on failure so the callers up the
+ * stack know precisely what failed.
*/
if (pass == XLOG_RECOVER_CRCPASS) {
- if (old_crc && crc != old_crc)
+ if (expected_crc && crc != expected_crc)
return -EFSBADCRC;
return 0;
}
@@ -2918,11 +2921,11 @@ xlog_recover_process(
* zero CRC check prevents warnings from being emitted when upgrading
* the kernel from one that does not add CRCs by default.
*/
- if (crc != old_crc) {
- if (old_crc || xfs_has_crc(log->l_mp)) {
+ if (crc != expected_crc) {
+ if (expected_crc || xfs_has_crc(log->l_mp)) {
xfs_alert(log->l_mp,
"log record CRC mismatch: found 0x%x, expected 0x%x.",
- le32_to_cpu(old_crc),
+ le32_to_cpu(expected_crc),
le32_to_cpu(crc));
xfs_hex_dump(dp, 32);
}
@@ -2994,7 +2997,7 @@ xlog_do_recovery_pass(
int pass,
xfs_daddr_t *first_bad) /* out: first bad log rec */
{
- xlog_rec_header_t *rhead;
+ struct xlog_rec_header *rhead;
xfs_daddr_t blk_no, rblk_no;
xfs_daddr_t rhead_blk;
char *offset;
@@ -3031,7 +3034,7 @@ xlog_do_recovery_pass(
if (error)
goto bread_err1;
- rhead = (xlog_rec_header_t *)offset;
+ rhead = (struct xlog_rec_header *)offset;
/*
* xfsprogs has a bug where record length is based on lsunit but
@@ -3138,7 +3141,7 @@ xlog_do_recovery_pass(
if (error)
goto bread_err2;
}
- rhead = (xlog_rec_header_t *)offset;
+ rhead = (struct xlog_rec_header *)offset;
error = xlog_valid_rec_header(log, rhead,
split_hblks ? blk_no : 0, h_size);
if (error)
@@ -3220,7 +3223,7 @@ xlog_do_recovery_pass(
if (error)
goto bread_err2;
- rhead = (xlog_rec_header_t *)offset;
+ rhead = (struct xlog_rec_header *)offset;
error = xlog_valid_rec_header(log, rhead, blk_no, h_size);
if (error)
goto bread_err2;
diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c
index 15d410d16bb2..19aba2c3d525 100644
--- a/fs/xfs/xfs_message.c
+++ b/fs/xfs/xfs_message.c
@@ -141,14 +141,6 @@ xfs_warn_experimental(
const char *name;
long opstate;
} features[] = {
- [XFS_EXPERIMENTAL_PNFS] = {
- .opstate = XFS_OPSTATE_WARNED_PNFS,
- .name = "pNFS",
- },
- [XFS_EXPERIMENTAL_SCRUB] = {
- .opstate = XFS_OPSTATE_WARNED_SCRUB,
- .name = "online scrub",
- },
[XFS_EXPERIMENTAL_SHRINK] = {
.opstate = XFS_OPSTATE_WARNED_SHRINK,
.name = "online shrink",
@@ -161,14 +153,6 @@ xfs_warn_experimental(
.opstate = XFS_OPSTATE_WARNED_LBS,
.name = "large block size",
},
- [XFS_EXPERIMENTAL_EXCHRANGE] = {
- .opstate = XFS_OPSTATE_WARNED_EXCHRANGE,
- .name = "exchange range",
- },
- [XFS_EXPERIMENTAL_PPTR] = {
- .opstate = XFS_OPSTATE_WARNED_PPTR,
- .name = "parent pointer",
- },
[XFS_EXPERIMENTAL_METADIR] = {
.opstate = XFS_OPSTATE_WARNED_METADIR,
.name = "metadata directory tree",
diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h
index a92a4d09c8e9..d68e72379f9d 100644
--- a/fs/xfs/xfs_message.h
+++ b/fs/xfs/xfs_message.h
@@ -91,13 +91,9 @@ void xfs_buf_alert_ratelimited(struct xfs_buf *bp, const char *rlmsg,
const char *fmt, ...);
enum xfs_experimental_feat {
- XFS_EXPERIMENTAL_PNFS,
- XFS_EXPERIMENTAL_SCRUB,
XFS_EXPERIMENTAL_SHRINK,
XFS_EXPERIMENTAL_LARP,
XFS_EXPERIMENTAL_LBS,
- XFS_EXPERIMENTAL_EXCHRANGE,
- XFS_EXPERIMENTAL_PPTR,
XFS_EXPERIMENTAL_METADIR,
XFS_EXPERIMENTAL_ZONED,
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 00b53f479ece..0953f6ae94ab 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -171,19 +171,16 @@ xfs_readsb(
ASSERT(mp->m_ddev_targp != NULL);
/*
- * For the initial read, we must guess at the sector
- * size based on the block device. It's enough to
- * get the sb_sectsize out of the superblock and
- * then reread with the proper length.
- * We don't verify it yet, because it may not be complete.
+ * In the first pass, use the device sector size to just read enough
+ * of the superblock to extract the XFS sector size.
+ *
+ * The device sector size must be smaller than or equal to the XFS
+ * sector size and thus we can always read the superblock. Once we know
+ * the XFS sector size, re-read it and run the buffer verifier.
*/
- sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
+ sector_size = mp->m_ddev_targp->bt_logical_sectorsize;
buf_ops = NULL;
- /*
- * Allocate a (locked) buffer to hold the superblock. This will be kept
- * around at all times to optimize access to the superblock.
- */
reread:
error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
BTOBB(sector_size), &bp, buf_ops);
@@ -247,6 +244,10 @@ reread:
/* no need to be quiet anymore, so reset the buf ops */
bp->b_ops = &xfs_sb_buf_ops;
+ /*
+ * Keep a pointer of the sb buffer around instead of caching it in the
+ * buffer cache because we access it frequently.
+ */
mp->m_sb_bp = bp;
xfs_buf_unlock(bp);
return 0;
@@ -666,6 +667,152 @@ xfs_agbtree_compute_maxlevels(
mp->m_agbtree_maxlevels = max(levels, mp->m_refc_maxlevels);
}
+/* Maximum atomic write IO size that the kernel allows. */
+static inline xfs_extlen_t xfs_calc_atomic_write_max(struct xfs_mount *mp)
+{
+ return rounddown_pow_of_two(XFS_B_TO_FSB(mp, MAX_RW_COUNT));
+}
+
+/*
+ * If the underlying device advertises atomic write support, limit the size of
+ * atomic writes to the greatest power-of-two factor of the group size so
+ * that every atomic write unit aligns with the start of every group. This is
+ * required so that the allocations for an atomic write will always be
+ * aligned compatibly with the alignment requirements of the storage.
+ *
+ * If the device doesn't advertise atomic writes, then there are no alignment
+ * restrictions and the largest out-of-place write we can do ourselves is the
+ * number of blocks that user files can allocate from any group.
+ */
+static xfs_extlen_t
+xfs_calc_group_awu_max(
+ struct xfs_mount *mp,
+ enum xfs_group_type type)
+{
+ struct xfs_groups *g = &mp->m_groups[type];
+ struct xfs_buftarg *btp = xfs_group_type_buftarg(mp, type);
+
+ if (g->blocks == 0)
+ return 0;
+ if (btp && btp->bt_awu_min > 0)
+ return max_pow_of_two_factor(g->blocks);
+ return rounddown_pow_of_two(g->blocks);
+}
+
+/* Compute the maximum atomic write unit size for each section. */
+static inline void
+xfs_calc_atomic_write_unit_max(
+ struct xfs_mount *mp,
+ enum xfs_group_type type)
+{
+ struct xfs_groups *g = &mp->m_groups[type];
+
+ const xfs_extlen_t max_write = xfs_calc_atomic_write_max(mp);
+ const xfs_extlen_t max_ioend = xfs_reflink_max_atomic_cow(mp);
+ const xfs_extlen_t max_gsize = xfs_calc_group_awu_max(mp, type);
+
+ g->awu_max = min3(max_write, max_ioend, max_gsize);
+ trace_xfs_calc_atomic_write_unit_max(mp, type, max_write, max_ioend,
+ max_gsize, g->awu_max);
+}
+
+/*
+ * Try to set the atomic write maximum to a new value that we got from
+ * userspace via mount option.
+ */
+int
+xfs_set_max_atomic_write_opt(
+ struct xfs_mount *mp,
+ unsigned long long new_max_bytes)
+{
+ const xfs_filblks_t new_max_fsbs = XFS_B_TO_FSBT(mp, new_max_bytes);
+ const xfs_extlen_t max_write = xfs_calc_atomic_write_max(mp);
+ const xfs_extlen_t max_group =
+ max(mp->m_groups[XG_TYPE_AG].blocks,
+ mp->m_groups[XG_TYPE_RTG].blocks);
+ const xfs_extlen_t max_group_write =
+ max(xfs_calc_group_awu_max(mp, XG_TYPE_AG),
+ xfs_calc_group_awu_max(mp, XG_TYPE_RTG));
+ int error;
+
+ if (new_max_bytes == 0)
+ goto set_limit;
+
+ ASSERT(max_write <= U32_MAX);
+
+ /* generic_atomic_write_valid enforces power of two length */
+ if (!is_power_of_2(new_max_bytes)) {
+ xfs_warn(mp,
+ "max atomic write size of %llu bytes is not a power of 2",
+ new_max_bytes);
+ return -EINVAL;
+ }
+
+ if (new_max_bytes & mp->m_blockmask) {
+ xfs_warn(mp,
+ "max atomic write size of %llu bytes not aligned with fsblock",
+ new_max_bytes);
+ return -EINVAL;
+ }
+
+ if (new_max_fsbs > max_write) {
+ xfs_warn(mp,
+ "max atomic write size of %lluk cannot be larger than max write size %lluk",
+ new_max_bytes >> 10,
+ XFS_FSB_TO_B(mp, max_write) >> 10);
+ return -EINVAL;
+ }
+
+ if (new_max_fsbs > max_group) {
+ xfs_warn(mp,
+ "max atomic write size of %lluk cannot be larger than allocation group size %lluk",
+ new_max_bytes >> 10,
+ XFS_FSB_TO_B(mp, max_group) >> 10);
+ return -EINVAL;
+ }
+
+ if (new_max_fsbs > max_group_write) {
+ xfs_warn(mp,
+ "max atomic write size of %lluk cannot be larger than max allocation group write size %lluk",
+ new_max_bytes >> 10,
+ XFS_FSB_TO_B(mp, max_group_write) >> 10);
+ return -EINVAL;
+ }
+
+ if (xfs_has_reflink(mp))
+ goto set_limit;
+
+ if (new_max_fsbs == 1) {
+ if (mp->m_ddev_targp->bt_awu_max ||
+ (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_awu_max)) {
+ } else {
+ xfs_warn(mp,
+ "cannot support atomic writes of size %lluk with no reflink or HW support",
+ new_max_bytes >> 10);
+ return -EINVAL;
+ }
+ } else {
+ xfs_warn(mp,
+ "cannot support atomic writes of size %lluk with no reflink support",
+ new_max_bytes >> 10);
+ return -EINVAL;
+ }
+
+set_limit:
+ error = xfs_calc_atomic_write_reservation(mp, new_max_fsbs);
+ if (error) {
+ xfs_warn(mp,
+ "cannot support completing atomic writes of %lluk",
+ new_max_bytes >> 10);
+ return error;
+ }
+
+ xfs_calc_atomic_write_unit_max(mp, XG_TYPE_AG);
+ xfs_calc_atomic_write_unit_max(mp, XG_TYPE_RTG);
+ mp->m_awu_max_bytes = new_max_bytes;
+ return 0;
+}
+
/* Compute maximum possible height for realtime btree types for this fs. */
static inline void
xfs_rtbtree_compute_maxlevels(
@@ -910,19 +1057,6 @@ xfs_mountfs(
xfs_inodegc_start(mp);
xfs_blockgc_start(mp);
- /*
- * Now that we've recovered any pending superblock feature bit
- * additions, we can finish setting up the attr2 behaviour for the
- * mount. The noattr2 option overrides the superblock flag, so only
- * check the superblock feature flag if the mount option is not set.
- */
- if (xfs_has_noattr2(mp)) {
- mp->m_features &= ~XFS_FEAT_ATTR2;
- } else if (!xfs_has_attr2(mp) &&
- (mp->m_sb.sb_features2 & XFS_SB_VERSION2_ATTR2BIT)) {
- mp->m_features |= XFS_FEAT_ATTR2;
- }
-
if (xfs_has_metadir(mp)) {
error = xfs_mount_setup_metadir(mp);
if (error)
@@ -1082,6 +1216,15 @@ xfs_mountfs(
xfs_zone_gc_start(mp);
}
+ /*
+ * Pre-calculate atomic write unit max. This involves computations
+ * derived from transaction reservations, so we must do this after the
+ * log is fully initialized.
+ */
+ error = xfs_set_max_atomic_write_opt(mp, mp->m_awu_max_bytes);
+ if (error)
+ goto out_agresv;
+
return 0;
out_agresv:
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 799b84220ebb..b871dfde372b 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -119,6 +119,12 @@ struct xfs_groups {
* SMR hard drives.
*/
xfs_fsblock_t start_fsb;
+
+ /*
+ * Maximum length of an atomic write for files stored in this
+ * collection of allocation groups, in fsblocks.
+ */
+ xfs_extlen_t awu_max;
};
struct xfs_freecounter {
@@ -229,6 +235,10 @@ typedef struct xfs_mount {
bool m_finobt_nores; /* no per-AG finobt resv. */
bool m_update_sb; /* sb needs update in mount */
unsigned int m_max_open_zones;
+ unsigned int m_zonegc_low_space;
+
+ /* max_atomic_write mount option value */
+ unsigned long long m_awu_max_bytes;
/*
* Bitsets of per-fs metadata that have been checked and/or are sick.
@@ -352,7 +362,6 @@ typedef struct xfs_mount {
#define XFS_FEAT_EXTFLG (1ULL << 7) /* unwritten extents */
#define XFS_FEAT_ASCIICI (1ULL << 8) /* ASCII only case-insens. */
#define XFS_FEAT_LAZYSBCOUNT (1ULL << 9) /* Superblk counters */
-#define XFS_FEAT_ATTR2 (1ULL << 10) /* dynamic attr fork */
#define XFS_FEAT_PARENT (1ULL << 11) /* parent pointers */
#define XFS_FEAT_PROJID32 (1ULL << 12) /* 32 bit project id */
#define XFS_FEAT_CRC (1ULL << 13) /* metadata CRCs */
@@ -375,7 +384,6 @@ typedef struct xfs_mount {
/* Mount features */
#define XFS_FEAT_NOLIFETIME (1ULL << 47) /* disable lifetime hints */
-#define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */
#define XFS_FEAT_NOALIGN (1ULL << 49) /* ignore alignment */
#define XFS_FEAT_ALLOCSIZE (1ULL << 50) /* user specified allocation size */
#define XFS_FEAT_LARGE_IOSIZE (1ULL << 51) /* report large preferred
@@ -385,7 +393,6 @@ typedef struct xfs_mount {
#define XFS_FEAT_DISCARD (1ULL << 54) /* discard unused blocks */
#define XFS_FEAT_GRPID (1ULL << 55) /* group-ID assigned from directory */
#define XFS_FEAT_SMALL_INUMS (1ULL << 56) /* user wants 32bit inodes */
-#define XFS_FEAT_IKEEP (1ULL << 57) /* keep empty inode clusters*/
#define XFS_FEAT_SWALLOC (1ULL << 58) /* stripe width allocation */
#define XFS_FEAT_FILESTREAMS (1ULL << 59) /* use filestreams allocator */
#define XFS_FEAT_DAX_ALWAYS (1ULL << 60) /* DAX always enabled */
@@ -463,6 +470,11 @@ static inline bool xfs_has_nonzoned(const struct xfs_mount *mp)
return !xfs_has_zoned(mp);
}
+static inline bool xfs_can_sw_atomic_write(struct xfs_mount *mp)
+{
+ return xfs_has_reflink(mp);
+}
+
/*
* Some features are always on for v5 file systems, allow the compiler to
* eliminiate dead code when building without v4 support.
@@ -488,12 +500,17 @@ __XFS_HAS_V4_FEAT(align, ALIGN)
__XFS_HAS_V4_FEAT(logv2, LOGV2)
__XFS_HAS_V4_FEAT(extflg, EXTFLG)
__XFS_HAS_V4_FEAT(lazysbcount, LAZYSBCOUNT)
-__XFS_ADD_V4_FEAT(attr2, ATTR2)
__XFS_ADD_V4_FEAT(projid32, PROJID32)
__XFS_HAS_V4_FEAT(v3inodes, V3INODES)
__XFS_HAS_V4_FEAT(crc, CRC)
__XFS_HAS_V4_FEAT(pquotino, PQUOTINO)
+static inline void xfs_add_attr2(struct xfs_mount *mp)
+{
+ if (IS_ENABLED(CONFIG_XFS_SUPPORT_V4))
+ xfs_sb_version_addattr2(&mp->m_sb);
+}
+
/*
* Mount features
*
@@ -501,7 +518,6 @@ __XFS_HAS_V4_FEAT(pquotino, PQUOTINO)
* bit inodes and read-only state, are kept as operational state rather than
* features.
*/
-__XFS_HAS_FEAT(noattr2, NOATTR2)
__XFS_HAS_FEAT(noalign, NOALIGN)
__XFS_HAS_FEAT(allocsize, ALLOCSIZE)
__XFS_HAS_FEAT(large_iosize, LARGE_IOSIZE)
@@ -510,7 +526,6 @@ __XFS_HAS_FEAT(dirsync, DIRSYNC)
__XFS_HAS_FEAT(discard, DISCARD)
__XFS_HAS_FEAT(grpid, GRPID)
__XFS_HAS_FEAT(small_inums, SMALL_INUMS)
-__XFS_HAS_FEAT(ikeep, IKEEP)
__XFS_HAS_FEAT(swalloc, SWALLOC)
__XFS_HAS_FEAT(filestreams, FILESTREAMS)
__XFS_HAS_FEAT(dax_always, DAX_ALWAYS)
@@ -542,10 +557,6 @@ __XFS_HAS_FEAT(nouuid, NOUUID)
*/
#define XFS_OPSTATE_BLOCKGC_ENABLED 6
-/* Kernel has logged a warning about pNFS being used on this fs. */
-#define XFS_OPSTATE_WARNED_PNFS 7
-/* Kernel has logged a warning about online fsck being used on this fs. */
-#define XFS_OPSTATE_WARNED_SCRUB 8
/* Kernel has logged a warning about shrink being used on this fs. */
#define XFS_OPSTATE_WARNED_SHRINK 9
/* Kernel has logged a warning about logged xattr updates being used. */
@@ -558,10 +569,6 @@ __XFS_HAS_FEAT(nouuid, NOUUID)
#define XFS_OPSTATE_USE_LARP 13
/* Kernel has logged a warning about blocksize > pagesize on this fs. */
#define XFS_OPSTATE_WARNED_LBS 14
-/* Kernel has logged a warning about exchange-range being used on this fs. */
-#define XFS_OPSTATE_WARNED_EXCHRANGE 15
-/* Kernel has logged a warning about parent pointers being used on this fs. */
-#define XFS_OPSTATE_WARNED_PPTR 16
/* Kernel has logged a warning about metadata dirs being used on this fs. */
#define XFS_OPSTATE_WARNED_METADIR 17
/* Filesystem should use qflags to determine quotaon status */
@@ -630,7 +637,6 @@ xfs_should_warn(struct xfs_mount *mp, long nr)
{ (1UL << XFS_OPSTATE_READONLY), "read_only" }, \
{ (1UL << XFS_OPSTATE_INODEGC_ENABLED), "inodegc" }, \
{ (1UL << XFS_OPSTATE_BLOCKGC_ENABLED), "blockgc" }, \
- { (1UL << XFS_OPSTATE_WARNED_SCRUB), "wscrub" }, \
{ (1UL << XFS_OPSTATE_WARNED_SHRINK), "wshrink" }, \
{ (1UL << XFS_OPSTATE_WARNED_LARP), "wlarp" }, \
{ (1UL << XFS_OPSTATE_QUOTACHECK_RUNNING), "quotacheck" }, \
@@ -792,4 +798,24 @@ static inline void xfs_mod_sb_delalloc(struct xfs_mount *mp, int64_t delta)
percpu_counter_add(&mp->m_delalloc_blks, delta);
}
+int xfs_set_max_atomic_write_opt(struct xfs_mount *mp,
+ unsigned long long new_max_bytes);
+
+static inline struct xfs_buftarg *
+xfs_group_type_buftarg(
+ struct xfs_mount *mp,
+ enum xfs_group_type type)
+{
+ switch (type) {
+ case XG_TYPE_AG:
+ return mp->m_ddev_targp;
+ case XG_TYPE_RTG:
+ return mp->m_rtdev_targp;
+ default:
+ ASSERT(0);
+ break;
+ }
+ return NULL;
+}
+
#endif /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index d0f5b403bdbe..73b7e72944e4 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -293,7 +293,8 @@ int
xfs_mru_cache_init(void)
{
xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache",
- XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE), 1);
+ XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_PERCPU),
+ 1);
if (!xfs_mru_reap_wq)
return -ENOMEM;
return 0;
@@ -320,7 +321,7 @@ xfs_mru_cache_create(
xfs_mru_cache_free_func_t free_func)
{
struct xfs_mru_cache *mru = NULL;
- int err = 0, grp;
+ int grp;
unsigned int grp_time;
if (mrup)
@@ -341,8 +342,8 @@ xfs_mru_cache_create(
mru->lists = kzalloc(mru->grp_count * sizeof(*mru->lists),
GFP_KERNEL | __GFP_NOFAIL);
if (!mru->lists) {
- err = -ENOMEM;
- goto exit;
+ kfree(mru);
+ return -ENOMEM;
}
for (grp = 0; grp < mru->grp_count; grp++)
@@ -361,14 +362,7 @@ xfs_mru_cache_create(
mru->free_func = free_func;
mru->data = data;
*mrup = mru;
-
-exit:
- if (err && mru && mru->lists)
- kfree(mru->lists);
- if (err && mru)
- kfree(mru);
-
- return err;
+ return 0;
}
/*
@@ -414,6 +408,8 @@ xfs_mru_cache_destroy(
* To insert an element, call xfs_mru_cache_insert() with the data store, the
* element's key and the client data pointer. This function returns 0 on
* success or ENOMEM if memory for the data element couldn't be allocated.
+ *
+ * The passed in elem is freed through the per-cache free_func on failure.
*/
int
xfs_mru_cache_insert(
@@ -421,14 +417,11 @@ xfs_mru_cache_insert(
unsigned long key,
struct xfs_mru_cache_elem *elem)
{
- int error;
-
- ASSERT(mru && mru->lists);
- if (!mru || !mru->lists)
- return -EINVAL;
+ int error = -EINVAL;
+ error = -ENOMEM;
if (radix_tree_preload(GFP_KERNEL))
- return -ENOMEM;
+ goto out_free;
INIT_LIST_HEAD(&elem->list_node);
elem->key = key;
@@ -440,6 +433,12 @@ xfs_mru_cache_insert(
_xfs_mru_cache_list_insert(mru, elem);
spin_unlock(&mru->lock);
+ if (error)
+ goto out_free;
+ return 0;
+
+out_free:
+ mru->free_func(mru->data, elem);
return error;
}
diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c
index ed8d8ed42f0a..b17672889942 100644
--- a/fs/xfs/xfs_notify_failure.c
+++ b/fs/xfs/xfs_notify_failure.c
@@ -127,7 +127,7 @@ xfs_dax_notify_failure_freeze(
struct super_block *sb = mp->m_super;
int error;
- error = freeze_super(sb, FREEZE_HOLDER_KERNEL);
+ error = freeze_super(sb, FREEZE_HOLDER_KERNEL, NULL);
if (error)
xfs_emerg(mp, "already frozen by kernel, err=%d", error);
@@ -143,7 +143,7 @@ xfs_dax_notify_failure_thaw(
int error;
if (kernel_frozen) {
- error = thaw_super(sb, FREEZE_HOLDER_KERNEL);
+ error = thaw_super(sb, FREEZE_HOLDER_KERNEL, NULL);
if (error)
xfs_emerg(mp, "still frozen after notify failure, err=%d",
error);
@@ -153,7 +153,7 @@ xfs_dax_notify_failure_thaw(
* Also thaw userspace call anyway because the device is about to be
* removed immediately.
*/
- thaw_super(sb, FREEZE_HOLDER_USERSPACE);
+ thaw_super(sb, FREEZE_HOLDER_USERSPACE, NULL);
}
static int
@@ -165,7 +165,7 @@ xfs_dax_translate_range(
uint64_t *bblen)
{
u64 dev_start = btp->bt_dax_part_off;
- u64 dev_len = bdev_nr_bytes(btp->bt_bdev);
+ u64 dev_len = BBTOB(btp->bt_nr_sectors);
u64 dev_end = dev_start + dev_len - 1;
/* Notify failure on the whole device. */
@@ -253,8 +253,7 @@ xfs_dax_notify_dev_failure(
return -EOPNOTSUPP;
}
- error = xfs_dax_translate_range(type == XG_TYPE_RTG ?
- mp->m_rtdev_targp : mp->m_ddev_targp,
+ error = xfs_dax_translate_range(xfs_group_type_buftarg(mp, type),
offset, len, &daddr, &bblen);
if (error)
return error;
@@ -280,10 +279,7 @@ xfs_dax_notify_dev_failure(
kernel_frozen = xfs_dax_notify_failure_freeze(mp) == 0;
}
- error = xfs_trans_alloc_empty(mp, &tp);
- if (error)
- goto out;
-
+ tp = xfs_trans_alloc_empty(mp);
start_gno = xfs_fsb_to_gno(mp, start_bno, type);
end_gno = xfs_fsb_to_gno(mp, end_bno, type);
while ((xg = xfs_group_next_range(mp, xg, start_gno, end_gno, type))) {
@@ -354,7 +350,6 @@ xfs_dax_notify_dev_failure(
error = -EFSCORRUPTED;
}
-out:
/* Thaw the fs if it has been frozen before. */
if (mf_flags & MF_MEM_PRE_REMOVE)
xfs_dax_notify_failure_thaw(mp, kernel_frozen);
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index 6f4479deac6d..afe7497012d4 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -58,8 +58,6 @@ xfs_fs_get_uuid(
{
struct xfs_mount *mp = XFS_M(sb);
- xfs_warn_experimental(mp, XFS_EXPERIMENTAL_PNFS);
-
if (*len < sizeof(uuid_t))
return -EINVAL;
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 417439b58785..95be67ac6eb4 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -126,14 +126,17 @@ xfs_qm_dqpurge(
void *data)
{
struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo;
- int error = -EAGAIN;
- xfs_dqlock(dqp);
- if ((dqp->q_flags & XFS_DQFLAG_FREEING) || dqp->q_nrefs != 0)
- goto out_unlock;
-
- dqp->q_flags |= XFS_DQFLAG_FREEING;
+ spin_lock(&dqp->q_lockref.lock);
+ if (dqp->q_lockref.count > 0 || __lockref_is_dead(&dqp->q_lockref)) {
+ spin_unlock(&dqp->q_lockref.lock);
+ return -EAGAIN;
+ }
+ lockref_mark_dead(&dqp->q_lockref);
+ spin_unlock(&dqp->q_lockref.lock);
+ mutex_lock(&dqp->q_qlock);
+ xfs_qm_dqunpin_wait(dqp);
xfs_dqflock(dqp);
/*
@@ -143,6 +146,7 @@ xfs_qm_dqpurge(
*/
if (XFS_DQ_IS_DIRTY(dqp)) {
struct xfs_buf *bp = NULL;
+ int error;
/*
* We don't care about getting disk errors here. We need
@@ -150,9 +154,9 @@ xfs_qm_dqpurge(
*/
error = xfs_dquot_use_attached_buf(dqp, &bp);
if (error == -EAGAIN) {
- xfs_dqfunlock(dqp);
- dqp->q_flags &= ~XFS_DQFLAG_FREEING;
- goto out_unlock;
+ /* resurrect the refcount from the dead. */
+ dqp->q_lockref.count = 0;
+ goto out_funlock;
}
if (!bp)
goto out_funlock;
@@ -176,7 +180,7 @@ out_funlock:
!test_bit(XFS_LI_IN_AIL, &dqp->q_logitem.qli_item.li_flags));
xfs_dqfunlock(dqp);
- xfs_dqunlock(dqp);
+ mutex_unlock(&dqp->q_qlock);
radix_tree_delete(xfs_dquot_tree(qi, xfs_dquot_type(dqp)), dqp->q_id);
qi->qi_dquots--;
@@ -191,10 +195,6 @@ out_funlock:
xfs_qm_dqdestroy(dqp);
return 0;
-
-out_unlock:
- xfs_dqunlock(dqp);
- return error;
}
/*
@@ -287,51 +287,6 @@ xfs_qm_unmount_quotas(
xfs_qm_destroy_quotainos(mp->m_quotainfo);
}
-STATIC int
-xfs_qm_dqattach_one(
- struct xfs_inode *ip,
- xfs_dqtype_t type,
- bool doalloc,
- struct xfs_dquot **IO_idqpp)
-{
- struct xfs_dquot *dqp;
- int error;
-
- xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
- error = 0;
-
- /*
- * See if we already have it in the inode itself. IO_idqpp is &i_udquot
- * or &i_gdquot. This made the code look weird, but made the logic a lot
- * simpler.
- */
- dqp = *IO_idqpp;
- if (dqp) {
- trace_xfs_dqattach_found(dqp);
- return 0;
- }
-
- /*
- * Find the dquot from somewhere. This bumps the reference count of
- * dquot and returns it locked. This can return ENOENT if dquot didn't
- * exist on disk and we didn't ask it to allocate; ESRCH if quotas got
- * turned off suddenly.
- */
- error = xfs_qm_dqget_inode(ip, type, doalloc, &dqp);
- if (error)
- return error;
-
- trace_xfs_dqattach_get(dqp);
-
- /*
- * dqget may have dropped and re-acquired the ilock, but it guarantees
- * that the dquot returned is the one that should go in the inode.
- */
- *IO_idqpp = dqp;
- xfs_dqunlock(dqp);
- return 0;
-}
-
static bool
xfs_qm_need_dqattach(
struct xfs_inode *ip)
@@ -371,7 +326,7 @@ xfs_qm_dqattach_locked(
ASSERT(!xfs_is_metadir_inode(ip));
if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) {
- error = xfs_qm_dqattach_one(ip, XFS_DQTYPE_USER,
+ error = xfs_qm_dqget_inode(ip, XFS_DQTYPE_USER,
doalloc, &ip->i_udquot);
if (error)
goto done;
@@ -379,7 +334,7 @@ xfs_qm_dqattach_locked(
}
if (XFS_IS_GQUOTA_ON(mp) && !ip->i_gdquot) {
- error = xfs_qm_dqattach_one(ip, XFS_DQTYPE_GROUP,
+ error = xfs_qm_dqget_inode(ip, XFS_DQTYPE_GROUP,
doalloc, &ip->i_gdquot);
if (error)
goto done;
@@ -387,7 +342,7 @@ xfs_qm_dqattach_locked(
}
if (XFS_IS_PQUOTA_ON(mp) && !ip->i_pdquot) {
- error = xfs_qm_dqattach_one(ip, XFS_DQTYPE_PROJ,
+ error = xfs_qm_dqget_inode(ip, XFS_DQTYPE_PROJ,
doalloc, &ip->i_pdquot);
if (error)
goto done;
@@ -465,8 +420,9 @@ xfs_qm_dquot_isolate(
struct xfs_dquot *dqp = container_of(item,
struct xfs_dquot, q_lru);
struct xfs_qm_isolate *isol = arg;
+ enum lru_status ret = LRU_SKIP;
- if (!xfs_dqlock_nowait(dqp))
+ if (!spin_trylock(&dqp->q_lockref.lock))
goto out_miss_busy;
/*
@@ -474,15 +430,24 @@ xfs_qm_dquot_isolate(
* from the LRU, leave it for the freeing task to complete the freeing
* process rather than risk it being free from under us here.
*/
- if (dqp->q_flags & XFS_DQFLAG_FREEING)
+ if (__lockref_is_dead(&dqp->q_lockref))
+ goto out_miss_unlock;
+
+ /*
+ * If the dquot is pinned or dirty, rotate it to the end of the LRU to
+ * give some time for it to be cleaned before we try to isolate it
+ * again.
+ */
+ ret = LRU_ROTATE;
+ if (XFS_DQ_IS_DIRTY(dqp) || atomic_read(&dqp->q_pincount) > 0)
goto out_miss_unlock;
/*
* This dquot has acquired a reference in the meantime remove it from
* the freelist and try again.
*/
- if (dqp->q_nrefs) {
- xfs_dqunlock(dqp);
+ if (dqp->q_lockref.count) {
+ spin_unlock(&dqp->q_lockref.lock);
XFS_STATS_INC(dqp->q_mount, xs_qm_dqwants);
trace_xfs_dqreclaim_want(dqp);
@@ -492,51 +457,23 @@ xfs_qm_dquot_isolate(
}
/*
- * If the dquot is dirty, flush it. If it's already being flushed, just
- * skip it so there is time for the IO to complete before we try to
- * reclaim it again on the next LRU pass.
+ * The dquot may still be under IO, in which case the flush lock will be
+ * held. If we can't get the flush lock now, just skip over the dquot as
+ * if it was dirty.
*/
if (!xfs_dqflock_nowait(dqp))
goto out_miss_unlock;
- if (XFS_DQ_IS_DIRTY(dqp)) {
- struct xfs_buf *bp = NULL;
- int error;
-
- trace_xfs_dqreclaim_dirty(dqp);
-
- /* we have to drop the LRU lock to flush the dquot */
- spin_unlock(&lru->lock);
-
- error = xfs_dquot_use_attached_buf(dqp, &bp);
- if (!bp || error == -EAGAIN) {
- xfs_dqfunlock(dqp);
- goto out_unlock_dirty;
- }
-
- /*
- * dqflush completes dqflock on error, and the delwri ioend
- * does it on success.
- */
- error = xfs_qm_dqflush(dqp, bp);
- if (error)
- goto out_unlock_dirty;
-
- xfs_buf_delwri_queue(bp, &isol->buffers);
- xfs_buf_relse(bp);
- goto out_unlock_dirty;
- }
-
+ ASSERT(!XFS_DQ_IS_DIRTY(dqp));
xfs_dquot_detach_buf(dqp);
xfs_dqfunlock(dqp);
/*
* Prevent lookups now that we are past the point of no return.
*/
- dqp->q_flags |= XFS_DQFLAG_FREEING;
- xfs_dqunlock(dqp);
+ lockref_mark_dead(&dqp->q_lockref);
+ spin_unlock(&dqp->q_lockref.lock);
- ASSERT(dqp->q_nrefs == 0);
list_lru_isolate_move(lru, &dqp->q_lru, &isol->dispose);
XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused);
trace_xfs_dqreclaim_done(dqp);
@@ -544,17 +481,11 @@ xfs_qm_dquot_isolate(
return LRU_REMOVED;
out_miss_unlock:
- xfs_dqunlock(dqp);
+ spin_unlock(&dqp->q_lockref.lock);
out_miss_busy:
trace_xfs_dqreclaim_busy(dqp);
XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses);
- return LRU_SKIP;
-
-out_unlock_dirty:
- trace_xfs_dqreclaim_busy(dqp);
- XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses);
- xfs_dqunlock(dqp);
- return LRU_RETRY;
+ return ret;
}
static unsigned long
@@ -681,10 +612,7 @@ xfs_qm_load_metadir_qinos(
struct xfs_trans *tp;
int error;
- error = xfs_trans_alloc_empty(mp, &tp);
- if (error)
- return error;
-
+ tp = xfs_trans_alloc_empty(mp);
error = xfs_dqinode_load_parent(tp, &qi->qi_dirip);
if (error == -ENOENT) {
/* no quota dir directory, but we'll create one later */
@@ -1340,9 +1268,10 @@ xfs_qm_quotacheck_dqadjust(
return error;
}
+ mutex_lock(&dqp->q_qlock);
error = xfs_dquot_attach_buf(NULL, dqp);
if (error)
- return error;
+ goto out_unlock;
trace_xfs_dqadjust(dqp);
@@ -1372,8 +1301,10 @@ xfs_qm_quotacheck_dqadjust(
}
dqp->q_flags |= XFS_DQFLAG_DIRTY;
- xfs_qm_dqput(dqp);
- return 0;
+out_unlock:
+ mutex_unlock(&dqp->q_qlock);
+ xfs_qm_dqrele(dqp);
+ return error;
}
/*
@@ -1486,45 +1417,19 @@ xfs_qm_flush_one(
struct xfs_dquot *dqp,
void *data)
{
- struct xfs_mount *mp = dqp->q_mount;
struct list_head *buffer_list = data;
struct xfs_buf *bp = NULL;
int error = 0;
- xfs_dqlock(dqp);
- if (dqp->q_flags & XFS_DQFLAG_FREEING)
- goto out_unlock;
+ if (!lockref_get_not_dead(&dqp->q_lockref))
+ return 0;
+
+ mutex_lock(&dqp->q_qlock);
if (!XFS_DQ_IS_DIRTY(dqp))
goto out_unlock;
- /*
- * The only way the dquot is already flush locked by the time quotacheck
- * gets here is if reclaim flushed it before the dqadjust walk dirtied
- * it for the final time. Quotacheck collects all dquot bufs in the
- * local delwri queue before dquots are dirtied, so reclaim can't have
- * possibly queued it for I/O. The only way out is to push the buffer to
- * cycle the flush lock.
- */
- if (!xfs_dqflock_nowait(dqp)) {
- /* buf is pinned in-core by delwri list */
- error = xfs_buf_incore(mp->m_ddev_targp, dqp->q_blkno,
- mp->m_quotainfo->qi_dqchunklen, 0, &bp);
- if (error)
- goto out_unlock;
-
- if (!(bp->b_flags & _XBF_DELWRI_Q)) {
- error = -EAGAIN;
- xfs_buf_relse(bp);
- goto out_unlock;
- }
- xfs_buf_unlock(bp);
-
- xfs_buf_delwri_pushbuf(bp, buffer_list);
- xfs_buf_rele(bp);
-
- error = -EAGAIN;
- goto out_unlock;
- }
+ xfs_qm_dqunpin_wait(dqp);
+ xfs_dqflock(dqp);
error = xfs_dquot_use_attached_buf(dqp, &bp);
if (error)
@@ -1539,7 +1444,8 @@ xfs_qm_flush_one(
xfs_buf_delwri_queue(bp, buffer_list);
xfs_buf_relse(bp);
out_unlock:
- xfs_dqunlock(dqp);
+ mutex_unlock(&dqp->q_qlock);
+ xfs_qm_dqrele(dqp);
return error;
}
@@ -1803,10 +1709,7 @@ xfs_qm_qino_load(
struct xfs_inode *dp = NULL;
int error;
- error = xfs_trans_alloc_empty(mp, &tp);
- if (error)
- return error;
-
+ tp = xfs_trans_alloc_empty(mp);
if (xfs_has_metadir(mp)) {
error = xfs_dqinode_load_parent(tp, &dp);
if (error)
@@ -1958,16 +1861,12 @@ xfs_qm_vop_dqalloc(
struct xfs_dquot *gq = NULL;
struct xfs_dquot *pq = NULL;
int error;
- uint lockflags;
if (!XFS_IS_QUOTA_ON(mp))
return 0;
ASSERT(!xfs_is_metadir_inode(ip));
- lockflags = XFS_ILOCK_EXCL;
- xfs_ilock(ip, lockflags);
-
if ((flags & XFS_QMOPT_INHERIT) && XFS_INHERIT_GID(ip))
gid = inode->i_gid;
@@ -1976,38 +1875,22 @@ xfs_qm_vop_dqalloc(
* if necessary. The dquot(s) will not be locked.
*/
if (XFS_NOT_DQATTACHED(mp, ip)) {
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
error = xfs_qm_dqattach_locked(ip, true);
- if (error) {
- xfs_iunlock(ip, lockflags);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ if (error)
return error;
- }
}
if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) {
ASSERT(O_udqpp);
if (!uid_eq(inode->i_uid, uid)) {
- /*
- * What we need is the dquot that has this uid, and
- * if we send the inode to dqget, the uid of the inode
- * takes priority over what's sent in the uid argument.
- * We must unlock inode here before calling dqget if
- * we're not sending the inode, because otherwise
- * we'll deadlock by doing trans_reserve while
- * holding ilock.
- */
- xfs_iunlock(ip, lockflags);
error = xfs_qm_dqget(mp, from_kuid(user_ns, uid),
XFS_DQTYPE_USER, true, &uq);
if (error) {
ASSERT(error != -ENOENT);
return error;
}
- /*
- * Get the ilock in the right order.
- */
- xfs_dqunlock(uq);
- lockflags = XFS_ILOCK_SHARED;
- xfs_ilock(ip, lockflags);
} else {
/*
* Take an extra reference, because we'll return
@@ -2020,16 +1903,12 @@ xfs_qm_vop_dqalloc(
if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) {
ASSERT(O_gdqpp);
if (!gid_eq(inode->i_gid, gid)) {
- xfs_iunlock(ip, lockflags);
error = xfs_qm_dqget(mp, from_kgid(user_ns, gid),
XFS_DQTYPE_GROUP, true, &gq);
if (error) {
ASSERT(error != -ENOENT);
goto error_rele;
}
- xfs_dqunlock(gq);
- lockflags = XFS_ILOCK_SHARED;
- xfs_ilock(ip, lockflags);
} else {
ASSERT(ip->i_gdquot);
gq = xfs_qm_dqhold(ip->i_gdquot);
@@ -2038,16 +1917,12 @@ xfs_qm_vop_dqalloc(
if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) {
ASSERT(O_pdqpp);
if (ip->i_projid != prid) {
- xfs_iunlock(ip, lockflags);
error = xfs_qm_dqget(mp, prid,
XFS_DQTYPE_PROJ, true, &pq);
if (error) {
ASSERT(error != -ENOENT);
goto error_rele;
}
- xfs_dqunlock(pq);
- lockflags = XFS_ILOCK_SHARED;
- xfs_ilock(ip, lockflags);
} else {
ASSERT(ip->i_pdquot);
pq = xfs_qm_dqhold(ip->i_pdquot);
@@ -2055,7 +1930,6 @@ xfs_qm_vop_dqalloc(
}
trace_xfs_dquot_dqalloc(ip);
- xfs_iunlock(ip, lockflags);
if (O_udqpp)
*O_udqpp = uq;
else
@@ -2132,7 +2006,7 @@ xfs_qm_vop_chown(
* back now.
*/
tp->t_flags |= XFS_TRANS_DIRTY;
- xfs_dqlock(prevdq);
+ mutex_lock(&prevdq->q_qlock);
if (isrt) {
ASSERT(prevdq->q_rtb.reserved >= ip->i_delayed_blks);
prevdq->q_rtb.reserved -= ip->i_delayed_blks;
@@ -2140,7 +2014,7 @@ xfs_qm_vop_chown(
ASSERT(prevdq->q_blk.reserved >= ip->i_delayed_blks);
prevdq->q_blk.reserved -= ip->i_delayed_blks;
}
- xfs_dqunlock(prevdq);
+ mutex_unlock(&prevdq->q_qlock);
/*
* Take an extra reference, because the inode is going to keep
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 35b64bc3a7a8..e88ed6ad0e65 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -57,7 +57,7 @@ struct xfs_quotainfo {
struct xfs_inode *qi_pquotaip; /* project quota inode */
struct xfs_inode *qi_dirip; /* quota metadir */
struct list_lru qi_lru;
- int qi_dquots;
+ uint64_t qi_dquots;
struct mutex qi_quotaofflock;/* to serialize quotaoff */
xfs_filblks_t qi_dqchunklen; /* # BBs in a chunk of dqs */
uint qi_dqperchunk; /* # ondisk dq in above chunk */
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index 245d754f382a..edc0aef3cf34 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -73,8 +73,10 @@ xfs_qm_statvfs(
struct xfs_dquot *dqp;
if (!xfs_qm_dqget(mp, ip->i_projid, XFS_DQTYPE_PROJ, false, &dqp)) {
+ mutex_lock(&dqp->q_qlock);
xfs_fill_statvfs_from_dquot(statp, ip, dqp);
- xfs_qm_dqput(dqp);
+ mutex_unlock(&dqp->q_qlock);
+ xfs_qm_dqrele(dqp);
}
}
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 0c78f30fa4a3..022e2179c06b 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -303,13 +303,12 @@ xfs_qm_scall_setqlim(
}
defq = xfs_get_defquota(q, xfs_dquot_type(dqp));
- xfs_dqunlock(dqp);
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_setqlim, 0, 0, 0, &tp);
if (error)
goto out_rele;
- xfs_dqlock(dqp);
+ mutex_lock(&dqp->q_qlock);
xfs_trans_dqjoin(tp, dqp);
/*
@@ -459,6 +458,7 @@ xfs_qm_scall_getquota(
* If everything's NULL, this dquot doesn't quite exist as far as
* our utility programs are concerned.
*/
+ mutex_lock(&dqp->q_qlock);
if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
error = -ENOENT;
goto out_put;
@@ -467,7 +467,8 @@ xfs_qm_scall_getquota(
xfs_qm_scall_getquota_fill_qc(mp, type, dqp, dst);
out_put:
- xfs_qm_dqput(dqp);
+ mutex_unlock(&dqp->q_qlock);
+ xfs_qm_dqrele(dqp);
return error;
}
@@ -497,7 +498,8 @@ xfs_qm_scall_getquota_next(
*id = dqp->q_id;
xfs_qm_scall_getquota_fill_qc(mp, type, dqp, dst);
+ mutex_unlock(&dqp->q_qlock);
- xfs_qm_dqput(dqp);
+ xfs_qm_dqrele(dqp);
return error;
}
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index 4c7f7ce4fd2f..94fbe3d99ec7 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -65,7 +65,7 @@ xfs_fs_get_quota_state(
memset(state, 0, sizeof(*state));
if (!XFS_IS_QUOTA_ON(mp))
return 0;
- state->s_incoredqs = q->qi_dquots;
+ state->s_incoredqs = min_t(uint64_t, q->qi_dquots, UINT_MAX);
if (XFS_IS_UQUOTA_ON(mp))
state->s_state[USRQUOTA].flags |= QCI_ACCT_ENABLED;
if (XFS_IS_UQUOTA_ENFORCED(mp))
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index fe2d7aab8554..3728234699a2 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -78,6 +78,11 @@ xfs_cui_item_size(
*nbytes += xfs_cui_log_format_sizeof(cuip->cui_format.cui_nextents);
}
+unsigned int xfs_cui_log_space(unsigned int nr)
+{
+ return xlog_item_space(1, xfs_cui_log_format_sizeof(nr));
+}
+
/*
* This is called to fill in the vector of log iovecs for the
* given cui log item. We use only 1 iovec, and we point that
@@ -179,6 +184,11 @@ xfs_cud_item_size(
*nbytes += sizeof(struct xfs_cud_log_format);
}
+unsigned int xfs_cud_log_space(void)
+{
+ return xlog_item_space(1, sizeof(struct xfs_cud_log_format));
+}
+
/*
* This is called to fill in the vector of log iovecs for the
* given cud log item. We use only 1 iovec, and we point that
@@ -707,18 +717,18 @@ xlog_recover_cui_commit_pass2(
struct xfs_cui_log_format *cui_formatp;
size_t len;
- cui_formatp = item->ri_buf[0].i_addr;
+ cui_formatp = item->ri_buf[0].iov_base;
- if (item->ri_buf[0].i_len < xfs_cui_log_format_sizeof(0)) {
+ if (item->ri_buf[0].iov_len < xfs_cui_log_format_sizeof(0)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
len = xfs_cui_log_format_sizeof(cui_formatp->cui_nextents);
- if (item->ri_buf[0].i_len != len) {
+ if (item->ri_buf[0].iov_len != len) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
@@ -749,18 +759,18 @@ xlog_recover_rtcui_commit_pass2(
struct xfs_cui_log_format *cui_formatp;
size_t len;
- cui_formatp = item->ri_buf[0].i_addr;
+ cui_formatp = item->ri_buf[0].iov_base;
- if (item->ri_buf[0].i_len < xfs_cui_log_format_sizeof(0)) {
+ if (item->ri_buf[0].iov_len < xfs_cui_log_format_sizeof(0)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
len = xfs_cui_log_format_sizeof(cui_formatp->cui_nextents);
- if (item->ri_buf[0].i_len != len) {
+ if (item->ri_buf[0].iov_len != len) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
@@ -781,7 +791,7 @@ xlog_recover_rtcui_commit_pass2(
xfs_lsn_t lsn)
{
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
#endif
@@ -807,10 +817,10 @@ xlog_recover_cud_commit_pass2(
{
struct xfs_cud_log_format *cud_formatp;
- cud_formatp = item->ri_buf[0].i_addr;
- if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format)) {
+ cud_formatp = item->ri_buf[0].iov_base;
+ if (item->ri_buf[0].iov_len != sizeof(struct xfs_cud_log_format)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
@@ -833,10 +843,10 @@ xlog_recover_rtcud_commit_pass2(
{
struct xfs_cud_log_format *cud_formatp;
- cud_formatp = item->ri_buf[0].i_addr;
- if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format)) {
+ cud_formatp = item->ri_buf[0].iov_base;
+ if (item->ri_buf[0].iov_len != sizeof(struct xfs_cud_log_format)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
diff --git a/fs/xfs/xfs_refcount_item.h b/fs/xfs/xfs_refcount_item.h
index bfee8f30c63c..0fc3f493342b 100644
--- a/fs/xfs/xfs_refcount_item.h
+++ b/fs/xfs/xfs_refcount_item.h
@@ -76,4 +76,7 @@ struct xfs_refcount_intent;
void xfs_refcount_defer_add(struct xfs_trans *tp,
struct xfs_refcount_intent *ri);
+unsigned int xfs_cui_log_space(unsigned int nr);
+unsigned int xfs_cud_log_space(void);
+
#endif /* __XFS_REFCOUNT_ITEM_H__ */
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index cc3b4df88110..3f177b4ec131 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -293,7 +293,7 @@ xfs_bmap_trim_cow(
return xfs_reflink_trim_around_shared(ip, imap, shared);
}
-static int
+int
xfs_reflink_convert_cow_locked(
struct xfs_inode *ip,
xfs_fileoff_t offset_fsb,
@@ -786,35 +786,19 @@ xfs_reflink_update_quota(
* requirements as low as possible.
*/
STATIC int
-xfs_reflink_end_cow_extent(
+xfs_reflink_end_cow_extent_locked(
+ struct xfs_trans *tp,
struct xfs_inode *ip,
xfs_fileoff_t *offset_fsb,
xfs_fileoff_t end_fsb)
{
struct xfs_iext_cursor icur;
struct xfs_bmbt_irec got, del, data;
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_trans *tp;
struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
- unsigned int resblks;
int nmaps;
bool isrt = XFS_IS_REALTIME_INODE(ip);
int error;
- resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
- XFS_TRANS_RESERVE, &tp);
- if (error)
- return error;
-
- /*
- * Lock the inode. We have to ijoin without automatic unlock because
- * the lead transaction is the refcountbt record deletion; the data
- * fork update follows as a deferred log item.
- */
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, 0);
-
/*
* In case of racing, overlapping AIO writes no COW extents might be
* left by the time I/O completes for the loser of the race. In that
@@ -823,7 +807,7 @@ xfs_reflink_end_cow_extent(
if (!xfs_iext_lookup_extent(ip, ifp, *offset_fsb, &icur, &got) ||
got.br_startoff >= end_fsb) {
*offset_fsb = end_fsb;
- goto out_cancel;
+ return 0;
}
/*
@@ -837,7 +821,7 @@ xfs_reflink_end_cow_extent(
if (!xfs_iext_next_extent(ifp, &icur, &got) ||
got.br_startoff >= end_fsb) {
*offset_fsb = end_fsb;
- goto out_cancel;
+ return 0;
}
}
del = got;
@@ -846,14 +830,14 @@ xfs_reflink_end_cow_extent(
error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK,
XFS_IEXT_REFLINK_END_COW_CNT);
if (error)
- goto out_cancel;
+ return error;
/* Grab the corresponding mapping in the data fork. */
nmaps = 1;
error = xfs_bmapi_read(ip, del.br_startoff, del.br_blockcount, &data,
&nmaps, 0);
if (error)
- goto out_cancel;
+ return error;
/* We can only remap the smaller of the two extent sizes. */
data.br_blockcount = min(data.br_blockcount, del.br_blockcount);
@@ -882,7 +866,7 @@ xfs_reflink_end_cow_extent(
error = xfs_bunmapi(NULL, ip, data.br_startoff,
data.br_blockcount, 0, 1, &done);
if (error)
- goto out_cancel;
+ return error;
ASSERT(done);
}
@@ -899,17 +883,45 @@ xfs_reflink_end_cow_extent(
/* Remove the mapping from the CoW fork. */
xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
- error = xfs_trans_commit(tp);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- if (error)
- return error;
-
/* Update the caller about how much progress we made. */
*offset_fsb = del.br_startoff + del.br_blockcount;
return 0;
+}
-out_cancel:
- xfs_trans_cancel(tp);
+/*
+ * Remap part of the CoW fork into the data fork.
+ *
+ * We aim to remap the range starting at @offset_fsb and ending at @end_fsb
+ * into the data fork; this function will remap what it can (at the end of the
+ * range) and update @end_fsb appropriately. Each remap gets its own
+ * transaction because we can end up merging and splitting bmbt blocks for
+ * every remap operation and we'd like to keep the block reservation
+ * requirements as low as possible.
+ */
+STATIC int
+xfs_reflink_end_cow_extent(
+ struct xfs_inode *ip,
+ xfs_fileoff_t *offset_fsb,
+ xfs_fileoff_t end_fsb)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ unsigned int resblks;
+ int error;
+
+ resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
+ XFS_TRANS_RESERVE, &tp);
+ if (error)
+ return error;
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
+
+ error = xfs_reflink_end_cow_extent_locked(tp, ip, offset_fsb, end_fsb);
+ if (error)
+ xfs_trans_cancel(tp);
+ else
+ error = xfs_trans_commit(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
@@ -973,6 +985,78 @@ xfs_reflink_end_cow(
}
/*
+ * Fully remap all of the file's data fork at once, which is the critical part
+ * in achieving atomic behaviour.
+ * The regular CoW end path does not use function as to keep the block
+ * reservation per transaction as low as possible.
+ */
+int
+xfs_reflink_end_atomic_cow(
+ struct xfs_inode *ip,
+ xfs_off_t offset,
+ xfs_off_t count)
+{
+ xfs_fileoff_t offset_fsb;
+ xfs_fileoff_t end_fsb;
+ int error = 0;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ unsigned int resblks;
+
+ trace_xfs_reflink_end_cow(ip, offset, count);
+
+ offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ end_fsb = XFS_B_TO_FSB(mp, offset + count);
+
+ /*
+ * Each remapping operation could cause a btree split, so in the worst
+ * case that's one for each block.
+ */
+ resblks = (end_fsb - offset_fsb) *
+ XFS_NEXTENTADD_SPACE_RES(mp, 1, XFS_DATA_FORK);
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_atomic_ioend, resblks, 0,
+ XFS_TRANS_RESERVE, &tp);
+ if (error)
+ return error;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
+
+ while (end_fsb > offset_fsb && !error) {
+ error = xfs_reflink_end_cow_extent_locked(tp, ip, &offset_fsb,
+ end_fsb);
+ }
+ if (error) {
+ trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
+ goto out_cancel;
+ }
+ error = xfs_trans_commit(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+out_cancel:
+ xfs_trans_cancel(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+}
+
+/* Compute the largest atomic write that we can complete through software. */
+xfs_extlen_t
+xfs_reflink_max_atomic_cow(
+ struct xfs_mount *mp)
+{
+ /* We cannot do any atomic writes without out of place writes. */
+ if (!xfs_can_sw_atomic_write(mp))
+ return 0;
+
+ /*
+ * Atomic write limits must always be a power-of-2, according to
+ * generic_atomic_write_valid.
+ */
+ return rounddown_pow_of_two(xfs_calc_max_atomic_write_fsblocks(mp));
+}
+
+/*
* Free all CoW staging blocks that are still referenced by the ondisk refcount
* metadata. The ondisk metadata does not track which inode created the
* staging extent, so callers must ensure that there are no cached inodes with
@@ -1797,7 +1881,8 @@ xfs_reflink_unshare(
&xfs_dax_write_iomap_ops);
else
error = iomap_file_unshare(inode, offset, len,
- &xfs_buffered_write_iomap_ops);
+ &xfs_buffered_write_iomap_ops,
+ &xfs_iomap_write_ops);
if (error)
goto out;
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index cc4e92278279..9d1ed9bb0bee 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -17,7 +17,7 @@ xfs_can_free_cowblocks(struct xfs_inode *ip)
{
struct inode *inode = VFS_I(ip);
- if ((inode->i_state & I_DIRTY_PAGES) ||
+ if ((inode_state_read_once(inode) & I_DIRTY_PAGES) ||
mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY) ||
mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK) ||
atomic_read(&inode->i_dio_count))
@@ -35,6 +35,8 @@ int xfs_reflink_allocate_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap,
bool convert_now);
extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t count);
+int xfs_reflink_convert_cow_locked(struct xfs_inode *ip,
+ xfs_fileoff_t offset_fsb, xfs_filblks_t count_fsb);
extern int xfs_reflink_cancel_cow_blocks(struct xfs_inode *ip,
struct xfs_trans **tpp, xfs_fileoff_t offset_fsb,
@@ -43,6 +45,8 @@ extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t count, bool cancel_real);
extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t count);
+int xfs_reflink_end_atomic_cow(struct xfs_inode *ip, xfs_off_t offset,
+ xfs_off_t count);
extern int xfs_reflink_recover_cow(struct xfs_mount *mp);
extern loff_t xfs_reflink_remap_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out, loff_t len,
@@ -64,4 +68,6 @@ extern int xfs_reflink_update_dest(struct xfs_inode *dest, xfs_off_t newlen,
bool xfs_reflink_supports_rextsize(struct xfs_mount *mp, unsigned int rextsize);
+xfs_extlen_t xfs_reflink_max_atomic_cow(struct xfs_mount *mp);
+
#endif /* __XFS_REFLINK_H */
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 89decffe76c8..15f0903f6fd4 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -77,6 +77,11 @@ xfs_rui_item_size(
*nbytes += xfs_rui_log_format_sizeof(ruip->rui_format.rui_nextents);
}
+unsigned int xfs_rui_log_space(unsigned int nr)
+{
+ return xlog_item_space(1, xfs_rui_log_format_sizeof(nr));
+}
+
/*
* This is called to fill in the vector of log iovecs for the
* given rui log item. We use only 1 iovec, and we point that
@@ -180,6 +185,11 @@ xfs_rud_item_size(
*nbytes += sizeof(struct xfs_rud_log_format);
}
+unsigned int xfs_rud_log_space(void)
+{
+ return xlog_item_space(1, sizeof(struct xfs_rud_log_format));
+}
+
/*
* This is called to fill in the vector of log iovecs for the
* given rud log item. We use only 1 iovec, and we point that
@@ -736,18 +746,18 @@ xlog_recover_rui_commit_pass2(
struct xfs_rui_log_format *rui_formatp;
size_t len;
- rui_formatp = item->ri_buf[0].i_addr;
+ rui_formatp = item->ri_buf[0].iov_base;
- if (item->ri_buf[0].i_len < xfs_rui_log_format_sizeof(0)) {
+ if (item->ri_buf[0].iov_len < xfs_rui_log_format_sizeof(0)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
len = xfs_rui_log_format_sizeof(rui_formatp->rui_nextents);
- if (item->ri_buf[0].i_len != len) {
+ if (item->ri_buf[0].iov_len != len) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
@@ -778,18 +788,18 @@ xlog_recover_rtrui_commit_pass2(
struct xfs_rui_log_format *rui_formatp;
size_t len;
- rui_formatp = item->ri_buf[0].i_addr;
+ rui_formatp = item->ri_buf[0].iov_base;
- if (item->ri_buf[0].i_len < xfs_rui_log_format_sizeof(0)) {
+ if (item->ri_buf[0].iov_len < xfs_rui_log_format_sizeof(0)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
len = xfs_rui_log_format_sizeof(rui_formatp->rui_nextents);
- if (item->ri_buf[0].i_len != len) {
+ if (item->ri_buf[0].iov_len != len) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
@@ -810,7 +820,7 @@ xlog_recover_rtrui_commit_pass2(
xfs_lsn_t lsn)
{
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
#endif
@@ -836,10 +846,10 @@ xlog_recover_rud_commit_pass2(
{
struct xfs_rud_log_format *rud_formatp;
- rud_formatp = item->ri_buf[0].i_addr;
- if (item->ri_buf[0].i_len != sizeof(struct xfs_rud_log_format)) {
+ rud_formatp = item->ri_buf[0].iov_base;
+ if (item->ri_buf[0].iov_len != sizeof(struct xfs_rud_log_format)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
- rud_formatp, item->ri_buf[0].i_len);
+ rud_formatp, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
@@ -862,10 +872,10 @@ xlog_recover_rtrud_commit_pass2(
{
struct xfs_rud_log_format *rud_formatp;
- rud_formatp = item->ri_buf[0].i_addr;
- if (item->ri_buf[0].i_len != sizeof(struct xfs_rud_log_format)) {
+ rud_formatp = item->ri_buf[0].iov_base;
+ if (item->ri_buf[0].iov_len != sizeof(struct xfs_rud_log_format)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
- rud_formatp, item->ri_buf[0].i_len);
+ rud_formatp, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
diff --git a/fs/xfs/xfs_rmap_item.h b/fs/xfs/xfs_rmap_item.h
index 40d331555675..3a99f0117f2d 100644
--- a/fs/xfs/xfs_rmap_item.h
+++ b/fs/xfs/xfs_rmap_item.h
@@ -75,4 +75,7 @@ struct xfs_rmap_intent;
void xfs_rmap_defer_add(struct xfs_trans *tp, struct xfs_rmap_intent *ri);
+unsigned int xfs_rui_log_space(unsigned int nr);
+unsigned int xfs_rud_log_space(void);
+
#endif /* __XFS_RMAP_ITEM_H__ */
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 6484c596ecea..e063f4f2f2e6 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -729,9 +729,7 @@ xfs_rtginode_ensure(
if (rtg->rtg_inodes[type])
return 0;
- error = xfs_trans_alloc_empty(rtg_mount(rtg), &tp);
- if (error)
- return error;
+ tp = xfs_trans_alloc_empty(rtg_mount(rtg));
error = xfs_rtginode_load(rtg, type, tp);
xfs_trans_cancel(tp);
@@ -1257,10 +1255,10 @@ xfs_growfs_check_rtgeom(
min_logfsbs = min_t(xfs_extlen_t, xfs_log_calc_minimum_size(nmp),
nmp->m_rsumblocks * 2);
- kfree(nmp);
+ trace_xfs_growfs_check_rtgeom(mp, min_logfsbs);
if (min_logfsbs > mp->m_sb.sb_logblocks)
- return -EINVAL;
+ goto out_inval;
if (xfs_has_zoned(mp)) {
uint32_t gblocks = mp->m_groups[XG_TYPE_RTG].blocks;
@@ -1268,16 +1266,20 @@ xfs_growfs_check_rtgeom(
if (rextsize != 1)
return -EINVAL;
- div_u64_rem(mp->m_sb.sb_rblocks, gblocks, &rem);
+ div_u64_rem(nmp->m_sb.sb_rblocks, gblocks, &rem);
if (rem) {
xfs_warn(mp,
"new RT volume size (%lld) not aligned to RT group size (%d)",
- mp->m_sb.sb_rblocks, gblocks);
- return -EINVAL;
+ nmp->m_sb.sb_rblocks, gblocks);
+ goto out_inval;
}
}
+ kfree(nmp);
return 0;
+out_inval:
+ kfree(nmp);
+ return -EINVAL;
}
/*
@@ -1303,9 +1305,7 @@ xfs_growfs_rt_prep_groups(
if (!mp->m_rtdirip) {
struct xfs_trans *tp;
- error = xfs_trans_alloc_empty(mp, &tp);
- if (error)
- return error;
+ tp = xfs_trans_alloc_empty(mp);
error = xfs_rtginode_load_parent(tp);
xfs_trans_cancel(tp);
@@ -1672,10 +1672,7 @@ xfs_rtmount_inodes(
struct xfs_rtgroup *rtg = NULL;
int error;
- error = xfs_trans_alloc_empty(mp, &tp);
- if (error)
- return error;
-
+ tp = xfs_trans_alloc_empty(mp);
if (xfs_has_rtgroups(mp) && mp->m_sb.sb_rgcount > 0) {
error = xfs_rtginode_load_parent(tp);
if (error)
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index b2dd0c0bf509..bc71aa9dcee8 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -102,19 +102,33 @@ static const struct constant_table dax_param_enums[] = {
* Table driven mount option parser.
*/
enum {
- Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev,
+ Op_deprecated, Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev,
Opt_wsync, Opt_noalign, Opt_swalloc, Opt_sunit, Opt_swidth, Opt_nouuid,
Opt_grpid, Opt_nogrpid, Opt_bsdgroups, Opt_sysvgroups,
- Opt_allocsize, Opt_norecovery, Opt_inode64, Opt_inode32, Opt_ikeep,
- Opt_noikeep, Opt_largeio, Opt_nolargeio, Opt_attr2, Opt_noattr2,
+ Opt_allocsize, Opt_norecovery, Opt_inode64, Opt_inode32,
+ Opt_largeio, Opt_nolargeio,
Opt_filestreams, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota,
Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota,
Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce,
Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum, Opt_max_open_zones,
- Opt_lifetime, Opt_nolifetime,
+ Opt_lifetime, Opt_nolifetime, Opt_max_atomic_write,
};
+#define fsparam_dead(NAME) \
+ __fsparam(NULL, (NAME), Op_deprecated, fs_param_deprecated, NULL)
+
static const struct fs_parameter_spec xfs_fs_parameters[] = {
+ /*
+ * These mount options were supposed to be deprecated in September 2025
+ * but the deprecation warning was buggy, so not all users were
+ * notified. The deprecation is now obnoxiously loud and postponed to
+ * September 2030.
+ */
+ fsparam_dead("attr2"),
+ fsparam_dead("noattr2"),
+ fsparam_dead("ikeep"),
+ fsparam_dead("noikeep"),
+
fsparam_u32("logbufs", Opt_logbufs),
fsparam_string("logbsize", Opt_logbsize),
fsparam_string("logdev", Opt_logdev),
@@ -133,12 +147,8 @@ static const struct fs_parameter_spec xfs_fs_parameters[] = {
fsparam_flag("norecovery", Opt_norecovery),
fsparam_flag("inode64", Opt_inode64),
fsparam_flag("inode32", Opt_inode32),
- fsparam_flag("ikeep", Opt_ikeep),
- fsparam_flag("noikeep", Opt_noikeep),
fsparam_flag("largeio", Opt_largeio),
fsparam_flag("nolargeio", Opt_nolargeio),
- fsparam_flag("attr2", Opt_attr2),
- fsparam_flag("noattr2", Opt_noattr2),
fsparam_flag("filestreams", Opt_filestreams),
fsparam_flag("quota", Opt_quota),
fsparam_flag("noquota", Opt_noquota),
@@ -159,6 +169,7 @@ static const struct fs_parameter_spec xfs_fs_parameters[] = {
fsparam_u32("max_open_zones", Opt_max_open_zones),
fsparam_flag("lifetime", Opt_lifetime),
fsparam_flag("nolifetime", Opt_nolifetime),
+ fsparam_string("max_atomic_write", Opt_max_atomic_write),
{}
};
@@ -174,13 +185,11 @@ xfs_fs_show_options(
{
static struct proc_xfs_info xfs_info_set[] = {
/* the few simple ones we can get from the mount struct */
- { XFS_FEAT_IKEEP, ",ikeep" },
{ XFS_FEAT_WSYNC, ",wsync" },
{ XFS_FEAT_NOALIGN, ",noalign" },
{ XFS_FEAT_SWALLOC, ",swalloc" },
{ XFS_FEAT_NOUUID, ",nouuid" },
{ XFS_FEAT_NORECOVERY, ",norecovery" },
- { XFS_FEAT_ATTR2, ",attr2" },
{ XFS_FEAT_FILESTREAMS, ",filestreams" },
{ XFS_FEAT_GRPID, ",grpid" },
{ XFS_FEAT_DISCARD, ",discard" },
@@ -241,6 +250,9 @@ xfs_fs_show_options(
if (mp->m_max_open_zones)
seq_printf(m, ",max_open_zones=%u", mp->m_max_open_zones);
+ if (mp->m_awu_max_bytes)
+ seq_printf(m, ",max_atomic_write=%lluk",
+ mp->m_awu_max_bytes >> 10);
return 0;
}
@@ -380,10 +392,11 @@ xfs_blkdev_get(
struct file **bdev_filep)
{
int error = 0;
+ blk_mode_t mode;
- *bdev_filep = bdev_file_open_by_path(name,
- BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_RESTRICT_WRITES,
- mp->m_super, &fs_holder_ops);
+ mode = sb_open_mode(mp->m_super->s_flags);
+ *bdev_filep = bdev_file_open_by_path(name, mode,
+ mp->m_super, &fs_holder_ops);
if (IS_ERR(*bdev_filep)) {
error = PTR_ERR(*bdev_filep);
*bdev_filep = NULL;
@@ -481,21 +494,29 @@ xfs_open_devices(
/*
* Setup xfs_mount buffer target pointers
*/
- error = -ENOMEM;
mp->m_ddev_targp = xfs_alloc_buftarg(mp, sb->s_bdev_file);
- if (!mp->m_ddev_targp)
+ if (IS_ERR(mp->m_ddev_targp)) {
+ error = PTR_ERR(mp->m_ddev_targp);
+ mp->m_ddev_targp = NULL;
goto out_close_rtdev;
+ }
if (rtdev_file) {
mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev_file);
- if (!mp->m_rtdev_targp)
+ if (IS_ERR(mp->m_rtdev_targp)) {
+ error = PTR_ERR(mp->m_rtdev_targp);
+ mp->m_rtdev_targp = NULL;
goto out_free_ddev_targ;
+ }
}
if (logdev_file && file_bdev(logdev_file) != ddev) {
mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev_file);
- if (!mp->m_logdev_targp)
+ if (IS_ERR(mp->m_logdev_targp)) {
+ error = PTR_ERR(mp->m_logdev_targp);
+ mp->m_logdev_targp = NULL;
goto out_free_rtdev_targ;
+ }
} else {
mp->m_logdev_targp = mp->m_ddev_targp;
/* Handle won't be used, drop it */
@@ -528,7 +549,8 @@ xfs_setup_devices(
{
int error;
- error = xfs_setsize_buftarg(mp->m_ddev_targp, mp->m_sb.sb_sectsize);
+ error = xfs_configure_buftarg(mp->m_ddev_targp, mp->m_sb.sb_sectsize,
+ mp->m_sb.sb_dblocks);
if (error)
return error;
@@ -537,8 +559,8 @@ xfs_setup_devices(
if (xfs_has_sector(mp))
log_sector_size = mp->m_sb.sb_logsectsize;
- error = xfs_setsize_buftarg(mp->m_logdev_targp,
- log_sector_size);
+ error = xfs_configure_buftarg(mp->m_logdev_targp,
+ log_sector_size, mp->m_sb.sb_logblocks);
if (error)
return error;
}
@@ -551,8 +573,8 @@ xfs_setup_devices(
}
mp->m_rtdev_targp = mp->m_ddev_targp;
} else if (mp->m_rtname) {
- error = xfs_setsize_buftarg(mp->m_rtdev_targp,
- mp->m_sb.sb_sectsize);
+ error = xfs_configure_buftarg(mp->m_rtdev_targp,
+ mp->m_sb.sb_sectsize, mp->m_sb.sb_rblocks);
if (error)
return error;
}
@@ -565,19 +587,19 @@ xfs_init_mount_workqueues(
struct xfs_mount *mp)
{
mp->m_buf_workqueue = alloc_workqueue("xfs-buf/%s",
- XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM),
+ XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU),
1, mp->m_super->s_id);
if (!mp->m_buf_workqueue)
goto out;
mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s",
- XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM),
+ XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU),
0, mp->m_super->s_id);
if (!mp->m_unwritten_workqueue)
goto out_destroy_buf;
mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s",
- XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM),
+ XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU),
0, mp->m_super->s_id);
if (!mp->m_reclaim_workqueue)
goto out_destroy_unwritten;
@@ -589,13 +611,14 @@ xfs_init_mount_workqueues(
goto out_destroy_reclaim;
mp->m_inodegc_wq = alloc_workqueue("xfs-inodegc/%s",
- XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM),
+ XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU),
1, mp->m_super->s_id);
if (!mp->m_inodegc_wq)
goto out_destroy_blockgc;
mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s",
- XFS_WQFLAGS(WQ_FREEZABLE), 0, mp->m_super->s_id);
+ XFS_WQFLAGS(WQ_FREEZABLE | WQ_PERCPU), 0,
+ mp->m_super->s_id);
if (!mp->m_sync_workqueue)
goto out_destroy_inodegc;
@@ -765,7 +788,7 @@ xfs_fs_drop_inode(
return 0;
}
- return generic_drop_inode(inode);
+ return inode_generic_drop(inode);
}
STATIC void
@@ -777,6 +800,12 @@ xfs_fs_evict_inode(
truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
+
+ if (IS_ENABLED(CONFIG_XFS_RT) &&
+ S_ISREG(inode->i_mode) && inode->i_private) {
+ xfs_open_zone_put(inode->i_private);
+ inode->i_private = NULL;
+ }
}
static void
@@ -1075,15 +1104,6 @@ xfs_finish_flags(
}
/*
- * V5 filesystems always use attr2 format for attributes.
- */
- if (xfs_has_crc(mp) && xfs_has_noattr2(mp)) {
- xfs_warn(mp, "Cannot mount a V5 filesystem as noattr2. "
- "attr2 is always enabled for V5 filesystems.");
- return -EINVAL;
- }
-
- /*
* prohibit r/w mounts of read-only filesystems
*/
if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !xfs_is_readonly(mp)) {
@@ -1149,7 +1169,7 @@ xfs_init_percpu_counters(
return 0;
free_freecounters:
- while (--i > 0)
+ while (--i >= 0)
percpu_counter_destroy(&mp->m_free[i].count);
percpu_counter_destroy(&mp->m_delalloc_rtextents);
free_delalloc:
@@ -1334,19 +1354,64 @@ suffix_kstrtoint(
return ret;
}
+static int
+suffix_kstrtoull(
+ const char *s,
+ unsigned int base,
+ unsigned long long *res)
+{
+ int last, shift_left_factor = 0;
+ unsigned long long _res;
+ char *value;
+ int ret = 0;
+
+ value = kstrdup(s, GFP_KERNEL);
+ if (!value)
+ return -ENOMEM;
+
+ last = strlen(value) - 1;
+ if (value[last] == 'K' || value[last] == 'k') {
+ shift_left_factor = 10;
+ value[last] = '\0';
+ }
+ if (value[last] == 'M' || value[last] == 'm') {
+ shift_left_factor = 20;
+ value[last] = '\0';
+ }
+ if (value[last] == 'G' || value[last] == 'g') {
+ shift_left_factor = 30;
+ value[last] = '\0';
+ }
+
+ if (kstrtoull(value, base, &_res))
+ ret = -EINVAL;
+ kfree(value);
+ *res = _res << shift_left_factor;
+ return ret;
+}
+
static inline void
xfs_fs_warn_deprecated(
struct fs_context *fc,
- struct fs_parameter *param,
- uint64_t flag,
- bool value)
+ struct fs_parameter *param)
{
- /* Don't print the warning if reconfiguring and current mount point
- * already had the flag set
+ /*
+ * Always warn about someone passing in a deprecated mount option.
+ * Previously we wouldn't print the warning if we were reconfiguring
+ * and current mount point already had the flag set, but that was not
+ * the right thing to do.
+ *
+ * Many distributions mount the root filesystem with no options in the
+ * initramfs and rely on mount -a to remount the root fs with the
+ * options in fstab. However, the old behavior meant that there would
+ * never be a warning about deprecated mount options for the root fs in
+ * /etc/fstab. On a single-fs system, that means no warning at all.
+ *
+ * Compounding this problem are distribution scripts that copy
+ * /proc/mounts to fstab, which means that we can't remove mount
+ * options unless we're 100% sure they have only ever been advertised
+ * in /proc/mounts in response to explicitly provided mount options.
*/
- if ((fc->purpose & FS_CONTEXT_FOR_RECONFIGURE) &&
- !!(XFS_M(fc->root->d_sb)->m_features & flag) == value)
- return;
xfs_warn(fc->s_fs_info, "%s mount option is deprecated.", param->key);
}
@@ -1372,6 +1437,9 @@ xfs_fs_parse_param(
return opt;
switch (opt) {
+ case Op_deprecated:
+ xfs_fs_warn_deprecated(fc, param);
+ return 0;
case Opt_logbufs:
parsing_mp->m_logbufs = result.uint_32;
return 0;
@@ -1492,23 +1560,6 @@ xfs_fs_parse_param(
xfs_mount_set_dax_mode(parsing_mp, result.uint_32);
return 0;
#endif
- /* Following mount options will be removed in September 2025 */
- case Opt_ikeep:
- xfs_fs_warn_deprecated(fc, param, XFS_FEAT_IKEEP, true);
- parsing_mp->m_features |= XFS_FEAT_IKEEP;
- return 0;
- case Opt_noikeep:
- xfs_fs_warn_deprecated(fc, param, XFS_FEAT_IKEEP, false);
- parsing_mp->m_features &= ~XFS_FEAT_IKEEP;
- return 0;
- case Opt_attr2:
- xfs_fs_warn_deprecated(fc, param, XFS_FEAT_ATTR2, true);
- parsing_mp->m_features |= XFS_FEAT_ATTR2;
- return 0;
- case Opt_noattr2:
- xfs_fs_warn_deprecated(fc, param, XFS_FEAT_NOATTR2, true);
- parsing_mp->m_features |= XFS_FEAT_NOATTR2;
- return 0;
case Opt_max_open_zones:
parsing_mp->m_max_open_zones = result.uint_32;
return 0;
@@ -1518,6 +1569,14 @@ xfs_fs_parse_param(
case Opt_nolifetime:
parsing_mp->m_features |= XFS_FEAT_NOLIFETIME;
return 0;
+ case Opt_max_atomic_write:
+ if (suffix_kstrtoull(param->string, 10,
+ &parsing_mp->m_awu_max_bytes)) {
+ xfs_warn(parsing_mp,
+ "max atomic write size must be positive integer");
+ return -EINVAL;
+ }
+ return 0;
default:
xfs_warn(parsing_mp, "unknown mount option [%s].", param->key);
return -EINVAL;
@@ -1536,16 +1595,6 @@ xfs_fs_validate_params(
return -EINVAL;
}
- /*
- * We have not read the superblock at this point, so only the attr2
- * mount option can set the attr2 feature by this stage.
- */
- if (xfs_has_attr2(mp) && xfs_has_noattr2(mp)) {
- xfs_warn(mp, "attr2 and noattr2 cannot both be specified.");
- return -EINVAL;
- }
-
-
if (xfs_has_noalign(mp) && (mp->m_dalign || mp->m_swidth)) {
xfs_warn(mp,
"sunit and swidth options incompatible with the noalign option");
@@ -1644,7 +1693,10 @@ xfs_fs_fill_super(
if (error)
return error;
- sb_min_blocksize(sb, BBSIZE);
+ if (!sb_min_blocksize(sb, BBSIZE)) {
+ xfs_err(mp, "unable to set blocksize");
+ return -EINVAL;
+ }
sb->s_xattr = xfs_xattr_handlers;
sb->s_export_op = &xfs_export_operations;
#ifdef CONFIG_XFS_QUOTA
@@ -1897,13 +1949,6 @@ xfs_fs_fill_super(
}
}
-
- if (xfs_has_exchange_range(mp))
- xfs_warn_experimental(mp, XFS_EXPERIMENTAL_EXCHRANGE);
-
- if (xfs_has_parent(mp))
- xfs_warn_experimental(mp, XFS_EXPERIMENTAL_PPTR);
-
/*
* If no quota mount options were provided, maybe we'll try to pick
* up the quota accounting and enforcement flags from the ondisk sb.
@@ -1969,6 +2014,19 @@ xfs_remount_rw(
struct xfs_sb *sbp = &mp->m_sb;
int error;
+ if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp &&
+ xfs_readonly_buftarg(mp->m_logdev_targp)) {
+ xfs_warn(mp,
+ "ro->rw transition prohibited by read-only logdev");
+ return -EACCES;
+ }
+
+ if (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp)) {
+ xfs_warn(mp,
+ "ro->rw transition prohibited by read-only rtdev");
+ return -EACCES;
+ }
+
if (xfs_has_norecovery(mp)) {
xfs_warn(mp,
"ro->rw transition prohibited on norecovery mount");
@@ -2114,6 +2172,14 @@ xfs_fs_reconfigure(
if (error)
return error;
+ /* Validate new max_atomic_write option before making other changes */
+ if (mp->m_awu_max_bytes != new_mp->m_awu_max_bytes) {
+ error = xfs_set_max_atomic_write_opt(mp,
+ new_mp->m_awu_max_bytes);
+ if (error)
+ return error;
+ }
+
/* inode32 -> inode64 */
if (xfs_has_small_inums(mp) && !xfs_has_small_inums(new_mp)) {
mp->m_features &= ~XFS_FEAT_SMALL_INUMS;
@@ -2126,6 +2192,17 @@ xfs_fs_reconfigure(
mp->m_maxagi = xfs_set_inode_alloc(mp, mp->m_sb.sb_agcount);
}
+ /*
+ * Now that mp has been modified according to the remount options, we
+ * do a final option validation with xfs_finish_flags() just like it is
+ * just like it is done during mount. We cannot use
+ * done during mount. We cannot use xfs_finish_flags() on new_mp as it
+ * contains only the user given options.
+ */
+ error = xfs_finish_flags(mp);
+ if (error)
+ return error;
+
/* ro -> rw */
if (xfs_is_readonly(mp) && !(flags & SB_RDONLY)) {
error = xfs_remount_rw(mp);
@@ -2178,7 +2255,7 @@ xfs_init_fs_context(
struct xfs_mount *mp;
int i;
- mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL | __GFP_NOFAIL);
+ mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL);
if (!mp)
return -ENOMEM;
@@ -2499,8 +2576,8 @@ xfs_init_workqueues(void)
* AGs in all the filesystems mounted. Hence use the default large
* max_active value for this workqueue.
*/
- xfs_alloc_wq = alloc_workqueue("xfsalloc",
- XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE), 0);
+ xfs_alloc_wq = alloc_workqueue("xfsalloc", XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_PERCPU),
+ 0);
if (!xfs_alloc_wq)
return -ENOMEM;
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index 751dc74a3067..9918f14b4874 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -50,7 +50,7 @@ xfs_panic_mask_proc_handler(
}
#endif /* CONFIG_PROC_FS */
-STATIC int
+static inline int
xfs_deprecated_dointvec_minmax(
const struct ctl_table *ctl,
int write,
@@ -68,24 +68,6 @@ xfs_deprecated_dointvec_minmax(
static const struct ctl_table xfs_table[] = {
{
- .procname = "irix_sgid_inherit",
- .data = &xfs_params.sgid_inherit.val,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = xfs_deprecated_dointvec_minmax,
- .extra1 = &xfs_params.sgid_inherit.min,
- .extra2 = &xfs_params.sgid_inherit.max
- },
- {
- .procname = "irix_symlink_mode",
- .data = &xfs_params.symlink_mode.val,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = xfs_deprecated_dointvec_minmax,
- .extra1 = &xfs_params.symlink_mode.min,
- .extra2 = &xfs_params.symlink_mode.max
- },
- {
.procname = "panic_mask",
.data = &xfs_params.panic_mask.val,
.maxlen = sizeof(int),
@@ -185,15 +167,6 @@ static const struct ctl_table xfs_table[] = {
.extra1 = &xfs_params.blockgc_timer.min,
.extra2 = &xfs_params.blockgc_timer.max,
},
- {
- .procname = "speculative_cow_prealloc_lifetime",
- .data = &xfs_params.blockgc_timer.val,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = xfs_deprecated_dointvec_minmax,
- .extra1 = &xfs_params.blockgc_timer.min,
- .extra2 = &xfs_params.blockgc_timer.max,
- },
/* please keep this the last entry */
#ifdef CONFIG_PROC_FS
{
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index 276696a07040..ed9d896079c1 100644
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
@@ -19,9 +19,6 @@ typedef struct xfs_sysctl_val {
} xfs_sysctl_val_t;
typedef struct xfs_param {
- xfs_sysctl_val_t sgid_inherit; /* Inherit S_ISGID if process' GID is
- * not a member of parent dir GID. */
- xfs_sysctl_val_t symlink_mode; /* Link creat mode affected by umask */
xfs_sysctl_val_t panic_mask; /* bitmask to cause panic on errors. */
xfs_sysctl_val_t error_level; /* Degree of reporting for problems */
xfs_sysctl_val_t syncd_timer; /* Interval between xfssyncd wakeups */
@@ -29,8 +26,6 @@ typedef struct xfs_param {
xfs_sysctl_val_t inherit_sync; /* Inherit the "sync" inode flag. */
xfs_sysctl_val_t inherit_nodump;/* Inherit the "nodump" inode flag. */
xfs_sysctl_val_t inherit_noatim;/* Inherit the "noatime" inode flag. */
- xfs_sysctl_val_t xfs_buf_timer; /* Interval between xfsbufd wakeups. */
- xfs_sysctl_val_t xfs_buf_age; /* Metadata buffer age before flush. */
xfs_sysctl_val_t inherit_nosym; /* Inherit the "nosymlinks" flag. */
xfs_sysctl_val_t rotorstep; /* inode32 AG rotoring control knob */
xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index b7e82d85f043..7a5c5ef2db92 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -718,8 +718,40 @@ max_open_zones_show(
}
XFS_SYSFS_ATTR_RO(max_open_zones);
+static ssize_t
+zonegc_low_space_store(
+ struct kobject *kobj,
+ const char *buf,
+ size_t count)
+{
+ int ret;
+ unsigned int val;
+
+ ret = kstrtouint(buf, 0, &val);
+ if (ret)
+ return ret;
+
+ if (val > 100)
+ return -EINVAL;
+
+ zoned_to_mp(kobj)->m_zonegc_low_space = val;
+
+ return count;
+}
+
+static ssize_t
+zonegc_low_space_show(
+ struct kobject *kobj,
+ char *buf)
+{
+ return sysfs_emit(buf, "%u\n",
+ zoned_to_mp(kobj)->m_zonegc_low_space);
+}
+XFS_SYSFS_ATTR_RW(zonegc_low_space);
+
static struct attribute *xfs_zoned_attrs[] = {
ATTR_LIST(max_open_zones),
+ ATTR_LIST(zonegc_low_space),
NULL,
};
ATTRIBUTE_GROUPS(xfs_zoned);
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index e56ba1963160..f70afbf3cb19 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -170,6 +170,96 @@ DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound);
DEFINE_ATTR_LIST_EVENT(xfs_attr_leaf_list);
DEFINE_ATTR_LIST_EVENT(xfs_attr_node_list);
+TRACE_EVENT(xfs_calc_atomic_write_unit_max,
+ TP_PROTO(struct xfs_mount *mp, enum xfs_group_type type,
+ unsigned int max_write, unsigned int max_ioend,
+ unsigned int max_gsize, unsigned int awu_max),
+ TP_ARGS(mp, type, max_write, max_ioend, max_gsize, awu_max),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(enum xfs_group_type, type)
+ __field(unsigned int, max_write)
+ __field(unsigned int, max_ioend)
+ __field(unsigned int, max_gsize)
+ __field(unsigned int, awu_max)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->type = type;
+ __entry->max_write = max_write;
+ __entry->max_ioend = max_ioend;
+ __entry->max_gsize = max_gsize;
+ __entry->awu_max = awu_max;
+ ),
+ TP_printk("dev %d:%d %s max_write %u max_ioend %u max_gsize %u awu_max %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
+ __entry->max_write,
+ __entry->max_ioend,
+ __entry->max_gsize,
+ __entry->awu_max)
+);
+
+TRACE_EVENT(xfs_calc_max_atomic_write_fsblocks,
+ TP_PROTO(struct xfs_mount *mp, unsigned int per_intent,
+ unsigned int step_size, unsigned int logres,
+ unsigned int blockcount),
+ TP_ARGS(mp, per_intent, step_size, logres, blockcount),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned int, per_intent)
+ __field(unsigned int, step_size)
+ __field(unsigned int, logres)
+ __field(unsigned int, blockcount)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->per_intent = per_intent;
+ __entry->step_size = step_size;
+ __entry->logres = logres;
+ __entry->blockcount = blockcount;
+ ),
+ TP_printk("dev %d:%d per_intent %u step_size %u logres %u blockcount %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->per_intent,
+ __entry->step_size,
+ __entry->logres,
+ __entry->blockcount)
+);
+
+TRACE_EVENT(xfs_calc_max_atomic_write_log_geometry,
+ TP_PROTO(struct xfs_mount *mp, unsigned int per_intent,
+ unsigned int step_size, unsigned int blockcount,
+ unsigned int min_logblocks, unsigned int logres),
+ TP_ARGS(mp, per_intent, step_size, blockcount, min_logblocks, logres),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned int, per_intent)
+ __field(unsigned int, step_size)
+ __field(unsigned int, blockcount)
+ __field(unsigned int, min_logblocks)
+ __field(unsigned int, cur_logblocks)
+ __field(unsigned int, logres)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->per_intent = per_intent;
+ __entry->step_size = step_size;
+ __entry->blockcount = blockcount;
+ __entry->min_logblocks = min_logblocks;
+ __entry->cur_logblocks = mp->m_sb.sb_logblocks;
+ __entry->logres = logres;
+ ),
+ TP_printk("dev %d:%d per_intent %u step_size %u blockcount %u min_logblocks %u logblocks %u logres %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->per_intent,
+ __entry->step_size,
+ __entry->blockcount,
+ __entry->min_logblocks,
+ __entry->cur_logblocks,
+ __entry->logres)
+);
+
TRACE_EVENT(xlog_intent_recovery_failed,
TP_PROTO(struct xfs_mount *mp, const struct xfs_defer_op_type *ops,
int error),
@@ -335,8 +425,8 @@ DECLARE_EVENT_CLASS(xfs_zone_alloc_class,
__field(dev_t, dev)
__field(xfs_rgnumber_t, rgno)
__field(xfs_rgblock_t, used)
+ __field(xfs_rgblock_t, allocated)
__field(xfs_rgblock_t, written)
- __field(xfs_rgblock_t, write_pointer)
__field(xfs_rgblock_t, rgbno)
__field(xfs_extlen_t, len)
),
@@ -344,17 +434,17 @@ DECLARE_EVENT_CLASS(xfs_zone_alloc_class,
__entry->dev = rtg_mount(oz->oz_rtg)->m_super->s_dev;
__entry->rgno = rtg_rgno(oz->oz_rtg);
__entry->used = rtg_rmap(oz->oz_rtg)->i_used_blocks;
+ __entry->allocated = oz->oz_allocated;
__entry->written = oz->oz_written;
- __entry->write_pointer = oz->oz_write_pointer;
__entry->rgbno = rgbno;
__entry->len = len;
),
- TP_printk("dev %d:%d rgno 0x%x used 0x%x written 0x%x wp 0x%x rgbno 0x%x len 0x%x",
+ TP_printk("dev %d:%d rgno 0x%x used 0x%x alloced 0x%x written 0x%x rgbno 0x%x len 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->rgno,
__entry->used,
+ __entry->allocated,
__entry->written,
- __entry->write_pointer,
__entry->rgbno,
__entry->len)
);
@@ -365,6 +455,7 @@ DEFINE_EVENT(xfs_zone_alloc_class, name, \
xfs_extlen_t len), \
TP_ARGS(oz, rgbno, len))
DEFINE_ZONE_ALLOC_EVENT(xfs_zone_record_blocks);
+DEFINE_ZONE_ALLOC_EVENT(xfs_zone_skip_blocks);
DEFINE_ZONE_ALLOC_EVENT(xfs_zone_alloc_blocks);
TRACE_EVENT(xfs_zone_gc_select_victim,
@@ -685,7 +776,6 @@ DEFINE_BUF_EVENT(xfs_buf_iowait_done);
DEFINE_BUF_EVENT(xfs_buf_delwri_queue);
DEFINE_BUF_EVENT(xfs_buf_delwri_queued);
DEFINE_BUF_EVENT(xfs_buf_delwri_split);
-DEFINE_BUF_EVENT(xfs_buf_delwri_pushbuf);
DEFINE_BUF_EVENT(xfs_buf_get_uncached);
DEFINE_BUF_EVENT(xfs_buf_item_relse);
DEFINE_BUF_EVENT(xfs_buf_iodone_async);
@@ -990,7 +1080,9 @@ DEFINE_INODE_EVENT(xfs_get_acl);
#endif
DEFINE_INODE_EVENT(xfs_vm_bmap);
DEFINE_INODE_EVENT(xfs_file_ioctl);
+#ifdef CONFIG_COMPAT
DEFINE_INODE_EVENT(xfs_file_compat_ioctl);
+#endif
DEFINE_INODE_EVENT(xfs_ioctl_setattr);
DEFINE_INODE_EVENT(xfs_dir_fsync);
DEFINE_INODE_EVENT(xfs_file_fsync);
@@ -1054,20 +1146,23 @@ DECLARE_EVENT_CLASS(xfs_iref_class,
__field(xfs_ino_t, ino)
__field(int, count)
__field(int, pincount)
+ __field(unsigned long, iflags)
__field(unsigned long, caller_ip)
),
TP_fast_assign(
__entry->dev = VFS_I(ip)->i_sb->s_dev;
__entry->ino = ip->i_ino;
- __entry->count = atomic_read(&VFS_I(ip)->i_count);
+ __entry->count = icount_read(VFS_I(ip));
__entry->pincount = atomic_read(&ip->i_pincount);
+ __entry->iflags = ip->i_flags;
__entry->caller_ip = caller_ip;
),
- TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %pS",
+ TP_printk("dev %d:%d ino 0x%llx count %d pincount %d iflags 0x%lx caller %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->count,
__entry->pincount,
+ __entry->iflags,
(char *)__entry->caller_ip)
)
@@ -1157,6 +1252,8 @@ DEFINE_IREF_EVENT(xfs_irele);
DEFINE_IREF_EVENT(xfs_inode_pin);
DEFINE_IREF_EVENT(xfs_inode_unpin);
DEFINE_IREF_EVENT(xfs_inode_unpin_nowait);
+DEFINE_IREF_EVENT(xfs_inode_push_pinned);
+DEFINE_IREF_EVENT(xfs_inode_push_stale);
DECLARE_EVENT_CLASS(xfs_namespace_class,
TP_PROTO(struct xfs_inode *dp, const struct xfs_name *name),
@@ -1253,7 +1350,7 @@ DECLARE_EVENT_CLASS(xfs_dquot_class,
__entry->id = dqp->q_id;
__entry->type = dqp->q_type;
__entry->flags = dqp->q_flags;
- __entry->nrefs = dqp->q_nrefs;
+ __entry->nrefs = data_race(dqp->q_lockref.count);
__entry->res_bcount = dqp->q_blk.reserved;
__entry->res_rtbcount = dqp->q_rtb.reserved;
@@ -1300,10 +1397,8 @@ DEFINE_EVENT(xfs_dquot_class, name, \
TP_ARGS(dqp))
DEFINE_DQUOT_EVENT(xfs_dqadjust);
DEFINE_DQUOT_EVENT(xfs_dqreclaim_want);
-DEFINE_DQUOT_EVENT(xfs_dqreclaim_dirty);
DEFINE_DQUOT_EVENT(xfs_dqreclaim_busy);
DEFINE_DQUOT_EVENT(xfs_dqreclaim_done);
-DEFINE_DQUOT_EVENT(xfs_dqattach_found);
DEFINE_DQUOT_EVENT(xfs_dqattach_get);
DEFINE_DQUOT_EVENT(xfs_dqalloc);
DEFINE_DQUOT_EVENT(xfs_dqtobp_read);
@@ -1313,9 +1408,8 @@ DEFINE_DQUOT_EVENT(xfs_dqget_hit);
DEFINE_DQUOT_EVENT(xfs_dqget_miss);
DEFINE_DQUOT_EVENT(xfs_dqget_freeing);
DEFINE_DQUOT_EVENT(xfs_dqget_dup);
-DEFINE_DQUOT_EVENT(xfs_dqput);
-DEFINE_DQUOT_EVENT(xfs_dqput_free);
DEFINE_DQUOT_EVENT(xfs_dqrele);
+DEFINE_DQUOT_EVENT(xfs_dqrele_free);
DEFINE_DQUOT_EVENT(xfs_dqflush);
DEFINE_DQUOT_EVENT(xfs_dqflush_force);
DEFINE_DQUOT_EVENT(xfs_dqflush_done);
@@ -1505,7 +1599,6 @@ DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant);
DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant_sub);
DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant_exit);
DEFINE_LOGGRANT_EVENT(xfs_log_cil_wait);
-DEFINE_LOGGRANT_EVENT(xfs_log_cil_return);
DECLARE_EVENT_CLASS(xfs_log_item_class,
TP_PROTO(struct xfs_log_item *lip),
@@ -1561,6 +1654,8 @@ DEFINE_LOG_ITEM_EVENT(xfs_ail_flushing);
DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_mark);
DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_skip);
DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_unpin);
+DEFINE_LOG_ITEM_EVENT(xlog_ail_insert_abort);
+DEFINE_LOG_ITEM_EVENT(xfs_trans_free_abort);
DECLARE_EVENT_CLASS(xfs_ail_class,
TP_PROTO(struct xfs_log_item *lip, xfs_lsn_t old_lsn, xfs_lsn_t new_lsn),
@@ -1657,6 +1752,28 @@ DEFINE_RW_EVENT(xfs_file_direct_write);
DEFINE_RW_EVENT(xfs_file_dax_write);
DEFINE_RW_EVENT(xfs_reflink_bounce_dio_write);
+TRACE_EVENT(xfs_iomap_atomic_write_cow,
+ TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
+ TP_ARGS(ip, offset, count),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_off_t, offset)
+ __field(ssize_t, count)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->offset = offset;
+ __entry->count = count;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx pos 0x%llx bytecount 0x%zx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->offset,
+ __entry->count)
+)
+
DECLARE_EVENT_CLASS(xfs_imap_class,
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
int whichfork, struct xfs_bmbt_irec *irec),
@@ -1744,8 +1861,6 @@ DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
DEFINE_SIMPLE_IO_EVENT(xfs_setfilesize);
DEFINE_SIMPLE_IO_EVENT(xfs_zero_eof);
DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write);
-DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_unwritten);
-DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_append);
DEFINE_SIMPLE_IO_EVENT(xfs_file_splice_read);
DEFINE_SIMPLE_IO_EVENT(xfs_zoned_map_blocks);
@@ -1778,31 +1893,6 @@ DEFINE_EVENT(xfs_itrunc_class, name, \
DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_start);
DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_end);
-TRACE_EVENT(xfs_pagecache_inval,
- TP_PROTO(struct xfs_inode *ip, xfs_off_t start, xfs_off_t finish),
- TP_ARGS(ip, start, finish),
- TP_STRUCT__entry(
- __field(dev_t, dev)
- __field(xfs_ino_t, ino)
- __field(xfs_fsize_t, size)
- __field(xfs_off_t, start)
- __field(xfs_off_t, finish)
- ),
- TP_fast_assign(
- __entry->dev = VFS_I(ip)->i_sb->s_dev;
- __entry->ino = ip->i_ino;
- __entry->size = ip->i_disk_size;
- __entry->start = start;
- __entry->finish = finish;
- ),
- TP_printk("dev %d:%d ino 0x%llx disize 0x%llx start 0x%llx finish 0x%llx",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->ino,
- __entry->size,
- __entry->start,
- __entry->finish)
-);
-
TRACE_EVENT(xfs_bunmap,
TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t fileoff, xfs_filblks_t len,
int flags, unsigned long caller_ip),
@@ -2148,14 +2238,12 @@ DEFINE_EVENT(xfs_alloc_class, name, \
DEFINE_ALLOC_EVENT(xfs_alloc_exact_done);
DEFINE_ALLOC_EVENT(xfs_alloc_exact_notfound);
DEFINE_ALLOC_EVENT(xfs_alloc_exact_error);
-DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft);
DEFINE_ALLOC_EVENT(xfs_alloc_near_first);
DEFINE_ALLOC_EVENT(xfs_alloc_cur);
DEFINE_ALLOC_EVENT(xfs_alloc_cur_right);
DEFINE_ALLOC_EVENT(xfs_alloc_cur_left);
DEFINE_ALLOC_EVENT(xfs_alloc_cur_lookup);
DEFINE_ALLOC_EVENT(xfs_alloc_cur_lookup_done);
-DEFINE_ALLOC_EVENT(xfs_alloc_near_error);
DEFINE_ALLOC_EVENT(xfs_alloc_near_noentry);
DEFINE_ALLOC_EVENT(xfs_alloc_near_busy);
DEFINE_ALLOC_EVENT(xfs_alloc_size_neither);
@@ -2350,13 +2438,8 @@ DEFINE_ATTR_EVENT(xfs_attr_leaf_toosmall);
DEFINE_ATTR_EVENT(xfs_attr_node_addname);
DEFINE_ATTR_EVENT(xfs_attr_node_get);
DEFINE_ATTR_EVENT(xfs_attr_node_replace);
-DEFINE_ATTR_EVENT(xfs_attr_node_removename);
-
-DEFINE_ATTR_EVENT(xfs_attr_fillstate);
-DEFINE_ATTR_EVENT(xfs_attr_refillstate);
DEFINE_ATTR_EVENT(xfs_attr_rmtval_get);
-DEFINE_ATTR_EVENT(xfs_attr_rmtval_set);
#define DEFINE_DA_EVENT(name) \
DEFINE_EVENT(xfs_da_class, name, \
@@ -2774,7 +2857,6 @@ DEFINE_EVENT(xfs_rtdiscard_class, name, \
TP_ARGS(mp, rtbno, len))
DEFINE_RTDISCARD_EVENT(xfs_discard_rtextent);
DEFINE_RTDISCARD_EVENT(xfs_discard_rttoosmall);
-DEFINE_RTDISCARD_EVENT(xfs_discard_rtrelax);
DECLARE_EVENT_CLASS(xfs_btree_cur_class,
TP_PROTO(struct xfs_btree_cur *cur, int level, struct xfs_buf *bp),
@@ -4091,36 +4173,6 @@ DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_extent_error);
DEFINE_INODE_IREC_EVENT(xfs_reflink_remap_extent_src);
DEFINE_INODE_IREC_EVENT(xfs_reflink_remap_extent_dest);
-/* dedupe tracepoints */
-DEFINE_DOUBLE_IO_EVENT(xfs_reflink_compare_extents);
-DEFINE_INODE_ERROR_EVENT(xfs_reflink_compare_extents_error);
-
-/* ioctl tracepoints */
-TRACE_EVENT(xfs_ioctl_clone,
- TP_PROTO(struct inode *src, struct inode *dest),
- TP_ARGS(src, dest),
- TP_STRUCT__entry(
- __field(dev_t, dev)
- __field(unsigned long, src_ino)
- __field(loff_t, src_isize)
- __field(unsigned long, dest_ino)
- __field(loff_t, dest_isize)
- ),
- TP_fast_assign(
- __entry->dev = src->i_sb->s_dev;
- __entry->src_ino = src->i_ino;
- __entry->src_isize = i_size_read(src);
- __entry->dest_ino = dest->i_ino;
- __entry->dest_isize = i_size_read(dest);
- ),
- TP_printk("dev %d:%d ino 0x%lx isize 0x%llx -> ino 0x%lx isize 0x%llx",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->src_ino,
- __entry->src_isize,
- __entry->dest_ino,
- __entry->dest_isize)
-);
-
/* unshare tracepoints */
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_unshare);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error);
@@ -4128,7 +4180,6 @@ DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error);
/* copy on write */
DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared);
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found);
-DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc);
DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow);
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
@@ -4881,7 +4932,7 @@ DECLARE_EVENT_CLASS(xlog_iclog_class,
__entry->refcount = atomic_read(&iclog->ic_refcnt);
__entry->offset = iclog->ic_offset;
__entry->flags = iclog->ic_flags;
- __entry->lsn = be64_to_cpu(iclog->ic_header.h_lsn);
+ __entry->lsn = be64_to_cpu(iclog->ic_header->h_lsn);
__entry->caller_ip = caller_ip;
),
TP_printk("dev %d:%d state %s refcnt %d offset %u lsn 0x%llx flags %s caller %pS",
@@ -4913,7 +4964,6 @@ DEFINE_ICLOG_EVENT(xlog_iclog_switch);
DEFINE_ICLOG_EVENT(xlog_iclog_sync);
DEFINE_ICLOG_EVENT(xlog_iclog_syncing);
DEFINE_ICLOG_EVENT(xlog_iclog_sync_done);
-DEFINE_ICLOG_EVENT(xlog_iclog_want_sync);
DEFINE_ICLOG_EVENT(xlog_iclog_wait_on);
DEFINE_ICLOG_EVENT(xlog_iclog_write);
@@ -4962,7 +5012,6 @@ DEFINE_DAS_STATE_EVENT(xfs_attr_sf_addname_return);
DEFINE_DAS_STATE_EVENT(xfs_attr_set_iter_return);
DEFINE_DAS_STATE_EVENT(xfs_attr_leaf_addname_return);
DEFINE_DAS_STATE_EVENT(xfs_attr_node_addname_return);
-DEFINE_DAS_STATE_EVENT(xfs_attr_remove_iter_return);
DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_alloc);
DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_remove_return);
DEFINE_DAS_STATE_EVENT(xfs_attr_defer_add);
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index c6657072361a..474f5a04ec63 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -134,18 +134,14 @@ xfs_trans_dup(
}
/*
- * This is called to reserve free disk blocks and log space for the
- * given transaction. This must be done before allocating any resources
- * within the transaction.
+ * This is called to reserve free disk blocks and log space for the given
+ * transaction before allocating any resources within the transaction.
*
* This will return ENOSPC if there are not enough blocks available.
* It will sleep waiting for available log space.
- * The only valid value for the flags parameter is XFS_RES_LOG_PERM, which
- * is used by long running transactions. If any one of the reservations
- * fails then they will all be backed out.
*
- * This does not do quota reservations. That typically is done by the
- * caller afterwards.
+ * This does not do quota reservations. That typically is done by the caller
+ * afterwards.
*/
static int
xfs_trans_reserve(
@@ -158,10 +154,12 @@ xfs_trans_reserve(
int error = 0;
bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
+ ASSERT(resp->tr_logres > 0);
+
/*
- * Attempt to reserve the needed disk blocks by decrementing
- * the number needed from the number available. This will
- * fail if the count would go below zero.
+ * Attempt to reserve the needed disk blocks by decrementing the number
+ * needed from the number available. This will fail if the count would
+ * go below zero.
*/
if (blocks > 0) {
error = xfs_dec_fdblocks(mp, blocks, rsvd);
@@ -173,42 +171,20 @@ xfs_trans_reserve(
/*
* Reserve the log space needed for this transaction.
*/
- if (resp->tr_logres > 0) {
- bool permanent = false;
-
- ASSERT(tp->t_log_res == 0 ||
- tp->t_log_res == resp->tr_logres);
- ASSERT(tp->t_log_count == 0 ||
- tp->t_log_count == resp->tr_logcount);
-
- if (resp->tr_logflags & XFS_TRANS_PERM_LOG_RES) {
- tp->t_flags |= XFS_TRANS_PERM_LOG_RES;
- permanent = true;
- } else {
- ASSERT(tp->t_ticket == NULL);
- ASSERT(!(tp->t_flags & XFS_TRANS_PERM_LOG_RES));
- }
-
- if (tp->t_ticket != NULL) {
- ASSERT(resp->tr_logflags & XFS_TRANS_PERM_LOG_RES);
- error = xfs_log_regrant(mp, tp->t_ticket);
- } else {
- error = xfs_log_reserve(mp, resp->tr_logres,
- resp->tr_logcount,
- &tp->t_ticket, permanent);
- }
-
- if (error)
- goto undo_blocks;
+ if (resp->tr_logflags & XFS_TRANS_PERM_LOG_RES)
+ tp->t_flags |= XFS_TRANS_PERM_LOG_RES;
+ error = xfs_log_reserve(mp, resp->tr_logres, resp->tr_logcount,
+ &tp->t_ticket, (tp->t_flags & XFS_TRANS_PERM_LOG_RES));
+ if (error)
+ goto undo_blocks;
- tp->t_log_res = resp->tr_logres;
- tp->t_log_count = resp->tr_logcount;
- }
+ tp->t_log_res = resp->tr_logres;
+ tp->t_log_count = resp->tr_logcount;
/*
- * Attempt to reserve the needed realtime extents by decrementing
- * the number needed from the number available. This will
- * fail if the count would go below zero.
+ * Attempt to reserve the needed realtime extents by decrementing the
+ * number needed from the number available. This will fail if the
+ * count would go below zero.
*/
if (rtextents > 0) {
error = xfs_dec_frextents(mp, rtextents);
@@ -221,18 +197,11 @@ xfs_trans_reserve(
return 0;
- /*
- * Error cases jump to one of these labels to undo any
- * reservations which have already been performed.
- */
undo_log:
- if (resp->tr_logres > 0) {
- xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket);
- tp->t_ticket = NULL;
- tp->t_log_res = 0;
- tp->t_flags &= ~XFS_TRANS_PERM_LOG_RES;
- }
-
+ xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket);
+ tp->t_ticket = NULL;
+ tp->t_log_res = 0;
+ tp->t_flags &= ~XFS_TRANS_PERM_LOG_RES;
undo_blocks:
if (blocks > 0) {
xfs_add_fdblocks(mp, blocks);
@@ -241,6 +210,28 @@ undo_blocks:
return error;
}
+static struct xfs_trans *
+__xfs_trans_alloc(
+ struct xfs_mount *mp,
+ uint flags)
+{
+ struct xfs_trans *tp;
+
+ ASSERT(!(flags & XFS_TRANS_RES_FDBLKS) || xfs_has_lazysbcount(mp));
+
+ tp = kmem_cache_zalloc(xfs_trans_cache, GFP_KERNEL | __GFP_NOFAIL);
+ if (!(flags & XFS_TRANS_NO_WRITECOUNT))
+ sb_start_intwrite(mp->m_super);
+ xfs_trans_set_context(tp);
+ tp->t_flags = flags;
+ tp->t_mountp = mp;
+ INIT_LIST_HEAD(&tp->t_items);
+ INIT_LIST_HEAD(&tp->t_busy);
+ INIT_LIST_HEAD(&tp->t_dfops);
+ tp->t_highest_agno = NULLAGNUMBER;
+ return tp;
+}
+
int
xfs_trans_alloc(
struct xfs_mount *mp,
@@ -254,33 +245,16 @@ xfs_trans_alloc(
bool want_retry = true;
int error;
+ ASSERT(resp->tr_logres > 0);
+
/*
* Allocate the handle before we do our freeze accounting and setting up
* GFP_NOFS allocation context so that we avoid lockdep false positives
* by doing GFP_KERNEL allocations inside sb_start_intwrite().
*/
retry:
- tp = kmem_cache_zalloc(xfs_trans_cache, GFP_KERNEL | __GFP_NOFAIL);
- if (!(flags & XFS_TRANS_NO_WRITECOUNT))
- sb_start_intwrite(mp->m_super);
- xfs_trans_set_context(tp);
-
- /*
- * Zero-reservation ("empty") transactions can't modify anything, so
- * they're allowed to run while we're frozen.
- */
- WARN_ON(resp->tr_logres > 0 &&
- mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE);
- ASSERT(!(flags & XFS_TRANS_RES_FDBLKS) ||
- xfs_has_lazysbcount(mp));
-
- tp->t_flags = flags;
- tp->t_mountp = mp;
- INIT_LIST_HEAD(&tp->t_items);
- INIT_LIST_HEAD(&tp->t_busy);
- INIT_LIST_HEAD(&tp->t_dfops);
- tp->t_highest_agno = NULLAGNUMBER;
-
+ tp = __xfs_trans_alloc(mp, flags);
+ WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE);
error = xfs_trans_reserve(tp, resp, blocks, rtextents);
if (error == -ENOSPC && want_retry) {
xfs_trans_cancel(tp);
@@ -324,14 +298,11 @@ retry:
* where we can be grabbing buffers at the same time that freeze is trying to
* drain the buffer LRU list.
*/
-int
+struct xfs_trans *
xfs_trans_alloc_empty(
- struct xfs_mount *mp,
- struct xfs_trans **tpp)
+ struct xfs_mount *mp)
{
- struct xfs_trans_res resv = {0};
-
- return xfs_trans_alloc(mp, &resv, 0, 0, XFS_TRANS_NO_WRITECOUNT, tpp);
+ return __xfs_trans_alloc(mp, XFS_TRANS_NO_WRITECOUNT);
}
/*
@@ -481,19 +452,17 @@ xfs_trans_mod_sb(
*/
STATIC void
xfs_trans_apply_sb_deltas(
- xfs_trans_t *tp)
+ struct xfs_trans *tp)
{
- struct xfs_dsb *sbp;
- struct xfs_buf *bp;
- int whole = 0;
-
- bp = xfs_trans_getsb(tp);
- sbp = bp->b_addr;
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_buf *bp = xfs_trans_getsb(tp);
+ struct xfs_dsb *sbp = bp->b_addr;
+ int whole = 0;
/*
* Only update the superblock counters if we are logging them
*/
- if (!xfs_has_lazysbcount((tp->t_mountp))) {
+ if (!xfs_has_lazysbcount(mp)) {
if (tp->t_icount_delta)
be64_add_cpu(&sbp->sb_icount, tp->t_icount_delta);
if (tp->t_ifree_delta)
@@ -520,8 +489,7 @@ xfs_trans_apply_sb_deltas(
* write the correct value ondisk.
*/
if ((tp->t_frextents_delta || tp->t_res_frextents_delta) &&
- !xfs_has_rtgroups(tp->t_mountp)) {
- struct xfs_mount *mp = tp->t_mountp;
+ !xfs_has_rtgroups(mp)) {
int64_t rtxdelta;
rtxdelta = tp->t_frextents_delta + tp->t_res_frextents_delta;
@@ -534,6 +502,8 @@ xfs_trans_apply_sb_deltas(
if (tp->t_dblocks_delta) {
be64_add_cpu(&sbp->sb_dblocks, tp->t_dblocks_delta);
+ mp->m_ddev_targp->bt_nr_sectors +=
+ XFS_FSB_TO_BB(mp, tp->t_dblocks_delta);
whole = 1;
}
if (tp->t_agcount_delta) {
@@ -553,7 +523,7 @@ xfs_trans_apply_sb_deltas(
* recompute the ondisk rtgroup block log. The incore values
* will be recomputed in xfs_trans_unreserve_and_mod_sb.
*/
- if (xfs_has_rtgroups(tp->t_mountp)) {
+ if (xfs_has_rtgroups(mp)) {
sbp->sb_rgblklog = xfs_compute_rgblklog(
be32_to_cpu(sbp->sb_rgextents),
be32_to_cpu(sbp->sb_rextsize));
@@ -566,6 +536,8 @@ xfs_trans_apply_sb_deltas(
}
if (tp->t_rblocks_delta) {
be64_add_cpu(&sbp->sb_rblocks, tp->t_rblocks_delta);
+ mp->m_rtdev_targp->bt_nr_sectors +=
+ XFS_FSB_TO_BB(mp, tp->t_rblocks_delta);
whole = 1;
}
if (tp->t_rextents_delta) {
@@ -742,8 +714,10 @@ xfs_trans_free_items(
list_for_each_entry_safe(lip, next, &tp->t_items, li_trans) {
xfs_trans_del_item(lip);
- if (abort)
+ if (abort) {
+ trace_xfs_trans_free_abort(lip);
set_bit(XFS_LI_ABORTED, &lip->li_flags);
+ }
if (lip->li_ops->iop_release)
lip->li_ops->iop_release(lip);
}
@@ -1024,51 +998,57 @@ xfs_trans_cancel(
}
/*
- * Roll from one trans in the sequence of PERMANENT transactions to
- * the next: permanent transactions are only flushed out when
- * committed with xfs_trans_commit(), but we still want as soon
- * as possible to let chunks of it go to the log. So we commit the
- * chunk we've been working on and get a new transaction to continue.
+ * Roll from one trans in the sequence of PERMANENT transactions to the next:
+ * permanent transactions are only flushed out when committed with
+ * xfs_trans_commit(), but we still want as soon as possible to let chunks of it
+ * go to the log. So we commit the chunk we've been working on and get a new
+ * transaction to continue.
*/
int
xfs_trans_roll(
struct xfs_trans **tpp)
{
- struct xfs_trans *trans = *tpp;
- struct xfs_trans_res tres;
+ struct xfs_trans *tp = *tpp;
+ unsigned int log_res = tp->t_log_res;
+ unsigned int log_count = tp->t_log_count;
int error;
- trace_xfs_trans_roll(trans, _RET_IP_);
+ trace_xfs_trans_roll(tp, _RET_IP_);
+
+ ASSERT(log_res > 0);
/*
* Copy the critical parameters from one trans to the next.
*/
- tres.tr_logres = trans->t_log_res;
- tres.tr_logcount = trans->t_log_count;
-
- *tpp = xfs_trans_dup(trans);
+ *tpp = xfs_trans_dup(tp);
/*
* Commit the current transaction.
- * If this commit failed, then it'd just unlock those items that
- * are not marked ihold. That also means that a filesystem shutdown
- * is in progress. The caller takes the responsibility to cancel
- * the duplicate transaction that gets returned.
+ *
+ * If this commit failed, then it'd just unlock those items that are not
+ * marked ihold. That also means that a filesystem shutdown is in
+ * progress. The caller takes the responsibility to cancel the
+ * duplicate transaction that gets returned.
*/
- error = __xfs_trans_commit(trans, true);
+ error = __xfs_trans_commit(tp, true);
if (error)
return error;
/*
* Reserve space in the log for the next transaction.
- * This also pushes items in the "AIL", the list of logged items,
- * out to disk if they are taking up space at the tail of the log
- * that we want to use. This requires that either nothing be locked
- * across this call, or that anything that is locked be logged in
- * the prior and the next transactions.
+ *
+ * This also pushes items in the AIL out to disk if they are taking up
+ * space at the tail of the log that we want to use. This requires that
+ * either nothing be locked across this call, or that anything that is
+ * locked be logged in the prior and the next transactions.
*/
- tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
- return xfs_trans_reserve(*tpp, &tres, 0, 0);
+ tp = *tpp;
+ error = xfs_log_regrant(tp->t_mountp, tp->t_ticket);
+ if (error)
+ return error;
+ tp->t_log_res = log_res;
+ tp->t_log_count = log_count;
+ return 0;
}
/*
@@ -1144,9 +1124,18 @@ xfs_trans_reserve_more(
unsigned int blocks,
unsigned int rtextents)
{
- struct xfs_trans_res resv = { };
-
- return xfs_trans_reserve(tp, &resv, blocks, rtextents);
+ bool rsvd = tp->t_flags & XFS_TRANS_RESERVE;
+
+ if (blocks && xfs_dec_fdblocks(tp->t_mountp, blocks, rsvd))
+ return -ENOSPC;
+ if (rtextents && xfs_dec_frextents(tp->t_mountp, rtextents)) {
+ if (blocks)
+ xfs_add_fdblocks(tp->t_mountp, blocks);
+ return -ENOSPC;
+ }
+ tp->t_blk_res += blocks;
+ tp->t_rtx_res += rtextents;
+ return 0;
}
/*
@@ -1161,14 +1150,13 @@ xfs_trans_reserve_more_inode(
unsigned int rblocks,
bool force_quota)
{
- struct xfs_trans_res resv = { };
struct xfs_mount *mp = ip->i_mount;
unsigned int rtx = xfs_extlen_to_rtxlen(mp, rblocks);
int error;
xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
- error = xfs_trans_reserve(tp, &resv, dblocks, rtx);
+ error = xfs_trans_reserve_more(tp, dblocks, rtx);
if (error)
return error;
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 2b366851e9a4..7fb860f645a3 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -15,7 +15,6 @@ struct xfs_efd_log_item;
struct xfs_efi_log_item;
struct xfs_inode;
struct xfs_item_ops;
-struct xfs_log_iovec;
struct xfs_mount;
struct xfs_trans;
struct xfs_trans_res;
@@ -168,8 +167,7 @@ int xfs_trans_alloc(struct xfs_mount *mp, struct xfs_trans_res *resp,
struct xfs_trans **tpp);
int xfs_trans_reserve_more(struct xfs_trans *tp,
unsigned int blocks, unsigned int rtextents);
-int xfs_trans_alloc_empty(struct xfs_mount *mp,
- struct xfs_trans **tpp);
+struct xfs_trans *xfs_trans_alloc_empty(struct xfs_mount *mp);
void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t);
int xfs_trans_get_buf_map(struct xfs_trans *tp, struct xfs_buftarg *target,
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 0fcb1828e598..38983c6777df 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -315,7 +315,7 @@ xfs_ail_splice(
}
/*
- * Delete the given item from the AIL. Return a pointer to the item.
+ * Delete the given item from the AIL.
*/
static void
xfs_ail_delete(
@@ -374,7 +374,7 @@ xfsaild_push_item(
* If log item pinning is enabled, skip the push and track the item as
* pinned. This can help induce head-behind-tail conditions.
*/
- if (XFS_TEST_ERROR(false, ailp->ail_log->l_mp, XFS_ERRTAG_LOG_ITEM_PIN))
+ if (XFS_TEST_ERROR(ailp->ail_log->l_mp, XFS_ERRTAG_LOG_ITEM_PIN))
return XFS_ITEM_PINNED;
/*
@@ -777,26 +777,28 @@ xfs_ail_update_finish(
}
/*
- * xfs_trans_ail_update - bulk AIL insertion operation.
+ * xfs_trans_ail_update_bulk - bulk AIL insertion operation.
*
- * @xfs_trans_ail_update takes an array of log items that all need to be
+ * @xfs_trans_ail_update_bulk takes an array of log items that all need to be
* positioned at the same LSN in the AIL. If an item is not in the AIL, it will
- * be added. Otherwise, it will be repositioned by removing it and re-adding
- * it to the AIL. If we move the first item in the AIL, update the log tail to
- * match the new minimum LSN in the AIL.
+ * be added. Otherwise, it will be repositioned by removing it and re-adding
+ * it to the AIL.
*
- * This function takes the AIL lock once to execute the update operations on
- * all the items in the array, and as such should not be called with the AIL
- * lock held. As a result, once we have the AIL lock, we need to check each log
- * item LSN to confirm it needs to be moved forward in the AIL.
+ * If we move the first item in the AIL, update the log tail to match the new
+ * minimum LSN in the AIL.
*
- * To optimise the insert operation, we delete all the items from the AIL in
- * the first pass, moving them into a temporary list, then splice the temporary
- * list into the correct position in the AIL. This avoids needing to do an
- * insert operation on every item.
+ * This function should be called with the AIL lock held.
*
- * This function must be called with the AIL lock held. The lock is dropped
- * before returning.
+ * To optimise the insert operation, we add all items to a temporary list, then
+ * splice this list into the correct position in the AIL.
+ *
+ * Items that are already in the AIL are first deleted from their current
+ * location before being added to the temporary list.
+ *
+ * This avoids needing to do an insert operation on every item.
+ *
+ * The AIL lock is dropped by xfs_ail_update_finish() before returning to
+ * the caller.
*/
void
xfs_trans_ail_update_bulk(
@@ -909,10 +911,9 @@ xfs_trans_ail_delete(
return;
}
- /* xfs_ail_update_finish() drops the AIL lock */
- xfs_clear_li_failed(lip);
+ clear_bit(XFS_LI_FAILED, &lip->li_flags);
tail_lsn = xfs_ail_delete_one(ailp, lip);
- xfs_ail_update_finish(ailp, tail_lsn);
+ xfs_ail_update_finish(ailp, tail_lsn); /* drops the AIL lock */
}
int
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 765456bf3428..c842ce06acd6 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -393,7 +393,7 @@ xfs_trans_dqlockedjoin(
unsigned int i;
ASSERT(q[0].qt_dquot != NULL);
if (q[1].qt_dquot == NULL) {
- xfs_dqlock(q[0].qt_dquot);
+ mutex_lock(&q[0].qt_dquot->q_qlock);
xfs_trans_dqjoin(tp, q[0].qt_dquot);
} else if (q[2].qt_dquot == NULL) {
xfs_dqlock2(q[0].qt_dquot, q[1].qt_dquot);
@@ -693,7 +693,7 @@ xfs_trans_unreserve_and_mod_dquots(
locked = already_locked;
if (qtrx->qt_blk_res) {
if (!locked) {
- xfs_dqlock(dqp);
+ mutex_lock(&dqp->q_qlock);
locked = true;
}
dqp->q_blk.reserved -=
@@ -701,7 +701,7 @@ xfs_trans_unreserve_and_mod_dquots(
}
if (qtrx->qt_ino_res) {
if (!locked) {
- xfs_dqlock(dqp);
+ mutex_lock(&dqp->q_qlock);
locked = true;
}
dqp->q_ino.reserved -=
@@ -710,14 +710,14 @@ xfs_trans_unreserve_and_mod_dquots(
if (qtrx->qt_rtblk_res) {
if (!locked) {
- xfs_dqlock(dqp);
+ mutex_lock(&dqp->q_qlock);
locked = true;
}
dqp->q_rtb.reserved -=
(xfs_qcnt_t)qtrx->qt_rtblk_res;
}
if (locked && !already_locked)
- xfs_dqunlock(dqp);
+ mutex_unlock(&dqp->q_qlock);
}
}
@@ -820,7 +820,7 @@ xfs_trans_dqresv(
struct xfs_dquot_res *blkres;
struct xfs_quota_limits *qlim;
- xfs_dqlock(dqp);
+ mutex_lock(&dqp->q_qlock);
defq = xfs_get_defquota(q, xfs_dquot_type(dqp));
@@ -887,16 +887,16 @@ xfs_trans_dqresv(
XFS_IS_CORRUPT(mp, dqp->q_ino.reserved < dqp->q_ino.count))
goto error_corrupt;
- xfs_dqunlock(dqp);
+ mutex_unlock(&dqp->q_qlock);
return 0;
error_return:
- xfs_dqunlock(dqp);
+ mutex_unlock(&dqp->q_qlock);
if (xfs_dquot_type(dqp) == XFS_DQTYPE_PROJ)
return -ENOSPC;
return -EDQUOT;
error_corrupt:
- xfs_dqunlock(dqp);
+ mutex_unlock(&dqp->q_qlock);
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
xfs_fs_mark_sick(mp, XFS_SICK_FS_QUOTACHECK);
return -EFSCORRUPTED;
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index bd841df93021..f945f0450b16 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -167,32 +167,4 @@ xfs_trans_ail_copy_lsn(
}
#endif
-static inline void
-xfs_clear_li_failed(
- struct xfs_log_item *lip)
-{
- struct xfs_buf *bp = lip->li_buf;
-
- ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags));
- lockdep_assert_held(&lip->li_ailp->ail_lock);
-
- if (test_and_clear_bit(XFS_LI_FAILED, &lip->li_flags)) {
- lip->li_buf = NULL;
- xfs_buf_rele(bp);
- }
-}
-
-static inline void
-xfs_set_li_failed(
- struct xfs_log_item *lip,
- struct xfs_buf *bp)
-{
- lockdep_assert_held(&lip->li_ailp->ail_lock);
-
- if (!test_and_set_bit(XFS_LI_FAILED, &lip->li_flags)) {
- xfs_buf_hold(bp);
- lip->li_buf = bp;
- }
-}
-
#endif /* __XFS_TRANS_PRIV_H__ */
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 0f641a9091ec..ac5cecec9aa1 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -243,7 +243,7 @@ __xfs_xattr_put_listent(
offset = context->buffer + context->count;
memcpy(offset, prefix, prefix_len);
offset += prefix_len;
- strncpy(offset, (char *)name, namelen); /* real name */
+ memcpy(offset, (char *)name, namelen); /* real name */
offset += namelen;
*offset = '\0';
diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c
index 52af234936a2..bbcf21704ea0 100644
--- a/fs/xfs/xfs_zone_alloc.c
+++ b/fs/xfs/xfs_zone_alloc.c
@@ -24,15 +24,24 @@
#include "xfs_zone_priv.h"
#include "xfs_zones.h"
#include "xfs_trace.h"
+#include "xfs_mru_cache.h"
+
+static void
+xfs_open_zone_free_rcu(
+ struct callback_head *cb)
+{
+ struct xfs_open_zone *oz = container_of(cb, typeof(*oz), oz_rcu);
+
+ xfs_rtgroup_rele(oz->oz_rtg);
+ kfree(oz);
+}
void
xfs_open_zone_put(
struct xfs_open_zone *oz)
{
- if (atomic_dec_and_test(&oz->oz_ref)) {
- xfs_rtgroup_rele(oz->oz_rtg);
- kfree(oz);
- }
+ if (atomic_dec_and_test(&oz->oz_ref))
+ call_rcu(&oz->oz_rcu, xfs_open_zone_free_rcu);
}
static inline uint32_t
@@ -94,9 +103,6 @@ xfs_zone_account_reclaimable(
*/
trace_xfs_zone_emptied(rtg);
- if (!was_full)
- xfs_group_clear_mark(xg, XFS_RTG_RECLAIMABLE);
-
spin_lock(&zi->zi_used_buckets_lock);
if (!was_full)
xfs_zone_remove_from_bucket(zi, rgno, from_bucket);
@@ -118,7 +124,6 @@ xfs_zone_account_reclaimable(
xfs_zone_add_to_bucket(zi, rgno, to_bucket);
spin_unlock(&zi->zi_used_buckets_lock);
- xfs_group_set_mark(xg, XFS_RTG_RECLAIMABLE);
if (zi->zi_gc_thread && xfs_zoned_need_gc(mp))
wake_up_process(zi->zi_gc_thread);
} else if (to_bucket != from_bucket) {
@@ -133,6 +138,28 @@ xfs_zone_account_reclaimable(
}
}
+/*
+ * Check if we have any zones that can be reclaimed by looking at the entry
+ * counters for the zone buckets.
+ */
+bool
+xfs_zoned_have_reclaimable(
+ struct xfs_zone_info *zi)
+{
+ int i;
+
+ spin_lock(&zi->zi_used_buckets_lock);
+ for (i = 0; i < XFS_ZONE_USED_BUCKETS; i++) {
+ if (zi->zi_used_bucket_entries[i]) {
+ spin_unlock(&zi->zi_used_buckets_lock);
+ return true;
+ }
+ }
+ spin_unlock(&zi->zi_used_buckets_lock);
+
+ return false;
+}
+
static void
xfs_open_zone_mark_full(
struct xfs_open_zone *oz)
@@ -165,10 +192,9 @@ xfs_open_zone_mark_full(
static void
xfs_zone_record_blocks(
struct xfs_trans *tp,
- xfs_fsblock_t fsbno,
- xfs_filblks_t len,
struct xfs_open_zone *oz,
- bool used)
+ xfs_fsblock_t fsbno,
+ xfs_filblks_t len)
{
struct xfs_mount *mp = tp->t_mountp;
struct xfs_rtgroup *rtg = oz->oz_rtg;
@@ -178,18 +204,37 @@ xfs_zone_record_blocks(
xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP);
- if (used) {
- rmapip->i_used_blocks += len;
- ASSERT(rmapip->i_used_blocks <= rtg_blocks(rtg));
- } else {
- xfs_add_frextents(mp, len);
- }
+ rmapip->i_used_blocks += len;
+ ASSERT(rmapip->i_used_blocks <= rtg_blocks(rtg));
oz->oz_written += len;
if (oz->oz_written == rtg_blocks(rtg))
xfs_open_zone_mark_full(oz);
xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE);
}
+/*
+ * Called for blocks that have been written to disk, but not actually linked to
+ * an inode, which can happen when garbage collection races with user data
+ * writes to a file.
+ */
+static void
+xfs_zone_skip_blocks(
+ struct xfs_open_zone *oz,
+ xfs_filblks_t len)
+{
+ struct xfs_rtgroup *rtg = oz->oz_rtg;
+
+ trace_xfs_zone_skip_blocks(oz, 0, len);
+
+ xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
+ oz->oz_written += len;
+ if (oz->oz_written == rtg_blocks(rtg))
+ xfs_open_zone_mark_full(oz);
+ xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
+
+ xfs_add_frextents(rtg_mount(rtg), len);
+}
+
static int
xfs_zoned_map_extent(
struct xfs_trans *tp,
@@ -219,6 +264,14 @@ xfs_zoned_map_extent(
* If a data write raced with this GC write, keep the existing data in
* the data fork, mark our newly written GC extent as reclaimable, then
* move on to the next extent.
+ *
+ * Note that this can also happen when racing with operations that do
+ * not actually invalidate the data, but just move it to a different
+ * inode (XFS_IOC_EXCHANGE_RANGE), or to a different offset inside the
+ * inode (FALLOC_FL_COLLAPSE_RANGE / FALLOC_FL_INSERT_RANGE). If the
+ * data was just moved around, GC fails to free the zone, but the zone
+ * becomes a GC candidate again as soon as all previous GC I/O has
+ * finished and these blocks will be moved out eventually.
*/
if (old_startblock != NULLFSBLOCK &&
old_startblock != data.br_startblock)
@@ -249,8 +302,7 @@ xfs_zoned_map_extent(
}
}
- xfs_zone_record_blocks(tp, new->br_startblock, new->br_blockcount, oz,
- true);
+ xfs_zone_record_blocks(tp, oz, new->br_startblock, new->br_blockcount);
/* Map the new blocks into the data fork. */
xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, new);
@@ -258,8 +310,7 @@ xfs_zoned_map_extent(
skip:
trace_xfs_reflink_cow_remap_skip(ip, new);
- xfs_zone_record_blocks(tp, new->br_startblock, new->br_blockcount, oz,
- false);
+ xfs_zone_skip_blocks(oz, new->br_blockcount);
return 0;
}
@@ -357,44 +408,6 @@ xfs_zone_free_blocks(
return 0;
}
-/*
- * Check if the zone containing the data just before the offset we are
- * writing to is still open and has space.
- */
-static struct xfs_open_zone *
-xfs_last_used_zone(
- struct iomap_ioend *ioend)
-{
- struct xfs_inode *ip = XFS_I(ioend->io_inode);
- struct xfs_mount *mp = ip->i_mount;
- xfs_fileoff_t offset_fsb = XFS_B_TO_FSB(mp, ioend->io_offset);
- struct xfs_rtgroup *rtg = NULL;
- struct xfs_open_zone *oz = NULL;
- struct xfs_iext_cursor icur;
- struct xfs_bmbt_irec got;
-
- xfs_ilock(ip, XFS_ILOCK_SHARED);
- if (!xfs_iext_lookup_extent_before(ip, &ip->i_df, &offset_fsb,
- &icur, &got)) {
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
- return NULL;
- }
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
- rtg = xfs_rtgroup_grab(mp, xfs_rtb_to_rgno(mp, got.br_startblock));
- if (!rtg)
- return NULL;
-
- xfs_ilock(rtg_rmap(rtg), XFS_ILOCK_SHARED);
- oz = READ_ONCE(rtg->rtg_open_zone);
- if (oz && (oz->oz_is_gc || !atomic_inc_not_zero(&oz->oz_ref)))
- oz = NULL;
- xfs_iunlock(rtg_rmap(rtg), XFS_ILOCK_SHARED);
-
- xfs_rtgroup_rele(rtg);
- return oz;
-}
-
static struct xfs_group *
xfs_find_free_zone(
struct xfs_mount *mp,
@@ -433,7 +446,7 @@ xfs_init_open_zone(
spin_lock_init(&oz->oz_alloc_lock);
atomic_set(&oz->oz_ref, 1);
oz->oz_rtg = rtg;
- oz->oz_write_pointer = write_pointer;
+ oz->oz_allocated = write_pointer;
oz->oz_written = write_pointer;
oz->oz_write_hint = write_hint;
oz->oz_is_gc = is_gc;
@@ -514,64 +527,58 @@ xfs_try_open_zone(
return oz;
}
+enum xfs_zone_alloc_score {
+ /* Any open zone will do it, we're desperate */
+ XFS_ZONE_ALLOC_ANY = 0,
+
+ /* It better fit somehow */
+ XFS_ZONE_ALLOC_OK = 1,
+
+ /* Only reuse a zone if it fits really well. */
+ XFS_ZONE_ALLOC_GOOD = 2,
+};
+
/*
- * For data with short or medium lifetime, try to colocated it into an
- * already open zone with a matching temperature.
+ * Life time hint co-location matrix. Fields not set default to 0
+ * aka XFS_ZONE_ALLOC_ANY.
*/
-static bool
-xfs_colocate_eagerly(
- enum rw_hint file_hint)
-{
- switch (file_hint) {
- case WRITE_LIFE_MEDIUM:
- case WRITE_LIFE_SHORT:
- case WRITE_LIFE_NONE:
- return true;
- default:
- return false;
- }
-}
-
-static bool
-xfs_good_hint_match(
- struct xfs_open_zone *oz,
- enum rw_hint file_hint)
-{
- switch (oz->oz_write_hint) {
- case WRITE_LIFE_LONG:
- case WRITE_LIFE_EXTREME:
- /* colocate long and extreme */
- if (file_hint == WRITE_LIFE_LONG ||
- file_hint == WRITE_LIFE_EXTREME)
- return true;
- break;
- case WRITE_LIFE_MEDIUM:
- /* colocate medium with medium */
- if (file_hint == WRITE_LIFE_MEDIUM)
- return true;
- break;
- case WRITE_LIFE_SHORT:
- case WRITE_LIFE_NONE:
- case WRITE_LIFE_NOT_SET:
- /* colocate short and none */
- if (file_hint <= WRITE_LIFE_SHORT)
- return true;
- break;
- }
- return false;
-}
+static const unsigned int
+xfs_zoned_hint_score[WRITE_LIFE_HINT_NR][WRITE_LIFE_HINT_NR] = {
+ [WRITE_LIFE_NOT_SET] = {
+ [WRITE_LIFE_NOT_SET] = XFS_ZONE_ALLOC_OK,
+ },
+ [WRITE_LIFE_NONE] = {
+ [WRITE_LIFE_NONE] = XFS_ZONE_ALLOC_OK,
+ },
+ [WRITE_LIFE_SHORT] = {
+ [WRITE_LIFE_SHORT] = XFS_ZONE_ALLOC_GOOD,
+ },
+ [WRITE_LIFE_MEDIUM] = {
+ [WRITE_LIFE_MEDIUM] = XFS_ZONE_ALLOC_GOOD,
+ },
+ [WRITE_LIFE_LONG] = {
+ [WRITE_LIFE_LONG] = XFS_ZONE_ALLOC_OK,
+ [WRITE_LIFE_EXTREME] = XFS_ZONE_ALLOC_OK,
+ },
+ [WRITE_LIFE_EXTREME] = {
+ [WRITE_LIFE_LONG] = XFS_ZONE_ALLOC_OK,
+ [WRITE_LIFE_EXTREME] = XFS_ZONE_ALLOC_OK,
+ },
+};
static bool
xfs_try_use_zone(
struct xfs_zone_info *zi,
enum rw_hint file_hint,
struct xfs_open_zone *oz,
- bool lowspace)
+ unsigned int goodness)
{
- if (oz->oz_write_pointer == rtg_blocks(oz->oz_rtg))
+ if (oz->oz_allocated == rtg_blocks(oz->oz_rtg))
return false;
- if (!lowspace && !xfs_good_hint_match(oz, file_hint))
+
+ if (xfs_zoned_hint_score[oz->oz_write_hint][file_hint] < goodness)
return false;
+
if (!atomic_inc_not_zero(&oz->oz_ref))
return false;
@@ -602,14 +609,14 @@ static struct xfs_open_zone *
xfs_select_open_zone_lru(
struct xfs_zone_info *zi,
enum rw_hint file_hint,
- bool lowspace)
+ unsigned int goodness)
{
struct xfs_open_zone *oz;
lockdep_assert_held(&zi->zi_open_zones_lock);
list_for_each_entry(oz, &zi->zi_open_zones, oz_entry)
- if (xfs_try_use_zone(zi, file_hint, oz, lowspace))
+ if (xfs_try_use_zone(zi, file_hint, oz, goodness))
return oz;
cond_resched_lock(&zi->zi_open_zones_lock);
@@ -626,7 +633,7 @@ xfs_select_open_zone_mru(
lockdep_assert_held(&zi->zi_open_zones_lock);
list_for_each_entry_reverse(oz, &zi->zi_open_zones, oz_entry)
- if (xfs_try_use_zone(zi, file_hint, oz, false))
+ if (xfs_try_use_zone(zi, file_hint, oz, XFS_ZONE_ALLOC_OK))
return oz;
cond_resched_lock(&zi->zi_open_zones_lock);
@@ -641,25 +648,29 @@ static inline enum rw_hint xfs_inode_write_hint(struct xfs_inode *ip)
}
/*
- * Try to pack inodes that are written back after they were closed tight instead
- * of trying to open new zones for them or spread them to the least recently
- * used zone. This optimizes the data layout for workloads that untar or copy
- * a lot of small files. Right now this does not separate multiple such
+ * Try to tightly pack small files that are written back after they were closed
+ * instead of trying to open new zones for them or spread them to the least
+ * recently used zone. This optimizes the data layout for workloads that untar
+ * or copy a lot of small files. Right now this does not separate multiple such
* streams.
*/
static inline bool xfs_zoned_pack_tight(struct xfs_inode *ip)
{
+ struct xfs_mount *mp = ip->i_mount;
+ size_t zone_capacity =
+ XFS_FSB_TO_B(mp, mp->m_groups[XG_TYPE_RTG].blocks);
+
+ /*
+ * Do not pack write files that are already using a full zone to avoid
+ * fragmentation.
+ */
+ if (i_size_read(VFS_I(ip)) >= zone_capacity)
+ return false;
+
return !inode_is_open_for_write(VFS_I(ip)) &&
!(ip->i_diflags & XFS_DIFLAG_APPEND);
}
-/*
- * Pick a new zone for writes.
- *
- * If we aren't using up our budget of open zones just open a new one from the
- * freelist. Else try to find one that matches the expected data lifetime. If
- * we don't find one that is good pick any zone that is available.
- */
static struct xfs_open_zone *
xfs_select_zone_nowait(
struct xfs_mount *mp,
@@ -679,31 +690,34 @@ xfs_select_zone_nowait(
* data.
*/
spin_lock(&zi->zi_open_zones_lock);
- if (xfs_colocate_eagerly(write_hint))
- oz = xfs_select_open_zone_lru(zi, write_hint, false);
- else if (pack_tight)
+ oz = xfs_select_open_zone_lru(zi, write_hint, XFS_ZONE_ALLOC_GOOD);
+ if (oz)
+ goto out_unlock;
+
+ if (pack_tight)
oz = xfs_select_open_zone_mru(zi, write_hint);
if (oz)
goto out_unlock;
/*
- * See if we can open a new zone and use that.
+ * See if we can open a new zone and use that so that data for different
+ * files is mixed as little as possible.
*/
oz = xfs_try_open_zone(mp, write_hint);
if (oz)
goto out_unlock;
/*
- * Try to colocate cold data with other cold data if we failed to open a
- * new zone for it.
+ * Try to find an zone that is an ok match to colocate data with.
*/
- if (write_hint != WRITE_LIFE_NOT_SET &&
- !xfs_colocate_eagerly(write_hint))
- oz = xfs_select_open_zone_lru(zi, write_hint, false);
- if (!oz)
- oz = xfs_select_open_zone_lru(zi, WRITE_LIFE_NOT_SET, false);
- if (!oz)
- oz = xfs_select_open_zone_lru(zi, WRITE_LIFE_NOT_SET, true);
+ oz = xfs_select_open_zone_lru(zi, write_hint, XFS_ZONE_ALLOC_OK);
+ if (oz)
+ goto out_unlock;
+
+ /*
+ * Pick the least recently used zone, regardless of hint match
+ */
+ oz = xfs_select_open_zone_lru(zi, write_hint, XFS_ZONE_ALLOC_ANY);
out_unlock:
spin_unlock(&zi->zi_open_zones_lock);
return oz;
@@ -726,7 +740,7 @@ xfs_select_zone(
for (;;) {
prepare_to_wait(&zi->zi_zone_wait, &wait, TASK_UNINTERRUPTIBLE);
oz = xfs_select_zone_nowait(mp, write_hint, pack_tight);
- if (oz)
+ if (oz || xfs_is_shutdown(mp))
break;
schedule();
}
@@ -743,25 +757,25 @@ xfs_zone_alloc_blocks(
{
struct xfs_rtgroup *rtg = oz->oz_rtg;
struct xfs_mount *mp = rtg_mount(rtg);
- xfs_rgblock_t rgbno;
+ xfs_rgblock_t allocated;
spin_lock(&oz->oz_alloc_lock);
count_fsb = min3(count_fsb, XFS_MAX_BMBT_EXTLEN,
- (xfs_filblks_t)rtg_blocks(rtg) - oz->oz_write_pointer);
+ (xfs_filblks_t)rtg_blocks(rtg) - oz->oz_allocated);
if (!count_fsb) {
spin_unlock(&oz->oz_alloc_lock);
return 0;
}
- rgbno = oz->oz_write_pointer;
- oz->oz_write_pointer += count_fsb;
+ allocated = oz->oz_allocated;
+ oz->oz_allocated += count_fsb;
spin_unlock(&oz->oz_alloc_lock);
- trace_xfs_zone_alloc_blocks(oz, rgbno, count_fsb);
+ trace_xfs_zone_alloc_blocks(oz, allocated, count_fsb);
*sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0);
*is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *sector);
if (!*is_seq)
- *sector += XFS_FSB_TO_BB(mp, rgbno);
+ *sector += XFS_FSB_TO_BB(mp, allocated);
return XFS_FSB_TO_B(mp, count_fsb);
}
@@ -776,6 +790,57 @@ xfs_mark_rtg_boundary(
ioend->io_flags |= IOMAP_IOEND_BOUNDARY;
}
+/*
+ * Check if we have a cached last open zone available for the inode and
+ * if yes return a reference to it.
+ */
+static struct xfs_open_zone *
+xfs_get_cached_zone(
+ struct xfs_inode *ip)
+{
+ struct xfs_open_zone *oz;
+
+ rcu_read_lock();
+ oz = VFS_I(ip)->i_private;
+ if (oz) {
+ /*
+ * GC only steals open zones at mount time, so no GC zones
+ * should end up in the cache.
+ */
+ ASSERT(!oz->oz_is_gc);
+ if (!atomic_inc_not_zero(&oz->oz_ref))
+ oz = NULL;
+ }
+ rcu_read_unlock();
+
+ return oz;
+}
+
+/*
+ * Stash our zone in the inode so that is is reused for future allocations.
+ *
+ * The open_zone structure will be pinned until either the inode is freed or
+ * until the cached open zone is replaced with a different one because the
+ * current one was full when we tried to use it. This means we keep any
+ * open zone around forever as long as any inode that used it for the last
+ * write is cached, which slightly increases the memory use of cached inodes
+ * that were every written to, but significantly simplifies the cached zone
+ * lookup. Because the open_zone is clearly marked as full when all data
+ * in the underlying RTG was written, the caching is always safe.
+ */
+static void
+xfs_set_cached_zone(
+ struct xfs_inode *ip,
+ struct xfs_open_zone *oz)
+{
+ struct xfs_open_zone *old_oz;
+
+ atomic_inc(&oz->oz_ref);
+ old_oz = xchg(&VFS_I(ip)->i_private, oz);
+ if (old_oz)
+ xfs_open_zone_put(old_oz);
+}
+
static void
xfs_submit_zoned_bio(
struct iomap_ioend *ioend,
@@ -813,17 +878,18 @@ xfs_zone_alloc_and_submit(
goto out_error;
/*
- * If we don't have a cached zone in this write context, see if the
- * last extent before the one we are writing to points to an active
- * zone. If so, just continue writing to it.
+ * If we don't have a locally cached zone in this write context, see if
+ * the inode is still associated with a zone and use that if so.
*/
- if (!*oz && ioend->io_offset)
- *oz = xfs_last_used_zone(ioend);
+ if (!*oz)
+ *oz = xfs_get_cached_zone(ip);
+
if (!*oz) {
select_zone:
*oz = xfs_select_zone(mp, write_hint, pack_tight);
if (!*oz)
goto out_error;
+ xfs_set_cached_zone(ip, *oz);
}
alloc_len = xfs_zone_alloc_blocks(*oz, XFS_B_TO_FSB(mp, ioend->io_size),
@@ -883,7 +949,7 @@ xfs_zone_rgbno_is_valid(
lockdep_assert_held(&rtg_rmap(rtg)->i_lock);
if (rtg->rtg_open_zone)
- return rgbno < rtg->rtg_open_zone->oz_write_pointer;
+ return rgbno < rtg->rtg_open_zone->oz_allocated;
return !xa_get_mark(&rtg_mount(rtg)->m_groups[XG_TYPE_RTG].xa,
rtg_rgno(rtg), XFS_RTG_FREE);
}
@@ -901,6 +967,12 @@ xfs_free_open_zones(
xfs_open_zone_put(oz);
}
spin_unlock(&zi->zi_open_zones_lock);
+
+ /*
+ * Wait for all open zones to be freed so that they drop the group
+ * references:
+ */
+ rcu_barrier();
}
struct xfs_init_zones {
@@ -917,7 +989,7 @@ xfs_init_zone(
{
struct xfs_mount *mp = rtg_mount(rtg);
struct xfs_zone_info *zi = mp->m_zone_info;
- uint64_t used = rtg_rmap(rtg)->i_used_blocks;
+ uint32_t used = rtg_rmap(rtg)->i_used_blocks;
xfs_rgblock_t write_pointer, highest_rgbno;
int error;
@@ -1014,24 +1086,27 @@ xfs_get_zone_info_cb(
}
/*
- * Calculate the max open zone limit based on the of number of
- * backing zones available
+ * Calculate the max open zone limit based on the of number of backing zones
+ * available.
*/
static inline uint32_t
xfs_max_open_zones(
struct xfs_mount *mp)
{
unsigned int max_open, max_open_data_zones;
+
/*
- * We need two zones for every open data zone,
- * one in reserve as we don't reclaim open zones. One data zone
- * and its spare is included in XFS_MIN_ZONES.
+ * We need two zones for every open data zone, one in reserve as we
+ * don't reclaim open zones. One data zone and its spare is included
+ * in XFS_MIN_ZONES to support at least one user data writer.
*/
max_open_data_zones = (mp->m_sb.sb_rgcount - XFS_MIN_ZONES) / 2 + 1;
max_open = max_open_data_zones + XFS_OPEN_GC_ZONES;
/*
- * Cap the max open limit to 1/4 of available space
+ * Cap the max open limit to 1/4 of available space. Without this we'd
+ * run out of easy reclaim targets too quickly and storage devices don't
+ * handle huge numbers of concurrent write streams overly well.
*/
max_open = min(max_open, mp->m_sb.sb_rgcount / 4);
@@ -1063,7 +1138,7 @@ xfs_calc_open_zones(
if (bdev_open_zones)
mp->m_max_open_zones = bdev_open_zones;
else
- mp->m_max_open_zones = xfs_max_open_zones(mp);
+ mp->m_max_open_zones = XFS_DEFAULT_MAX_OPEN_ZONES;
}
if (mp->m_max_open_zones < XFS_MIN_OPEN_ZONES) {
@@ -1147,6 +1222,7 @@ xfs_mount_zones(
.mp = mp,
};
struct xfs_buftarg *bt = mp->m_rtdev_targp;
+ xfs_extlen_t zone_blocks = mp->m_groups[XG_TYPE_RTG].blocks;
int error;
if (!bt) {
@@ -1176,13 +1252,36 @@ xfs_mount_zones(
if (!mp->m_zone_info)
return -ENOMEM;
- xfs_info(mp, "%u zones of %u blocks size (%u max open)",
- mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks,
- mp->m_max_open_zones);
+ xfs_info(mp, "%u zones of %u blocks (%u max open zones)",
+ mp->m_sb.sb_rgcount, zone_blocks, mp->m_max_open_zones);
trace_xfs_zones_mount(mp);
+ /*
+ * The writeback code switches between inodes regularly to provide
+ * fairness. The default lower bound is 4MiB, but for zoned file
+ * systems we want to increase that both to reduce seeks, but also more
+ * importantly so that workloads that writes files in a multiple of the
+ * zone size do not get fragmented and require garbage collection when
+ * they shouldn't. Increase is to the zone size capped by the max
+ * extent len.
+ *
+ * Note that because s_min_writeback_pages is a superblock field, this
+ * value also get applied to non-zoned files on the data device if
+ * there are any. On typical zoned setup all data is on the RT device
+ * because using the more efficient sequential write required zones
+ * is the reason for using the zone allocator, and either the RT device
+ * and the (meta)data device are on the same block device, or the
+ * (meta)data device is on a fast SSD while the data on the RT device
+ * is on a SMR HDD. In any combination of the above cases enforcing
+ * the higher min_writeback_pages for non-RT inodes is either a noop
+ * or beneficial.
+ */
+ mp->m_super->s_min_writeback_pages =
+ XFS_FSB_TO_B(mp, min(zone_blocks, XFS_MAX_BMBT_EXTLEN)) >>
+ PAGE_SHIFT;
+
if (bdev_is_zoned(bt->bt_bdev)) {
- error = blkdev_report_zones(bt->bt_bdev,
+ error = blkdev_report_zones_cached(bt->bt_bdev,
XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart),
mp->m_sb.sb_rgcount, xfs_get_zone_info_cb, &iz);
if (error < 0)
@@ -1192,8 +1291,10 @@ xfs_mount_zones(
while ((rtg = xfs_rtgroup_next(mp, rtg))) {
error = xfs_init_zone(&iz, rtg, NULL);
- if (error)
+ if (error) {
+ xfs_rtgroup_rele(rtg);
goto out_free_zone_info;
+ }
}
}
@@ -1201,6 +1302,13 @@ xfs_mount_zones(
xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,
iz.available + iz.reclaimable);
+ /*
+ * The user may configure GC to free up a percentage of unused blocks.
+ * By default this is 0. GC will always trigger at the minimum level
+ * for keeping max_open_zones available for data placement.
+ */
+ mp->m_zonegc_low_space = 0;
+
error = xfs_zone_gc_mount(mp);
if (error)
goto out_free_zone_info;
diff --git a/fs/xfs/xfs_zone_alloc.h b/fs/xfs/xfs_zone_alloc.h
index ecf39106704c..4db02816d0fd 100644
--- a/fs/xfs/xfs_zone_alloc.h
+++ b/fs/xfs/xfs_zone_alloc.h
@@ -23,9 +23,9 @@ struct xfs_zone_alloc_ctx {
*/
#define XFS_ZR_RESERVED (1U << 2)
-int xfs_zoned_space_reserve(struct xfs_inode *ip, xfs_filblks_t count_fsb,
+int xfs_zoned_space_reserve(struct xfs_mount *mp, xfs_filblks_t count_fsb,
unsigned int flags, struct xfs_zone_alloc_ctx *ac);
-void xfs_zoned_space_unreserve(struct xfs_inode *ip,
+void xfs_zoned_space_unreserve(struct xfs_mount *mp,
struct xfs_zone_alloc_ctx *ac);
void xfs_zoned_add_available(struct xfs_mount *mp, xfs_filblks_t count_fsb);
diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c
index c5136ea9bb1d..3c52cc1497d4 100644
--- a/fs/xfs/xfs_zone_gc.c
+++ b/fs/xfs/xfs_zone_gc.c
@@ -114,8 +114,9 @@ struct xfs_gc_bio {
/* Open Zone being written to */
struct xfs_open_zone *oz;
+ struct xfs_rtgroup *victim_rtg;
+
/* Bio used for reads and writes, including the bvec used by it */
- struct bio_vec bv;
struct bio bio; /* must be last */
};
@@ -162,18 +163,35 @@ struct xfs_zone_gc_data {
/*
* We aim to keep enough zones free in stock to fully use the open zone limit
- * for data placement purposes.
+ * for data placement purposes. Additionally, the m_zonegc_low_space tunable
+ * can be set to make sure a fraction of the unused blocks are available for
+ * writing.
*/
bool
xfs_zoned_need_gc(
struct xfs_mount *mp)
{
- if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE))
+ s64 available, free, threshold;
+ s32 remainder;
+
+ if (!xfs_zoned_have_reclaimable(mp->m_zone_info))
return false;
- if (xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE) <
- mp->m_groups[XG_TYPE_RTG].blocks *
- (mp->m_max_open_zones - XFS_OPEN_GC_ZONES))
+
+ available = xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE);
+
+ if (available <
+ xfs_rtgs_to_rfsbs(mp, mp->m_max_open_zones - XFS_OPEN_GC_ZONES))
+ return true;
+
+ free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS);
+
+ threshold = div_s64_rem(free, 100, &remainder);
+ threshold = threshold * mp->m_zonegc_low_space +
+ remainder * div_s64(mp->m_zonegc_low_space, 100);
+
+ if (available < threshold)
return true;
+
return false;
}
@@ -246,6 +264,7 @@ xfs_zone_gc_iter_init(
iter->rec_count = 0;
iter->rec_idx = 0;
iter->victim_rtg = victim_rtg;
+ atomic_inc(&victim_rtg->rtg_gccount);
}
/*
@@ -272,8 +291,6 @@ xfs_zone_gc_query_cb(
return 0;
}
-#define cmp_int(l, r) ((l > r) - (l < r))
-
static int
xfs_zone_gc_rmap_rec_cmp(
const void *a,
@@ -312,10 +329,7 @@ xfs_zone_gc_query(
iter->rec_idx = 0;
iter->rec_count = 0;
- error = xfs_trans_alloc_empty(mp, &tp);
- if (error)
- return error;
-
+ tp = xfs_trans_alloc_empty(mp);
xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
cur = xfs_rtrmapbt_init_cursor(tp, rtg);
error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
@@ -349,6 +363,7 @@ xfs_zone_gc_query(
return 0;
done:
+ atomic_dec(&iter->victim_rtg->rtg_gccount);
xfs_rtgroup_rele(iter->victim_rtg);
iter->victim_rtg = NULL;
return 0;
@@ -438,6 +453,20 @@ xfs_zone_gc_pick_victim_from(
if (!rtg)
continue;
+ /*
+ * If the zone is already undergoing GC, don't pick it again.
+ *
+ * This prevents us from picking one of the zones for which we
+ * already submitted GC I/O, but for which the remapping hasn't
+ * concluded yet. This won't cause data corruption, but
+ * increases write amplification and slows down GC, so this is
+ * a bad thing.
+ */
+ if (atomic_read(&rtg->rtg_gccount)) {
+ xfs_rtgroup_rele(rtg);
+ continue;
+ }
+
/* skip zones that are just waiting for a reset */
if (rtg_rmap(rtg)->i_used_blocks == 0 ||
rtg_rmap(rtg)->i_used_blocks >= victim_used) {
@@ -478,21 +507,6 @@ xfs_zone_gc_select_victim(
struct xfs_rtgroup *victim_rtg = NULL;
unsigned int bucket;
- if (xfs_is_shutdown(mp))
- return false;
-
- if (iter->victim_rtg)
- return true;
-
- /*
- * Don't start new work if we are asked to stop or park.
- */
- if (kthread_should_stop() || kthread_should_park())
- return false;
-
- if (!xfs_zoned_need_gc(mp))
- return false;
-
spin_lock(&zi->zi_used_buckets_lock);
for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) {
victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket);
@@ -517,8 +531,7 @@ xfs_zone_gc_steal_open(
spin_lock(&zi->zi_open_zones_lock);
list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) {
- if (!found ||
- oz->oz_write_pointer < found->oz_write_pointer)
+ if (!found || oz->oz_allocated < found->oz_allocated)
found = oz;
}
@@ -568,7 +581,7 @@ xfs_zone_gc_ensure_target(
{
struct xfs_open_zone *oz = mp->m_zone_info->zi_open_gc_zone;
- if (!oz || oz->oz_write_pointer == rtg_blocks(oz->oz_rtg))
+ if (!oz || oz->oz_allocated == rtg_blocks(oz->oz_rtg))
return xfs_zone_gc_select_target(mp);
return oz;
}
@@ -589,7 +602,7 @@ xfs_zone_gc_space_available(
oz = xfs_zone_gc_ensure_target(data->mp);
if (!oz)
return false;
- return oz->oz_write_pointer < rtg_blocks(oz->oz_rtg) &&
+ return oz->oz_allocated < rtg_blocks(oz->oz_rtg) &&
xfs_zone_gc_scratch_available(data);
}
@@ -631,7 +644,7 @@ xfs_zone_gc_alloc_blocks(
*/
spin_lock(&mp->m_sb_lock);
*count_fsb = min(*count_fsb,
- rtg_blocks(oz->oz_rtg) - oz->oz_write_pointer);
+ rtg_blocks(oz->oz_rtg) - oz->oz_allocated);
*count_fsb = min3(*count_fsb,
mp->m_free[XC_FREE_RTEXTENTS].res_avail,
mp->m_free[XC_FREE_RTAVAILABLE].res_avail);
@@ -645,8 +658,8 @@ xfs_zone_gc_alloc_blocks(
*daddr = xfs_gbno_to_daddr(&oz->oz_rtg->rtg_group, 0);
*is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr);
if (!*is_seq)
- *daddr += XFS_FSB_TO_BB(mp, oz->oz_write_pointer);
- oz->oz_write_pointer += *count_fsb;
+ *daddr += XFS_FSB_TO_BB(mp, oz->oz_allocated);
+ oz->oz_allocated += *count_fsb;
atomic_inc(&oz->oz_ref);
return oz;
}
@@ -691,6 +704,9 @@ xfs_zone_gc_start_chunk(
chunk->scratch = &data->scratch[data->scratch_idx];
chunk->data = data;
chunk->oz = oz;
+ chunk->victim_rtg = iter->victim_rtg;
+ atomic_inc(&chunk->victim_rtg->rtg_group.xg_active_ref);
+ atomic_inc(&chunk->victim_rtg->rtg_gccount);
bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock);
bio->bi_end_io = xfs_zone_gc_end_io;
@@ -713,6 +729,8 @@ static void
xfs_zone_gc_free_chunk(
struct xfs_gc_bio *chunk)
{
+ atomic_dec(&chunk->victim_rtg->rtg_gccount);
+ xfs_rtgroup_rele(chunk->victim_rtg);
list_del(&chunk->entry);
xfs_open_zone_put(chunk->oz);
xfs_irele(chunk->ip);
@@ -773,6 +791,10 @@ xfs_zone_gc_split_write(
split_chunk->oz = chunk->oz;
atomic_inc(&chunk->oz->oz_ref);
+ split_chunk->victim_rtg = chunk->victim_rtg;
+ atomic_inc(&chunk->victim_rtg->rtg_group.xg_active_ref);
+ atomic_inc(&chunk->victim_rtg->rtg_gccount);
+
chunk->offset += split_len;
chunk->len -= split_len;
chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len);
@@ -789,7 +811,8 @@ xfs_zone_gc_write_chunk(
{
struct xfs_zone_gc_data *data = chunk->data;
struct xfs_mount *mp = chunk->ip->i_mount;
- unsigned int folio_offset = chunk->bio.bi_io_vec->bv_offset;
+ phys_addr_t bvec_paddr =
+ bvec_phys(bio_first_bvec_all(&chunk->bio));
struct xfs_gc_bio *split_chunk;
if (chunk->bio.bi_status)
@@ -804,7 +827,7 @@ xfs_zone_gc_write_chunk(
bio_reset(&chunk->bio, mp->m_rtdev_targp->bt_bdev, REQ_OP_WRITE);
bio_add_folio_nofail(&chunk->bio, chunk->scratch->folio, chunk->len,
- folio_offset);
+ offset_in_folio(chunk->scratch->folio, bvec_paddr));
while ((split_chunk = xfs_zone_gc_split_write(data, chunk)))
xfs_zone_gc_submit_write(data, split_chunk);
@@ -962,6 +985,27 @@ xfs_zone_gc_reset_zones(
} while (next);
}
+static bool
+xfs_zone_gc_should_start_new_work(
+ struct xfs_zone_gc_data *data)
+{
+ if (xfs_is_shutdown(data->mp))
+ return false;
+ if (!xfs_zone_gc_space_available(data))
+ return false;
+
+ if (!data->iter.victim_rtg) {
+ if (kthread_should_stop() || kthread_should_park())
+ return false;
+ if (!xfs_zoned_need_gc(data->mp))
+ return false;
+ if (!xfs_zone_gc_select_victim(data))
+ return false;
+ }
+
+ return true;
+}
+
/*
* Handle the work to read and write data for GC and to reset the zones,
* including handling all completions.
@@ -969,7 +1013,7 @@ xfs_zone_gc_reset_zones(
* Note that the order of the chunks is preserved so that we don't undo the
* optimal order established by xfs_zone_gc_query().
*/
-static bool
+static void
xfs_zone_gc_handle_work(
struct xfs_zone_gc_data *data)
{
@@ -983,30 +1027,22 @@ xfs_zone_gc_handle_work(
zi->zi_reset_list = NULL;
spin_unlock(&zi->zi_reset_list_lock);
- if (!xfs_zone_gc_select_victim(data) ||
- !xfs_zone_gc_space_available(data)) {
- if (list_empty(&data->reading) &&
- list_empty(&data->writing) &&
- list_empty(&data->resetting) &&
- !reset_list)
- return false;
- }
-
- __set_current_state(TASK_RUNNING);
- try_to_freeze();
-
- if (reset_list)
+ if (reset_list) {
+ set_current_state(TASK_RUNNING);
xfs_zone_gc_reset_zones(data, reset_list);
+ }
list_for_each_entry_safe(chunk, next, &data->resetting, entry) {
if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
break;
+ set_current_state(TASK_RUNNING);
xfs_zone_gc_finish_reset(chunk);
}
list_for_each_entry_safe(chunk, next, &data->writing, entry) {
if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
break;
+ set_current_state(TASK_RUNNING);
xfs_zone_gc_finish_chunk(chunk);
}
@@ -1014,15 +1050,18 @@ xfs_zone_gc_handle_work(
list_for_each_entry_safe(chunk, next, &data->reading, entry) {
if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
break;
+ set_current_state(TASK_RUNNING);
xfs_zone_gc_write_chunk(chunk);
}
blk_finish_plug(&plug);
- blk_start_plug(&plug);
- while (xfs_zone_gc_start_chunk(data))
- ;
- blk_finish_plug(&plug);
- return true;
+ if (xfs_zone_gc_should_start_new_work(data)) {
+ set_current_state(TASK_RUNNING);
+ blk_start_plug(&plug);
+ while (xfs_zone_gc_start_chunk(data))
+ ;
+ blk_finish_plug(&plug);
+ }
}
/*
@@ -1046,8 +1085,18 @@ xfs_zoned_gcd(
for (;;) {
set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE);
xfs_set_zonegc_running(mp);
- if (xfs_zone_gc_handle_work(data))
+
+ xfs_zone_gc_handle_work(data);
+
+ /*
+ * Only sleep if nothing set the state to running. Else check for
+ * work again as someone might have queued up more work and woken
+ * us in the meantime.
+ */
+ if (get_current_state() == TASK_RUNNING) {
+ try_to_freeze();
continue;
+ }
if (list_empty(&data->reading) &&
list_empty(&data->writing) &&
@@ -1133,16 +1182,16 @@ xfs_zone_gc_mount(
goto out_put_gc_zone;
}
- mp->m_zone_info->zi_gc_thread = kthread_create(xfs_zoned_gcd, data,
+ zi->zi_gc_thread = kthread_create(xfs_zoned_gcd, data,
"xfs-zone-gc/%s", mp->m_super->s_id);
- if (IS_ERR(mp->m_zone_info->zi_gc_thread)) {
+ if (IS_ERR(zi->zi_gc_thread)) {
xfs_warn(mp, "unable to create zone gc thread");
- error = PTR_ERR(mp->m_zone_info->zi_gc_thread);
+ error = PTR_ERR(zi->zi_gc_thread);
goto out_free_gc_data;
}
/* xfs_zone_gc_start will unpark for rw mounts */
- kthread_park(mp->m_zone_info->zi_gc_thread);
+ kthread_park(zi->zi_gc_thread);
return 0;
out_free_gc_data:
diff --git a/fs/xfs/xfs_zone_info.c b/fs/xfs/xfs_zone_info.c
index 733bcc2f8645..07e30c596975 100644
--- a/fs/xfs/xfs_zone_info.c
+++ b/fs/xfs/xfs_zone_info.c
@@ -32,7 +32,7 @@ xfs_show_open_zone(
{
seq_printf(m, "\t zone %d, wp %u, written %u, used %u, hint %s\n",
rtg_rgno(oz->oz_rtg),
- oz->oz_write_pointer, oz->oz_written,
+ oz->oz_allocated, oz->oz_written,
rtg_rmap(oz->oz_rtg)->i_used_blocks,
xfs_write_hint_to_str(oz->oz_write_hint));
}
diff --git a/fs/xfs/xfs_zone_priv.h b/fs/xfs/xfs_zone_priv.h
index ab696975a993..ce7f0e2f4598 100644
--- a/fs/xfs/xfs_zone_priv.h
+++ b/fs/xfs/xfs_zone_priv.h
@@ -11,18 +11,18 @@ struct xfs_open_zone {
atomic_t oz_ref;
/*
- * oz_write_pointer is the write pointer at which space is handed out
- * for conventional zones, or simple the count of blocks handed out
- * so far for sequential write required zones and is protected by
- * oz_alloc_lock/
+ * oz_allocated is the amount of space already allocated out of the zone
+ * and is protected by oz_alloc_lock.
+ *
+ * For conventional zones it also is the offset of the next write.
*/
spinlock_t oz_alloc_lock;
- xfs_rgblock_t oz_write_pointer;
+ xfs_rgblock_t oz_allocated;
/*
- * oz_written is the number of blocks for which we've received a
- * write completion. oz_written must always be <= oz_write_pointer
- * and is protected by the ILOCK of the rmap inode.
+ * oz_written is the number of blocks for which we've received a write
+ * completion. oz_written must always be <= oz_allocated and is
+ * protected by the ILOCK of the rmap inode.
*/
xfs_rgblock_t oz_written;
@@ -44,6 +44,8 @@ struct xfs_open_zone {
* the life time of an open zone.
*/
struct xfs_rtgroup *oz_rtg;
+
+ struct rcu_head oz_rcu;
};
/*
@@ -111,6 +113,7 @@ struct xfs_open_zone *xfs_open_zone(struct xfs_mount *mp,
int xfs_zone_gc_reset_sync(struct xfs_rtgroup *rtg);
bool xfs_zoned_need_gc(struct xfs_mount *mp);
+bool xfs_zoned_have_reclaimable(struct xfs_zone_info *zi);
int xfs_zone_gc_mount(struct xfs_mount *mp);
void xfs_zone_gc_unmount(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_zone_space_resv.c b/fs/xfs/xfs_zone_space_resv.c
index 93c9a7721139..fc1a4d1ce10c 100644
--- a/fs/xfs/xfs_zone_space_resv.c
+++ b/fs/xfs/xfs_zone_space_resv.c
@@ -10,6 +10,7 @@
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_rtbitmap.h"
+#include "xfs_icache.h"
#include "xfs_zone_alloc.h"
#include "xfs_zone_priv.h"
#include "xfs_zones.h"
@@ -53,12 +54,10 @@ xfs_zoned_default_resblks(
{
switch (ctr) {
case XC_FREE_RTEXTENTS:
- return (uint64_t)XFS_RESERVED_ZONES *
- mp->m_groups[XG_TYPE_RTG].blocks +
- mp->m_sb.sb_rtreserved;
+ return xfs_rtgs_to_rfsbs(mp, XFS_RESERVED_ZONES) +
+ mp->m_sb.sb_rtreserved;
case XC_FREE_RTAVAILABLE:
- return (uint64_t)XFS_GC_ZONES *
- mp->m_groups[XG_TYPE_RTG].blocks;
+ return xfs_rtgs_to_rfsbs(mp, XFS_GC_ZONES);
default:
ASSERT(0);
return 0;
@@ -117,11 +116,10 @@ xfs_zoned_space_wait_error(
static int
xfs_zoned_reserve_available(
- struct xfs_inode *ip,
+ struct xfs_mount *mp,
xfs_filblks_t count_fsb,
unsigned int flags)
{
- struct xfs_mount *mp = ip->i_mount;
struct xfs_zone_info *zi = mp->m_zone_info;
struct xfs_zone_reservation reservation = {
.task = current,
@@ -174,7 +172,7 @@ xfs_zoned_reserve_available(
* processing a pending GC request give up as we're fully out
* of space.
*/
- if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE) &&
+ if (!xfs_zoned_have_reclaimable(mp->m_zone_info) &&
!xfs_is_zonegc_running(mp))
break;
@@ -198,11 +196,10 @@ xfs_zoned_reserve_available(
*/
static int
xfs_zoned_reserve_extents_greedy(
- struct xfs_inode *ip,
+ struct xfs_mount *mp,
xfs_filblks_t *count_fsb,
unsigned int flags)
{
- struct xfs_mount *mp = ip->i_mount;
struct xfs_zone_info *zi = mp->m_zone_info;
s64 len = *count_fsb;
int error = -ENOSPC;
@@ -220,12 +217,11 @@ xfs_zoned_reserve_extents_greedy(
int
xfs_zoned_space_reserve(
- struct xfs_inode *ip,
+ struct xfs_mount *mp,
xfs_filblks_t count_fsb,
unsigned int flags,
struct xfs_zone_alloc_ctx *ac)
{
- struct xfs_mount *mp = ip->i_mount;
int error;
ASSERT(ac->reserved_blocks == 0);
@@ -233,12 +229,17 @@ xfs_zoned_space_reserve(
error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb,
flags & XFS_ZR_RESERVED);
+ if (error == -ENOSPC && !(flags & XFS_ZR_NOWAIT)) {
+ xfs_inodegc_flush(mp);
+ error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb,
+ flags & XFS_ZR_RESERVED);
+ }
if (error == -ENOSPC && (flags & XFS_ZR_GREEDY) && count_fsb > 1)
- error = xfs_zoned_reserve_extents_greedy(ip, &count_fsb, flags);
+ error = xfs_zoned_reserve_extents_greedy(mp, &count_fsb, flags);
if (error)
return error;
- error = xfs_zoned_reserve_available(ip, count_fsb, flags);
+ error = xfs_zoned_reserve_available(mp, count_fsb, flags);
if (error) {
xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb);
return error;
@@ -249,12 +250,10 @@ xfs_zoned_space_reserve(
void
xfs_zoned_space_unreserve(
- struct xfs_inode *ip,
+ struct xfs_mount *mp,
struct xfs_zone_alloc_ctx *ac)
{
if (ac->reserved_blocks > 0) {
- struct xfs_mount *mp = ip->i_mount;
-
xfs_zoned_add_available(mp, ac->reserved_blocks);
xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, ac->reserved_blocks);
}