diff options
Diffstat (limited to 'fs/xfs')
161 files changed, 4899 insertions, 3456 deletions
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index fffd6fffdce0..b99da294e9a3 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -3,7 +3,7 @@ config XFS_FS tristate "XFS filesystem support" depends on BLOCK select EXPORTFS - select LIBCRC32C + select CRC32 select FS_IOMAP help XFS is a high performance journaling filesystem which originated @@ -25,7 +25,7 @@ config XFS_FS config XFS_SUPPORT_V4 bool "Support deprecated V4 (crc=0) format" depends on XFS_FS - default y + default n help The V4 filesystem format lacks certain features that are supported by the V5 format, such as metadata checksumming, strengthened @@ -40,7 +40,7 @@ config XFS_SUPPORT_V4 filesystem is a V4 filesystem. If no such string is found, please upgrade xfsprogs to the latest version and try again. - This option will become default N in September 2025. Support for the + This option became default N in September 2025. Support for the V4 format will be removed entirely in September 2030. Distributors can say N here to withdraw support earlier. @@ -50,7 +50,7 @@ config XFS_SUPPORT_V4 config XFS_SUPPORT_ASCII_CI bool "Support deprecated case-insensitive ascii (ascii-ci=1) format" depends on XFS_FS - default y + default n help The ASCII case insensitivity filesystem feature only works correctly on systems that have been coerced into using ISO 8859-1, and it does @@ -67,7 +67,7 @@ config XFS_SUPPORT_ASCII_CI filesystem is a case-insensitive filesystem. If no such string is found, please upgrade xfsprogs to the latest version and try again. - This option will become default N in September 2025. Support for the + This option became default N in September 2025. Support for the feature will be removed entirely in September 2030. Distributors can say N here to withdraw support earlier. @@ -105,6 +105,7 @@ config XFS_POSIX_ACL config XFS_RT bool "XFS Realtime subvolume support" depends on XFS_FS + default BLK_DEV_ZONED help If you say Y here you will be able to mount and use XFS filesystems which contain a realtime subvolume. The realtime subvolume is a @@ -118,6 +119,15 @@ config XFS_RT See the xfs man page in section 5 for additional information. + This option is mandatory to support zoned block devices. For these + devices, the realtime subvolume must be backed by a zoned block + device and a regular block device used as the main device (for + metadata). If the zoned block device is a host-managed SMR hard-disk + containing conventional zones at the beginning of its address space, + XFS will use the disk conventional zones as the main device and the + remaining sequential write required zones as the backing storage for + the realtime subvolume. + If unsure, say N. config XFS_DRAIN_INTENTS @@ -136,7 +146,7 @@ config XFS_BTREE_IN_MEM config XFS_ONLINE_SCRUB bool "XFS online metadata check support" - default n + default y depends on XFS_FS depends on TMPFS && SHMEM select XFS_LIVE_HOOKS @@ -149,17 +159,13 @@ config XFS_ONLINE_SCRUB advantage here is to look for problems proactively so that they can be dealt with in a controlled manner. - This feature is considered EXPERIMENTAL. Use with caution! - See the xfs_scrub man page in section 8 for additional information. - If unsure, say N. - config XFS_ONLINE_SCRUB_STATS bool "XFS online metadata check usage data collection" default y depends on XFS_ONLINE_SCRUB - select DEBUG_FS + depends on DEBUG_FS help If you say Y here, the kernel will gather usage data about the online metadata check subsystem. This includes the number @@ -170,11 +176,9 @@ config XFS_ONLINE_SCRUB_STATS Usage data are collected in /sys/kernel/debug/xfs/scrub. - If unsure, say N. - config XFS_ONLINE_REPAIR bool "XFS online metadata repair support" - default n + default y depends on XFS_FS && XFS_ONLINE_SCRUB select XFS_BTREE_IN_MEM help @@ -185,12 +189,8 @@ config XFS_ONLINE_REPAIR formatted with secondary metadata, such as reverse mappings and inode parent pointers. - This feature is considered EXPERIMENTAL. Use with caution! - See the xfs_scrub man page in section 8 for additional information. - If unsure, say N. - config XFS_WARN bool "XFS Verbose Warnings" depends on XFS_FS && !XFS_DEBUG diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index fb79215a509d..8ac8230c3d3c 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -92,9 +92,8 @@ xfs_ag_resv_critical( trace_xfs_ag_resv_critical(pag, type, avail); /* Critically low if less than 10% or max btree height remains. */ - return XFS_TEST_ERROR(avail < orig / 10 || - avail < mp->m_agbtree_maxlevels, - mp, XFS_ERRTAG_AG_RESV_CRITICAL); + return avail < orig / 10 || avail < mp->m_agbtree_maxlevels || + XFS_TEST_ERROR(mp, XFS_ERRTAG_AG_RESV_CRITICAL); } /* @@ -203,7 +202,7 @@ __xfs_ag_resv_init( return -EINVAL; } - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_AG_RESV_FAIL)) + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_AG_RESV_FAIL)) error = -ENOSPC; else error = xfs_dec_fdblocks(mp, hidden_space, true); diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 7839efe050bf..ad381c73abc4 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -3321,7 +3321,7 @@ xfs_agf_read_verify( xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { fa = xfs_agf_verify(bp); - if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_ALLOC_READ_AGF)) + if (fa || XFS_TEST_ERROR(mp, XFS_ERRTAG_ALLOC_READ_AGF)) xfs_verifier_error(bp, -EFSCORRUPTED, fa); } } @@ -3444,16 +3444,41 @@ xfs_alloc_read_agf( set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate); } + #ifdef DEBUG - else if (!xfs_is_shutdown(mp)) { - ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks)); - ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks)); - ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount)); - ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest)); - ASSERT(pag->pagf_bno_level == be32_to_cpu(agf->agf_bno_level)); - ASSERT(pag->pagf_cnt_level == be32_to_cpu(agf->agf_cnt_level)); + /* + * It's possible for the AGF to be out of sync if the block device is + * silently dropping writes. This can happen in fstests with dmflakey + * enabled, which allows the buffer to be cleaned and reclaimed by + * memory pressure and then re-read from disk here. We will get a + * stale version of the AGF from disk, and nothing good can happen from + * here. Hence if we detect this situation, immediately shut down the + * filesystem. + * + * This can also happen if we are already in the middle of a forced + * shutdown, so don't bother checking if we are already shut down. + */ + if (!xfs_is_shutdown(pag_mount(pag))) { + bool ok = true; + + ok &= pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks); + ok &= pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks); + ok &= pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks); + ok &= pag->pagf_flcount == be32_to_cpu(agf->agf_flcount); + ok &= pag->pagf_longest == be32_to_cpu(agf->agf_longest); + ok &= pag->pagf_bno_level == be32_to_cpu(agf->agf_bno_level); + ok &= pag->pagf_cnt_level == be32_to_cpu(agf->agf_cnt_level); + + if (XFS_IS_CORRUPT(pag_mount(pag), !ok)) { + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGF); + xfs_trans_brelse(tp, agfbp); + xfs_force_shutdown(pag_mount(pag), + SHUTDOWN_CORRUPT_ONDISK); + return -EFSCORRUPTED; + } } -#endif +#endif /* DEBUG */ + if (agfbpp) *agfbpp = agfbp; else @@ -3994,8 +4019,7 @@ __xfs_free_extent( ASSERT(len != 0); ASSERT(type != XFS_AG_RESV_AGFL); - if (XFS_TEST_ERROR(false, mp, - XFS_ERRTAG_FREE_EXTENT)) + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_FREE_EXTENT)) return -EIO; error = xfs_free_extent_fix_freelist(tp, pag, &agbp); diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index a4ac37ba5d51..fa1f03c1331e 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -186,35 +186,32 @@ xfs_allocbt_init_ptr_from_cur( ptr->s = agf->agf_cnt_root; } -STATIC int64_t -xfs_bnobt_key_diff( +STATIC int +xfs_bnobt_cmp_key_with_cur( struct xfs_btree_cur *cur, const union xfs_btree_key *key) { struct xfs_alloc_rec_incore *rec = &cur->bc_rec.a; const struct xfs_alloc_rec *kp = &key->alloc; - return (int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock; + return cmp_int(be32_to_cpu(kp->ar_startblock), + rec->ar_startblock); } -STATIC int64_t -xfs_cntbt_key_diff( +STATIC int +xfs_cntbt_cmp_key_with_cur( struct xfs_btree_cur *cur, const union xfs_btree_key *key) { struct xfs_alloc_rec_incore *rec = &cur->bc_rec.a; const struct xfs_alloc_rec *kp = &key->alloc; - int64_t diff; - diff = (int64_t)be32_to_cpu(kp->ar_blockcount) - rec->ar_blockcount; - if (diff) - return diff; - - return (int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock; + return cmp_int(be32_to_cpu(kp->ar_blockcount), rec->ar_blockcount) ?: + cmp_int(be32_to_cpu(kp->ar_startblock), rec->ar_startblock); } -STATIC int64_t -xfs_bnobt_diff_two_keys( +STATIC int +xfs_bnobt_cmp_two_keys( struct xfs_btree_cur *cur, const union xfs_btree_key *k1, const union xfs_btree_key *k2, @@ -222,29 +219,24 @@ xfs_bnobt_diff_two_keys( { ASSERT(!mask || mask->alloc.ar_startblock); - return (int64_t)be32_to_cpu(k1->alloc.ar_startblock) - - be32_to_cpu(k2->alloc.ar_startblock); + return cmp_int(be32_to_cpu(k1->alloc.ar_startblock), + be32_to_cpu(k2->alloc.ar_startblock)); } -STATIC int64_t -xfs_cntbt_diff_two_keys( +STATIC int +xfs_cntbt_cmp_two_keys( struct xfs_btree_cur *cur, const union xfs_btree_key *k1, const union xfs_btree_key *k2, const union xfs_btree_key *mask) { - int64_t diff; - ASSERT(!mask || (mask->alloc.ar_blockcount && mask->alloc.ar_startblock)); - diff = be32_to_cpu(k1->alloc.ar_blockcount) - - be32_to_cpu(k2->alloc.ar_blockcount); - if (diff) - return diff; - - return be32_to_cpu(k1->alloc.ar_startblock) - - be32_to_cpu(k2->alloc.ar_startblock); + return cmp_int(be32_to_cpu(k1->alloc.ar_blockcount), + be32_to_cpu(k2->alloc.ar_blockcount)) ?: + cmp_int(be32_to_cpu(k1->alloc.ar_startblock), + be32_to_cpu(k2->alloc.ar_startblock)); } static xfs_failaddr_t @@ -438,9 +430,9 @@ const struct xfs_btree_ops xfs_bnobt_ops = { .init_high_key_from_rec = xfs_bnobt_init_high_key_from_rec, .init_rec_from_cur = xfs_allocbt_init_rec_from_cur, .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, - .key_diff = xfs_bnobt_key_diff, + .cmp_key_with_cur = xfs_bnobt_cmp_key_with_cur, .buf_ops = &xfs_bnobt_buf_ops, - .diff_two_keys = xfs_bnobt_diff_two_keys, + .cmp_two_keys = xfs_bnobt_cmp_two_keys, .keys_inorder = xfs_bnobt_keys_inorder, .recs_inorder = xfs_bnobt_recs_inorder, .keys_contiguous = xfs_allocbt_keys_contiguous, @@ -468,9 +460,9 @@ const struct xfs_btree_ops xfs_cntbt_ops = { .init_high_key_from_rec = xfs_cntbt_init_high_key_from_rec, .init_rec_from_cur = xfs_allocbt_init_rec_from_cur, .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, - .key_diff = xfs_cntbt_key_diff, + .cmp_key_with_cur = xfs_cntbt_cmp_key_with_cur, .buf_ops = &xfs_cntbt_buf_ops, - .diff_two_keys = xfs_cntbt_diff_two_keys, + .cmp_two_keys = xfs_cntbt_cmp_two_keys, .keys_inorder = xfs_cntbt_keys_inorder, .recs_inorder = xfs_cntbt_recs_inorder, .keys_contiguous = NULL, /* not needed right now */ diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index fddb55605e0c..91c1b30ebaab 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -667,12 +667,8 @@ xfs_attr_shortform_bytesfit( /* * For attr2 we can try to move the forkoff if there is space in the - * literal area, but for the old format we are done if there is no - * space in the fixed attribute fork. + * literal area */ - if (!xfs_has_attr2(mp)) - return 0; - dsize = dp->i_df.if_bytes; switch (dp->i_df.if_format) { @@ -723,22 +719,16 @@ xfs_attr_shortform_bytesfit( } /* - * Switch on the ATTR2 superblock bit (implies also FEATURES2) unless: - * - noattr2 mount option is set, - * - on-disk version bit says it is already set, or - * - the attr2 mount option is not set to enable automatic upgrade from attr1. + * Switch on the ATTR2 superblock bit (implies also FEATURES2) unless + * on-disk version bit says it is already set */ STATIC void xfs_sbversion_add_attr2( struct xfs_mount *mp, struct xfs_trans *tp) { - if (xfs_has_noattr2(mp)) - return; if (mp->m_sb.sb_features2 & XFS_SB_VERSION2_ATTR2BIT) return; - if (!xfs_has_attr2(mp)) - return; spin_lock(&mp->m_sb_lock); xfs_add_attr2(mp); @@ -889,7 +879,7 @@ xfs_attr_sf_removename( /* * Fix up the start offset of the attribute fork */ - if (totsize == sizeof(struct xfs_attr_sf_hdr) && xfs_has_attr2(mp) && + if (totsize == sizeof(struct xfs_attr_sf_hdr) && (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && !(args->op_flags & (XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE)) && !xfs_has_parent(mp)) { @@ -900,7 +890,6 @@ xfs_attr_sf_removename( ASSERT(dp->i_forkoff); ASSERT(totsize > sizeof(struct xfs_attr_sf_hdr) || (args->op_flags & XFS_DA_OP_ADDNAME) || - !xfs_has_attr2(mp) || dp->i_df.if_format == XFS_DINODE_FMT_BTREE || xfs_has_parent(mp)); xfs_trans_log_inode(args->trans, dp, @@ -1040,8 +1029,7 @@ xfs_attr_shortform_allfit( bytes += xfs_attr_sf_entsize_byname(name_loc->namelen, be16_to_cpu(name_loc->valuelen)); } - if (xfs_has_attr2(dp->i_mount) && - (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && + if ((dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && (bytes == sizeof(struct xfs_attr_sf_hdr))) return -1; return xfs_attr_shortform_bytesfit(dp, bytes); @@ -1161,7 +1149,6 @@ xfs_attr3_leaf_to_shortform( * this case. */ if (!(args->op_flags & XFS_DA_OP_REPLACE)) { - ASSERT(xfs_has_attr2(dp->i_mount)); ASSERT(dp->i_df.if_format != XFS_DINODE_FMT_BTREE); xfs_attr_fork_remove(dp, args->trans); } @@ -1225,7 +1212,7 @@ xfs_attr3_leaf_to_node( trace_xfs_attr_leaf_to_node(args); - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_ATTR_LEAF_TO_NODE)) { + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_ATTR_LEAF_TO_NODE)) { error = -EIO; goto out; } diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 4c44ce1c8a64..bff3dc226f81 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -435,6 +435,13 @@ xfs_attr_rmtval_get( 0, &bp, &xfs_attr3_rmt_buf_ops); if (xfs_metadata_is_sick(error)) xfs_dirattr_mark_sick(args->dp, XFS_ATTR_FORK); + /* + * ENODATA from disk implies a disk medium failure; + * ENODATA for xattrs means attribute not found, so + * disambiguate that here. + */ + if (error == -ENODATA) + error = -EIO; if (error) return error; diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 63255820b58a..53ef4b7e504d 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -997,8 +997,7 @@ xfs_bmap_add_attrfork_local( static int xfs_bmap_set_attrforkoff( struct xfs_inode *ip, - int size, - int *version) + int size) { int default_size = xfs_default_attroffset(ip) >> 3; @@ -1012,8 +1011,6 @@ xfs_bmap_set_attrforkoff( ip->i_forkoff = xfs_attr_shortform_bytesfit(ip, size); if (!ip->i_forkoff) ip->i_forkoff = default_size; - else if (xfs_has_attr2(ip->i_mount) && version) - *version = 2; break; default: ASSERT(0); @@ -1035,7 +1032,6 @@ xfs_bmap_add_attrfork( int rsvd) /* xact may use reserved blks */ { struct xfs_mount *mp = tp->t_mountp; - int version = 1; /* superblock attr version */ int logflags; /* logging flags */ int error; /* error return value */ @@ -1045,7 +1041,7 @@ xfs_bmap_add_attrfork( ASSERT(!xfs_inode_has_attr_fork(ip)); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - error = xfs_bmap_set_attrforkoff(ip, size, &version); + error = xfs_bmap_set_attrforkoff(ip, size); if (error) return error; @@ -1069,16 +1065,12 @@ xfs_bmap_add_attrfork( xfs_trans_log_inode(tp, ip, logflags); if (error) return error; - if (!xfs_has_attr(mp) || - (!xfs_has_attr2(mp) && version == 2)) { + if (!xfs_has_attr(mp)) { bool log_sb = false; spin_lock(&mp->m_sb_lock); if (!xfs_has_attr(mp)) { xfs_add_attr(mp); - log_sb = true; - } - if (!xfs_has_attr2(mp) && version == 2) { xfs_add_attr2(mp); log_sb = true; } @@ -3312,6 +3304,11 @@ xfs_bmap_compute_alignments( align = xfs_get_cowextsz_hint(ap->ip); else if (ap->datatype & XFS_ALLOC_USERDATA) align = xfs_get_extsz_hint(ap->ip); + + /* Try to align start block to any minimum allocation alignment */ + if (align > 1 && (ap->flags & XFS_BMAPI_EXTSZALIGN)) + args->alignment = align; + if (align) { if (xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 0, ap->eof, 0, ap->conv, &ap->offset, @@ -3657,8 +3654,7 @@ xfs_bmap_btalloc( /* Trim the allocation back to the maximum an AG can fit. */ args.maxlen = min(ap->length, mp->m_ag_max_usable); - if (unlikely(XFS_TEST_ERROR(false, mp, - XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT))) + if (unlikely(XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT))) error = xfs_bmap_exact_minlen_extent_alloc(ap, &args); else if ((ap->datatype & XFS_ALLOC_USERDATA) && xfs_inode_is_filestream(ap->ip)) @@ -3844,7 +3840,7 @@ xfs_bmapi_read( } if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -4195,7 +4191,7 @@ xfs_bmapi_write( (XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO)); if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -4540,7 +4536,7 @@ xfs_bmapi_remap( (XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC)); if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -5674,7 +5670,7 @@ xfs_bmap_collapse_extents( int logflags = 0; if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -5790,7 +5786,7 @@ xfs_bmap_insert_extents( int logflags = 0; if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -5895,7 +5891,7 @@ xfs_bmap_split_extent( int i = 0; if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -6060,7 +6056,7 @@ xfs_bmap_finish_one( trace_xfs_bmap_deferred(bi); - if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_BMAP_FINISH_ONE)) + if (XFS_TEST_ERROR(tp->t_mountp, XFS_ERRTAG_BMAP_FINISH_ONE)) return -EIO; switch (bi->bi_type) { diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index b4d9c6e0f3f9..d5f2729305fa 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -87,6 +87,9 @@ struct xfs_bmalloca { /* Do not update the rmap btree. Used for reconstructing bmbt from rmapbt. */ #define XFS_BMAPI_NORMAP (1u << 10) +/* Try to align allocations to the extent size hint */ +#define XFS_BMAPI_EXTSZALIGN (1u << 11) + #define XFS_BMAPI_FLAGS \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ { XFS_BMAPI_METADATA, "METADATA" }, \ @@ -98,7 +101,8 @@ struct xfs_bmalloca { { XFS_BMAPI_REMAP, "REMAP" }, \ { XFS_BMAPI_COWFORK, "COWFORK" }, \ { XFS_BMAPI_NODISCARD, "NODISCARD" }, \ - { XFS_BMAPI_NORMAP, "NORMAP" } + { XFS_BMAPI_NORMAP, "NORMAP" },\ + { XFS_BMAPI_EXTSZALIGN, "EXTSZALIGN" } static inline int xfs_bmapi_aflag(int w) diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 908d7b050e9c..188feac04b60 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -369,38 +369,26 @@ xfs_bmbt_init_rec_from_cur( xfs_bmbt_disk_set_all(&rec->bmbt, &cur->bc_rec.b); } -STATIC int64_t -xfs_bmbt_key_diff( +STATIC int +xfs_bmbt_cmp_key_with_cur( struct xfs_btree_cur *cur, const union xfs_btree_key *key) { - return (int64_t)be64_to_cpu(key->bmbt.br_startoff) - - cur->bc_rec.b.br_startoff; + return cmp_int(be64_to_cpu(key->bmbt.br_startoff), + cur->bc_rec.b.br_startoff); } -STATIC int64_t -xfs_bmbt_diff_two_keys( +STATIC int +xfs_bmbt_cmp_two_keys( struct xfs_btree_cur *cur, const union xfs_btree_key *k1, const union xfs_btree_key *k2, const union xfs_btree_key *mask) { - uint64_t a = be64_to_cpu(k1->bmbt.br_startoff); - uint64_t b = be64_to_cpu(k2->bmbt.br_startoff); - ASSERT(!mask || mask->bmbt.br_startoff); - /* - * Note: This routine previously casted a and b to int64 and subtracted - * them to generate a result. This lead to problems if b was the - * "maximum" key value (all ones) being signed incorrectly, hence this - * somewhat less efficient version. - */ - if (a > b) - return 1; - if (b > a) - return -1; - return 0; + return cmp_int(be64_to_cpu(k1->bmbt.br_startoff), + be64_to_cpu(k2->bmbt.br_startoff)); } static xfs_failaddr_t @@ -647,8 +635,8 @@ const struct xfs_btree_ops xfs_bmbt_ops = { .init_key_from_rec = xfs_bmbt_init_key_from_rec, .init_high_key_from_rec = xfs_bmbt_init_high_key_from_rec, .init_rec_from_cur = xfs_bmbt_init_rec_from_cur, - .key_diff = xfs_bmbt_key_diff, - .diff_two_keys = xfs_bmbt_diff_two_keys, + .cmp_key_with_cur = xfs_bmbt_cmp_key_with_cur, + .cmp_two_keys = xfs_bmbt_cmp_two_keys, .buf_ops = &xfs_bmbt_buf_ops, .keys_inorder = xfs_bmbt_keys_inorder, .recs_inorder = xfs_bmbt_recs_inorder, diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 299ce7fd11b0..dbe9df8c3300 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -306,7 +306,7 @@ xfs_btree_check_block( fa = __xfs_btree_check_block(cur, block, level, bp); if (XFS_IS_CORRUPT(mp, fa != NULL) || - XFS_TEST_ERROR(false, mp, xfs_btree_block_errtag(cur))) { + XFS_TEST_ERROR(mp, xfs_btree_block_errtag(cur))) { if (bp) trace_xfs_btree_corrupt(bp, _RET_IP_); xfs_btree_mark_sick(cur); @@ -1985,7 +1985,7 @@ xfs_btree_lookup( int *stat) /* success/failure */ { struct xfs_btree_block *block; /* current btree block */ - int64_t diff; /* difference for the current key */ + int cmp_r; /* current key comparison result */ int error; /* error return value */ int keyno; /* current key number */ int level; /* level in the btree */ @@ -2013,13 +2013,13 @@ xfs_btree_lookup( * on the lookup record, then follow the corresponding block * pointer down to the next level. */ - for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) { + for (level = cur->bc_nlevels - 1, cmp_r = 1; level >= 0; level--) { /* Get the block we need to do the lookup on. */ error = xfs_btree_lookup_get_block(cur, level, pp, &block); if (error) goto error0; - if (diff == 0) { + if (cmp_r == 0) { /* * If we already had a key match at a higher level, we * know we need to use the first entry in this block. @@ -2065,15 +2065,16 @@ xfs_btree_lookup( keyno, block, &key); /* - * Compute difference to get next direction: + * Compute comparison result to get next + * direction: * - less than, move right * - greater than, move left * - equal, we're done */ - diff = cur->bc_ops->key_diff(cur, kp); - if (diff < 0) + cmp_r = cur->bc_ops->cmp_key_with_cur(cur, kp); + if (cmp_r < 0) low = keyno + 1; - else if (diff > 0) + else if (cmp_r > 0) high = keyno - 1; else break; @@ -2089,7 +2090,7 @@ xfs_btree_lookup( * If we moved left, need the previous key number, * unless there isn't one. */ - if (diff > 0 && --keyno < 1) + if (cmp_r > 0 && --keyno < 1) keyno = 1; pp = xfs_btree_ptr_addr(cur, keyno, block); @@ -2102,7 +2103,7 @@ xfs_btree_lookup( } /* Done with the search. See if we need to adjust the results. */ - if (dir != XFS_LOOKUP_LE && diff < 0) { + if (dir != XFS_LOOKUP_LE && cmp_r < 0) { keyno++; /* * If ge search and we went off the end of the block, but it's @@ -2125,14 +2126,14 @@ xfs_btree_lookup( *stat = 1; return 0; } - } else if (dir == XFS_LOOKUP_LE && diff > 0) + } else if (dir == XFS_LOOKUP_LE && cmp_r > 0) keyno--; cur->bc_levels[0].ptr = keyno; /* Return if we succeeded or not. */ if (keyno == 0 || keyno > xfs_btree_get_numrecs(block)) *stat = 0; - else if (dir != XFS_LOOKUP_EQ || diff == 0) + else if (dir != XFS_LOOKUP_EQ || cmp_r == 0) *stat = 1; else *stat = 0; @@ -5058,7 +5059,7 @@ xfs_btree_simple_query_range( int error; ASSERT(cur->bc_ops->init_high_key_from_rec); - ASSERT(cur->bc_ops->diff_two_keys); + ASSERT(cur->bc_ops->cmp_two_keys); /* * Find the leftmost record. The btree cursor must be set @@ -5352,15 +5353,15 @@ xfs_btree_count_blocks( } /* Compare two btree pointers. */ -int64_t -xfs_btree_diff_two_ptrs( +int +xfs_btree_cmp_two_ptrs( struct xfs_btree_cur *cur, const union xfs_btree_ptr *a, const union xfs_btree_ptr *b) { if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) - return (int64_t)be64_to_cpu(a->l) - be64_to_cpu(b->l); - return (int64_t)be32_to_cpu(a->s) - be32_to_cpu(b->s); + return cmp_int(be64_to_cpu(a->l), be64_to_cpu(b->l)); + return cmp_int(be32_to_cpu(a->s), be32_to_cpu(b->s)); } struct xfs_btree_has_records { diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 355b304696e6..60e78572e725 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -171,20 +171,23 @@ struct xfs_btree_ops { void (*init_high_key_from_rec)(union xfs_btree_key *key, const union xfs_btree_rec *rec); - /* difference between key value and cursor value */ - int64_t (*key_diff)(struct xfs_btree_cur *cur, - const union xfs_btree_key *key); + /* + * Compare key value and cursor value -- positive if key > cur, + * negative if key < cur, and zero if equal. + */ + int (*cmp_key_with_cur)(struct xfs_btree_cur *cur, + const union xfs_btree_key *key); /* - * Difference between key2 and key1 -- positive if key1 > key2, - * negative if key1 < key2, and zero if equal. If the @mask parameter - * is non NULL, each key field to be used in the comparison must - * contain a nonzero value. + * Compare key1 and key2 -- positive if key1 > key2, negative if + * key1 < key2, and zero if equal. If the @mask parameter is non NULL, + * each key field to be used in the comparison must contain a nonzero + * value. */ - int64_t (*diff_two_keys)(struct xfs_btree_cur *cur, - const union xfs_btree_key *key1, - const union xfs_btree_key *key2, - const union xfs_btree_key *mask); + int (*cmp_two_keys)(struct xfs_btree_cur *cur, + const union xfs_btree_key *key1, + const union xfs_btree_key *key2, + const union xfs_btree_key *mask); const struct xfs_buf_ops *buf_ops; @@ -516,9 +519,9 @@ struct xfs_btree_block *xfs_btree_get_block(struct xfs_btree_cur *cur, int level, struct xfs_buf **bpp); bool xfs_btree_ptr_is_null(struct xfs_btree_cur *cur, const union xfs_btree_ptr *ptr); -int64_t xfs_btree_diff_two_ptrs(struct xfs_btree_cur *cur, - const union xfs_btree_ptr *a, - const union xfs_btree_ptr *b); +int xfs_btree_cmp_two_ptrs(struct xfs_btree_cur *cur, + const union xfs_btree_ptr *a, + const union xfs_btree_ptr *b); void xfs_btree_get_sibling(struct xfs_btree_cur *cur, struct xfs_btree_block *block, union xfs_btree_ptr *ptr, int lr); @@ -546,7 +549,7 @@ xfs_btree_keycmp_lt( const union xfs_btree_key *key1, const union xfs_btree_key *key2) { - return cur->bc_ops->diff_two_keys(cur, key1, key2, NULL) < 0; + return cur->bc_ops->cmp_two_keys(cur, key1, key2, NULL) < 0; } static inline bool @@ -555,7 +558,7 @@ xfs_btree_keycmp_gt( const union xfs_btree_key *key1, const union xfs_btree_key *key2) { - return cur->bc_ops->diff_two_keys(cur, key1, key2, NULL) > 0; + return cur->bc_ops->cmp_two_keys(cur, key1, key2, NULL) > 0; } static inline bool @@ -564,7 +567,7 @@ xfs_btree_keycmp_eq( const union xfs_btree_key *key1, const union xfs_btree_key *key2) { - return cur->bc_ops->diff_two_keys(cur, key1, key2, NULL) == 0; + return cur->bc_ops->cmp_two_keys(cur, key1, key2, NULL) == 0; } static inline bool @@ -602,7 +605,7 @@ xfs_btree_masked_keycmp_lt( const union xfs_btree_key *key2, const union xfs_btree_key *mask) { - return cur->bc_ops->diff_two_keys(cur, key1, key2, mask) < 0; + return cur->bc_ops->cmp_two_keys(cur, key1, key2, mask) < 0; } static inline bool @@ -612,7 +615,7 @@ xfs_btree_masked_keycmp_gt( const union xfs_btree_key *key2, const union xfs_btree_key *mask) { - return cur->bc_ops->diff_two_keys(cur, key1, key2, mask) > 0; + return cur->bc_ops->cmp_two_keys(cur, key1, key2, mask) > 0; } static inline bool diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 17d9e6154f19..90f7fc219fcc 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -565,7 +565,7 @@ xfs_da3_split( trace_xfs_da_split(state->args); - if (XFS_TEST_ERROR(false, state->mp, XFS_ERRTAG_DA_LEAF_SPLIT)) + if (XFS_TEST_ERROR(state->mp, XFS_ERRTAG_DA_LEAF_SPLIT)) return -EIO; /* @@ -2833,6 +2833,12 @@ xfs_da_read_buf( &bp, ops); if (xfs_metadata_is_sick(error)) xfs_dirattr_mark_sick(dp, whichfork); + /* + * ENODATA from disk implies a disk medium failure; ENODATA for + * xattrs means attribute not found, so disambiguate that here. + */ + if (error == -ENODATA && whichfork == XFS_ATTR_FORK) + error = -EIO; if (error) goto out_free; diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 1775abcfa04d..82a338458a51 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -223,7 +223,7 @@ xfs_dir_ino_validate( bool ino_ok = xfs_verify_dir_ino(mp, ino); if (XFS_IS_CORRUPT(mp, !ino_ok) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_DIR_INO_VALIDATE)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_DIR_INO_VALIDATE)) { xfs_warn(mp, "Invalid inode number 0x%Lx", (unsigned long long) ino); return -EFSCORRUPTED; diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h index a53c5d40e084..57e47077c75a 100644 --- a/fs/xfs/libxfs/xfs_errortag.h +++ b/fs/xfs/libxfs/xfs_errortag.h @@ -4,14 +4,22 @@ * Copyright (C) 2017 Oracle. * All Rights Reserved. */ -#ifndef __XFS_ERRORTAG_H_ +#if !defined(__XFS_ERRORTAG_H_) || defined(XFS_ERRTAG) #define __XFS_ERRORTAG_H_ /* - * error injection tags - the labels can be anything you want - * but each tag should have its own unique number + * There are two ways to use this header file. The first way is to #include it + * bare, which will define all the XFS_ERRTAG_* error injection knobs for use + * with the XFS_TEST_ERROR macro. The second way is to enclose the #include + * with a #define for an XFS_ERRTAG macro, in which case the header will define + " an XFS_ERRTAGS macro that expands to invoke that XFS_ERRTAG macro for each + * defined error injection knob. */ +/* + * These are the actual error injection tags. The numbers should be consecutive + * because arrays are sized based on the maximum. + */ #define XFS_ERRTAG_NOERROR 0 #define XFS_ERRTAG_IFLUSH_1 1 #define XFS_ERRTAG_IFLUSH_2 2 @@ -65,55 +73,69 @@ #define XFS_ERRTAG_WRITE_DELAY_MS 43 #define XFS_ERRTAG_EXCHMAPS_FINISH_ONE 44 #define XFS_ERRTAG_METAFILE_RESV_CRITICAL 45 -#define XFS_ERRTAG_MAX 46 +#define XFS_ERRTAG_FORCE_ZERO_RANGE 46 +#define XFS_ERRTAG_MAX 47 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. */ #define XFS_RANDOM_DEFAULT 100 -#define XFS_RANDOM_IFLUSH_1 XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IFLUSH_2 XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IFLUSH_3 XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IFLUSH_4 XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IFLUSH_5 XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IFLUSH_6 XFS_RANDOM_DEFAULT -#define XFS_RANDOM_DA_READ_BUF XFS_RANDOM_DEFAULT -#define XFS_RANDOM_BTREE_CHECK_LBLOCK (XFS_RANDOM_DEFAULT/4) -#define XFS_RANDOM_BTREE_CHECK_SBLOCK XFS_RANDOM_DEFAULT -#define XFS_RANDOM_ALLOC_READ_AGF XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IALLOC_READ_AGI XFS_RANDOM_DEFAULT -#define XFS_RANDOM_ITOBP_INOTOBP XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IUNLINK XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IUNLINK_REMOVE XFS_RANDOM_DEFAULT -#define XFS_RANDOM_DIR_INO_VALIDATE XFS_RANDOM_DEFAULT -#define XFS_RANDOM_BULKSTAT_READ_CHUNK XFS_RANDOM_DEFAULT -#define XFS_RANDOM_IODONE_IOERR (XFS_RANDOM_DEFAULT/10) -#define XFS_RANDOM_STRATREAD_IOERR (XFS_RANDOM_DEFAULT/10) -#define XFS_RANDOM_STRATCMPL_IOERR (XFS_RANDOM_DEFAULT/10) -#define XFS_RANDOM_DIOWRITE_IOERR (XFS_RANDOM_DEFAULT/10) -#define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT -#define XFS_RANDOM_FREE_EXTENT 1 -#define XFS_RANDOM_RMAP_FINISH_ONE 1 -#define XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE 1 -#define XFS_RANDOM_REFCOUNT_FINISH_ONE 1 -#define XFS_RANDOM_BMAP_FINISH_ONE 1 -#define XFS_RANDOM_AG_RESV_CRITICAL 4 -#define XFS_RANDOM_LOG_BAD_CRC 1 -#define XFS_RANDOM_LOG_ITEM_PIN 1 -#define XFS_RANDOM_BUF_LRU_REF 2 -#define XFS_RANDOM_FORCE_SCRUB_REPAIR 1 -#define XFS_RANDOM_FORCE_SUMMARY_RECALC 1 -#define XFS_RANDOM_IUNLINK_FALLBACK (XFS_RANDOM_DEFAULT/10) -#define XFS_RANDOM_BUF_IOERROR XFS_RANDOM_DEFAULT -#define XFS_RANDOM_REDUCE_MAX_IEXTENTS 1 -#define XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT 1 -#define XFS_RANDOM_AG_RESV_FAIL 1 -#define XFS_RANDOM_LARP 1 -#define XFS_RANDOM_DA_LEAF_SPLIT 1 -#define XFS_RANDOM_ATTR_LEAF_TO_NODE 1 -#define XFS_RANDOM_WB_DELAY_MS 3000 -#define XFS_RANDOM_WRITE_DELAY_MS 3000 -#define XFS_RANDOM_EXCHMAPS_FINISH_ONE 1 -#define XFS_RANDOM_METAFILE_RESV_CRITICAL 4 + +/* + * Table of errror injection knobs. The parameters to the XFS_ERRTAG macro are: + * 1. The XFS_ERRTAG_ flag but without the prefix; + * 2. The name of the sysfs knob; and + * 3. The default value for the knob. + */ +#ifdef XFS_ERRTAG +# undef XFS_ERRTAGS +# define XFS_ERRTAGS \ +XFS_ERRTAG(NOERROR, noerror, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IFLUSH_1, iflush1, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IFLUSH_2, iflush2, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IFLUSH_3, iflush3, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IFLUSH_4, iflush4, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IFLUSH_5, iflush5, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IFLUSH_6, iflush6, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(DA_READ_BUF, dareadbuf, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(BTREE_CHECK_LBLOCK, btree_chk_lblk, XFS_RANDOM_DEFAULT/4) \ +XFS_ERRTAG(BTREE_CHECK_SBLOCK, btree_chk_sblk, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(ALLOC_READ_AGF, readagf, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IALLOC_READ_AGI, readagi, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(ITOBP_INOTOBP, itobp, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IUNLINK, iunlink, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IUNLINK_REMOVE, iunlinkrm, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(DIR_INO_VALIDATE, dirinovalid, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(BULKSTAT_READ_CHUNK, bulkstat, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(IODONE_IOERR, logiodone, XFS_RANDOM_DEFAULT/10) \ +XFS_ERRTAG(STRATREAD_IOERR, stratread, XFS_RANDOM_DEFAULT/10) \ +XFS_ERRTAG(STRATCMPL_IOERR, stratcmpl, XFS_RANDOM_DEFAULT/10) \ +XFS_ERRTAG(DIOWRITE_IOERR, diowrite, XFS_RANDOM_DEFAULT/10) \ +XFS_ERRTAG(BMAPIFORMAT, bmapifmt, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(FREE_EXTENT, free_extent, 1) \ +XFS_ERRTAG(RMAP_FINISH_ONE, rmap_finish_one, 1) \ +XFS_ERRTAG(REFCOUNT_CONTINUE_UPDATE, refcount_continue_update, 1) \ +XFS_ERRTAG(REFCOUNT_FINISH_ONE, refcount_finish_one, 1) \ +XFS_ERRTAG(BMAP_FINISH_ONE, bmap_finish_one, 1) \ +XFS_ERRTAG(AG_RESV_CRITICAL, ag_resv_critical, 4) \ +XFS_ERRTAG(LOG_BAD_CRC, log_bad_crc, 1) \ +XFS_ERRTAG(LOG_ITEM_PIN, log_item_pin, 1) \ +XFS_ERRTAG(BUF_LRU_REF, buf_lru_ref, 2) \ +XFS_ERRTAG(FORCE_SCRUB_REPAIR, force_repair, 1) \ +XFS_ERRTAG(FORCE_SUMMARY_RECALC, bad_summary, 1) \ +XFS_ERRTAG(IUNLINK_FALLBACK, iunlink_fallback, XFS_RANDOM_DEFAULT/10) \ +XFS_ERRTAG(BUF_IOERROR, buf_ioerror, XFS_RANDOM_DEFAULT) \ +XFS_ERRTAG(REDUCE_MAX_IEXTENTS, reduce_max_iextents, 1) \ +XFS_ERRTAG(BMAP_ALLOC_MINLEN_EXTENT, bmap_alloc_minlen_extent, 1) \ +XFS_ERRTAG(AG_RESV_FAIL, ag_resv_fail, 1) \ +XFS_ERRTAG(LARP, larp, 1) \ +XFS_ERRTAG(DA_LEAF_SPLIT, da_leaf_split, 1) \ +XFS_ERRTAG(ATTR_LEAF_TO_NODE, attr_leaf_to_node, 1) \ +XFS_ERRTAG(WB_DELAY_MS, wb_delay_ms, 3000) \ +XFS_ERRTAG(WRITE_DELAY_MS, write_delay_ms, 3000) \ +XFS_ERRTAG(EXCHMAPS_FINISH_ONE, exchmaps_finish_one, 1) \ +XFS_ERRTAG(METAFILE_RESV_CRITICAL, metafile_resv_crit, 4) \ +XFS_ERRTAG(FORCE_ZERO_RANGE, force_zero_range, 4) +#endif /* XFS_ERRTAG */ #endif /* __XFS_ERRORTAG_H_ */ diff --git a/fs/xfs/libxfs/xfs_exchmaps.c b/fs/xfs/libxfs/xfs_exchmaps.c index 3f1d6a98c118..932ee4619e9e 100644 --- a/fs/xfs/libxfs/xfs_exchmaps.c +++ b/fs/xfs/libxfs/xfs_exchmaps.c @@ -616,7 +616,7 @@ xfs_exchmaps_finish_one( return error; } - if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_EXCHMAPS_FINISH_ONE)) + if (XFS_TEST_ERROR(tp->t_mountp, XFS_ERRTAG_EXCHMAPS_FINISH_ONE)) return -EIO; /* If we still have work to do, ask for a new transaction. */ @@ -882,7 +882,7 @@ xmi_ensure_delta_nextents( &new_nextents)) return -EFBIG; - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) && + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) && new_nextents > 10) return -EFBIG; diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 9566a7623365..779dac59b1f3 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -112,7 +112,7 @@ typedef struct xfs_sb { uint16_t sb_sectsize; /* volume sector size, bytes */ uint16_t sb_inodesize; /* inode size, bytes */ uint16_t sb_inopblock; /* inodes per block */ - char sb_fname[XFSLABEL_MAX]; /* file system name */ + char sb_fname[XFSLABEL_MAX] __nonstring; /* file system name */ uint8_t sb_blocklog; /* log2 of sb_blocksize */ uint8_t sb_sectlog; /* log2 of sb_sectsize */ uint8_t sb_inodelog; /* log2 of sb_inodesize */ diff --git a/fs/xfs/libxfs/xfs_group.c b/fs/xfs/libxfs/xfs_group.c index e9d76bcdc820..792f76d2e2a0 100644 --- a/fs/xfs/libxfs/xfs_group.c +++ b/fs/xfs/libxfs/xfs_group.c @@ -163,7 +163,8 @@ xfs_group_free( xfs_defer_drain_free(&xg->xg_intents_drain); #ifdef __KERNEL__ - kfree(xg->xg_busy_extents); + if (xfs_group_has_extent_busy(xg->xg_mount, xg->xg_type)) + kfree(xg->xg_busy_extents); #endif if (uninit) @@ -171,7 +172,8 @@ xfs_group_free( /* drop the mount's active reference */ xfs_group_rele(xg); - XFS_IS_CORRUPT(mp, atomic_read(&xg->xg_active_ref) != 0); + XFS_IS_CORRUPT(mp, atomic_read(&xg->xg_active_ref) > 0); + XFS_IS_CORRUPT(mp, atomic_read(&xg->xg_active_ref) < 0); kfree_rcu_mightsleep(xg); } @@ -189,9 +191,11 @@ xfs_group_insert( xg->xg_type = type; #ifdef __KERNEL__ - xg->xg_busy_extents = xfs_extent_busy_alloc(); - if (!xg->xg_busy_extents) - return -ENOMEM; + if (xfs_group_has_extent_busy(mp, type)) { + xg->xg_busy_extents = xfs_extent_busy_alloc(); + if (!xg->xg_busy_extents) + return -ENOMEM; + } spin_lock_init(&xg->xg_state_lock); xfs_hooks_init(&xg->xg_rmap_update_hooks); #endif @@ -210,7 +214,8 @@ xfs_group_insert( out_drain: xfs_defer_drain_free(&xg->xg_intents_drain); #ifdef __KERNEL__ - kfree(xg->xg_busy_extents); + if (xfs_group_has_extent_busy(xg->xg_mount, xg->xg_type)) + kfree(xg->xg_busy_extents); #endif return error; } diff --git a/fs/xfs/libxfs/xfs_group.h b/fs/xfs/libxfs/xfs_group.h index 4423932a2313..4ae638f1c2c5 100644 --- a/fs/xfs/libxfs/xfs_group.h +++ b/fs/xfs/libxfs/xfs_group.h @@ -98,6 +98,15 @@ xfs_group_max_blocks( return xg->xg_mount->m_groups[xg->xg_type].blocks; } +static inline xfs_rfsblock_t +xfs_groups_to_rfsbs( + struct xfs_mount *mp, + uint32_t nr_groups, + enum xfs_group_type type) +{ + return (xfs_rfsblock_t)mp->m_groups[type].blocks * nr_groups; +} + static inline xfs_fsblock_t xfs_group_start_fsb( struct xfs_group *xg) diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 0c47b5c6ca7d..d97295eaebe6 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2140,7 +2140,7 @@ xfs_difree_inobt( * remove the chunk if the block size is large enough for multiple inode * chunks (that might not be free). */ - if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE && + if (rec.ir_free == XFS_INOBT_ALL_FREE && mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) { xic->deleted = true; xic->first_ino = xfs_agino_to_ino(pag, rec.ir_startino); @@ -2286,7 +2286,7 @@ xfs_difree_finobt( * enough for multiple chunks. Leave the finobt record to remain in sync * with the inobt. */ - if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE && + if (rec.ir_free == XFS_INOBT_ALL_FREE && mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) { error = xfs_btree_delete(cur, &i); if (error) @@ -2706,7 +2706,7 @@ xfs_agi_read_verify( xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { fa = xfs_agi_verify(bp); - if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_IALLOC_READ_AGI)) + if (fa || XFS_TEST_ERROR(mp, XFS_ERRTAG_IALLOC_READ_AGI)) xfs_verifier_error(bp, -EFSCORRUPTED, fa); } } @@ -2801,12 +2801,35 @@ xfs_ialloc_read_agi( set_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate); } +#ifdef DEBUG /* - * It's possible for these to be out of sync if - * we are in the middle of a forced shutdown. + * It's possible for the AGF to be out of sync if the block device is + * silently dropping writes. This can happen in fstests with dmflakey + * enabled, which allows the buffer to be cleaned and reclaimed by + * memory pressure and then re-read from disk here. We will get a + * stale version of the AGF from disk, and nothing good can happen from + * here. Hence if we detect this situation, immediately shut down the + * filesystem. + * + * This can also happen if we are already in the middle of a forced + * shutdown, so don't bother checking if we are already shut down. */ - ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) || - xfs_is_shutdown(pag_mount(pag))); + if (!xfs_is_shutdown(pag_mount(pag))) { + bool ok = true; + + ok &= pag->pagi_freecount == be32_to_cpu(agi->agi_freecount); + ok &= pag->pagi_count == be32_to_cpu(agi->agi_count); + + if (XFS_IS_CORRUPT(pag_mount(pag), !ok)) { + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); + xfs_trans_brelse(tp, agibp); + xfs_force_shutdown(pag_mount(pag), + SHUTDOWN_CORRUPT_ONDISK); + return -EFSCORRUPTED; + } + } +#endif /* DEBUG */ + if (agibpp) *agibpp = agibp; else diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 6f270d8f4270..100afdd66cdd 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -265,17 +265,17 @@ xfs_finobt_init_ptr_from_cur( ptr->s = agi->agi_free_root; } -STATIC int64_t -xfs_inobt_key_diff( +STATIC int +xfs_inobt_cmp_key_with_cur( struct xfs_btree_cur *cur, const union xfs_btree_key *key) { - return (int64_t)be32_to_cpu(key->inobt.ir_startino) - - cur->bc_rec.i.ir_startino; + return cmp_int(be32_to_cpu(key->inobt.ir_startino), + cur->bc_rec.i.ir_startino); } -STATIC int64_t -xfs_inobt_diff_two_keys( +STATIC int +xfs_inobt_cmp_two_keys( struct xfs_btree_cur *cur, const union xfs_btree_key *k1, const union xfs_btree_key *k2, @@ -283,8 +283,8 @@ xfs_inobt_diff_two_keys( { ASSERT(!mask || mask->inobt.ir_startino); - return (int64_t)be32_to_cpu(k1->inobt.ir_startino) - - be32_to_cpu(k2->inobt.ir_startino); + return cmp_int(be32_to_cpu(k1->inobt.ir_startino), + be32_to_cpu(k2->inobt.ir_startino)); } static xfs_failaddr_t @@ -430,9 +430,9 @@ const struct xfs_btree_ops xfs_inobt_ops = { .init_high_key_from_rec = xfs_inobt_init_high_key_from_rec, .init_rec_from_cur = xfs_inobt_init_rec_from_cur, .init_ptr_from_cur = xfs_inobt_init_ptr_from_cur, - .key_diff = xfs_inobt_key_diff, + .cmp_key_with_cur = xfs_inobt_cmp_key_with_cur, .buf_ops = &xfs_inobt_buf_ops, - .diff_two_keys = xfs_inobt_diff_two_keys, + .cmp_two_keys = xfs_inobt_cmp_two_keys, .keys_inorder = xfs_inobt_keys_inorder, .recs_inorder = xfs_inobt_recs_inorder, .keys_contiguous = xfs_inobt_keys_contiguous, @@ -460,9 +460,9 @@ const struct xfs_btree_ops xfs_finobt_ops = { .init_high_key_from_rec = xfs_inobt_init_high_key_from_rec, .init_rec_from_cur = xfs_inobt_init_rec_from_cur, .init_ptr_from_cur = xfs_finobt_init_ptr_from_cur, - .key_diff = xfs_inobt_key_diff, + .cmp_key_with_cur = xfs_inobt_cmp_key_with_cur, .buf_ops = &xfs_finobt_buf_ops, - .diff_two_keys = xfs_inobt_diff_two_keys, + .cmp_two_keys = xfs_inobt_cmp_two_keys, .keys_inorder = xfs_inobt_keys_inorder, .recs_inorder = xfs_inobt_recs_inorder, .keys_contiguous = xfs_inobt_keys_contiguous, diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index aa13fc00afd7..b1812b2c3cce 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -61,8 +61,8 @@ xfs_inode_buf_verify( di_ok = xfs_verify_magic16(bp, dip->di_magic) && xfs_dinode_good_version(mp, dip->di_version) && xfs_verify_agino_or_null(bp->b_pag, unlinked_ino); - if (unlikely(XFS_TEST_ERROR(!di_ok, mp, - XFS_ERRTAG_ITOBP_INOTOBP))) { + if (unlikely(!di_ok || + XFS_TEST_ERROR(mp, XFS_ERRTAG_ITOBP_INOTOBP))) { if (readahead) { bp->b_flags &= ~XBF_DONE; xfs_buf_ioerror(bp, -EIO); diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 4f99b90add55..1772d82f2d68 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -756,8 +756,7 @@ xfs_iext_count_extend( if (nr_exts < ifp->if_nextents) return -EFBIG; - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) && - nr_exts > 10) + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_REDUCE_MAX_IEXTENTS) && nr_exts > 10) return -EFBIG; if (nr_exts > xfs_iext_max_nextents(has_large, whichfork)) { diff --git a/fs/xfs/libxfs/xfs_inode_util.c b/fs/xfs/libxfs/xfs_inode_util.c index 48fe49a5f050..309ce6dd5553 100644 --- a/fs/xfs/libxfs/xfs_inode_util.c +++ b/fs/xfs/libxfs/xfs_inode_util.c @@ -299,17 +299,6 @@ xfs_inode_init( } else { inode_init_owner(args->idmap, inode, dir, args->mode); } - - /* - * If the group ID of the new file does not match the effective - * group ID or one of the supplementary group IDs, the S_ISGID - * bit is cleared (and only if the irix_sgid_inherit - * compatibility variable is set). - */ - if (irix_sgid_inherit && (inode->i_mode & S_ISGID) && - !vfsgid_in_group_p(i_gid_into_vfsgid(args->idmap, inode))) - inode->i_mode &= ~S_ISGID; - ip->i_projid = xfs_get_initial_prid(pip); } diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 0d637c276db0..908e7060428c 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -31,6 +31,7 @@ typedef uint32_t xlog_tid_t; #define XLOG_BIG_RECORD_BSIZE (32*1024) /* 32k buffers */ #define XLOG_MAX_RECORD_BSIZE (256*1024) #define XLOG_HEADER_CYCLE_SIZE (32*1024) /* cycle data in header */ +#define XLOG_CYCLE_DATA_SIZE (XLOG_HEADER_CYCLE_SIZE / BBSIZE) #define XLOG_MIN_RECORD_BSHIFT 14 /* 16384 == 1 << 14 */ #define XLOG_BIG_RECORD_BSHIFT 15 /* 32k == 1 << 15 */ #define XLOG_MAX_RECORD_BSHIFT 18 /* 256k == 1 << 18 */ @@ -86,43 +87,6 @@ struct xfs_unmount_log_format { uint32_t pad2; /* may as well make it 64 bits */ }; -/* Region types for iovec's i_type */ -#define XLOG_REG_TYPE_BFORMAT 1 -#define XLOG_REG_TYPE_BCHUNK 2 -#define XLOG_REG_TYPE_EFI_FORMAT 3 -#define XLOG_REG_TYPE_EFD_FORMAT 4 -#define XLOG_REG_TYPE_IFORMAT 5 -#define XLOG_REG_TYPE_ICORE 6 -#define XLOG_REG_TYPE_IEXT 7 -#define XLOG_REG_TYPE_IBROOT 8 -#define XLOG_REG_TYPE_ILOCAL 9 -#define XLOG_REG_TYPE_IATTR_EXT 10 -#define XLOG_REG_TYPE_IATTR_BROOT 11 -#define XLOG_REG_TYPE_IATTR_LOCAL 12 -#define XLOG_REG_TYPE_QFORMAT 13 -#define XLOG_REG_TYPE_DQUOT 14 -#define XLOG_REG_TYPE_QUOTAOFF 15 -#define XLOG_REG_TYPE_LRHEADER 16 -#define XLOG_REG_TYPE_UNMOUNT 17 -#define XLOG_REG_TYPE_COMMIT 18 -#define XLOG_REG_TYPE_TRANSHDR 19 -#define XLOG_REG_TYPE_ICREATE 20 -#define XLOG_REG_TYPE_RUI_FORMAT 21 -#define XLOG_REG_TYPE_RUD_FORMAT 22 -#define XLOG_REG_TYPE_CUI_FORMAT 23 -#define XLOG_REG_TYPE_CUD_FORMAT 24 -#define XLOG_REG_TYPE_BUI_FORMAT 25 -#define XLOG_REG_TYPE_BUD_FORMAT 26 -#define XLOG_REG_TYPE_ATTRI_FORMAT 27 -#define XLOG_REG_TYPE_ATTRD_FORMAT 28 -#define XLOG_REG_TYPE_ATTR_NAME 29 -#define XLOG_REG_TYPE_ATTR_VALUE 30 -#define XLOG_REG_TYPE_XMI_FORMAT 31 -#define XLOG_REG_TYPE_XMD_FORMAT 32 -#define XLOG_REG_TYPE_ATTR_NEWNAME 33 -#define XLOG_REG_TYPE_ATTR_NEWVALUE 34 -#define XLOG_REG_TYPE_MAX 34 - /* * Flags to log operation header * @@ -141,14 +105,13 @@ struct xfs_unmount_log_format { #define XLOG_END_TRANS 0x10 /* End a continued transaction */ #define XLOG_UNMOUNT_TRANS 0x20 /* Unmount a filesystem transaction */ - -typedef struct xlog_op_header { +struct xlog_op_header { __be32 oh_tid; /* transaction id of operation : 4 b */ __be32 oh_len; /* bytes in data region : 4 b */ __u8 oh_clientid; /* who sent me this : 1 b */ __u8 oh_flags; /* : 1 b */ __u16 oh_res2; /* 32 bit align : 2 b */ -} xlog_op_header_t; +}; /* valid values for h_fmt */ #define XLOG_FMT_UNKNOWN 0 @@ -163,7 +126,17 @@ typedef struct xlog_op_header { #define XLOG_FMT XLOG_FMT_LINUX_LE #endif -typedef struct xlog_rec_header { +struct xlog_rec_ext_header { + __be32 xh_cycle; /* write cycle of log */ + __be32 xh_cycle_data[XLOG_CYCLE_DATA_SIZE]; + __u8 xh_reserved[252]; +}; + +/* actual ext header payload size for checksumming */ +#define XLOG_REC_EXT_SIZE \ + offsetofend(struct xlog_rec_ext_header, xh_cycle_data) + +struct xlog_rec_header { __be32 h_magicno; /* log record (LR) identifier : 4 */ __be32 h_cycle; /* write cycle of log : 4 */ __be32 h_version; /* LR version : 4 */ @@ -173,34 +146,50 @@ typedef struct xlog_rec_header { __le32 h_crc; /* crc of log record : 4 */ __be32 h_prev_block; /* block number to previous LR : 4 */ __be32 h_num_logops; /* number of log operations in this LR : 4 */ - __be32 h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; - /* new fields */ + __be32 h_cycle_data[XLOG_CYCLE_DATA_SIZE]; + + /* fields added by the Linux port: */ __be32 h_fmt; /* format of log record : 4 */ uuid_t h_fs_uuid; /* uuid of FS : 16 */ + + /* fields added for log v2: */ __be32 h_size; /* iclog size : 4 */ -} xlog_rec_header_t; -typedef struct xlog_rec_ext_header { - __be32 xh_cycle; /* write cycle of log : 4 */ - __be32 xh_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; /* : 256 */ -} xlog_rec_ext_header_t; + /* + * When h_size added for log v2 support, it caused structure to have + * a different size on i386 vs all other architectures because the + * sum of the size ofthe member is not aligned by that of the largest + * __be64-sized member, and i386 has really odd struct alignment rules. + * + * Due to the way the log headers are placed out on-disk that alone is + * not a problem becaue the xlog_rec_header always sits alone in a + * BBSIZEs area, and the rest of that area is padded with zeroes. + * But xlog_cksum used to calculate the checksum based on the structure + * size, and thus gives different checksums for i386 vs the rest. + * We now do two checksum validation passes for both sizes to allow + * moving v5 file systems with unclean logs between i386 and other + * (little-endian) architectures. + */ + __u32 h_pad0; -/* - * Quite misnamed, because this union lays out the actual on-disk log buffer. - */ -typedef union xlog_in_core2 { - xlog_rec_header_t hic_header; - xlog_rec_ext_header_t hic_xheader; - char hic_sector[XLOG_HEADER_SIZE]; -} xlog_in_core_2_t; + __u8 h_reserved[184]; + struct xlog_rec_ext_header h_ext[]; +}; + +#ifdef __i386__ +#define XLOG_REC_SIZE offsetofend(struct xlog_rec_header, h_size) +#define XLOG_REC_SIZE_OTHER offsetofend(struct xlog_rec_header, h_pad0) +#else +#define XLOG_REC_SIZE offsetofend(struct xlog_rec_header, h_pad0) +#define XLOG_REC_SIZE_OTHER offsetofend(struct xlog_rec_header, h_size) +#endif /* __i386__ */ /* not an on-disk structure, but needed by log recovery in userspace */ -typedef struct xfs_log_iovec { +struct xfs_log_iovec { void *i_addr; /* beginning address of region */ int i_len; /* length in bytes of region */ uint i_type; /* type of region */ -} xfs_log_iovec_t; - +}; /* * Transaction Header definitions. @@ -213,12 +202,12 @@ typedef struct xfs_log_iovec { * Do not change the below structure without redoing the code in * xlog_recover_add_to_trans() and xlog_recover_add_to_cont_trans(). */ -typedef struct xfs_trans_header { +struct xfs_trans_header { uint th_magic; /* magic number */ uint th_type; /* transaction type */ int32_t th_tid; /* transaction id (unused) */ uint th_num_items; /* num items logged by trans */ -} xfs_trans_header_t; +}; #define XFS_TRANS_HEADER_MAGIC 0x5452414e /* TRAN */ @@ -542,7 +531,7 @@ struct xfs_log_dinode { #define __XFS_BLF_DATAMAP_SIZE ((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD) #define XFS_BLF_DATAMAP_SIZE (__XFS_BLF_DATAMAP_SIZE + 1) -typedef struct xfs_buf_log_format { +struct xfs_buf_log_format { unsigned short blf_type; /* buf log item type indicator */ unsigned short blf_size; /* size of this item */ unsigned short blf_flags; /* misc state */ @@ -550,7 +539,7 @@ typedef struct xfs_buf_log_format { int64_t blf_blkno; /* starting blkno of this buf */ unsigned int blf_map_size; /* used size of data bitmap in words */ unsigned int blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */ -} xfs_buf_log_format_t; +}; /* * All buffers now need to tell recovery where the magic number @@ -606,40 +595,41 @@ xfs_blft_from_flags(struct xfs_buf_log_format *blf) /* * EFI/EFD log format definitions */ -typedef struct xfs_extent { +struct xfs_extent { xfs_fsblock_t ext_start; xfs_extlen_t ext_len; -} xfs_extent_t; +}; /* - * Since an xfs_extent_t has types (start:64, len: 32) - * there are different alignments on 32 bit and 64 bit kernels. - * So we provide the different variants for use by a - * conversion routine. + * Since the structures in struct xfs_extent add up to 96 bytes, it has + * different alignments on i386 vs all other architectures, because i386 + * does not pad structures to their natural alignment. + * + * Provide the different variants for use by a conversion routine. */ -typedef struct xfs_extent_32 { +struct xfs_extent_32 { uint64_t ext_start; uint32_t ext_len; -} __attribute__((packed)) xfs_extent_32_t; +} __attribute__((packed)); -typedef struct xfs_extent_64 { +struct xfs_extent_64 { uint64_t ext_start; uint32_t ext_len; uint32_t ext_pad; -} xfs_extent_64_t; +}; /* * This is the structure used to lay out an efi log item in the * log. The efi_extents field is a variable size array whose * size is given by efi_nextents. */ -typedef struct xfs_efi_log_format { +struct xfs_efi_log_format { uint16_t efi_type; /* efi log item type */ uint16_t efi_size; /* size of this item */ uint32_t efi_nextents; /* # extents to free */ uint64_t efi_id; /* efi identifier */ - xfs_extent_t efi_extents[]; /* array of extents to free */ -} xfs_efi_log_format_t; + struct xfs_extent efi_extents[]; /* array of extents to free */ +}; static inline size_t xfs_efi_log_format_sizeof( @@ -649,13 +639,13 @@ xfs_efi_log_format_sizeof( nr * sizeof(struct xfs_extent); } -typedef struct xfs_efi_log_format_32 { +struct xfs_efi_log_format_32 { uint16_t efi_type; /* efi log item type */ uint16_t efi_size; /* size of this item */ uint32_t efi_nextents; /* # extents to free */ uint64_t efi_id; /* efi identifier */ - xfs_extent_32_t efi_extents[]; /* array of extents to free */ -} __attribute__((packed)) xfs_efi_log_format_32_t; + struct xfs_extent_32 efi_extents[]; /* array of extents to free */ +} __attribute__((packed)); static inline size_t xfs_efi_log_format32_sizeof( @@ -665,13 +655,13 @@ xfs_efi_log_format32_sizeof( nr * sizeof(struct xfs_extent_32); } -typedef struct xfs_efi_log_format_64 { +struct xfs_efi_log_format_64 { uint16_t efi_type; /* efi log item type */ uint16_t efi_size; /* size of this item */ uint32_t efi_nextents; /* # extents to free */ uint64_t efi_id; /* efi identifier */ - xfs_extent_64_t efi_extents[]; /* array of extents to free */ -} xfs_efi_log_format_64_t; + struct xfs_extent_64 efi_extents[]; /* array of extents to free */ +}; static inline size_t xfs_efi_log_format64_sizeof( @@ -686,13 +676,13 @@ xfs_efi_log_format64_sizeof( * log. The efd_extents array is a variable size array whose * size is given by efd_nextents; */ -typedef struct xfs_efd_log_format { +struct xfs_efd_log_format { uint16_t efd_type; /* efd log item type */ uint16_t efd_size; /* size of this item */ uint32_t efd_nextents; /* # of extents freed */ uint64_t efd_efi_id; /* id of corresponding efi */ - xfs_extent_t efd_extents[]; /* array of extents freed */ -} xfs_efd_log_format_t; + struct xfs_extent efd_extents[]; /* array of extents freed */ +}; static inline size_t xfs_efd_log_format_sizeof( @@ -702,13 +692,13 @@ xfs_efd_log_format_sizeof( nr * sizeof(struct xfs_extent); } -typedef struct xfs_efd_log_format_32 { +struct xfs_efd_log_format_32 { uint16_t efd_type; /* efd log item type */ uint16_t efd_size; /* size of this item */ uint32_t efd_nextents; /* # of extents freed */ uint64_t efd_efi_id; /* id of corresponding efi */ - xfs_extent_32_t efd_extents[]; /* array of extents freed */ -} __attribute__((packed)) xfs_efd_log_format_32_t; + struct xfs_extent_32 efd_extents[]; /* array of extents freed */ +} __attribute__((packed)); static inline size_t xfs_efd_log_format32_sizeof( @@ -718,13 +708,13 @@ xfs_efd_log_format32_sizeof( nr * sizeof(struct xfs_extent_32); } -typedef struct xfs_efd_log_format_64 { +struct xfs_efd_log_format_64 { uint16_t efd_type; /* efd log item type */ uint16_t efd_size; /* size of this item */ uint32_t efd_nextents; /* # of extents freed */ uint64_t efd_efi_id; /* id of corresponding efi */ - xfs_extent_64_t efd_extents[]; /* array of extents freed */ -} xfs_efd_log_format_64_t; + struct xfs_extent_64 efd_extents[]; /* array of extents freed */ +}; static inline size_t xfs_efd_log_format64_sizeof( @@ -957,14 +947,14 @@ struct xfs_xmd_log_format { * The first two fields must be the type and size fitting into * 32 bits : log_recovery code assumes that. */ -typedef struct xfs_dq_logformat { +struct xfs_dq_logformat { uint16_t qlf_type; /* dquot log item type */ uint16_t qlf_size; /* size of this item */ xfs_dqid_t qlf_id; /* usr/grp/proj id : 32 bits */ int64_t qlf_blkno; /* blkno of dquot buffer */ int32_t qlf_len; /* len of dquot buffer */ uint32_t qlf_boffset; /* off of dquot in buffer */ -} xfs_dq_logformat_t; +}; /* * log format struct for QUOTAOFF records. @@ -974,12 +964,12 @@ typedef struct xfs_dq_logformat { * to the first and ensures that the first logitem is taken out of the AIL * only when the last one is securely committed. */ -typedef struct xfs_qoff_logformat { +struct xfs_qoff_logformat { unsigned short qf_type; /* quotaoff log item type */ unsigned short qf_size; /* size of this item */ unsigned int qf_flags; /* USR and/or GRP */ char qf_pad[12]; /* padding for future */ -} xfs_qoff_logformat_t; +}; /* * Disk quotas status in m_qflags, and also sb_qflags. 16 bits. diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 66c7916fb5cd..9e712e62369c 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -104,20 +104,20 @@ struct xlog_recover_item { struct list_head ri_list; int ri_cnt; /* count of regions found */ int ri_total; /* total regions */ - struct xfs_log_iovec *ri_buf; /* ptr to regions buffer */ + struct kvec *ri_buf; /* ptr to regions buffer */ const struct xlog_recover_item_ops *ri_ops; }; struct xlog_recover { struct hlist_node r_list; xlog_tid_t r_log_tid; /* log's transaction id */ - xfs_trans_header_t r_theader; /* trans header for partial */ + struct xfs_trans_header r_theader; /* trans header for partial */ int r_state; /* not needed */ xfs_lsn_t r_lsn; /* xact lsn */ struct list_head r_itemq; /* q for items */ }; -#define ITEM_TYPE(i) (*(unsigned short *)(i)->ri_buf[0].i_addr) +#define ITEM_TYPE(i) (*(unsigned short *)(i)->ri_buf[0].iov_base) #define XLOG_RECOVER_CRCPASS 0 #define XLOG_RECOVER_PASS1 1 diff --git a/fs/xfs/libxfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c index d3bd6a86c8fe..34bba96d30ca 100644 --- a/fs/xfs/libxfs/xfs_log_rlimit.c +++ b/fs/xfs/libxfs/xfs_log_rlimit.c @@ -91,6 +91,7 @@ xfs_log_calc_trans_resv_for_minlogblocks( */ if (xfs_want_minlogsize_fixes(&mp->m_sb)) { xfs_trans_resv_calc(mp, resv); + resv->tr_atomic_ioend = M_RES(mp)->tr_atomic_ioend; return; } @@ -107,6 +108,9 @@ xfs_log_calc_trans_resv_for_minlogblocks( xfs_trans_resv_calc(mp, resv); + /* Copy the dynamic transaction reservation types from the running fs */ + resv->tr_atomic_ioend = M_RES(mp)->tr_atomic_ioend; + if (xfs_has_reflink(mp)) { /* * In the early days of reflink, typical log operation counts diff --git a/fs/xfs/libxfs/xfs_metafile.c b/fs/xfs/libxfs/xfs_metafile.c index 225923e463c4..b02e3d6c0868 100644 --- a/fs/xfs/libxfs/xfs_metafile.c +++ b/fs/xfs/libxfs/xfs_metafile.c @@ -121,7 +121,7 @@ xfs_metafile_resv_critical( div_u64(mp->m_metafile_resv_target, 10))) return true; - return XFS_TEST_ERROR(false, mp, XFS_ERRTAG_METAFILE_RESV_CRITICAL); + return XFS_TEST_ERROR(mp, XFS_ERRTAG_METAFILE_RESV_CRITICAL); } /* Allocate a block from the metadata file's reservation. */ diff --git a/fs/xfs/libxfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h index 5ed44fdf7491..2e9715cc1641 100644 --- a/fs/xfs/libxfs/xfs_ondisk.h +++ b/fs/xfs/libxfs/xfs_ondisk.h @@ -174,7 +174,11 @@ xfs_check_ondisk_structs(void) XFS_CHECK_STRUCT_SIZE(struct xfs_rud_log_format, 16); XFS_CHECK_STRUCT_SIZE(struct xfs_map_extent, 32); XFS_CHECK_STRUCT_SIZE(struct xfs_phys_extent, 16); + XFS_CHECK_STRUCT_SIZE(struct xlog_rec_header, 512); + XFS_CHECK_STRUCT_SIZE(struct xlog_rec_ext_header, 512); + XFS_CHECK_OFFSET(struct xlog_rec_header, h_reserved, 328); + XFS_CHECK_OFFSET(struct xlog_rec_ext_header, xh_reserved, 260); XFS_CHECK_OFFSET(struct xfs_bui_log_format, bui_extents, 16); XFS_CHECK_OFFSET(struct xfs_cui_log_format, cui_extents, 16); XFS_CHECK_OFFSET(struct xfs_rui_log_format, rui_extents, 16); diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h index 763d941a8420..551d7ae46c5c 100644 --- a/fs/xfs/libxfs/xfs_quota_defs.h +++ b/fs/xfs/libxfs/xfs_quota_defs.h @@ -29,11 +29,9 @@ typedef uint8_t xfs_dqtype_t; * flags for q_flags field in the dquot. */ #define XFS_DQFLAG_DIRTY (1u << 0) /* dquot is dirty */ -#define XFS_DQFLAG_FREEING (1u << 1) /* dquot is being torn down */ #define XFS_DQFLAG_STRINGS \ - { XFS_DQFLAG_DIRTY, "DIRTY" }, \ - { XFS_DQFLAG_FREEING, "FREEING" } + { XFS_DQFLAG_DIRTY, "DIRTY" } /* * We have the possibility of all three quota types being active at once, and diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index cebe83f7842a..2484dc9f6d7e 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -1113,8 +1113,7 @@ xfs_refcount_still_have_space( * refcount continue update "error" has been injected. */ if (cur->bc_refc.nr_ops > 2 && - XFS_TEST_ERROR(false, cur->bc_mp, - XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE)) + XFS_TEST_ERROR(cur->bc_mp, XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE)) return false; if (cur->bc_refc.nr_ops == 0) @@ -1398,7 +1397,7 @@ xfs_refcount_finish_one( trace_xfs_refcount_deferred(mp, ri); - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) return -EIO; /* @@ -1511,7 +1510,7 @@ xfs_rtrefcount_finish_one( trace_xfs_refcount_deferred(mp, ri); - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_REFCOUNT_FINISH_ONE)) return -EIO; /* @@ -2099,9 +2098,7 @@ xfs_refcount_recover_cow_leftovers( * recording the CoW debris we cancel the (empty) transaction * and everything goes away cleanly. */ - error = xfs_trans_alloc_empty(mp, &tp); - if (error) - return error; + tp = xfs_trans_alloc_empty(mp); if (isrt) { xfs_rtgroup_lock(to_rtg(xg), XFS_RTGLOCK_REFCOUNT); diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 54505fee1852..06da3ca14727 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -174,8 +174,8 @@ xfs_refcountbt_init_ptr_from_cur( ptr->s = agf->agf_refcount_root; } -STATIC int64_t -xfs_refcountbt_key_diff( +STATIC int +xfs_refcountbt_cmp_key_with_cur( struct xfs_btree_cur *cur, const union xfs_btree_key *key) { @@ -185,11 +185,11 @@ xfs_refcountbt_key_diff( start = xfs_refcount_encode_startblock(irec->rc_startblock, irec->rc_domain); - return (int64_t)be32_to_cpu(kp->rc_startblock) - start; + return cmp_int(be32_to_cpu(kp->rc_startblock), start); } -STATIC int64_t -xfs_refcountbt_diff_two_keys( +STATIC int +xfs_refcountbt_cmp_two_keys( struct xfs_btree_cur *cur, const union xfs_btree_key *k1, const union xfs_btree_key *k2, @@ -197,8 +197,8 @@ xfs_refcountbt_diff_two_keys( { ASSERT(!mask || mask->refc.rc_startblock); - return (int64_t)be32_to_cpu(k1->refc.rc_startblock) - - be32_to_cpu(k2->refc.rc_startblock); + return cmp_int(be32_to_cpu(k1->refc.rc_startblock), + be32_to_cpu(k2->refc.rc_startblock)); } STATIC xfs_failaddr_t @@ -339,9 +339,9 @@ const struct xfs_btree_ops xfs_refcountbt_ops = { .init_high_key_from_rec = xfs_refcountbt_init_high_key_from_rec, .init_rec_from_cur = xfs_refcountbt_init_rec_from_cur, .init_ptr_from_cur = xfs_refcountbt_init_ptr_from_cur, - .key_diff = xfs_refcountbt_key_diff, + .cmp_key_with_cur = xfs_refcountbt_cmp_key_with_cur, .buf_ops = &xfs_refcountbt_buf_ops, - .diff_two_keys = xfs_refcountbt_diff_two_keys, + .cmp_two_keys = xfs_refcountbt_cmp_two_keys, .keys_inorder = xfs_refcountbt_keys_inorder, .recs_inorder = xfs_refcountbt_recs_inorder, .keys_contiguous = xfs_refcountbt_keys_contiguous, diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 3cdf50563fec..83e0488ff773 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -2690,7 +2690,7 @@ xfs_rmap_finish_one( trace_xfs_rmap_deferred(mp, ri); - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_RMAP_FINISH_ONE)) + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_RMAP_FINISH_ONE)) return -EIO; /* diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 2cab694ac58a..bf16aee50d73 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -243,38 +243,22 @@ static inline uint64_t offset_keymask(uint64_t offset) return offset & ~XFS_RMAP_OFF_UNWRITTEN; } -STATIC int64_t -xfs_rmapbt_key_diff( +STATIC int +xfs_rmapbt_cmp_key_with_cur( struct xfs_btree_cur *cur, const union xfs_btree_key *key) { struct xfs_rmap_irec *rec = &cur->bc_rec.r; const struct xfs_rmap_key *kp = &key->rmap; - __u64 x, y; - int64_t d; - d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock; - if (d) - return d; - - x = be64_to_cpu(kp->rm_owner); - y = rec->rm_owner; - if (x > y) - return 1; - else if (y > x) - return -1; - - x = offset_keymask(be64_to_cpu(kp->rm_offset)); - y = offset_keymask(xfs_rmap_irec_offset_pack(rec)); - if (x > y) - return 1; - else if (y > x) - return -1; - return 0; + return cmp_int(be32_to_cpu(kp->rm_startblock), rec->rm_startblock) ?: + cmp_int(be64_to_cpu(kp->rm_owner), rec->rm_owner) ?: + cmp_int(offset_keymask(be64_to_cpu(kp->rm_offset)), + offset_keymask(xfs_rmap_irec_offset_pack(rec))); } -STATIC int64_t -xfs_rmapbt_diff_two_keys( +STATIC int +xfs_rmapbt_cmp_two_keys( struct xfs_btree_cur *cur, const union xfs_btree_key *k1, const union xfs_btree_key *k2, @@ -282,36 +266,31 @@ xfs_rmapbt_diff_two_keys( { const struct xfs_rmap_key *kp1 = &k1->rmap; const struct xfs_rmap_key *kp2 = &k2->rmap; - int64_t d; - __u64 x, y; + int d; /* Doesn't make sense to mask off the physical space part */ ASSERT(!mask || mask->rmap.rm_startblock); - d = (int64_t)be32_to_cpu(kp1->rm_startblock) - - be32_to_cpu(kp2->rm_startblock); + d = cmp_int(be32_to_cpu(kp1->rm_startblock), + be32_to_cpu(kp2->rm_startblock)); if (d) return d; if (!mask || mask->rmap.rm_owner) { - x = be64_to_cpu(kp1->rm_owner); - y = be64_to_cpu(kp2->rm_owner); - if (x > y) - return 1; - else if (y > x) - return -1; + d = cmp_int(be64_to_cpu(kp1->rm_owner), + be64_to_cpu(kp2->rm_owner)); + if (d) + return d; } if (!mask || mask->rmap.rm_offset) { /* Doesn't make sense to allow offset but not owner */ ASSERT(!mask || mask->rmap.rm_owner); - x = offset_keymask(be64_to_cpu(kp1->rm_offset)); - y = offset_keymask(be64_to_cpu(kp2->rm_offset)); - if (x > y) - return 1; - else if (y > x) - return -1; + d = cmp_int(offset_keymask(be64_to_cpu(kp1->rm_offset)), + offset_keymask(be64_to_cpu(kp2->rm_offset))); + if (d) + return d; } return 0; @@ -515,9 +494,9 @@ const struct xfs_btree_ops xfs_rmapbt_ops = { .init_high_key_from_rec = xfs_rmapbt_init_high_key_from_rec, .init_rec_from_cur = xfs_rmapbt_init_rec_from_cur, .init_ptr_from_cur = xfs_rmapbt_init_ptr_from_cur, - .key_diff = xfs_rmapbt_key_diff, + .cmp_key_with_cur = xfs_rmapbt_cmp_key_with_cur, .buf_ops = &xfs_rmapbt_buf_ops, - .diff_two_keys = xfs_rmapbt_diff_two_keys, + .cmp_two_keys = xfs_rmapbt_cmp_two_keys, .keys_inorder = xfs_rmapbt_keys_inorder, .recs_inorder = xfs_rmapbt_recs_inorder, .keys_contiguous = xfs_rmapbt_keys_contiguous, @@ -632,9 +611,9 @@ const struct xfs_btree_ops xfs_rmapbt_mem_ops = { .init_high_key_from_rec = xfs_rmapbt_init_high_key_from_rec, .init_rec_from_cur = xfs_rmapbt_init_rec_from_cur, .init_ptr_from_cur = xfbtree_init_ptr_from_cur, - .key_diff = xfs_rmapbt_key_diff, + .cmp_key_with_cur = xfs_rmapbt_cmp_key_with_cur, .buf_ops = &xfs_rmapbt_mem_buf_ops, - .diff_two_keys = xfs_rmapbt_diff_two_keys, + .cmp_two_keys = xfs_rmapbt_cmp_two_keys, .keys_inorder = xfs_rmapbt_keys_inorder, .recs_inorder = xfs_rmapbt_recs_inorder, .keys_contiguous = xfs_rmapbt_keys_contiguous, diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 5057536e586c..618061d898d4 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -1067,7 +1067,7 @@ xfs_rtfree_extent( ASSERT(rbmip->i_itemp != NULL); xfs_assert_ilocked(rbmip, XFS_ILOCK_EXCL); - if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_FREE_EXTENT)) + if (XFS_TEST_ERROR(mp, XFS_ERRTAG_FREE_EXTENT)) return -EIO; error = xfs_rtcheck_alloc_range(&args, start, len); diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h index d36a6ae0abe5..03f1e2493334 100644 --- a/fs/xfs/libxfs/xfs_rtgroup.h +++ b/fs/xfs/libxfs/xfs_rtgroup.h @@ -50,6 +50,12 @@ struct xfs_rtgroup { uint8_t *rtg_rsum_cache; struct xfs_open_zone *rtg_open_zone; }; + + /* + * Count of outstanding GC operations for zoned XFS. Any RTG with a + * non-zero rtg_gccount will not be picked as new GC victim. + */ + atomic_t rtg_gccount; }; /* @@ -58,12 +64,6 @@ struct xfs_rtgroup { */ #define XFS_RTG_FREE XA_MARK_0 -/* - * For zoned RT devices this is set on groups that are fully written and that - * have unused blocks. Used by the garbage collection to pick targets. - */ -#define XFS_RTG_RECLAIMABLE XA_MARK_1 - static inline struct xfs_rtgroup *to_rtg(struct xfs_group *xg) { return container_of(xg, struct xfs_rtgroup, rtg_group); @@ -365,4 +365,12 @@ static inline int xfs_initialize_rtgroups(struct xfs_mount *mp, # define xfs_rtgroup_get_geometry(rtg, rgeo) (-EOPNOTSUPP) #endif /* CONFIG_XFS_RT */ +static inline xfs_rfsblock_t +xfs_rtgs_to_rfsbs( + struct xfs_mount *mp, + uint32_t nr_groups) +{ + return xfs_groups_to_rfsbs(mp, nr_groups, XG_TYPE_RTG); +} + #endif /* __LIBXFS_RTGROUP_H */ diff --git a/fs/xfs/libxfs/xfs_rtrefcount_btree.c b/fs/xfs/libxfs/xfs_rtrefcount_btree.c index 3db5e7a4a945..ac11e94b42ae 100644 --- a/fs/xfs/libxfs/xfs_rtrefcount_btree.c +++ b/fs/xfs/libxfs/xfs_rtrefcount_btree.c @@ -156,8 +156,8 @@ xfs_rtrefcountbt_init_ptr_from_cur( ptr->l = 0; } -STATIC int64_t -xfs_rtrefcountbt_key_diff( +STATIC int +xfs_rtrefcountbt_cmp_key_with_cur( struct xfs_btree_cur *cur, const union xfs_btree_key *key) { @@ -167,11 +167,11 @@ xfs_rtrefcountbt_key_diff( start = xfs_refcount_encode_startblock(irec->rc_startblock, irec->rc_domain); - return (int64_t)be32_to_cpu(kp->rc_startblock) - start; + return cmp_int(be32_to_cpu(kp->rc_startblock), start); } -STATIC int64_t -xfs_rtrefcountbt_diff_two_keys( +STATIC int +xfs_rtrefcountbt_cmp_two_keys( struct xfs_btree_cur *cur, const union xfs_btree_key *k1, const union xfs_btree_key *k2, @@ -179,8 +179,8 @@ xfs_rtrefcountbt_diff_two_keys( { ASSERT(!mask || mask->refc.rc_startblock); - return (int64_t)be32_to_cpu(k1->refc.rc_startblock) - - be32_to_cpu(k2->refc.rc_startblock); + return cmp_int(be32_to_cpu(k1->refc.rc_startblock), + be32_to_cpu(k2->refc.rc_startblock)); } static xfs_failaddr_t @@ -387,9 +387,9 @@ const struct xfs_btree_ops xfs_rtrefcountbt_ops = { .init_high_key_from_rec = xfs_rtrefcountbt_init_high_key_from_rec, .init_rec_from_cur = xfs_rtrefcountbt_init_rec_from_cur, .init_ptr_from_cur = xfs_rtrefcountbt_init_ptr_from_cur, - .key_diff = xfs_rtrefcountbt_key_diff, + .cmp_key_with_cur = xfs_rtrefcountbt_cmp_key_with_cur, .buf_ops = &xfs_rtrefcountbt_buf_ops, - .diff_two_keys = xfs_rtrefcountbt_diff_two_keys, + .cmp_two_keys = xfs_rtrefcountbt_cmp_two_keys, .keys_inorder = xfs_rtrefcountbt_keys_inorder, .recs_inorder = xfs_rtrefcountbt_recs_inorder, .keys_contiguous = xfs_rtrefcountbt_keys_contiguous, diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.c b/fs/xfs/libxfs/xfs_rtrmap_btree.c index 9bdc2cbfc113..55f903165769 100644 --- a/fs/xfs/libxfs/xfs_rtrmap_btree.c +++ b/fs/xfs/libxfs/xfs_rtrmap_btree.c @@ -185,38 +185,22 @@ static inline uint64_t offset_keymask(uint64_t offset) return offset & ~XFS_RMAP_OFF_UNWRITTEN; } -STATIC int64_t -xfs_rtrmapbt_key_diff( +STATIC int +xfs_rtrmapbt_cmp_key_with_cur( struct xfs_btree_cur *cur, const union xfs_btree_key *key) { struct xfs_rmap_irec *rec = &cur->bc_rec.r; const struct xfs_rmap_key *kp = &key->rmap; - __u64 x, y; - int64_t d; - d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock; - if (d) - return d; - - x = be64_to_cpu(kp->rm_owner); - y = rec->rm_owner; - if (x > y) - return 1; - else if (y > x) - return -1; - - x = offset_keymask(be64_to_cpu(kp->rm_offset)); - y = offset_keymask(xfs_rmap_irec_offset_pack(rec)); - if (x > y) - return 1; - else if (y > x) - return -1; - return 0; + return cmp_int(be32_to_cpu(kp->rm_startblock), rec->rm_startblock) ?: + cmp_int(be64_to_cpu(kp->rm_owner), rec->rm_owner) ?: + cmp_int(offset_keymask(be64_to_cpu(kp->rm_offset)), + offset_keymask(xfs_rmap_irec_offset_pack(rec))); } -STATIC int64_t -xfs_rtrmapbt_diff_two_keys( +STATIC int +xfs_rtrmapbt_cmp_two_keys( struct xfs_btree_cur *cur, const union xfs_btree_key *k1, const union xfs_btree_key *k2, @@ -224,36 +208,31 @@ xfs_rtrmapbt_diff_two_keys( { const struct xfs_rmap_key *kp1 = &k1->rmap; const struct xfs_rmap_key *kp2 = &k2->rmap; - int64_t d; - __u64 x, y; + int d; /* Doesn't make sense to mask off the physical space part */ ASSERT(!mask || mask->rmap.rm_startblock); - d = (int64_t)be32_to_cpu(kp1->rm_startblock) - - be32_to_cpu(kp2->rm_startblock); + d = cmp_int(be32_to_cpu(kp1->rm_startblock), + be32_to_cpu(kp2->rm_startblock)); if (d) return d; if (!mask || mask->rmap.rm_owner) { - x = be64_to_cpu(kp1->rm_owner); - y = be64_to_cpu(kp2->rm_owner); - if (x > y) - return 1; - else if (y > x) - return -1; + d = cmp_int(be64_to_cpu(kp1->rm_owner), + be64_to_cpu(kp2->rm_owner)); + if (d) + return d; } if (!mask || mask->rmap.rm_offset) { /* Doesn't make sense to allow offset but not owner */ ASSERT(!mask || mask->rmap.rm_owner); - x = offset_keymask(be64_to_cpu(kp1->rm_offset)); - y = offset_keymask(be64_to_cpu(kp2->rm_offset)); - if (x > y) - return 1; - else if (y > x) - return -1; + d = cmp_int(offset_keymask(be64_to_cpu(kp1->rm_offset)), + offset_keymask(be64_to_cpu(kp2->rm_offset))); + if (d) + return d; } return 0; @@ -511,9 +490,9 @@ const struct xfs_btree_ops xfs_rtrmapbt_ops = { .init_high_key_from_rec = xfs_rtrmapbt_init_high_key_from_rec, .init_rec_from_cur = xfs_rtrmapbt_init_rec_from_cur, .init_ptr_from_cur = xfs_rtrmapbt_init_ptr_from_cur, - .key_diff = xfs_rtrmapbt_key_diff, + .cmp_key_with_cur = xfs_rtrmapbt_cmp_key_with_cur, .buf_ops = &xfs_rtrmapbt_buf_ops, - .diff_two_keys = xfs_rtrmapbt_diff_two_keys, + .cmp_two_keys = xfs_rtrmapbt_cmp_two_keys, .keys_inorder = xfs_rtrmapbt_keys_inorder, .recs_inorder = xfs_rtrmapbt_recs_inorder, .keys_contiguous = xfs_rtrmapbt_keys_contiguous, @@ -620,9 +599,9 @@ const struct xfs_btree_ops xfs_rtrmapbt_mem_ops = { .init_high_key_from_rec = xfs_rtrmapbt_init_high_key_from_rec, .init_rec_from_cur = xfs_rtrmapbt_init_rec_from_cur, .init_ptr_from_cur = xfbtree_init_ptr_from_cur, - .key_diff = xfs_rtrmapbt_key_diff, + .cmp_key_with_cur = xfs_rtrmapbt_cmp_key_with_cur, .buf_ops = &xfs_rtrmapbt_mem_buf_ops, - .diff_two_keys = xfs_rtrmapbt_diff_two_keys, + .cmp_two_keys = xfs_rtrmapbt_cmp_two_keys, .keys_inorder = xfs_rtrmapbt_keys_inorder, .recs_inorder = xfs_rtrmapbt_recs_inorder, .keys_contiguous = xfs_rtrmapbt_keys_contiguous, diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 711e180f9ebb..94c272a2ae26 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -142,8 +142,6 @@ xfs_sb_version_to_features( if (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) { if (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT) features |= XFS_FEAT_LAZYSBCOUNT; - if (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT) - features |= XFS_FEAT_ATTR2; if (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT) features |= XFS_FEAT_PROJID32; if (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE) @@ -155,7 +153,7 @@ xfs_sb_version_to_features( /* Always on V5 features */ features |= XFS_FEAT_ALIGN | XFS_FEAT_LOGV2 | XFS_FEAT_EXTFLG | - XFS_FEAT_LAZYSBCOUNT | XFS_FEAT_ATTR2 | XFS_FEAT_PROJID32 | + XFS_FEAT_LAZYSBCOUNT | XFS_FEAT_PROJID32 | XFS_FEAT_V3INODES | XFS_FEAT_CRC | XFS_FEAT_PQUOTINO; /* Optional V5 features */ @@ -303,6 +301,21 @@ xfs_validate_rt_geometry( sbp->sb_rbmblocks != xfs_expected_rbmblocks(sbp)) return false; + if (xfs_sb_is_v5(sbp) && + (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED)) { + uint32_t mod; + + /* + * Zoned RT devices must be aligned to the RT group size, + * because garbage collection assumes that all zones have the + * same size to avoid insane complexity if that weren't the + * case. + */ + div_u64_rem(sbp->sb_rextents, sbp->sb_rgextents, &mod); + if (mod) + return false; + } + return true; } @@ -1524,7 +1537,8 @@ xfs_fs_geometry( geo->version = XFS_FSOP_GEOM_VERSION; geo->flags = XFS_FSOP_GEOM_FLAGS_NLINK | XFS_FSOP_GEOM_FLAGS_DIRV2 | - XFS_FSOP_GEOM_FLAGS_EXTFLG; + XFS_FSOP_GEOM_FLAGS_EXTFLG | + XFS_FSOP_GEOM_FLAGS_ATTR2; if (xfs_has_attr(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_ATTR; if (xfs_has_quota(mp)) @@ -1537,8 +1551,6 @@ xfs_fs_geometry( geo->flags |= XFS_FSOP_GEOM_FLAGS_DIRV2CI; if (xfs_has_lazysbcount(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_LAZYSB; - if (xfs_has_attr2(mp)) - geo->flags |= XFS_FSOP_GEOM_FLAGS_ATTR2; if (xfs_has_projid32(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_PROJID32; if (xfs_has_crc(mp)) diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index 13d00c7166e1..86a111d0f2fc 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -22,6 +22,12 @@ #include "xfs_rtbitmap.h" #include "xfs_attr_item.h" #include "xfs_log.h" +#include "xfs_defer.h" +#include "xfs_bmap_item.h" +#include "xfs_extfree_item.h" +#include "xfs_rmap_item.h" +#include "xfs_refcount_item.h" +#include "xfs_trace.h" #define _ALLOC true #define _FREE false @@ -264,6 +270,42 @@ xfs_rtalloc_block_count( */ /* + * Finishing a data device refcount updates (t1): + * the agfs of the ags containing the blocks: nr_ops * sector size + * the refcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size + */ +inline unsigned int +xfs_calc_finish_cui_reservation( + struct xfs_mount *mp, + unsigned int nr_ops) +{ + if (!xfs_has_reflink(mp)) + return 0; + + return xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops), + mp->m_sb.sb_blocksize); +} + +/* + * Realtime refcount updates (t2); + * the rt refcount inode + * the rtrefcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size + */ +inline unsigned int +xfs_calc_finish_rt_cui_reservation( + struct xfs_mount *mp, + unsigned int nr_ops) +{ + if (!xfs_has_rtreflink(mp)) + return 0; + + return xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(xfs_rtrefcountbt_block_count(mp, nr_ops), + mp->m_sb.sb_blocksize); +} + +/* * Compute the log reservation required to handle the refcount update * transaction. Refcount updates are always done via deferred log items. * @@ -280,19 +322,10 @@ xfs_calc_refcountbt_reservation( struct xfs_mount *mp, unsigned int nr_ops) { - unsigned int blksz = XFS_FSB_TO_B(mp, 1); - unsigned int t1, t2 = 0; + unsigned int t1, t2; - if (!xfs_has_reflink(mp)) - return 0; - - t1 = xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops), blksz); - - if (xfs_has_realtime(mp)) - t2 = xfs_calc_inode_res(mp, 1) + - xfs_calc_buf_res(xfs_rtrefcountbt_block_count(mp, nr_ops), - blksz); + t1 = xfs_calc_finish_cui_reservation(mp, nr_ops); + t2 = xfs_calc_finish_rt_cui_reservation(mp, nr_ops); return max(t1, t2); } @@ -380,6 +413,96 @@ xfs_calc_write_reservation_minlogsize( } /* + * Finishing an EFI can free the blocks and bmap blocks (t2): + * the agf for each of the ags: nr * sector size + * the agfl for each of the ags: nr * sector size + * the super block to reflect the freed blocks: sector size + * worst case split in allocation btrees per extent assuming nr extents: + * nr exts * 2 trees * (2 * max depth - 1) * block size + */ +inline unsigned int +xfs_calc_finish_efi_reservation( + struct xfs_mount *mp, + unsigned int nr) +{ + return xfs_calc_buf_res((2 * nr) + 1, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_allocfree_block_count(mp, nr), + mp->m_sb.sb_blocksize); +} + +/* + * Or, if it's a realtime file (t3): + * the agf for each of the ags: 2 * sector size + * the agfl for each of the ags: 2 * sector size + * the super block to reflect the freed blocks: sector size + * the realtime bitmap: + * 2 exts * ((XFS_BMBT_MAX_EXTLEN / rtextsize) / NBBY) bytes + * the realtime summary: 2 exts * 1 block + * worst case split in allocation btrees per extent assuming 2 extents: + * 2 exts * 2 trees * (2 * max depth - 1) * block size + */ +inline unsigned int +xfs_calc_finish_rt_efi_reservation( + struct xfs_mount *mp, + unsigned int nr) +{ + if (!xfs_has_realtime(mp)) + return 0; + + return xfs_calc_buf_res((2 * nr) + 1, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_rtalloc_block_count(mp, nr), + mp->m_sb.sb_blocksize) + + xfs_calc_buf_res(xfs_allocfree_block_count(mp, nr), + mp->m_sb.sb_blocksize); +} + +/* + * Finishing an RUI is the same as an EFI. We can split the rmap btree twice + * on each end of the record, and that can cause the AGFL to be refilled or + * emptied out. + */ +inline unsigned int +xfs_calc_finish_rui_reservation( + struct xfs_mount *mp, + unsigned int nr) +{ + if (!xfs_has_rmapbt(mp)) + return 0; + return xfs_calc_finish_efi_reservation(mp, nr); +} + +/* + * Finishing an RUI is the same as an EFI. We can split the rmap btree twice + * on each end of the record, and that can cause the AGFL to be refilled or + * emptied out. + */ +inline unsigned int +xfs_calc_finish_rt_rui_reservation( + struct xfs_mount *mp, + unsigned int nr) +{ + if (!xfs_has_rtrmapbt(mp)) + return 0; + return xfs_calc_finish_rt_efi_reservation(mp, nr); +} + +/* + * In finishing a BUI, we can modify: + * the inode being truncated: inode size + * dquots + * the inode's bmap btree: (max depth + 1) * block size + */ +inline unsigned int +xfs_calc_finish_bui_reservation( + struct xfs_mount *mp, + unsigned int nr) +{ + return xfs_calc_inode_res(mp, 1) + XFS_DQUOT_LOGRES + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, + mp->m_sb.sb_blocksize); +} + +/* * In truncating a file we free up to two extents at once. We can modify (t1): * the inode being truncated: inode size * the inode's bmap btree: (max depth + 1) * block size @@ -411,16 +534,8 @@ xfs_calc_itruncate_reservation( t1 = xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, blksz); - t2 = xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_block_count(mp, 4), blksz); - - if (xfs_has_realtime(mp)) { - t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_rtalloc_block_count(mp, 2), blksz) + - xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz); - } else { - t3 = 0; - } + t2 = xfs_calc_finish_efi_reservation(mp, 4); + t3 = xfs_calc_finish_rt_efi_reservation(mp, 2); /* * In the early days of reflink, we included enough reservation to log @@ -501,9 +616,7 @@ xfs_calc_rename_reservation( xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1)); - t2 = xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_block_count(mp, 3), - XFS_FSB_TO_B(mp, 1)); + t2 = xfs_calc_finish_efi_reservation(mp, 3); if (xfs_has_parent(mp)) { unsigned int rename_overhead, exchange_overhead; @@ -611,9 +724,7 @@ xfs_calc_link_reservation( overhead += xfs_calc_iunlink_remove_reservation(mp); t1 = xfs_calc_inode_res(mp, 2) + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1)); - t2 = xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), - XFS_FSB_TO_B(mp, 1)); + t2 = xfs_calc_finish_efi_reservation(mp, 1); if (xfs_has_parent(mp)) { t3 = resp->tr_attrsetm.tr_logres; @@ -676,9 +787,7 @@ xfs_calc_remove_reservation( t1 = xfs_calc_inode_res(mp, 2) + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1)); - t2 = xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), - XFS_FSB_TO_B(mp, 1)); + t2 = xfs_calc_finish_efi_reservation(mp, 2); if (xfs_has_parent(mp)) { t3 = resp->tr_attrrm.tr_logres; @@ -1181,6 +1290,15 @@ xfs_calc_namespace_reservations( resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES; } +STATIC void +xfs_calc_default_atomic_ioend_reservation( + struct xfs_mount *mp, + struct xfs_trans_resv *resp) +{ + /* Pick a default that will scale reasonably for the log size. */ + resp->tr_atomic_ioend = resp->tr_itruncate; +} + void xfs_trans_resv_calc( struct xfs_mount *mp, @@ -1275,4 +1393,167 @@ xfs_trans_resv_calc( resp->tr_itruncate.tr_logcount += logcount_adj; resp->tr_write.tr_logcount += logcount_adj; resp->tr_qm_dqalloc.tr_logcount += logcount_adj; + + /* + * Now that we've finished computing the static reservations, we can + * compute the dynamic reservation for atomic writes. + */ + xfs_calc_default_atomic_ioend_reservation(mp, resp); +} + +/* + * Return the per-extent and fixed transaction reservation sizes needed to + * complete an atomic write. + */ +STATIC unsigned int +xfs_calc_atomic_write_ioend_geometry( + struct xfs_mount *mp, + unsigned int *step_size) +{ + const unsigned int efi = xfs_efi_log_space(1); + const unsigned int efd = xfs_efd_log_space(1); + const unsigned int rui = xfs_rui_log_space(1); + const unsigned int rud = xfs_rud_log_space(); + const unsigned int cui = xfs_cui_log_space(1); + const unsigned int cud = xfs_cud_log_space(); + const unsigned int bui = xfs_bui_log_space(1); + const unsigned int bud = xfs_bud_log_space(); + + /* + * Maximum overhead to complete an atomic write ioend in software: + * remove data fork extent + remove cow fork extent + map extent into + * data fork. + * + * tx0: Creates a BUI and a CUI and that's all it needs. + * + * tx1: Roll to finish the BUI. Need space for the BUD, an RUI, and + * enough space to relog the CUI (== CUI + CUD). + * + * tx2: Roll again to finish the RUI. Need space for the RUD and space + * to relog the CUI. + * + * tx3: Roll again, need space for the CUD and possibly a new EFI. + * + * tx4: Roll again, need space for an EFD. + * + * If the extent referenced by the pair of BUI/CUI items is not the one + * being currently processed, then we need to reserve space to relog + * both items. + */ + const unsigned int tx0 = bui + cui; + const unsigned int tx1 = bud + rui + cui + cud; + const unsigned int tx2 = rud + cui + cud; + const unsigned int tx3 = cud + efi; + const unsigned int tx4 = efd; + const unsigned int relog = bui + bud + cui + cud; + + const unsigned int per_intent = max(max3(tx0, tx1, tx2), + max3(tx3, tx4, relog)); + + /* Overhead to finish one step of each intent item type */ + const unsigned int f1 = xfs_calc_finish_efi_reservation(mp, 1); + const unsigned int f2 = xfs_calc_finish_rui_reservation(mp, 1); + const unsigned int f3 = xfs_calc_finish_cui_reservation(mp, 1); + const unsigned int f4 = xfs_calc_finish_bui_reservation(mp, 1); + + /* We only finish one item per transaction in a chain */ + *step_size = max(f4, max3(f1, f2, f3)); + + return per_intent; +} + +/* + * Compute the maximum size (in fsblocks) of atomic writes that we can complete + * given the existing log reservations. + */ +xfs_extlen_t +xfs_calc_max_atomic_write_fsblocks( + struct xfs_mount *mp) +{ + const struct xfs_trans_res *resv = &M_RES(mp)->tr_atomic_ioend; + unsigned int per_intent = 0; + unsigned int step_size = 0; + unsigned int ret = 0; + + if (resv->tr_logres > 0) { + per_intent = xfs_calc_atomic_write_ioend_geometry(mp, + &step_size); + + if (resv->tr_logres >= step_size) + ret = (resv->tr_logres - step_size) / per_intent; + } + + trace_xfs_calc_max_atomic_write_fsblocks(mp, per_intent, step_size, + resv->tr_logres, ret); + + return ret; +} + +/* + * Compute the log blocks and transaction reservation needed to complete an + * atomic write of a given number of blocks. Worst case, each block requires + * separate handling. A return value of 0 means something went wrong. + */ +xfs_extlen_t +xfs_calc_atomic_write_log_geometry( + struct xfs_mount *mp, + xfs_extlen_t blockcount, + unsigned int *new_logres) +{ + struct xfs_trans_res *curr_res = &M_RES(mp)->tr_atomic_ioend; + uint old_logres = curr_res->tr_logres; + unsigned int per_intent, step_size; + unsigned int logres; + xfs_extlen_t min_logblocks; + + ASSERT(blockcount > 0); + + xfs_calc_default_atomic_ioend_reservation(mp, M_RES(mp)); + + per_intent = xfs_calc_atomic_write_ioend_geometry(mp, &step_size); + + /* Check for overflows */ + if (check_mul_overflow(blockcount, per_intent, &logres) || + check_add_overflow(logres, step_size, &logres)) + return 0; + + curr_res->tr_logres = logres; + min_logblocks = xfs_log_calc_minimum_size(mp); + curr_res->tr_logres = old_logres; + + trace_xfs_calc_max_atomic_write_log_geometry(mp, per_intent, step_size, + blockcount, min_logblocks, logres); + + *new_logres = logres; + return min_logblocks; +} + +/* + * Compute the transaction reservation needed to complete an out of place + * atomic write of a given number of blocks. + */ +int +xfs_calc_atomic_write_reservation( + struct xfs_mount *mp, + xfs_extlen_t blockcount) +{ + unsigned int new_logres; + xfs_extlen_t min_logblocks; + + /* + * If the caller doesn't ask for a specific atomic write size, then + * use the defaults. + */ + if (blockcount == 0) { + xfs_calc_default_atomic_ioend_reservation(mp, M_RES(mp)); + return 0; + } + + min_logblocks = xfs_calc_atomic_write_log_geometry(mp, blockcount, + &new_logres); + if (!min_logblocks || min_logblocks > mp->m_sb.sb_logblocks) + return -EINVAL; + + M_RES(mp)->tr_atomic_ioend.tr_logres = new_logres; + return 0; } diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h index 0554b9d775d2..336279e0fc61 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.h +++ b/fs/xfs/libxfs/xfs_trans_resv.h @@ -48,6 +48,7 @@ struct xfs_trans_resv { struct xfs_trans_res tr_qm_dqalloc; /* allocate quota on disk */ struct xfs_trans_res tr_sb; /* modify superblock */ struct xfs_trans_res tr_fsyncts; /* update timestamps on fsync */ + struct xfs_trans_res tr_atomic_ioend; /* untorn write completion */ }; /* shorthand way of accessing reservation structure */ @@ -98,8 +99,32 @@ struct xfs_trans_resv { void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp); uint xfs_allocfree_block_count(struct xfs_mount *mp, uint num_ops); +unsigned int xfs_calc_finish_bui_reservation(struct xfs_mount *mp, + unsigned int nr_ops); + +unsigned int xfs_calc_finish_efi_reservation(struct xfs_mount *mp, + unsigned int nr_ops); +unsigned int xfs_calc_finish_rt_efi_reservation(struct xfs_mount *mp, + unsigned int nr_ops); + +unsigned int xfs_calc_finish_rui_reservation(struct xfs_mount *mp, + unsigned int nr_ops); +unsigned int xfs_calc_finish_rt_rui_reservation(struct xfs_mount *mp, + unsigned int nr_ops); + +unsigned int xfs_calc_finish_cui_reservation(struct xfs_mount *mp, + unsigned int nr_ops); +unsigned int xfs_calc_finish_rt_cui_reservation(struct xfs_mount *mp, + unsigned int nr_ops); + unsigned int xfs_calc_itruncate_reservation_minlogsize(struct xfs_mount *mp); unsigned int xfs_calc_write_reservation_minlogsize(struct xfs_mount *mp); unsigned int xfs_calc_qm_dqalloc_reservation_minlogsize(struct xfs_mount *mp); +xfs_extlen_t xfs_calc_max_atomic_write_fsblocks(struct xfs_mount *mp); +xfs_extlen_t xfs_calc_atomic_write_log_geometry(struct xfs_mount *mp, + xfs_extlen_t blockcount, unsigned int *new_logres); +int xfs_calc_atomic_write_reservation(struct xfs_mount *mp, + xfs_extlen_t blockcount); + #endif /* __XFS_TRANS_RESV_H__ */ diff --git a/fs/xfs/libxfs/xfs_zones.c b/fs/xfs/libxfs/xfs_zones.c index b0791a71931c..b40f71f878b5 100644 --- a/fs/xfs/libxfs/xfs_zones.c +++ b/fs/xfs/libxfs/xfs_zones.c @@ -95,6 +95,7 @@ xfs_zone_validate_seq( case BLK_ZONE_COND_IMP_OPEN: case BLK_ZONE_COND_EXP_OPEN: case BLK_ZONE_COND_CLOSED: + case BLK_ZONE_COND_ACTIVE: return xfs_zone_validate_wp(zone, rtg, write_pointer); case BLK_ZONE_COND_FULL: return xfs_zone_validate_full(zone, rtg, write_pointer); diff --git a/fs/xfs/libxfs/xfs_zones.h b/fs/xfs/libxfs/xfs_zones.h index c4f1367b2cca..5fefd132e002 100644 --- a/fs/xfs/libxfs/xfs_zones.h +++ b/fs/xfs/libxfs/xfs_zones.h @@ -29,6 +29,13 @@ struct xfs_rtgroup; #define XFS_OPEN_GC_ZONES 1U #define XFS_MIN_OPEN_ZONES (XFS_OPEN_GC_ZONES + 1U) +/* + * For zoned devices that do not have a limit on the number of open zones, and + * for regular devices using the zoned allocator, use the most common SMR disks + * limit (128) as the default limit on the number of open zones. + */ +#define XFS_DEFAULT_MAX_OPEN_ZONES 128 + bool xfs_zone_validate(struct blk_zone *zone, struct xfs_rtgroup *rtg, xfs_rgblock_t *write_pointer); diff --git a/fs/xfs/scrub/attr_repair.c b/fs/xfs/scrub/attr_repair.c index c7eb94069caf..09d63aa10314 100644 --- a/fs/xfs/scrub/attr_repair.c +++ b/fs/xfs/scrub/attr_repair.c @@ -333,7 +333,6 @@ xrep_xattr_salvage_remote_attr( .attr_filter = ent->flags & XFS_ATTR_NSP_ONDISK_MASK, .namelen = rentry->namelen, .name = rentry->name, - .value = ab->value, .valuelen = be32_to_cpu(rentry->valuelen), }; unsigned int namesize; @@ -363,6 +362,7 @@ xrep_xattr_salvage_remote_attr( error = -EDEADLOCK; if (error) return error; + args.value = ab->value; /* Look up the remote value and stash it for reconstruction. */ error = xfs_attr3_leaf_getvalue(leaf_bp, &args); diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c index fe678a0438bc..cd6f0ff382a7 100644 --- a/fs/xfs/scrub/btree.c +++ b/fs/xfs/scrub/btree.c @@ -306,7 +306,7 @@ xchk_btree_block_check_sibling( if (pbp) xchk_buffer_recheck(bs->sc, pbp); - if (xfs_btree_diff_two_ptrs(cur, pp, sibling)) + if (xfs_btree_cmp_two_ptrs(cur, pp, sibling)) xchk_btree_set_corrupt(bs->sc, cur, level); out: xfs_btree_del_cursor(ncur, XFS_BTREE_ERROR); diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 28ad341df8ee..7bfa37c99480 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -866,11 +866,11 @@ xchk_trans_cancel( sc->tp = NULL; } -int +void xchk_trans_alloc_empty( struct xfs_scrub *sc) { - return xfs_trans_alloc_empty(sc->mp, &sc->tp); + sc->tp = xfs_trans_alloc_empty(sc->mp); } /* @@ -892,7 +892,8 @@ xchk_trans_alloc( return xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate, resblks, 0, 0, &sc->tp); - return xchk_trans_alloc_empty(sc); + xchk_trans_alloc_empty(sc); + return 0; } /* Set us up with a transaction and an empty context. */ @@ -1248,7 +1249,7 @@ xchk_irele( * hits do not clear DONTCACHE, so we must do it here. */ spin_lock(&VFS_I(ip)->i_lock); - VFS_I(ip)->i_state &= ~I_DONTCACHE; + inode_state_clear(VFS_I(ip), I_DONTCACHE); spin_unlock(&VFS_I(ip)->i_lock); } diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index 19877d99f255..ddbc065c798c 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -7,7 +7,7 @@ #define __XFS_SCRUB_COMMON_H__ int xchk_trans_alloc(struct xfs_scrub *sc, uint resblks); -int xchk_trans_alloc_empty(struct xfs_scrub *sc); +void xchk_trans_alloc_empty(struct xfs_scrub *sc); void xchk_trans_cancel(struct xfs_scrub *sc); bool xchk_process_error(struct xfs_scrub *sc, xfs_agnumber_t agno, diff --git a/fs/xfs/scrub/cow_repair.c b/fs/xfs/scrub/cow_repair.c index 38a246b8bf11..b2a83801412e 100644 --- a/fs/xfs/scrub/cow_repair.c +++ b/fs/xfs/scrub/cow_repair.c @@ -300,7 +300,7 @@ xrep_cow_find_bad( * on the debugging knob, replace everything in the CoW fork. */ if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) || - XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) { + XFS_TEST_ERROR(sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) { error = xrep_cow_mark_file_range(xc, xc->irec.br_startblock, xc->irec.br_blockcount); if (error) @@ -385,7 +385,7 @@ xrep_cow_find_bad_rt( * CoW fork and then scan for staging extents in the refcountbt. */ if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) || - XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) { + XFS_TEST_ERROR(sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) { error = xrep_cow_mark_file_range(xc, xc->irec.br_startblock, xc->irec.br_blockcount); if (error) diff --git a/fs/xfs/scrub/dir_repair.c b/fs/xfs/scrub/dir_repair.c index 249313882108..8d3b550990b5 100644 --- a/fs/xfs/scrub/dir_repair.c +++ b/fs/xfs/scrub/dir_repair.c @@ -1289,9 +1289,7 @@ xrep_dir_scan_dirtree( if (sc->ilock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) xchk_iunlock(sc, sc->ilock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - error = xchk_trans_alloc_empty(sc); - if (error) - return error; + xchk_trans_alloc_empty(sc); while ((error = xchk_iscan_iter(&rd->pscan.iscan, &ip)) == 1) { bool flush; @@ -1317,9 +1315,7 @@ xrep_dir_scan_dirtree( if (error) break; - error = xchk_trans_alloc_empty(sc); - if (error) - break; + xchk_trans_alloc_empty(sc); } if (xchk_should_terminate(sc, &error)) diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index e629663e460a..cebd0d526926 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -123,7 +123,7 @@ xchk_fsfreeze( { int error; - error = freeze_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL); + error = freeze_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL, NULL); trace_xchk_fsfreeze(sc, error); return error; } @@ -135,7 +135,7 @@ xchk_fsthaw( int error; /* This should always succeed, we have a kernel freeze */ - error = thaw_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL); + error = thaw_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL, NULL); trace_xchk_fsthaw(sc, error); return error; } @@ -237,7 +237,8 @@ xchk_setup_fscounters( return error; } - return xchk_trans_alloc_empty(sc); + xchk_trans_alloc_empty(sc); + return 0; } /* diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c index a90a011c7e5f..4f7040c9ddf0 100644 --- a/fs/xfs/scrub/inode_repair.c +++ b/fs/xfs/scrub/inode_repair.c @@ -1933,7 +1933,7 @@ xrep_inode_pptr( * Unlinked inodes that cannot be added to the directory tree will not * have a parent pointer. */ - if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE)) + if (inode->i_nlink == 0 && !(inode_state_read_once(inode) & I_LINKABLE)) return 0; /* Children of the superblock do not have parent pointers. */ diff --git a/fs/xfs/scrub/metapath.c b/fs/xfs/scrub/metapath.c index e21c16fbd15d..378ec7c8d38e 100644 --- a/fs/xfs/scrub/metapath.c +++ b/fs/xfs/scrub/metapath.c @@ -79,7 +79,7 @@ xchk_metapath_cleanup( if (mpath->dp_ilock_flags) xfs_iunlock(mpath->dp, mpath->dp_ilock_flags); - kfree(mpath->path); + kfree_const(mpath->path); } /* Set up a metadir path scan. @path must be dynamically allocated. */ @@ -98,13 +98,13 @@ xchk_setup_metapath_scan( error = xchk_install_live_inode(sc, ip); if (error) { - kfree(path); + kfree_const(path); return error; } mpath = kzalloc(sizeof(struct xchk_metapath), XCHK_GFP_FLAGS); if (!mpath) { - kfree(path); + kfree_const(path); return -ENOMEM; } @@ -132,7 +132,7 @@ xchk_setup_metapath_rtdir( return -ENOENT; return xchk_setup_metapath_scan(sc, sc->mp->m_metadirip, - kasprintf(GFP_KERNEL, "rtgroups"), sc->mp->m_rtdirip); + kstrdup_const("rtgroups", GFP_KERNEL), sc->mp->m_rtdirip); } /* Scan a rtgroup inode under the /rtgroups directory. */ @@ -179,7 +179,7 @@ xchk_setup_metapath_quotadir( return -ENOENT; return xchk_setup_metapath_scan(sc, sc->mp->m_metadirip, - kstrdup("quota", GFP_KERNEL), qi->qi_dirip); + kstrdup_const("quota", GFP_KERNEL), qi->qi_dirip); } /* Scan a quota inode under the /quota directory. */ @@ -212,7 +212,7 @@ xchk_setup_metapath_dqinode( return -ENOENT; return xchk_setup_metapath_scan(sc, qi->qi_dirip, - kstrdup(xfs_dqinode_path(type), GFP_KERNEL), ip); + kstrdup_const(xfs_dqinode_path(type), GFP_KERNEL), ip); } #else # define xchk_setup_metapath_quotadir(...) (-ENOENT) @@ -318,9 +318,7 @@ xchk_metapath( return 0; } - error = xchk_trans_alloc_empty(sc); - if (error) - return error; + xchk_trans_alloc_empty(sc); error = xchk_metapath_ilock_both(mpath); if (error) diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c index 1588ce971cb8..951ae8b71566 100644 --- a/fs/xfs/scrub/newbt.c +++ b/fs/xfs/scrub/newbt.c @@ -28,6 +28,15 @@ #include "scrub/newbt.h" /* + * This is the maximum number of deferred extent freeing item extents (EFIs) + * that we'll attach to a transaction without rolling the transaction to avoid + * overrunning a tr_itruncate reservation. The newbt code should reserve + * exactly the correct number of blocks to rebuild the btree, so there should + * not be any excess blocks to free when committing a new btree. + */ +#define XREP_MAX_ITRUNCATE_EFIS (128) + +/* * Estimate proper slack values for a btree that's being reloaded. * * Under most circumstances, we'll take whatever default loading value the diff --git a/fs/xfs/scrub/nlinks.c b/fs/xfs/scrub/nlinks.c index 4a47d0aabf73..091c79e432e5 100644 --- a/fs/xfs/scrub/nlinks.c +++ b/fs/xfs/scrub/nlinks.c @@ -376,6 +376,36 @@ out_incomplete: return error; } +static uint +xchk_nlinks_ilock_dir( + struct xfs_inode *ip) +{ + uint lock_mode = XFS_ILOCK_SHARED; + + /* + * We're going to scan the directory entries, so we must be ready to + * pull the data fork mappings into memory if they aren't already. + */ + if (xfs_need_iread_extents(&ip->i_df)) + lock_mode = XFS_ILOCK_EXCL; + + /* + * We're going to scan the parent pointers, so we must be ready to + * pull the attr fork mappings into memory if they aren't already. + */ + if (xfs_has_parent(ip->i_mount) && xfs_inode_has_attr_fork(ip) && + xfs_need_iread_extents(&ip->i_af)) + lock_mode = XFS_ILOCK_EXCL; + + /* + * Take the IOLOCK so that other threads cannot start a directory + * update while we're scanning. + */ + lock_mode |= XFS_IOLOCK_SHARED; + xfs_ilock(ip, lock_mode); + return lock_mode; +} + /* Walk a directory to bump the observed link counts of the children. */ STATIC int xchk_nlinks_collect_dir( @@ -394,8 +424,7 @@ xchk_nlinks_collect_dir( return 0; /* Prevent anyone from changing this directory while we walk it. */ - xfs_ilock(dp, XFS_IOLOCK_SHARED); - lock_mode = xfs_ilock_data_map_shared(dp); + lock_mode = xchk_nlinks_ilock_dir(dp); /* * The dotdot entry of an unlinked directory still points to the last @@ -452,7 +481,6 @@ out_abort: xchk_iscan_abort(&xnc->collect_iscan); out_unlock: xfs_iunlock(dp, lock_mode); - xfs_iunlock(dp, XFS_IOLOCK_SHARED); return error; } @@ -555,9 +583,7 @@ xchk_nlinks_collect( * do not take sb_internal. */ xchk_trans_cancel(sc); - error = xchk_trans_alloc_empty(sc); - if (error) - return error; + xchk_trans_alloc_empty(sc); while ((error = xchk_iscan_iter(&xnc->collect_iscan, &ip)) == 1) { if (S_ISDIR(VFS_I(ip)->i_mode)) @@ -880,9 +906,7 @@ xchk_nlinks_compare( * inactivation workqueue. */ xchk_trans_cancel(sc); - error = xchk_trans_alloc_empty(sc); - if (error) - return error; + xchk_trans_alloc_empty(sc); /* * Use the inobt to walk all allocated inodes to compare the link diff --git a/fs/xfs/scrub/nlinks_repair.c b/fs/xfs/scrub/nlinks_repair.c index 4ebdee095428..6ef2ee9c3814 100644 --- a/fs/xfs/scrub/nlinks_repair.c +++ b/fs/xfs/scrub/nlinks_repair.c @@ -340,9 +340,7 @@ xrep_nlinks( * We can only push the inactivation workqueues with an empty * transaction. */ - error = xchk_trans_alloc_empty(sc); - if (error) - break; + xchk_trans_alloc_empty(sc); } xchk_iscan_iter_finish(&xnc->compare_iscan); xchk_iscan_teardown(&xnc->compare_iscan); diff --git a/fs/xfs/scrub/orphanage.c b/fs/xfs/scrub/orphanage.c index 3537f3cca6d5..4e550a1d5353 100644 --- a/fs/xfs/scrub/orphanage.c +++ b/fs/xfs/scrub/orphanage.c @@ -152,12 +152,10 @@ xrep_orphanage_create( } /* Try to find the orphanage directory. */ - inode_lock_nested(root_inode, I_MUTEX_PARENT); - orphanage_dentry = lookup_one_len(ORPHANAGE, root_dentry, - strlen(ORPHANAGE)); + orphanage_dentry = start_creating_noperm(root_dentry, &QSTR(ORPHANAGE)); if (IS_ERR(orphanage_dentry)) { error = PTR_ERR(orphanage_dentry); - goto out_unlock_root; + goto out_dput_root; } /* @@ -168,10 +166,10 @@ xrep_orphanage_create( */ if (d_really_is_negative(orphanage_dentry)) { orphanage_dentry = vfs_mkdir(&nop_mnt_idmap, root_inode, - orphanage_dentry, 0750); + orphanage_dentry, 0750, NULL); error = PTR_ERR(orphanage_dentry); if (IS_ERR(orphanage_dentry)) - goto out_unlock_root; + goto out_dput_orphanage; } /* Not a directory? Bail out. */ @@ -201,9 +199,7 @@ xrep_orphanage_create( sc->orphanage_ilock_flags = 0; out_dput_orphanage: - dput(orphanage_dentry); -out_unlock_root: - inode_unlock(VFS_I(sc->mp->m_rootip)); + end_creating(orphanage_dentry); out_dput_root: dput(root_dentry); out: @@ -445,7 +441,7 @@ xrep_adoption_check_dcache( if (!d_orphanage) return 0; - d_child = d_hash_and_lookup(d_orphanage, &qname); + d_child = try_lookup_noperm(&qname, d_orphanage); if (d_child) { trace_xrep_adoption_check_child(sc->mp, d_child); @@ -482,7 +478,7 @@ xrep_adoption_zap_dcache( if (!d_orphanage) return; - d_child = d_hash_and_lookup(d_orphanage, &qname); + d_child = try_lookup_noperm(&qname, d_orphanage); while (d_child != NULL) { trace_xrep_adoption_invalidate_child(sc->mp, d_child); diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c index 3b692c4acc1e..11d5de10fd56 100644 --- a/fs/xfs/scrub/parent.c +++ b/fs/xfs/scrub/parent.c @@ -915,7 +915,7 @@ xchk_pptr_looks_zapped( * Temporary files that cannot be linked into the directory tree do not * have attr forks because they cannot ever have parents. */ - if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE)) + if (inode->i_nlink == 0 && !(inode_state_read_once(inode) & I_LINKABLE)) return false; /* diff --git a/fs/xfs/scrub/parent_repair.c b/fs/xfs/scrub/parent_repair.c index 31bfe10be22a..2949feda6271 100644 --- a/fs/xfs/scrub/parent_repair.c +++ b/fs/xfs/scrub/parent_repair.c @@ -569,9 +569,7 @@ xrep_parent_scan_dirtree( if (sc->ilock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) xchk_iunlock(sc, sc->ilock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - error = xchk_trans_alloc_empty(sc); - if (error) - return error; + xchk_trans_alloc_empty(sc); while ((error = xchk_iscan_iter(&rp->pscan.iscan, &ip)) == 1) { bool flush; @@ -597,9 +595,7 @@ xrep_parent_scan_dirtree( if (error) break; - error = xchk_trans_alloc_empty(sc); - if (error) - break; + xchk_trans_alloc_empty(sc); } if (xchk_should_terminate(sc, &error)) @@ -1099,9 +1095,7 @@ xrep_parent_flush_xattrs( xrep_tempfile_iounlock(rp->sc); /* Recreate the empty transaction and relock the inode. */ - error = xchk_trans_alloc_empty(rp->sc); - if (error) - return error; + xchk_trans_alloc_empty(rp->sc); xchk_ilock(rp->sc, XFS_ILOCK_EXCL); return 0; } diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index 58d6d4ed2853..5c5374c44c5a 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -155,12 +155,9 @@ xchk_quota_item( * We want to validate the bmap record for the storage backing this * dquot, so we need to lock the dquot and the quota file. For quota * operations, the locking order is first the ILOCK and then the dquot. - * However, dqiterate gave us a locked dquot, so drop the dquot lock to - * get the ILOCK. */ - xfs_dqunlock(dq); xchk_ilock(sc, XFS_ILOCK_SHARED); - xfs_dqlock(dq); + mutex_lock(&dq->q_qlock); /* * Except for the root dquot, the actual dquot we got must either have @@ -251,6 +248,7 @@ xchk_quota_item( xchk_quota_item_timer(sc, offset, &dq->q_rtb); out: + mutex_unlock(&dq->q_qlock); if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) return -ECANCELED; @@ -330,7 +328,7 @@ xchk_quota( xchk_dqiter_init(&cursor, sc, dqtype); while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) { error = xchk_quota_item(&sqi, dq); - xfs_qm_dqput(dq); + xfs_qm_dqrele(dq); if (error) break; } diff --git a/fs/xfs/scrub/quota_repair.c b/fs/xfs/scrub/quota_repair.c index 8f4c8d41f308..b1d661aa5f06 100644 --- a/fs/xfs/scrub/quota_repair.c +++ b/fs/xfs/scrub/quota_repair.c @@ -184,17 +184,13 @@ xrep_quota_item( /* * We might need to fix holes in the bmap record for the storage * backing this dquot, so we need to lock the dquot and the quota file. - * dqiterate gave us a locked dquot, so drop the dquot lock to get the - * ILOCK_EXCL. */ - xfs_dqunlock(dq); xchk_ilock(sc, XFS_ILOCK_EXCL); - xfs_dqlock(dq); - + mutex_lock(&dq->q_qlock); error = xrep_quota_item_bmap(sc, dq, &dirty); xchk_iunlock(sc, XFS_ILOCK_EXCL); if (error) - return error; + goto out_unlock_dquot; /* Check the limits. */ if (dq->q_blk.softlimit > dq->q_blk.hardlimit) { @@ -246,7 +242,7 @@ xrep_quota_item( xrep_quota_item_timer(sc, &dq->q_rtb, &dirty); if (!dirty) - return 0; + goto out_unlock_dquot; trace_xrep_dquot_item(sc->mp, dq->q_type, dq->q_id); @@ -257,8 +253,10 @@ xrep_quota_item( xfs_qm_adjust_dqtimers(dq); } xfs_trans_log_dquot(sc->tp, dq); - error = xfs_trans_roll(&sc->tp); - xfs_dqlock(dq); + return xfs_trans_roll(&sc->tp); + +out_unlock_dquot: + mutex_unlock(&dq->q_qlock); return error; } @@ -513,7 +511,7 @@ xrep_quota_problems( xchk_dqiter_init(&cursor, sc, dqtype); while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) { error = xrep_quota_item(&rqi, dq); - xfs_qm_dqput(dq); + xfs_qm_dqrele(dq); if (error) break; } diff --git a/fs/xfs/scrub/quotacheck.c b/fs/xfs/scrub/quotacheck.c index dc4033b91e44..d412a8359784 100644 --- a/fs/xfs/scrub/quotacheck.c +++ b/fs/xfs/scrub/quotacheck.c @@ -505,9 +505,7 @@ xqcheck_collect_counts( * transactions do not take sb_internal. */ xchk_trans_cancel(sc); - error = xchk_trans_alloc_empty(sc); - if (error) - return error; + xchk_trans_alloc_empty(sc); while ((error = xchk_iscan_iter(&xqc->iscan, &ip)) == 1) { error = xqcheck_collect_inode(xqc, ip); @@ -565,6 +563,7 @@ xqcheck_compare_dquot( return -ECANCELED; } + mutex_lock(&dq->q_qlock); mutex_lock(&xqc->lock); error = xfarray_load_sparse(counts, dq->q_id, &xcdq); if (error) @@ -591,7 +590,9 @@ xqcheck_compare_dquot( xchk_set_incomplete(xqc->sc); error = -ECANCELED; } +out_unlock: mutex_unlock(&xqc->lock); + mutex_unlock(&dq->q_qlock); if (error) return error; @@ -599,10 +600,6 @@ xqcheck_compare_dquot( return -ECANCELED; return 0; - -out_unlock: - mutex_unlock(&xqc->lock); - return error; } /* @@ -638,7 +635,7 @@ xqcheck_walk_observations( return error; error = xqcheck_compare_dquot(xqc, dqtype, dq); - xfs_qm_dqput(dq); + xfs_qm_dqrele(dq); if (error) return error; @@ -676,7 +673,7 @@ xqcheck_compare_dqtype( xchk_dqiter_init(&cursor, sc, dqtype); while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) { error = xqcheck_compare_dquot(xqc, dqtype, dq); - xfs_qm_dqput(dq); + xfs_qm_dqrele(dq); if (error) break; } diff --git a/fs/xfs/scrub/quotacheck_repair.c b/fs/xfs/scrub/quotacheck_repair.c index dd8554c755b5..51be8d8d261b 100644 --- a/fs/xfs/scrub/quotacheck_repair.c +++ b/fs/xfs/scrub/quotacheck_repair.c @@ -52,13 +52,11 @@ xqcheck_commit_dquot( bool dirty = false; int error = 0; - /* Unlock the dquot just long enough to allocate a transaction. */ - xfs_dqunlock(dq); error = xchk_trans_alloc(xqc->sc, 0); - xfs_dqlock(dq); if (error) return error; + mutex_lock(&dq->q_qlock); xfs_trans_dqjoin(xqc->sc->tp, dq); if (xchk_iscan_aborted(&xqc->iscan)) { @@ -115,23 +113,12 @@ xqcheck_commit_dquot( if (dq->q_id) xfs_qm_adjust_dqtimers(dq); xfs_trans_log_dquot(xqc->sc->tp, dq); - - /* - * Transaction commit unlocks the dquot, so we must re-lock it so that - * the caller can put the reference (which apparently requires a locked - * dquot). - */ - error = xrep_trans_commit(xqc->sc); - xfs_dqlock(dq); - return error; + return xrep_trans_commit(xqc->sc); out_unlock: mutex_unlock(&xqc->lock); out_cancel: xchk_trans_cancel(xqc->sc); - - /* Re-lock the dquot so the caller can put the reference. */ - xfs_dqlock(dq); return error; } @@ -156,7 +143,7 @@ xqcheck_commit_dqtype( xchk_dqiter_init(&cursor, sc, dqtype); while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) { error = xqcheck_commit_dquot(xqc, dqtype, dq); - xfs_qm_dqput(dq); + xfs_qm_dqrele(dq); if (error) break; } @@ -187,7 +174,7 @@ xqcheck_commit_dqtype( return error; error = xqcheck_commit_dquot(xqc, dqtype, dq); - xfs_qm_dqput(dq); + xfs_qm_dqrele(dq); if (error) return error; diff --git a/fs/xfs/scrub/rcbag_btree.c b/fs/xfs/scrub/rcbag_btree.c index 709356dc6256..9a4ef823c5a7 100644 --- a/fs/xfs/scrub/rcbag_btree.c +++ b/fs/xfs/scrub/rcbag_btree.c @@ -47,29 +47,20 @@ rcbagbt_init_rec_from_cur( bag_rec->rbg_refcount = bag_irec->rbg_refcount; } -STATIC int64_t -rcbagbt_key_diff( +STATIC int +rcbagbt_cmp_key_with_cur( struct xfs_btree_cur *cur, const union xfs_btree_key *key) { struct rcbag_rec *rec = (struct rcbag_rec *)&cur->bc_rec; const struct rcbag_key *kp = (const struct rcbag_key *)key; - if (kp->rbg_startblock > rec->rbg_startblock) - return 1; - if (kp->rbg_startblock < rec->rbg_startblock) - return -1; - - if (kp->rbg_blockcount > rec->rbg_blockcount) - return 1; - if (kp->rbg_blockcount < rec->rbg_blockcount) - return -1; - - return 0; + return cmp_int(kp->rbg_startblock, rec->rbg_startblock) ?: + cmp_int(kp->rbg_blockcount, rec->rbg_blockcount); } -STATIC int64_t -rcbagbt_diff_two_keys( +STATIC int +rcbagbt_cmp_two_keys( struct xfs_btree_cur *cur, const union xfs_btree_key *k1, const union xfs_btree_key *k2, @@ -80,17 +71,8 @@ rcbagbt_diff_two_keys( ASSERT(mask == NULL); - if (kp1->rbg_startblock > kp2->rbg_startblock) - return 1; - if (kp1->rbg_startblock < kp2->rbg_startblock) - return -1; - - if (kp1->rbg_blockcount > kp2->rbg_blockcount) - return 1; - if (kp1->rbg_blockcount < kp2->rbg_blockcount) - return -1; - - return 0; + return cmp_int(kp1->rbg_startblock, kp2->rbg_startblock) ?: + cmp_int(kp1->rbg_blockcount, kp2->rbg_blockcount); } STATIC int @@ -201,9 +183,9 @@ static const struct xfs_btree_ops rcbagbt_mem_ops = { .init_key_from_rec = rcbagbt_init_key_from_rec, .init_rec_from_cur = rcbagbt_init_rec_from_cur, .init_ptr_from_cur = xfbtree_init_ptr_from_cur, - .key_diff = rcbagbt_key_diff, + .cmp_key_with_cur = rcbagbt_cmp_key_with_cur, .buf_ops = &rcbagbt_mem_buf_ops, - .diff_two_keys = rcbagbt_diff_two_keys, + .cmp_two_keys = rcbagbt_cmp_two_keys, .keys_inorder = rcbagbt_keys_inorder, .recs_inorder = rcbagbt_recs_inorder, }; diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index 8703897c0a9c..07f5bb8a6421 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -36,6 +36,12 @@ #include "xfs_metafile.h" #include "xfs_rtgroup.h" #include "xfs_rtrmap_btree.h" +#include "xfs_extfree_item.h" +#include "xfs_rmap_item.h" +#include "xfs_refcount_item.h" +#include "xfs_buf_item.h" +#include "xfs_bmap_item.h" +#include "xfs_bmap_btree.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -91,21 +97,33 @@ struct xreap_state { struct xfs_scrub *sc; - /* Reverse mapping owner and metadata reservation type. */ - const struct xfs_owner_info *oinfo; - enum xfs_ag_resv_type resv; + union { + struct { + /* + * For AG blocks, this is reverse mapping owner and + * metadata reservation type. + */ + const struct xfs_owner_info *oinfo; + enum xfs_ag_resv_type resv; + }; + struct { + /* For file blocks, this is the inode and fork. */ + struct xfs_inode *ip; + int whichfork; + }; + }; - /* If true, roll the transaction before reaping the next extent. */ - bool force_roll; + /* Number of invalidated buffers logged to the current transaction. */ + unsigned int nr_binval; - /* Number of deferred reaps attached to the current transaction. */ - unsigned int deferred; + /* Maximum number of buffers we can invalidate in a single tx. */ + unsigned int max_binval; - /* Number of invalidated buffers logged to the current transaction. */ - unsigned int invalidated; + /* Number of deferred reaps attached to the current transaction. */ + unsigned int nr_deferred; - /* Number of deferred reaps queued during the whole reap sequence. */ - unsigned long long total_deferred; + /* Maximum number of intents we can reap in a single transaction. */ + unsigned int max_deferred; }; /* Put a block back on the AGFL. */ @@ -148,71 +166,79 @@ xreap_put_freelist( } /* Are there any uncommitted reap operations? */ -static inline bool xreap_dirty(const struct xreap_state *rs) +static inline bool xreap_is_dirty(const struct xreap_state *rs) { - if (rs->force_roll) - return true; - if (rs->deferred) - return true; - if (rs->invalidated) - return true; - if (rs->total_deferred) - return true; - return false; + return rs->nr_binval > 0 || rs->nr_deferred > 0; } -#define XREAP_MAX_BINVAL (2048) - /* - * Decide if we want to roll the transaction after reaping an extent. We don't - * want to overrun the transaction reservation, so we prohibit more than - * 128 EFIs per transaction. For the same reason, we limit the number - * of buffer invalidations to 2048. + * Decide if we need to roll the transaction to clear out the the log + * reservation that we allocated to buffer invalidations. */ -static inline bool xreap_want_roll(const struct xreap_state *rs) +static inline bool xreap_want_binval_roll(const struct xreap_state *rs) { - if (rs->force_roll) - return true; - if (rs->deferred > XREP_MAX_ITRUNCATE_EFIS) - return true; - if (rs->invalidated > XREAP_MAX_BINVAL) - return true; - return false; + return rs->nr_binval >= rs->max_binval; } -static inline void xreap_reset(struct xreap_state *rs) +/* Reset the buffer invalidation count after rolling. */ +static inline void xreap_binval_reset(struct xreap_state *rs) { - rs->total_deferred += rs->deferred; - rs->deferred = 0; - rs->invalidated = 0; - rs->force_roll = false; + rs->nr_binval = 0; } -#define XREAP_MAX_DEFER_CHAIN (2048) +/* + * Bump the number of invalidated buffers, and return true if we can continue, + * or false if we need to roll the transaction. + */ +static inline bool xreap_inc_binval(struct xreap_state *rs) +{ + rs->nr_binval++; + return rs->nr_binval < rs->max_binval; +} /* * Decide if we want to finish the deferred ops that are attached to the scrub * transaction. We don't want to queue huge chains of deferred ops because * that can consume a lot of log space and kernel memory. Hence we trigger a - * xfs_defer_finish if there are more than 2048 deferred reap operations or the - * caller did some real work. + * xfs_defer_finish if there are too many deferred reap operations or we've run + * out of space for invalidations. */ -static inline bool -xreap_want_defer_finish(const struct xreap_state *rs) +static inline bool xreap_want_defer_finish(const struct xreap_state *rs) { - if (rs->force_roll) - return true; - if (rs->total_deferred > XREAP_MAX_DEFER_CHAIN) - return true; - return false; + return rs->nr_deferred >= rs->max_deferred; } +/* + * Reset the defer chain length and buffer invalidation count after finishing + * items. + */ static inline void xreap_defer_finish_reset(struct xreap_state *rs) { - rs->total_deferred = 0; - rs->deferred = 0; - rs->invalidated = 0; - rs->force_roll = false; + rs->nr_deferred = 0; + rs->nr_binval = 0; +} + +/* + * Bump the number of deferred extent reaps. + */ +static inline void xreap_inc_defer(struct xreap_state *rs) +{ + rs->nr_deferred++; +} + +/* Force the caller to finish a deferred item chain. */ +static inline void xreap_force_defer_finish(struct xreap_state *rs) +{ + rs->nr_deferred = rs->max_deferred; +} + +/* Maximum number of fsblocks that we might find in a buffer to invalidate. */ +static inline unsigned int +xrep_binval_max_fsblocks( + struct xfs_mount *mp) +{ + /* Remote xattr values are the largest buffers that we support. */ + return xfs_attr3_max_rmt_blocks(mp); } /* @@ -224,12 +250,8 @@ xrep_bufscan_max_sectors( struct xfs_mount *mp, xfs_extlen_t fsblocks) { - int max_fsbs; - - /* Remote xattr values are the largest buffers that we support. */ - max_fsbs = xfs_attr3_max_rmt_blocks(mp); - - return XFS_FSB_TO_BB(mp, min_t(xfs_extlen_t, fsblocks, max_fsbs)); + return XFS_FSB_TO_BB(mp, min_t(xfs_extlen_t, fsblocks, + xrep_binval_max_fsblocks(mp))); } /* @@ -297,14 +319,13 @@ xreap_agextent_binval( while ((bp = xrep_bufscan_advance(mp, &scan)) != NULL) { xfs_trans_bjoin(sc->tp, bp); xfs_trans_binval(sc->tp, bp); - rs->invalidated++; /* * Stop invalidating if we've hit the limit; we should * still have enough reservation left to free however * far we've gotten. */ - if (rs->invalidated > XREAP_MAX_BINVAL) { + if (!xreap_inc_binval(rs)) { *aglenp -= agbno_next - bno; goto out; } @@ -416,21 +437,23 @@ xreap_agextent_iter( trace_xreap_dispose_unmap_extent(pag_group(sc->sa.pag), agbno, *aglenp); - rs->force_roll = true; - if (rs->oinfo == &XFS_RMAP_OINFO_COW) { /* - * If we're unmapping CoW staging extents, remove the + * t0: Unmapping CoW staging extents, remove the * records from the refcountbt, which will remove the * rmap record as well. */ xfs_refcount_free_cow_extent(sc->tp, false, fsbno, *aglenp); + xreap_inc_defer(rs); return 0; } - return xfs_rmap_free(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, - *aglenp, rs->oinfo); + /* t1: unmap crosslinked metadata blocks */ + xfs_rmap_free_extent(sc->tp, false, fsbno, *aglenp, + rs->oinfo->oi_owner); + xreap_inc_defer(rs); + return 0; } trace_xreap_dispose_free_extent(pag_group(sc->sa.pag), agbno, *aglenp); @@ -443,12 +466,12 @@ xreap_agextent_iter( */ xreap_agextent_binval(rs, agbno, aglenp); if (*aglenp == 0) { - ASSERT(xreap_want_roll(rs)); + ASSERT(xreap_want_binval_roll(rs)); return 0; } /* - * If we're getting rid of CoW staging extents, use deferred work items + * t2: To get rid of CoW staging extents, use deferred work items * to remove the refcountbt records (which removes the rmap records) * and free the extent. We're not worried about the system going down * here because log recovery walks the refcount btree to clean out the @@ -463,23 +486,23 @@ xreap_agextent_iter( if (error) return error; - rs->force_roll = true; + xreap_inc_defer(rs); return 0; } - /* Put blocks back on the AGFL one at a time. */ + /* t3: Put blocks back on the AGFL one at a time. */ if (rs->resv == XFS_AG_RESV_AGFL) { ASSERT(*aglenp == 1); error = xreap_put_freelist(sc, agbno); if (error) return error; - rs->force_roll = true; + xreap_force_defer_finish(rs); return 0; } /* - * Use deferred frees to get rid of the old btree blocks to try to + * t4: Use deferred frees to get rid of the old btree blocks to try to * minimize the window in which we could crash and lose the old blocks. * Add a defer ops barrier every other extent to avoid stressing the * system with large EFIs. @@ -489,12 +512,194 @@ xreap_agextent_iter( if (error) return error; - rs->deferred++; - if (rs->deferred % 2 == 0) + xreap_inc_defer(rs); + if (rs->nr_deferred % 2 == 0) xfs_defer_add_barrier(sc->tp); return 0; } +/* Configure the deferral and invalidation limits */ +static inline void +xreap_configure_limits( + struct xreap_state *rs, + unsigned int fixed_overhead, + unsigned int variable_overhead, + unsigned int per_intent, + unsigned int per_binval) +{ + struct xfs_scrub *sc = rs->sc; + unsigned int res = sc->tp->t_log_res - fixed_overhead; + + /* Don't underflow the reservation */ + if (sc->tp->t_log_res < (fixed_overhead + variable_overhead)) { + ASSERT(sc->tp->t_log_res >= + (fixed_overhead + variable_overhead)); + xfs_force_shutdown(sc->mp, SHUTDOWN_CORRUPT_INCORE); + return; + } + + rs->max_deferred = per_intent ? res / variable_overhead : 0; + res -= rs->max_deferred * per_intent; + rs->max_binval = per_binval ? res / per_binval : 0; +} + +/* + * Compute the maximum number of intent items that reaping can attach to the + * scrub transaction given the worst case log overhead of the intent items + * needed to reap a single per-AG space extent. This is not for freeing CoW + * staging extents. + */ +STATIC void +xreap_configure_agextent_limits( + struct xreap_state *rs) +{ + struct xfs_scrub *sc = rs->sc; + struct xfs_mount *mp = sc->mp; + + /* + * In the worst case, relogging an intent item causes both an intent + * item and a done item to be attached to a transaction for each extent + * that we'd like to process. + */ + const unsigned int efi = xfs_efi_log_space(1) + + xfs_efd_log_space(1); + const unsigned int rui = xfs_rui_log_space(1) + + xfs_rud_log_space(); + + /* + * Various things can happen when reaping non-CoW metadata blocks: + * + * t1: Unmapping crosslinked metadata blocks: deferred removal of rmap + * record. + * + * t3: Freeing to AGFL: roll and finish deferred items for every block. + * Limits here do not matter. + * + * t4: Freeing metadata blocks: deferred freeing of the space, which + * also removes the rmap record. + * + * For simplicity, we'll use the worst-case intents size to determine + * the maximum number of deferred extents before we have to finish the + * whole chain. If we're trying to reap a btree larger than this size, + * a crash midway through reaping can result in leaked blocks. + */ + const unsigned int t1 = rui; + const unsigned int t4 = rui + efi; + const unsigned int per_intent = max(t1, t4); + + /* + * For each transaction in a reap chain, we must be able to take one + * step in the defer item chain, which should only consist of EFI or + * RUI items. + */ + const unsigned int f1 = xfs_calc_finish_efi_reservation(mp, 1); + const unsigned int f2 = xfs_calc_finish_rui_reservation(mp, 1); + const unsigned int step_size = max(f1, f2); + + /* Largest buffer size (in fsblocks) that can be invalidated. */ + const unsigned int max_binval = xrep_binval_max_fsblocks(mp); + + /* Maximum overhead of invalidating one buffer. */ + const unsigned int per_binval = + xfs_buf_inval_log_space(1, XFS_B_TO_FSBT(mp, max_binval)); + + /* + * For each transaction in a reap chain, we can delete some number of + * extents and invalidate some number of blocks. We assume that btree + * blocks aren't usually contiguous; and that scrub likely pulled all + * the buffers into memory. From these assumptions, set the maximum + * number of deferrals we can queue before flushing the defer chain, + * and the number of invalidations we can queue before rolling to a + * clean transaction (and possibly relogging some of the deferrals) to + * the same quantity. + */ + const unsigned int variable_overhead = per_intent + per_binval; + + xreap_configure_limits(rs, step_size, variable_overhead, per_intent, + per_binval); + + trace_xreap_agextent_limits(sc->tp, per_binval, rs->max_binval, + step_size, per_intent, rs->max_deferred); +} + +/* + * Compute the maximum number of intent items that reaping can attach to the + * scrub transaction given the worst case log overhead of the intent items + * needed to reap a single CoW staging extent. This is not for freeing + * metadata blocks. + */ +STATIC void +xreap_configure_agcow_limits( + struct xreap_state *rs) +{ + struct xfs_scrub *sc = rs->sc; + struct xfs_mount *mp = sc->mp; + + /* + * In the worst case, relogging an intent item causes both an intent + * item and a done item to be attached to a transaction for each extent + * that we'd like to process. + */ + const unsigned int efi = xfs_efi_log_space(1) + + xfs_efd_log_space(1); + const unsigned int rui = xfs_rui_log_space(1) + + xfs_rud_log_space(); + const unsigned int cui = xfs_cui_log_space(1) + + xfs_cud_log_space(); + + /* + * Various things can happen when reaping non-CoW metadata blocks: + * + * t0: Unmapping crosslinked CoW blocks: deferred removal of refcount + * record, which defers removal of rmap record + * + * t2: Freeing CoW blocks: deferred removal of refcount record, which + * defers removal of rmap record; and deferred removal of the space + * + * For simplicity, we'll use the worst-case intents size to determine + * the maximum number of deferred extents before we have to finish the + * whole chain. If we're trying to reap a btree larger than this size, + * a crash midway through reaping can result in leaked blocks. + */ + const unsigned int t0 = cui + rui; + const unsigned int t2 = cui + rui + efi; + const unsigned int per_intent = max(t0, t2); + + /* + * For each transaction in a reap chain, we must be able to take one + * step in the defer item chain, which should only consist of CUI, EFI, + * or RUI items. + */ + const unsigned int f1 = xfs_calc_finish_efi_reservation(mp, 1); + const unsigned int f2 = xfs_calc_finish_rui_reservation(mp, 1); + const unsigned int f3 = xfs_calc_finish_cui_reservation(mp, 1); + const unsigned int step_size = max3(f1, f2, f3); + + /* Largest buffer size (in fsblocks) that can be invalidated. */ + const unsigned int max_binval = xrep_binval_max_fsblocks(mp); + + /* Overhead of invalidating one buffer */ + const unsigned int per_binval = + xfs_buf_inval_log_space(1, XFS_B_TO_FSBT(mp, max_binval)); + + /* + * For each transaction in a reap chain, we can delete some number of + * extents and invalidate some number of blocks. We assume that CoW + * staging extents are usually more than 1 fsblock, and that there + * shouldn't be any buffers for those blocks. From the assumptions, + * set the number of deferrals to use as much of the reservation as + * it can, but leave space to invalidate 1/8th that number of buffers. + */ + const unsigned int variable_overhead = per_intent + + (per_binval / 8); + + xreap_configure_limits(rs, step_size, variable_overhead, per_intent, + per_binval); + + trace_xreap_agcow_limits(sc->tp, per_binval, rs->max_binval, step_size, + per_intent, rs->max_deferred); +} + /* * Break an AG metadata extent into sub-extents by fate (crosslinked, not * crosslinked), and dispose of each sub-extent separately. @@ -531,11 +736,11 @@ xreap_agmeta_extent( if (error) return error; xreap_defer_finish_reset(rs); - } else if (xreap_want_roll(rs)) { + } else if (xreap_want_binval_roll(rs)) { error = xrep_roll_ag_trans(sc); if (error) return error; - xreap_reset(rs); + xreap_binval_reset(rs); } agbno += aglen; @@ -562,11 +767,12 @@ xrep_reap_agblocks( ASSERT(xfs_has_rmapbt(sc->mp)); ASSERT(sc->ip == NULL); + xreap_configure_agextent_limits(&rs); error = xagb_bitmap_walk(bitmap, xreap_agmeta_extent, &rs); if (error) return error; - if (xreap_dirty(&rs)) + if (xreap_is_dirty(&rs)) return xrep_defer_finish(sc); return 0; @@ -628,7 +834,7 @@ xreap_fsmeta_extent( if (error) goto out_agf; xreap_defer_finish_reset(rs); - } else if (xreap_want_roll(rs)) { + } else if (xreap_want_binval_roll(rs)) { /* * Hold the AGF buffer across the transaction roll so * that we don't have to reattach it to the scrub @@ -639,7 +845,7 @@ xreap_fsmeta_extent( xfs_trans_bjoin(sc->tp, sc->sa.agf_bp); if (error) goto out_agf; - xreap_reset(rs); + xreap_binval_reset(rs); } agbno += aglen; @@ -674,11 +880,15 @@ xrep_reap_fsblocks( ASSERT(xfs_has_rmapbt(sc->mp)); ASSERT(sc->ip != NULL); + if (oinfo == &XFS_RMAP_OINFO_COW) + xreap_configure_agcow_limits(&rs); + else + xreap_configure_agextent_limits(&rs); error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs); if (error) return error; - if (xreap_dirty(&rs)) + if (xreap_is_dirty(&rs)) return xrep_defer_finish(sc); return 0; @@ -770,7 +980,7 @@ xreap_rgextent_iter( rtbno = xfs_rgbno_to_rtb(sc->sr.rtg, rgbno); /* - * If there are other rmappings, this block is cross linked and must + * t1: There are other rmappings; this block is cross linked and must * not be freed. Remove the forward and reverse mapping and move on. */ if (crosslinked) { @@ -778,14 +988,14 @@ xreap_rgextent_iter( *rglenp); xfs_refcount_free_cow_extent(sc->tp, true, rtbno, *rglenp); - rs->deferred++; + xreap_inc_defer(rs); return 0; } trace_xreap_dispose_free_extent(rtg_group(sc->sr.rtg), rgbno, *rglenp); /* - * The CoW staging extent is not crosslinked. Use deferred work items + * t2: The CoW staging extent is not crosslinked. Use deferred work * to remove the refcountbt records (which removes the rmap records) * and free the extent. We're not worried about the system going down * here because log recovery walks the refcount btree to clean out the @@ -799,10 +1009,73 @@ xreap_rgextent_iter( if (error) return error; - rs->deferred++; + xreap_inc_defer(rs); return 0; } +/* + * Compute the maximum number of intent items that reaping can attach to the + * scrub transaction given the worst case log overhead of the intent items + * needed to reap a single CoW staging extent. This is not for freeing + * metadata blocks. + */ +STATIC void +xreap_configure_rgcow_limits( + struct xreap_state *rs) +{ + struct xfs_scrub *sc = rs->sc; + struct xfs_mount *mp = sc->mp; + + /* + * In the worst case, relogging an intent item causes both an intent + * item and a done item to be attached to a transaction for each extent + * that we'd like to process. + */ + const unsigned int efi = xfs_efi_log_space(1) + + xfs_efd_log_space(1); + const unsigned int rui = xfs_rui_log_space(1) + + xfs_rud_log_space(); + const unsigned int cui = xfs_cui_log_space(1) + + xfs_cud_log_space(); + + /* + * Various things can happen when reaping non-CoW metadata blocks: + * + * t1: Unmapping crosslinked CoW blocks: deferred removal of refcount + * record, which defers removal of rmap record + * + * t2: Freeing CoW blocks: deferred removal of refcount record, which + * defers removal of rmap record; and deferred removal of the space + * + * For simplicity, we'll use the worst-case intents size to determine + * the maximum number of deferred extents before we have to finish the + * whole chain. If we're trying to reap a btree larger than this size, + * a crash midway through reaping can result in leaked blocks. + */ + const unsigned int t1 = cui + rui; + const unsigned int t2 = cui + rui + efi; + const unsigned int per_intent = max(t1, t2); + + /* + * For each transaction in a reap chain, we must be able to take one + * step in the defer item chain, which should only consist of CUI, EFI, + * or RUI items. + */ + const unsigned int f1 = xfs_calc_finish_rt_efi_reservation(mp, 1); + const unsigned int f2 = xfs_calc_finish_rt_rui_reservation(mp, 1); + const unsigned int f3 = xfs_calc_finish_rt_cui_reservation(mp, 1); + const unsigned int step_size = max3(f1, f2, f3); + + /* + * The only buffer for the rt device is the rtgroup super, so we don't + * need to save space for buffer invalidations. + */ + xreap_configure_limits(rs, step_size, per_intent, per_intent, 0); + + trace_xreap_rgcow_limits(sc->tp, 0, 0, step_size, per_intent, + rs->max_deferred); +} + #define XREAP_RTGLOCK_ALL (XFS_RTGLOCK_BITMAP | \ XFS_RTGLOCK_RMAP | \ XFS_RTGLOCK_REFCOUNT) @@ -855,11 +1128,11 @@ xreap_rtmeta_extent( if (error) goto out_unlock; xreap_defer_finish_reset(rs); - } else if (xreap_want_roll(rs)) { + } else if (xreap_want_binval_roll(rs)) { error = xfs_trans_roll_inode(&sc->tp, sc->ip); if (error) goto out_unlock; - xreap_reset(rs); + xreap_binval_reset(rs); } rgbno += rglen; @@ -891,12 +1164,14 @@ xrep_reap_rtblocks( ASSERT(xfs_has_rmapbt(sc->mp)); ASSERT(sc->ip != NULL); + ASSERT(oinfo == &XFS_RMAP_OINFO_COW); + xreap_configure_rgcow_limits(&rs); error = xrtb_bitmap_walk(bitmap, xreap_rtmeta_extent, &rs); if (error) return error; - if (xreap_dirty(&rs)) + if (xreap_is_dirty(&rs)) return xrep_defer_finish(sc); return 0; @@ -929,13 +1204,13 @@ xrep_reap_metadir_fsblocks( ASSERT(sc->ip != NULL); ASSERT(xfs_is_metadir_inode(sc->ip)); + xreap_configure_agextent_limits(&rs); xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, XFS_DATA_FORK); - error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs); if (error) return error; - if (xreap_dirty(&rs)) { + if (xreap_is_dirty(&rs)) { error = xrep_defer_finish(sc); if (error) return error; @@ -955,13 +1230,12 @@ xrep_reap_metadir_fsblocks( */ STATIC int xreap_bmapi_select( - struct xfs_scrub *sc, - struct xfs_inode *ip, - int whichfork, + struct xreap_state *rs, struct xfs_bmbt_irec *imap, bool *crosslinked) { struct xfs_owner_info oinfo; + struct xfs_scrub *sc = rs->sc; struct xfs_btree_cur *cur; xfs_filblks_t len = 1; xfs_agblock_t bno; @@ -975,7 +1249,8 @@ xreap_bmapi_select( cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, sc->sa.pag); - xfs_rmap_ino_owner(&oinfo, ip->i_ino, whichfork, imap->br_startoff); + xfs_rmap_ino_owner(&oinfo, rs->ip->i_ino, rs->whichfork, + imap->br_startoff); error = xfs_rmap_has_other_keys(cur, agbno, 1, &oinfo, crosslinked); if (error) goto out_cur; @@ -1038,21 +1313,19 @@ xreap_buf_loggable( */ STATIC int xreap_bmapi_binval( - struct xfs_scrub *sc, - struct xfs_inode *ip, - int whichfork, + struct xreap_state *rs, struct xfs_bmbt_irec *imap) { + struct xfs_scrub *sc = rs->sc; struct xfs_mount *mp = sc->mp; struct xfs_perag *pag = sc->sa.pag; - int bmap_flags = xfs_bmapi_aflag(whichfork); + int bmap_flags = xfs_bmapi_aflag(rs->whichfork); xfs_fileoff_t off; xfs_fileoff_t max_off; xfs_extlen_t scan_blocks; xfs_agblock_t bno; xfs_agblock_t agbno; xfs_agblock_t agbno_next; - unsigned int invalidated = 0; int error; /* @@ -1079,7 +1352,7 @@ xreap_bmapi_binval( struct xfs_bmbt_irec hmap; int nhmaps = 1; - error = xfs_bmapi_read(ip, off, max_off - off, &hmap, + error = xfs_bmapi_read(rs->ip, off, max_off - off, &hmap, &nhmaps, bmap_flags); if (error) return error; @@ -1120,14 +1393,13 @@ xreap_bmapi_binval( xfs_buf_stale(bp); xfs_buf_relse(bp); } - invalidated++; /* * Stop invalidating if we've hit the limit; we should * still have enough reservation left to free however - * much of the mapping we've seen so far. + * far we've gotten. */ - if (invalidated > XREAP_MAX_BINVAL) { + if (!xreap_inc_binval(rs)) { imap->br_blockcount = agbno_next - bno; goto out; } @@ -1149,12 +1421,11 @@ out: */ STATIC int xrep_reap_bmapi_iter( - struct xfs_scrub *sc, - struct xfs_inode *ip, - int whichfork, + struct xreap_state *rs, struct xfs_bmbt_irec *imap, bool crosslinked) { + struct xfs_scrub *sc = rs->sc; int error; if (crosslinked) { @@ -1171,14 +1442,14 @@ xrep_reap_bmapi_iter( imap->br_blockcount); /* - * Schedule removal of the mapping from the fork. We use + * t0: Schedule removal of the mapping from the fork. We use * deferred log intents in this function to control the exact * sequence of metadata updates. */ - xfs_bmap_unmap_extent(sc->tp, ip, whichfork, imap); - xfs_trans_mod_dquot_byino(sc->tp, ip, XFS_TRANS_DQ_BCOUNT, + xfs_bmap_unmap_extent(sc->tp, rs->ip, rs->whichfork, imap); + xfs_trans_mod_dquot_byino(sc->tp, rs->ip, XFS_TRANS_DQ_BCOUNT, -(int64_t)imap->br_blockcount); - xfs_rmap_unmap_extent(sc->tp, ip, whichfork, imap); + xfs_rmap_unmap_extent(sc->tp, rs->ip, rs->whichfork, imap); return 0; } @@ -1199,41 +1470,139 @@ xrep_reap_bmapi_iter( * transaction is full of logged buffer invalidations, so we need to * return early so that we can roll and retry. */ - error = xreap_bmapi_binval(sc, ip, whichfork, imap); + error = xreap_bmapi_binval(rs, imap); if (error || imap->br_blockcount == 0) return error; /* - * Schedule removal of the mapping from the fork. We use deferred log - * intents in this function to control the exact sequence of metadata + * t1: Schedule removal of the mapping from the fork. We use deferred + * work in this function to control the exact sequence of metadata * updates. */ - xfs_bmap_unmap_extent(sc->tp, ip, whichfork, imap); - xfs_trans_mod_dquot_byino(sc->tp, ip, XFS_TRANS_DQ_BCOUNT, + xfs_bmap_unmap_extent(sc->tp, rs->ip, rs->whichfork, imap); + xfs_trans_mod_dquot_byino(sc->tp, rs->ip, XFS_TRANS_DQ_BCOUNT, -(int64_t)imap->br_blockcount); return xfs_free_extent_later(sc->tp, imap->br_startblock, imap->br_blockcount, NULL, XFS_AG_RESV_NONE, XFS_FREE_EXTENT_SKIP_DISCARD); } +/* Compute the maximum mapcount of a file buffer. */ +static unsigned int +xreap_bmapi_binval_mapcount( + struct xfs_scrub *sc) +{ + /* directory blocks can span multiple fsblocks and be discontiguous */ + if (sc->sm->sm_type == XFS_SCRUB_TYPE_DIR) + return sc->mp->m_dir_geo->fsbcount; + + /* all other file xattr/symlink blocks must be contiguous */ + return 1; +} + +/* Compute the maximum block size of a file buffer. */ +static unsigned int +xreap_bmapi_binval_blocksize( + struct xfs_scrub *sc) +{ + switch (sc->sm->sm_type) { + case XFS_SCRUB_TYPE_DIR: + return sc->mp->m_dir_geo->blksize; + case XFS_SCRUB_TYPE_XATTR: + case XFS_SCRUB_TYPE_PARENT: + /* + * The xattr structure itself consists of single fsblocks, but + * there could be remote xattr blocks to invalidate. + */ + return XFS_XATTR_SIZE_MAX; + } + + /* everything else is a single block */ + return sc->mp->m_sb.sb_blocksize; +} + +/* + * Compute the maximum number of buffer invalidations that we can do while + * reaping a single extent from a file fork. + */ +STATIC void +xreap_configure_bmapi_limits( + struct xreap_state *rs) +{ + struct xfs_scrub *sc = rs->sc; + struct xfs_mount *mp = sc->mp; + + /* overhead of invalidating a buffer */ + const unsigned int per_binval = + xfs_buf_inval_log_space(xreap_bmapi_binval_mapcount(sc), + xreap_bmapi_binval_blocksize(sc)); + + /* + * In the worst case, relogging an intent item causes both an intent + * item and a done item to be attached to a transaction for each extent + * that we'd like to process. + */ + const unsigned int efi = xfs_efi_log_space(1) + + xfs_efd_log_space(1); + const unsigned int rui = xfs_rui_log_space(1) + + xfs_rud_log_space(); + const unsigned int bui = xfs_bui_log_space(1) + + xfs_bud_log_space(); + + /* + * t1: Unmapping crosslinked file data blocks: one bmap deletion, + * possibly an EFI for underfilled bmbt blocks, and an rmap deletion. + * + * t2: Freeing freeing file data blocks: one bmap deletion, possibly an + * EFI for underfilled bmbt blocks, and another EFI for the space + * itself. + */ + const unsigned int t1 = (bui + efi) + rui; + const unsigned int t2 = (bui + efi) + efi; + const unsigned int per_intent = max(t1, t2); + + /* + * For each transaction in a reap chain, we must be able to take one + * step in the defer item chain, which should only consist of CUI, EFI, + * or RUI items. + */ + const unsigned int f1 = xfs_calc_finish_efi_reservation(mp, 1); + const unsigned int f2 = xfs_calc_finish_rui_reservation(mp, 1); + const unsigned int f3 = xfs_calc_finish_bui_reservation(mp, 1); + const unsigned int step_size = max3(f1, f2, f3); + + /* + * Each call to xreap_ifork_extent starts with a clean transaction and + * operates on a single mapping by creating a chain of log intent items + * for that mapping. We need to leave enough reservation in the + * transaction to log btree buffer and inode updates for each step in + * the chain, and to relog the log intents. + */ + const unsigned int per_extent_res = per_intent + step_size; + + xreap_configure_limits(rs, per_extent_res, per_binval, 0, per_binval); + + trace_xreap_bmapi_limits(sc->tp, per_binval, rs->max_binval, + step_size, per_intent, 1); +} + /* * Dispose of as much of this file extent as we can. Upon successful return, * the imap will reflect the mapping that was removed from the fork. */ STATIC int xreap_ifork_extent( - struct xfs_scrub *sc, - struct xfs_inode *ip, - int whichfork, + struct xreap_state *rs, struct xfs_bmbt_irec *imap) { + struct xfs_scrub *sc = rs->sc; xfs_agnumber_t agno; bool crosslinked; int error; ASSERT(sc->sa.pag == NULL); - trace_xreap_ifork_extent(sc, ip, whichfork, imap); + trace_xreap_ifork_extent(sc, rs->ip, rs->whichfork, imap); agno = XFS_FSB_TO_AGNO(sc->mp, imap->br_startblock); sc->sa.pag = xfs_perag_get(sc->mp, agno); @@ -1248,11 +1617,11 @@ xreap_ifork_extent( * Decide the fate of the blocks at the beginning of the mapping, then * update the mapping to use it with the unmap calls. */ - error = xreap_bmapi_select(sc, ip, whichfork, imap, &crosslinked); + error = xreap_bmapi_select(rs, imap, &crosslinked); if (error) goto out_agf; - error = xrep_reap_bmapi_iter(sc, ip, whichfork, imap, crosslinked); + error = xrep_reap_bmapi_iter(rs, imap, crosslinked); if (error) goto out_agf; @@ -1276,6 +1645,11 @@ xrep_reap_ifork( struct xfs_inode *ip, int whichfork) { + struct xreap_state rs = { + .sc = sc, + .ip = ip, + .whichfork = whichfork, + }; xfs_fileoff_t off = 0; int bmap_flags = xfs_bmapi_aflag(whichfork); int error; @@ -1284,6 +1658,7 @@ xrep_reap_ifork( ASSERT(ip == sc->ip || ip == sc->tempip); ASSERT(whichfork == XFS_ATTR_FORK || !XFS_IS_REALTIME_INODE(ip)); + xreap_configure_bmapi_limits(&rs); while (off < XFS_MAX_FILEOFF) { struct xfs_bmbt_irec imap; int nimaps = 1; @@ -1303,13 +1678,14 @@ xrep_reap_ifork( * can in a single transaction. */ if (xfs_bmap_is_real_extent(&imap)) { - error = xreap_ifork_extent(sc, ip, whichfork, &imap); + error = xreap_ifork_extent(&rs, &imap); if (error) return error; error = xfs_defer_finish(&sc->tp); if (error) return error; + xreap_defer_finish_reset(&rs); } off = imap.br_startoff + imap.br_blockcount; diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index f8f9ed30f56b..efd5a7ccdf62 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -1110,7 +1110,7 @@ xrep_will_attempt( return true; /* Let debug users force us into the repair routines. */ - if (XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) + if (XFS_TEST_ERROR(sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) return true; /* Metadata is corrupt or failed cross-referencing. */ @@ -1269,42 +1269,6 @@ xrep_setup_xfbtree( } /* - * Create a dummy transaction for use in a live update hook function. This - * function MUST NOT be called from regular repair code because the current - * process' transaction is saved via the cookie. - */ -int -xrep_trans_alloc_hook_dummy( - struct xfs_mount *mp, - void **cookiep, - struct xfs_trans **tpp) -{ - int error; - - *cookiep = current->journal_info; - current->journal_info = NULL; - - error = xfs_trans_alloc_empty(mp, tpp); - if (!error) - return 0; - - current->journal_info = *cookiep; - *cookiep = NULL; - return error; -} - -/* Cancel a dummy transaction used by a live update hook function. */ -void -xrep_trans_cancel_hook_dummy( - void **cookiep, - struct xfs_trans *tp) -{ - xfs_trans_cancel(tp); - current->journal_info = *cookiep; - *cookiep = NULL; -} - -/* * See if this buffer can pass the given ->verify_struct() function. * * If the buffer already has ops attached and they're not the ones that were diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index af0a3a9e5ed9..2bb125c4f9bf 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -18,14 +18,6 @@ static inline int xrep_notsupported(struct xfs_scrub *sc) #ifdef CONFIG_XFS_ONLINE_REPAIR -/* - * This is the maximum number of deferred extent freeing item extents (EFIs) - * that we'll attach to a transaction without rolling the transaction to avoid - * overrunning a tr_itruncate reservation. - */ -#define XREP_MAX_ITRUNCATE_EFIS (128) - - /* Repair helpers */ int xrep_attempt(struct xfs_scrub *sc, struct xchk_stats_run *run); @@ -180,10 +172,6 @@ int xrep_quotacheck(struct xfs_scrub *sc); int xrep_reinit_pagf(struct xfs_scrub *sc); int xrep_reinit_pagi(struct xfs_scrub *sc); -int xrep_trans_alloc_hook_dummy(struct xfs_mount *mp, void **cookiep, - struct xfs_trans **tpp); -void xrep_trans_cancel_hook_dummy(void **cookiep, struct xfs_trans *tp); - bool xrep_buf_verify_struct(struct xfs_buf *bp, const struct xfs_buf_ops *ops); void xrep_inode_set_nblocks(struct xfs_scrub *sc, int64_t new_blocks); int xrep_reset_metafile_resv(struct xfs_scrub *sc); diff --git a/fs/xfs/scrub/rmap_repair.c b/fs/xfs/scrub/rmap_repair.c index f5f73078ffe2..17d4a38d735c 100644 --- a/fs/xfs/scrub/rmap_repair.c +++ b/fs/xfs/scrub/rmap_repair.c @@ -951,9 +951,7 @@ end_agscan: sa->agf_bp = NULL; sa->agi_bp = NULL; xchk_trans_cancel(sc); - error = xchk_trans_alloc_empty(sc); - if (error) - return error; + xchk_trans_alloc_empty(sc); /* Iterate all AGs for inodes rmaps. */ while ((error = xchk_iscan_iter(&rr->iscan, &ip)) == 1) { @@ -1612,7 +1610,6 @@ xrep_rmapbt_live_update( struct xfs_mount *mp; struct xfs_btree_cur *mcur; struct xfs_trans *tp; - void *txcookie; int error; rr = container_of(nb, struct xrep_rmap, rhook.rmap_hook.nb); @@ -1623,9 +1620,7 @@ xrep_rmapbt_live_update( trace_xrep_rmap_live_update(pag_group(rr->sc->sa.pag), action, p); - error = xrep_trans_alloc_hook_dummy(mp, &txcookie, &tp); - if (error) - goto out_abort; + tp = xfs_trans_alloc_empty(mp); mutex_lock(&rr->lock); mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, tp, &rr->rmap_btree); @@ -1639,14 +1634,13 @@ xrep_rmapbt_live_update( if (error) goto out_cancel; - xrep_trans_cancel_hook_dummy(&txcookie, tp); + xfs_trans_cancel(tp); mutex_unlock(&rr->lock); return NOTIFY_DONE; out_cancel: xfbtree_trans_cancel(&rr->rmap_btree, tp); - xrep_trans_cancel_hook_dummy(&txcookie, tp); -out_abort: + xfs_trans_cancel(tp); mutex_unlock(&rr->lock); xchk_iscan_abort(&rr->iscan); out_unlock: diff --git a/fs/xfs/scrub/rtrmap_repair.c b/fs/xfs/scrub/rtrmap_repair.c index fc2592c53af5..7561941a337a 100644 --- a/fs/xfs/scrub/rtrmap_repair.c +++ b/fs/xfs/scrub/rtrmap_repair.c @@ -580,9 +580,7 @@ xrep_rtrmap_find_rmaps( */ xchk_trans_cancel(sc); xchk_rtgroup_unlock(&sc->sr); - error = xchk_trans_alloc_empty(sc); - if (error) - return error; + xchk_trans_alloc_empty(sc); while ((error = xchk_iscan_iter(&rr->iscan, &ip)) == 1) { error = xrep_rtrmap_scan_inode(rr, ip); @@ -846,7 +844,6 @@ xrep_rtrmapbt_live_update( struct xfs_mount *mp; struct xfs_btree_cur *mcur; struct xfs_trans *tp; - void *txcookie; int error; rr = container_of(nb, struct xrep_rtrmap, rhook.rmap_hook.nb); @@ -857,9 +854,7 @@ xrep_rtrmapbt_live_update( trace_xrep_rmap_live_update(rtg_group(rr->sc->sr.rtg), action, p); - error = xrep_trans_alloc_hook_dummy(mp, &txcookie, &tp); - if (error) - goto out_abort; + tp = xfs_trans_alloc_empty(mp); mutex_lock(&rr->lock); mcur = xfs_rtrmapbt_mem_cursor(rr->sc->sr.rtg, tp, &rr->rtrmap_btree); @@ -873,14 +868,13 @@ xrep_rtrmapbt_live_update( if (error) goto out_cancel; - xrep_trans_cancel_hook_dummy(&txcookie, tp); + xfs_trans_cancel(tp); mutex_unlock(&rr->lock); return NOTIFY_DONE; out_cancel: xfbtree_trans_cancel(&rr->rtrmap_btree, tp); - xrep_trans_cancel_hook_dummy(&txcookie, tp); -out_abort: + xfs_trans_cancel(tp); xchk_iscan_abort(&rr->iscan); mutex_unlock(&rr->lock); out_unlock: diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 9908850bf76f..3c3b0d25006f 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -680,8 +680,6 @@ xfs_scrub_metadata( if (error) goto out; - xfs_warn_experimental(mp, XFS_EXPERIMENTAL_SCRUB); - sc = kzalloc(sizeof(struct xfs_scrub), XCHK_GFP_FLAGS); if (!sc) { error = -ENOMEM; @@ -878,10 +876,7 @@ xchk_scrubv_open_by_handle( struct xfs_inode *ip; int error; - error = xfs_trans_alloc_empty(mp, &tp); - if (error) - return NULL; - + tp = xfs_trans_alloc_empty(mp); error = xfs_iget(mp, tp, head->svh_ino, XCHK_IGET_FLAGS, 0, &ip); xfs_trans_cancel(tp); if (error) diff --git a/fs/xfs/scrub/symlink_repair.c b/fs/xfs/scrub/symlink_repair.c index 953ce7be78dc..df629892462f 100644 --- a/fs/xfs/scrub/symlink_repair.c +++ b/fs/xfs/scrub/symlink_repair.c @@ -184,8 +184,8 @@ xrep_symlink_salvage_inline( sc->ip->i_disk_size == 1 && old_target[0] == '?') return 0; - nr = min(XFS_SYMLINK_MAXLEN, xfs_inode_data_fork_size(ip)); - strncpy(target_buf, ifp->if_data, nr); + nr = min(XFS_SYMLINK_MAXLEN, ifp->if_bytes); + memcpy(target_buf, ifp->if_data, nr); return nr; } diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index 2450e214103f..987313a52e64 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -22,6 +22,7 @@ #include "xfs_parent.h" #include "xfs_metafile.h" #include "xfs_rtgroup.h" +#include "xfs_trans.h" #include "scrub/scrub.h" #include "scrub/xfile.h" #include "scrub/xfarray.h" diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index d7c4ced47c15..39ea651cbb75 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -479,7 +479,7 @@ DECLARE_EVENT_CLASS(xchk_dqiter_class, __field(xfs_exntst_t, state) ), TP_fast_assign( - __entry->dev = cursor->sc->ip->i_mount->m_super->s_dev; + __entry->dev = cursor->sc->mp->m_super->s_dev; __entry->dqtype = cursor->dqtype; __entry->ino = cursor->quota_ip->i_ino; __entry->cur_id = cursor->id; @@ -2000,6 +2000,51 @@ DEFINE_REPAIR_EXTENT_EVENT(xreap_agextent_binval); DEFINE_REPAIR_EXTENT_EVENT(xreap_bmapi_binval); DEFINE_REPAIR_EXTENT_EVENT(xrep_agfl_insert); +DECLARE_EVENT_CLASS(xrep_reap_limits_class, + TP_PROTO(const struct xfs_trans *tp, unsigned int per_binval, + unsigned int max_binval, unsigned int step_size, + unsigned int per_intent, + unsigned int max_deferred), + TP_ARGS(tp, per_binval, max_binval, step_size, per_intent, max_deferred), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, log_res) + __field(unsigned int, per_binval) + __field(unsigned int, max_binval) + __field(unsigned int, step_size) + __field(unsigned int, per_intent) + __field(unsigned int, max_deferred) + ), + TP_fast_assign( + __entry->dev = tp->t_mountp->m_super->s_dev; + __entry->log_res = tp->t_log_res; + __entry->per_binval = per_binval; + __entry->max_binval = max_binval; + __entry->step_size = step_size; + __entry->per_intent = per_intent; + __entry->max_deferred = max_deferred; + ), + TP_printk("dev %d:%d logres %u per_binval %u max_binval %u step_size %u per_intent %u max_deferred %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->log_res, + __entry->per_binval, + __entry->max_binval, + __entry->step_size, + __entry->per_intent, + __entry->max_deferred) +); +#define DEFINE_REPAIR_REAP_LIMITS_EVENT(name) \ +DEFINE_EVENT(xrep_reap_limits_class, name, \ + TP_PROTO(const struct xfs_trans *tp, unsigned int per_binval, \ + unsigned int max_binval, unsigned int step_size, \ + unsigned int per_intent, \ + unsigned int max_deferred), \ + TP_ARGS(tp, per_binval, max_binval, step_size, per_intent, max_deferred)) +DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_agextent_limits); +DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_agcow_limits); +DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_rgcow_limits); +DEFINE_REPAIR_REAP_LIMITS_EVENT(xreap_bmapi_limits); + DECLARE_EVENT_CLASS(xrep_reap_find_class, TP_PROTO(const struct xfs_group *xg, xfs_agblock_t agbno, xfs_extlen_t len, bool crosslinked), @@ -2996,7 +3041,7 @@ DEFINE_EVENT(xrep_pptr_salvage_class, name, \ DEFINE_XREP_PPTR_SALVAGE_EVENT(xrep_xattr_salvage_pptr); DEFINE_XREP_PPTR_SALVAGE_EVENT(xrep_xattr_insert_pptr); -TRACE_EVENT(xrep_xattr_class, +DECLARE_EVENT_CLASS(xrep_xattr_class, TP_PROTO(struct xfs_inode *ip, struct xfs_inode *arg_ip), TP_ARGS(ip, arg_ip), TP_STRUCT__entry( diff --git a/fs/xfs/scrub/xfarray.c b/fs/xfs/scrub/xfarray.c index cdd13ed9c569..ed2e8c64b1a8 100644 --- a/fs/xfs/scrub/xfarray.c +++ b/fs/xfs/scrub/xfarray.c @@ -834,7 +834,7 @@ xfarray_sort_scan( si->first_folio_idx = xfarray_idx(si->array, folio_pos(si->folio) + si->array->obj_size - 1); - next_pos = folio_pos(si->folio) + folio_size(si->folio); + next_pos = folio_next_pos(si->folio); si->last_folio_idx = xfarray_idx(si->array, next_pos - 1); if (xfarray_pos(si->array, si->last_folio_idx + 1) > next_pos) si->last_folio_idx--; diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 26a04a783489..56a544638491 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -234,6 +234,47 @@ xfs_end_bio( } /* + * We cannot cancel the ioend directly on error. We may have already set other + * pages under writeback and hence we have to run I/O completion to mark the + * error state of the pages under writeback appropriately. + * + * If the folio has delalloc blocks on it, the caller is asking us to punch them + * out. If we don't, we can leave a stale delalloc mapping covered by a clean + * page that needs to be dirtied again before the delalloc mapping can be + * converted. This stale delalloc mapping can trip up a later direct I/O read + * operation on the same region. + * + * We prevent this by truncating away the delalloc regions on the folio. Because + * they are delalloc, we can do this without needing a transaction. Indeed - if + * we get ENOSPC errors, we have to be able to do this truncation without a + * transaction as there is no space left for block reservation (typically why + * we see a ENOSPC in writeback). + */ +static void +xfs_discard_folio( + struct folio *folio, + loff_t pos) +{ + struct xfs_inode *ip = XFS_I(folio->mapping->host); + struct xfs_mount *mp = ip->i_mount; + + if (xfs_is_shutdown(mp)) + return; + + xfs_alert_ratelimited(mp, + "page discard on page "PTR_FMT", inode 0x%llx, pos %llu.", + folio, ip->i_ino, pos); + + /* + * The end of the punch range is always the offset of the first + * byte of the next folio. Hence the end offset is only dependent on the + * folio itself and not the start offset that is passed in. + */ + xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos, + folio_next_pos(folio), NULL); +} + +/* * Fast revalidation of the cached writeback mapping. Return true if the current * mapping is valid, false otherwise. */ @@ -278,13 +319,12 @@ xfs_imap_valid( static int xfs_map_blocks( struct iomap_writepage_ctx *wpc, - struct inode *inode, loff_t offset, unsigned int len) { - struct xfs_inode *ip = XFS_I(inode); + struct xfs_inode *ip = XFS_I(wpc->inode); struct xfs_mount *mp = ip->i_mount; - ssize_t count = i_blocksize(inode); + ssize_t count = i_blocksize(wpc->inode); xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); xfs_fileoff_t cow_fsb; @@ -436,81 +476,78 @@ allocate_blocks: return 0; } -static int -xfs_submit_ioend( +static ssize_t +xfs_writeback_range( struct iomap_writepage_ctx *wpc, - int status) + struct folio *folio, + u64 offset, + unsigned int len, + u64 end_pos) { - struct iomap_ioend *ioend = wpc->ioend; - unsigned int nofs_flag; - - /* - * We can allocate memory here while doing writeback on behalf of - * memory reclaim. To avoid memory allocation deadlocks set the - * task-wide nofs context for the following operations. - */ - nofs_flag = memalloc_nofs_save(); + ssize_t ret; + + ret = xfs_map_blocks(wpc, offset, len); + if (!ret) + ret = iomap_add_to_ioend(wpc, folio, offset, end_pos, len); + if (ret < 0) + xfs_discard_folio(folio, offset); + return ret; +} - /* Convert CoW extents to regular */ - if (!status && (ioend->io_flags & IOMAP_IOEND_SHARED)) { - status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode), - ioend->io_offset, ioend->io_size); - } +static bool +xfs_ioend_needs_wq_completion( + struct iomap_ioend *ioend) +{ + /* Changing inode size requires a transaction. */ + if (xfs_ioend_is_append(ioend)) + return true; - memalloc_nofs_restore(nofs_flag); + /* Extent manipulation requires a transaction. */ + if (ioend->io_flags & (IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_SHARED)) + return true; - /* send ioends that might require a transaction to the completion wq */ - if (xfs_ioend_is_append(ioend) || - (ioend->io_flags & (IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_SHARED))) - ioend->io_bio.bi_end_io = xfs_end_bio; + /* Page cache invalidation cannot be done in irq context. */ + if (ioend->io_flags & IOMAP_IOEND_DONTCACHE) + return true; - if (status) - return status; - submit_bio(&ioend->io_bio); - return 0; + return false; } -/* - * If the folio has delalloc blocks on it, the caller is asking us to punch them - * out. If we don't, we can leave a stale delalloc mapping covered by a clean - * page that needs to be dirtied again before the delalloc mapping can be - * converted. This stale delalloc mapping can trip up a later direct I/O read - * operation on the same region. - * - * We prevent this by truncating away the delalloc regions on the folio. Because - * they are delalloc, we can do this without needing a transaction. Indeed - if - * we get ENOSPC errors, we have to be able to do this truncation without a - * transaction as there is no space left for block reservation (typically why - * we see a ENOSPC in writeback). - */ -static void -xfs_discard_folio( - struct folio *folio, - loff_t pos) +static int +xfs_writeback_submit( + struct iomap_writepage_ctx *wpc, + int error) { - struct xfs_inode *ip = XFS_I(folio->mapping->host); - struct xfs_mount *mp = ip->i_mount; + struct iomap_ioend *ioend = wpc->wb_ctx; - if (xfs_is_shutdown(mp)) - return; + /* + * Convert CoW extents to regular. + * + * We can allocate memory here while doing writeback on behalf of memory + * reclaim. To avoid memory allocation deadlocks, set the task-wide + * nofs context. + */ + if (!error && (ioend->io_flags & IOMAP_IOEND_SHARED)) { + unsigned int nofs_flag; - xfs_alert_ratelimited(mp, - "page discard on page "PTR_FMT", inode 0x%llx, pos %llu.", - folio, ip->i_ino, pos); + nofs_flag = memalloc_nofs_save(); + error = xfs_reflink_convert_cow(XFS_I(ioend->io_inode), + ioend->io_offset, ioend->io_size); + memalloc_nofs_restore(nofs_flag); + } /* - * The end of the punch range is always the offset of the first - * byte of the next folio. Hence the end offset is only dependent on the - * folio itself and not the start offset that is passed in. + * Send ioends that might require a transaction to the completion wq. */ - xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos, - folio_pos(folio) + folio_size(folio), NULL); + if (xfs_ioend_needs_wq_completion(ioend)) + ioend->io_bio.bi_end_io = xfs_end_bio; + + return iomap_ioend_writeback_submit(wpc, error); } static const struct iomap_writeback_ops xfs_writeback_ops = { - .map_blocks = xfs_map_blocks, - .submit_ioend = xfs_submit_ioend, - .discard_folio = xfs_discard_folio, + .writeback_range = xfs_writeback_range, + .writeback_submit = xfs_writeback_submit, }; struct xfs_zoned_writepage_ctx { @@ -527,11 +564,10 @@ XFS_ZWPC(struct iomap_writepage_ctx *ctx) static int xfs_zoned_map_blocks( struct iomap_writepage_ctx *wpc, - struct inode *inode, loff_t offset, unsigned int len) { - struct xfs_inode *ip = XFS_I(inode); + struct xfs_inode *ip = XFS_I(wpc->inode); struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + len); @@ -590,22 +626,44 @@ xfs_zoned_map_blocks( return 0; } -static int -xfs_zoned_submit_ioend( +static ssize_t +xfs_zoned_writeback_range( struct iomap_writepage_ctx *wpc, - int status) + struct folio *folio, + u64 offset, + unsigned int len, + u64 end_pos) { - wpc->ioend->io_bio.bi_end_io = xfs_end_bio; - if (status) - return status; - xfs_zone_alloc_and_submit(wpc->ioend, &XFS_ZWPC(wpc)->open_zone); + ssize_t ret; + + ret = xfs_zoned_map_blocks(wpc, offset, len); + if (!ret) + ret = iomap_add_to_ioend(wpc, folio, offset, end_pos, len); + if (ret < 0) + xfs_discard_folio(folio, offset); + return ret; +} + +static int +xfs_zoned_writeback_submit( + struct iomap_writepage_ctx *wpc, + int error) +{ + struct iomap_ioend *ioend = wpc->wb_ctx; + + ioend->io_bio.bi_end_io = xfs_end_bio; + if (error) { + ioend->io_bio.bi_status = errno_to_blk_status(error); + bio_endio(&ioend->io_bio); + return error; + } + xfs_zone_alloc_and_submit(ioend, &XFS_ZWPC(wpc)->open_zone); return 0; } static const struct iomap_writeback_ops xfs_zoned_writeback_ops = { - .map_blocks = xfs_zoned_map_blocks, - .submit_ioend = xfs_zoned_submit_ioend, - .discard_folio = xfs_discard_folio, + .writeback_range = xfs_zoned_writeback_range, + .writeback_submit = xfs_zoned_writeback_submit, }; STATIC int @@ -618,19 +676,29 @@ xfs_vm_writepages( xfs_iflags_clear(ip, XFS_ITRUNCATED); if (xfs_is_zoned_inode(ip)) { - struct xfs_zoned_writepage_ctx xc = { }; + struct xfs_zoned_writepage_ctx xc = { + .ctx = { + .inode = mapping->host, + .wbc = wbc, + .ops = &xfs_zoned_writeback_ops + }, + }; int error; - error = iomap_writepages(mapping, wbc, &xc.ctx, - &xfs_zoned_writeback_ops); + error = iomap_writepages(&xc.ctx); if (xc.open_zone) xfs_open_zone_put(xc.open_zone); return error; } else { - struct xfs_writepage_ctx wpc = { }; - - return iomap_writepages(mapping, wbc, &wpc.ctx, - &xfs_writeback_ops); + struct xfs_writepage_ctx wpc = { + .ctx = { + .inode = mapping->host, + .wbc = wbc, + .ops = &xfs_writeback_ops + }, + }; + + return iomap_writepages(&wpc.ctx); } } @@ -674,14 +742,15 @@ xfs_vm_read_folio( struct file *unused, struct folio *folio) { - return iomap_read_folio(folio, &xfs_read_iomap_ops); + iomap_bio_read_folio(folio, &xfs_read_iomap_ops); + return 0; } STATIC void xfs_vm_readahead( struct readahead_control *rac) { - iomap_readahead(rac, &xfs_read_iomap_ops); + iomap_bio_readahead(rac, &xfs_read_iomap_ops); } static int @@ -692,6 +761,9 @@ xfs_vm_swap_activate( { struct xfs_inode *ip = XFS_I(file_inode(swap_file)); + if (xfs_is_zoned_inode(ip)) + return -EINVAL; + /* * Swap file activation can race against concurrent shared extent * removal in files that have been cloned. If this happens, diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index f683b7a9323f..e8fa326ac995 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -91,41 +91,37 @@ xfs_attri_log_nameval_alloc( name_len + new_name_len + value_len + new_value_len); - nv->name.i_addr = nv + 1; - nv->name.i_len = name_len; - nv->name.i_type = XLOG_REG_TYPE_ATTR_NAME; - memcpy(nv->name.i_addr, name, name_len); + nv->name.iov_base = nv + 1; + nv->name.iov_len = name_len; + memcpy(nv->name.iov_base, name, name_len); if (new_name_len) { - nv->new_name.i_addr = nv->name.i_addr + name_len; - nv->new_name.i_len = new_name_len; - memcpy(nv->new_name.i_addr, new_name, new_name_len); + nv->new_name.iov_base = nv->name.iov_base + name_len; + nv->new_name.iov_len = new_name_len; + memcpy(nv->new_name.iov_base, new_name, new_name_len); } else { - nv->new_name.i_addr = NULL; - nv->new_name.i_len = 0; + nv->new_name.iov_base = NULL; + nv->new_name.iov_len = 0; } - nv->new_name.i_type = XLOG_REG_TYPE_ATTR_NEWNAME; if (value_len) { - nv->value.i_addr = nv->name.i_addr + name_len + new_name_len; - nv->value.i_len = value_len; - memcpy(nv->value.i_addr, value, value_len); + nv->value.iov_base = nv->name.iov_base + name_len + new_name_len; + nv->value.iov_len = value_len; + memcpy(nv->value.iov_base, value, value_len); } else { - nv->value.i_addr = NULL; - nv->value.i_len = 0; + nv->value.iov_base = NULL; + nv->value.iov_len = 0; } - nv->value.i_type = XLOG_REG_TYPE_ATTR_VALUE; if (new_value_len) { - nv->new_value.i_addr = nv->name.i_addr + name_len + + nv->new_value.iov_base = nv->name.iov_base + name_len + new_name_len + value_len; - nv->new_value.i_len = new_value_len; - memcpy(nv->new_value.i_addr, new_value, new_value_len); + nv->new_value.iov_len = new_value_len; + memcpy(nv->new_value.iov_base, new_value, new_value_len); } else { - nv->new_value.i_addr = NULL; - nv->new_value.i_len = 0; + nv->new_value.iov_base = NULL; + nv->new_value.iov_len = 0; } - nv->new_value.i_type = XLOG_REG_TYPE_ATTR_NEWVALUE; refcount_set(&nv->refcount, 1); return nv; @@ -170,21 +166,21 @@ xfs_attri_item_size( *nvecs += 2; *nbytes += sizeof(struct xfs_attri_log_format) + - xlog_calc_iovec_len(nv->name.i_len); + xlog_calc_iovec_len(nv->name.iov_len); - if (nv->new_name.i_len) { + if (nv->new_name.iov_len) { *nvecs += 1; - *nbytes += xlog_calc_iovec_len(nv->new_name.i_len); + *nbytes += xlog_calc_iovec_len(nv->new_name.iov_len); } - if (nv->value.i_len) { + if (nv->value.iov_len) { *nvecs += 1; - *nbytes += xlog_calc_iovec_len(nv->value.i_len); + *nbytes += xlog_calc_iovec_len(nv->value.iov_len); } - if (nv->new_value.i_len) { + if (nv->new_value.iov_len) { *nvecs += 1; - *nbytes += xlog_calc_iovec_len(nv->new_value.i_len); + *nbytes += xlog_calc_iovec_len(nv->new_value.iov_len); } } @@ -212,31 +208,36 @@ xfs_attri_item_format( * the log recovery. */ - ASSERT(nv->name.i_len > 0); + ASSERT(nv->name.iov_len > 0); attrip->attri_format.alfi_size++; - if (nv->new_name.i_len > 0) + if (nv->new_name.iov_len > 0) attrip->attri_format.alfi_size++; - if (nv->value.i_len > 0) + if (nv->value.iov_len > 0) attrip->attri_format.alfi_size++; - if (nv->new_value.i_len > 0) + if (nv->new_value.iov_len > 0) attrip->attri_format.alfi_size++; xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTRI_FORMAT, &attrip->attri_format, sizeof(struct xfs_attri_log_format)); - xlog_copy_from_iovec(lv, &vecp, &nv->name); - if (nv->new_name.i_len > 0) - xlog_copy_from_iovec(lv, &vecp, &nv->new_name); + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTR_NAME, nv->name.iov_base, + nv->name.iov_len); - if (nv->value.i_len > 0) - xlog_copy_from_iovec(lv, &vecp, &nv->value); + if (nv->new_name.iov_len > 0) + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTR_NEWNAME, + nv->new_name.iov_base, nv->new_name.iov_len); - if (nv->new_value.i_len > 0) - xlog_copy_from_iovec(lv, &vecp, &nv->new_value); + if (nv->value.iov_len > 0) + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTR_VALUE, + nv->value.iov_base, nv->value.iov_len); + + if (nv->new_value.iov_len > 0) + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTR_NEWVALUE, + nv->new_value.iov_base, nv->new_value.iov_len); } /* @@ -383,22 +384,22 @@ xfs_attr_log_item( attrp->alfi_ino = args->dp->i_ino; ASSERT(!(attr->xattri_op_flags & ~XFS_ATTRI_OP_FLAGS_TYPE_MASK)); attrp->alfi_op_flags = attr->xattri_op_flags; - attrp->alfi_value_len = nv->value.i_len; + attrp->alfi_value_len = nv->value.iov_len; switch (xfs_attr_log_item_op(attrp)) { case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE: - ASSERT(nv->value.i_len == nv->new_value.i_len); + ASSERT(nv->value.iov_len == nv->new_value.iov_len); attrp->alfi_igen = VFS_I(args->dp)->i_generation; - attrp->alfi_old_name_len = nv->name.i_len; - attrp->alfi_new_name_len = nv->new_name.i_len; + attrp->alfi_old_name_len = nv->name.iov_len; + attrp->alfi_new_name_len = nv->new_name.iov_len; break; case XFS_ATTRI_OP_FLAGS_PPTR_REMOVE: case XFS_ATTRI_OP_FLAGS_PPTR_SET: attrp->alfi_igen = VFS_I(args->dp)->i_generation; fallthrough; default: - attrp->alfi_name_len = nv->name.i_len; + attrp->alfi_name_len = nv->name.iov_len; break; } @@ -490,7 +491,7 @@ xfs_attr_finish_item( /* Reset trans after EAGAIN cycle since the transaction is new */ args->trans = tp; - if (XFS_TEST_ERROR(false, args->dp->i_mount, XFS_ERRTAG_LARP)) { + if (XFS_TEST_ERROR(args->dp->i_mount, XFS_ERRTAG_LARP)) { error = -EIO; goto out; } @@ -616,10 +617,7 @@ xfs_attri_iread_extents( struct xfs_trans *tp; int error; - error = xfs_trans_alloc_empty(ip->i_mount, &tp); - if (error) - return error; - + tp = xfs_trans_alloc_empty(ip->i_mount); xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_iread_extents(tp, ip, XFS_ATTR_FORK); xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -690,14 +688,14 @@ xfs_attri_recover_work( args->dp = ip; args->geo = mp->m_attr_geo; args->whichfork = XFS_ATTR_FORK; - args->name = nv->name.i_addr; - args->namelen = nv->name.i_len; - args->new_name = nv->new_name.i_addr; - args->new_namelen = nv->new_name.i_len; - args->value = nv->value.i_addr; - args->valuelen = nv->value.i_len; - args->new_value = nv->new_value.i_addr; - args->new_valuelen = nv->new_value.i_len; + args->name = nv->name.iov_base; + args->namelen = nv->name.iov_len; + args->new_name = nv->new_name.iov_base; + args->new_namelen = nv->new_name.iov_len; + args->value = nv->value.iov_base; + args->valuelen = nv->value.iov_len; + args->new_value = nv->new_value.iov_base; + args->new_valuelen = nv->new_value.iov_len; args->attr_filter = attrp->alfi_attr_filter & XFS_ATTRI_FILTER_MASK; args->op_flags = XFS_DA_OP_RECOVERY | XFS_DA_OP_OKNOENT | XFS_DA_OP_LOGGED; @@ -739,7 +737,7 @@ xfs_attr_recover_work( struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip); struct xfs_attr_intent *attr; struct xfs_mount *mp = lip->li_log->l_mp; - struct xfs_inode *ip; + struct xfs_inode *ip = NULL; struct xfs_da_args *args; struct xfs_trans *tp; struct xfs_trans_res resv; @@ -754,8 +752,8 @@ xfs_attr_recover_work( */ attrp = &attrip->attri_format; if (!xfs_attri_validate(mp, attrp) || - !xfs_attr_namecheck(attrp->alfi_attr_filter, nv->name.i_addr, - nv->name.i_len)) + !xfs_attr_namecheck(attrp->alfi_attr_filter, nv->name.iov_base, + nv->name.iov_len)) return -EFSCORRUPTED; attr = xfs_attri_recover_work(mp, dfp, attrp, &ip, nv); @@ -953,50 +951,50 @@ static inline void * xfs_attri_validate_name_iovec( struct xfs_mount *mp, struct xfs_attri_log_format *attri_formatp, - const struct xfs_log_iovec *iovec, + const struct kvec *iovec, unsigned int name_len) { - if (iovec->i_len != xlog_calc_iovec_len(name_len)) { + if (iovec->iov_len != xlog_calc_iovec_len(name_len)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, attri_formatp, sizeof(*attri_formatp)); return NULL; } - if (!xfs_attr_namecheck(attri_formatp->alfi_attr_filter, iovec->i_addr, + if (!xfs_attr_namecheck(attri_formatp->alfi_attr_filter, iovec->iov_base, name_len)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, attri_formatp, sizeof(*attri_formatp)); XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - iovec->i_addr, iovec->i_len); + iovec->iov_base, iovec->iov_len); return NULL; } - return iovec->i_addr; + return iovec->iov_base; } static inline void * xfs_attri_validate_value_iovec( struct xfs_mount *mp, struct xfs_attri_log_format *attri_formatp, - const struct xfs_log_iovec *iovec, + const struct kvec *iovec, unsigned int value_len) { - if (iovec->i_len != xlog_calc_iovec_len(value_len)) { + if (iovec->iov_len != xlog_calc_iovec_len(value_len)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, attri_formatp, sizeof(*attri_formatp)); return NULL; } if ((attri_formatp->alfi_attr_filter & XFS_ATTR_PARENT) && - !xfs_parent_valuecheck(mp, iovec->i_addr, value_len)) { + !xfs_parent_valuecheck(mp, iovec->iov_base, value_len)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, attri_formatp, sizeof(*attri_formatp)); XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - iovec->i_addr, iovec->i_len); + iovec->iov_base, iovec->iov_len); return NULL; } - return iovec->i_addr; + return iovec->iov_base; } STATIC int @@ -1023,13 +1021,13 @@ xlog_recover_attri_commit_pass2( /* Validate xfs_attri_log_format before the large memory allocation */ len = sizeof(struct xfs_attri_log_format); - if (item->ri_buf[i].i_len != len) { + if (item->ri_buf[i].iov_len != len) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + item->ri_buf[0].iov_base, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } - attri_formatp = item->ri_buf[i].i_addr; + attri_formatp = item->ri_buf[i].iov_base; if (!xfs_attri_validate(mp, attri_formatp)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, attri_formatp, len); @@ -1218,10 +1216,10 @@ xlog_recover_attrd_commit_pass2( { struct xfs_attrd_log_format *attrd_formatp; - attrd_formatp = item->ri_buf[0].i_addr; - if (item->ri_buf[0].i_len != sizeof(struct xfs_attrd_log_format)) { + attrd_formatp = item->ri_buf[0].iov_base; + if (item->ri_buf[0].iov_len != sizeof(struct xfs_attrd_log_format)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, - item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + item->ri_buf[0].iov_base, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } diff --git a/fs/xfs/xfs_attr_item.h b/fs/xfs/xfs_attr_item.h index e74128cbb722..d108a11b55ae 100644 --- a/fs/xfs/xfs_attr_item.h +++ b/fs/xfs/xfs_attr_item.h @@ -12,10 +12,10 @@ struct xfs_mount; struct kmem_zone; struct xfs_attri_log_nameval { - struct xfs_log_iovec name; - struct xfs_log_iovec new_name; /* PPTR_REPLACE only */ - struct xfs_log_iovec value; - struct xfs_log_iovec new_value; /* PPTR_REPLACE only */ + struct kvec name; + struct kvec new_name; /* PPTR_REPLACE only */ + struct kvec value; + struct kvec new_value; /* PPTR_REPLACE only */ refcount_t refcount; /* name and value follow the end of this struct */ diff --git a/fs/xfs/xfs_bio_io.c b/fs/xfs/xfs_bio_io.c index fe21c76f75b8..2a736d10eafb 100644 --- a/fs/xfs/xfs_bio_io.c +++ b/fs/xfs/xfs_bio_io.c @@ -18,42 +18,36 @@ xfs_rw_bdev( enum req_op op) { - unsigned int is_vmalloc = is_vmalloc_addr(data); - unsigned int left = count; + unsigned int done = 0, added; int error; struct bio *bio; - if (is_vmalloc && op == REQ_OP_WRITE) - flush_kernel_vmap_range(data, count); + op |= REQ_META | REQ_SYNC; + if (!is_vmalloc_addr(data)) + return bdev_rw_virt(bdev, sector, data, count, op); - bio = bio_alloc(bdev, bio_max_vecs(left), op | REQ_META | REQ_SYNC, - GFP_KERNEL); + bio = bio_alloc(bdev, bio_max_vecs(count), op, GFP_KERNEL); bio->bi_iter.bi_sector = sector; do { - struct page *page = kmem_to_page(data); - unsigned int off = offset_in_page(data); - unsigned int len = min_t(unsigned, left, PAGE_SIZE - off); - - while (bio_add_page(bio, page, len, off) != len) { + added = bio_add_vmalloc_chunk(bio, data + done, count - done); + if (!added) { struct bio *prev = bio; - bio = bio_alloc(prev->bi_bdev, bio_max_vecs(left), + bio = bio_alloc(prev->bi_bdev, + bio_max_vecs(count - done), prev->bi_opf, GFP_KERNEL); bio->bi_iter.bi_sector = bio_end_sector(prev); bio_chain(prev, bio); - submit_bio(prev); } - - data += len; - left -= len; - } while (left > 0); + done += added; + } while (done < count); error = submit_bio_wait(bio); bio_put(bio); - if (is_vmalloc && op == REQ_OP_READ) + if (op == REQ_OP_READ) invalidate_kernel_vmap_range(data, count); return error; } diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 3d52e9d7ad57..80f0c4bcc483 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -77,6 +77,11 @@ xfs_bui_item_size( *nbytes += xfs_bui_log_format_sizeof(buip->bui_format.bui_nextents); } +unsigned int xfs_bui_log_space(unsigned int nr) +{ + return xlog_item_space(1, xfs_bui_log_format_sizeof(nr)); +} + /* * This is called to fill in the vector of log iovecs for the * given bui log item. We use only 1 iovec, and we point that @@ -168,6 +173,11 @@ xfs_bud_item_size( *nbytes += sizeof(struct xfs_bud_log_format); } +unsigned int xfs_bud_log_space(void) +{ + return xlog_item_space(1, sizeof(struct xfs_bud_log_format)); +} + /* * This is called to fill in the vector of log iovecs for the * given bud log item. We use only 1 iovec, and we point that @@ -644,24 +654,24 @@ xlog_recover_bui_commit_pass2( struct xfs_bui_log_format *bui_formatp; size_t len; - bui_formatp = item->ri_buf[0].i_addr; + bui_formatp = item->ri_buf[0].iov_base; - if (item->ri_buf[0].i_len < xfs_bui_log_format_sizeof(0)) { + if (item->ri_buf[0].iov_len < xfs_bui_log_format_sizeof(0)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + item->ri_buf[0].iov_base, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } if (bui_formatp->bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + item->ri_buf[0].iov_base, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } len = xfs_bui_log_format_sizeof(bui_formatp->bui_nextents); - if (item->ri_buf[0].i_len != len) { + if (item->ri_buf[0].iov_len != len) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + item->ri_buf[0].iov_base, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } @@ -695,10 +705,10 @@ xlog_recover_bud_commit_pass2( { struct xfs_bud_log_format *bud_formatp; - bud_formatp = item->ri_buf[0].i_addr; - if (item->ri_buf[0].i_len != sizeof(struct xfs_bud_log_format)) { + bud_formatp = item->ri_buf[0].iov_base; + if (item->ri_buf[0].iov_len != sizeof(struct xfs_bud_log_format)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, - item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + item->ri_buf[0].iov_base, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h index 6fee6a508343..b42fee06899d 100644 --- a/fs/xfs/xfs_bmap_item.h +++ b/fs/xfs/xfs_bmap_item.h @@ -72,4 +72,7 @@ struct xfs_bmap_intent; void xfs_bmap_defer_add(struct xfs_trans *tp, struct xfs_bmap_intent *bi); +unsigned int xfs_bui_log_space(unsigned int nr); +unsigned int xfs_bud_log_space(void); + #endif /* __XFS_BMAP_ITEM_H__ */ diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 06ca11731e43..2208a720ec3f 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -514,7 +514,7 @@ xfs_can_free_eofblocks( * Caller must either hold the exclusive io lock; or be inactivating * the inode, which guarantees there are no other users of the inode. */ - if (!(VFS_I(ip)->i_state & I_FREEING)) + if (!(inode_state_read_once(VFS_I(ip)) & I_FREEING)) xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL); /* prealloc/delalloc exists only on regular files */ diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 8e7f1b324b3b..47edf3041631 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -105,6 +105,7 @@ xfs_buf_free( { unsigned int size = BBTOB(bp->b_length); + might_sleep(); trace_xfs_buf_free(bp, _RET_IP_); ASSERT(list_empty(&bp->b_lru)); @@ -386,8 +387,6 @@ xfs_buf_map_verify( struct xfs_buftarg *btp, struct xfs_buf_map *map) { - xfs_daddr_t eofs; - /* Check for IOs smaller than the sector size / not sector aligned */ ASSERT(!(BBTOB(map->bm_len) < btp->bt_meta_sectorsize)); ASSERT(!(BBTOB(map->bm_bn) & (xfs_off_t)btp->bt_meta_sectormask)); @@ -396,11 +395,10 @@ xfs_buf_map_verify( * Corrupted block numbers can get through to here, unfortunately, so we * have to check that the buffer falls within the filesystem bounds. */ - eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks); - if (map->bm_bn < 0 || map->bm_bn >= eofs) { + if (map->bm_bn < 0 || map->bm_bn >= btp->bt_nr_sectors) { xfs_alert(btp->bt_mount, "%s: daddr 0x%llx out of range, EOFS 0x%llx", - __func__, map->bm_bn, eofs); + __func__, map->bm_bn, btp->bt_nr_sectors); WARN_ON(1); return -EFSCORRUPTED; } @@ -1298,7 +1296,7 @@ xfs_buf_bio_end_io( if (bio->bi_status) xfs_buf_ioerror(bp, blk_status_to_errno(bio->bi_status)); else if ((bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) && - XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR)) + XFS_TEST_ERROR(bp->b_mount, XFS_ERRTAG_BUF_IOERROR)) xfs_buf_ioerror(bp, -EIO); if (bp->b_flags & XBF_ASYNC) { @@ -1332,45 +1330,18 @@ static void xfs_buf_submit_bio( struct xfs_buf *bp) { + unsigned int len = BBTOB(bp->b_length); + unsigned int nr_vecs = bio_add_max_vecs(bp->b_addr, len); unsigned int map = 0; struct blk_plug plug; struct bio *bio; - if (is_vmalloc_addr(bp->b_addr)) { - unsigned int size = BBTOB(bp->b_length); - unsigned int alloc_size = roundup(size, PAGE_SIZE); - void *data = bp->b_addr; - - bio = bio_alloc(bp->b_target->bt_bdev, alloc_size >> PAGE_SHIFT, - xfs_buf_bio_op(bp), GFP_NOIO); - - do { - unsigned int len = min(size, PAGE_SIZE); - - ASSERT(offset_in_page(data) == 0); - __bio_add_page(bio, vmalloc_to_page(data), len, 0); - data += len; - size -= len; - } while (size); - - flush_kernel_vmap_range(bp->b_addr, alloc_size); - } else { - /* - * Single folio or slab allocation. Must be contiguous and thus - * only a single bvec is needed. - * - * This uses the page based bio add helper for now as that is - * the lowest common denominator between folios and slab - * allocations. To be replaced with a better block layer - * helper soon (hopefully). - */ - bio = bio_alloc(bp->b_target->bt_bdev, 1, xfs_buf_bio_op(bp), - GFP_NOIO); - __bio_add_page(bio, virt_to_page(bp->b_addr), - BBTOB(bp->b_length), - offset_in_page(bp->b_addr)); - } - + bio = bio_alloc(bp->b_target->bt_bdev, nr_vecs, xfs_buf_bio_op(bp), + GFP_NOIO); + if (is_vmalloc_addr(bp->b_addr)) + bio_add_vmalloc(bio, bp->b_addr, len); + else + bio_add_virt_nofail(bio, bp->b_addr, len); bio->bi_private = bp; bio->bi_end_io = xfs_buf_bio_end_io; @@ -1709,26 +1680,67 @@ xfs_free_buftarg( fs_put_dax(btp->bt_daxdev, btp->bt_mount); /* the main block device is closed by kill_block_super */ if (btp->bt_bdev != btp->bt_mount->m_super->s_bdev) - bdev_fput(btp->bt_bdev_file); + bdev_fput(btp->bt_file); kfree(btp); } +/* + * Configure this buffer target for hardware-assisted atomic writes if the + * underlying block device supports is congruent with the filesystem geometry. + */ +static inline void +xfs_configure_buftarg_atomic_writes( + struct xfs_buftarg *btp) +{ + struct xfs_mount *mp = btp->bt_mount; + unsigned int min_bytes, max_bytes; + + min_bytes = bdev_atomic_write_unit_min_bytes(btp->bt_bdev); + max_bytes = bdev_atomic_write_unit_max_bytes(btp->bt_bdev); + + /* + * Ignore atomic write geometry that is nonsense or doesn't even cover + * a single fsblock. + */ + if (min_bytes > max_bytes || + min_bytes > mp->m_sb.sb_blocksize || + max_bytes < mp->m_sb.sb_blocksize) { + min_bytes = 0; + max_bytes = 0; + } + + btp->bt_awu_min = min_bytes; + btp->bt_awu_max = max_bytes; +} + +/* Configure a buffer target that abstracts a block device. */ int -xfs_setsize_buftarg( +xfs_configure_buftarg( struct xfs_buftarg *btp, - unsigned int sectorsize) + unsigned int sectorsize, + xfs_rfsblock_t nr_blocks) { - /* Set up metadata sector size info */ - btp->bt_meta_sectorsize = sectorsize; - btp->bt_meta_sectormask = sectorsize - 1; + struct xfs_mount *mp = btp->bt_mount; + + if (btp->bt_bdev) { + int error; + + error = bdev_validate_blocksize(btp->bt_bdev, sectorsize); + if (error) { + xfs_warn(mp, + "Cannot use blocksize %u on device %pg, err %d", + sectorsize, btp->bt_bdev, error); + return -EINVAL; + } - if (set_blocksize(btp->bt_bdev_file, sectorsize)) { - xfs_warn(btp->bt_mount, - "Cannot set_blocksize to %u on device %pg", - sectorsize, btp->bt_bdev); - return -EINVAL; + if (bdev_can_atomic_write(btp->bt_bdev)) + xfs_configure_buftarg_atomic_writes(btp); } + btp->bt_meta_sectorsize = sectorsize; + btp->bt_meta_sectormask = sectorsize - 1; + /* m_blkbb_log is not set up yet */ + btp->bt_nr_sectors = nr_blocks << (mp->m_sb.sb_blocklog - BBSHIFT); return 0; } @@ -1738,6 +1750,9 @@ xfs_init_buftarg( size_t logical_sectorsize, const char *descr) { + /* The maximum size of the buftarg is only known once the sb is read. */ + btp->bt_nr_sectors = XFS_BUF_DADDR_MAX; + /* Set up device logical sector size mask */ btp->bt_logical_sectorsize = logical_sectorsize; btp->bt_logical_sectormask = logical_sectorsize - 1; @@ -1778,6 +1793,8 @@ xfs_alloc_buftarg( { struct xfs_buftarg *btp; const struct dax_holder_operations *ops = NULL; + int error; + #if defined(CONFIG_FS_DAX) && defined(CONFIG_MEMORY_FAILURE) ops = &xfs_dax_holder_operations; @@ -1785,34 +1802,37 @@ xfs_alloc_buftarg( btp = kzalloc(sizeof(*btp), GFP_KERNEL | __GFP_NOFAIL); btp->bt_mount = mp; - btp->bt_bdev_file = bdev_file; + btp->bt_file = bdev_file; btp->bt_bdev = file_bdev(bdev_file); btp->bt_dev = btp->bt_bdev->bd_dev; btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off, mp, ops); - if (bdev_can_atomic_write(btp->bt_bdev)) { - btp->bt_bdev_awu_min = bdev_atomic_write_unit_min_bytes( - btp->bt_bdev); - btp->bt_bdev_awu_max = bdev_atomic_write_unit_max_bytes( - btp->bt_bdev); - } + /* + * Flush and invalidate all devices' pagecaches before reading any + * metadata because XFS doesn't use the bdev pagecache. + */ + error = sync_blockdev(btp->bt_bdev); + if (error) + goto error_free; /* * When allocating the buftargs we have not yet read the super block and * thus don't know the file system sector size yet. */ - if (xfs_setsize_buftarg(btp, bdev_logical_block_size(btp->bt_bdev))) - goto error_free; - if (xfs_init_buftarg(btp, bdev_logical_block_size(btp->bt_bdev), - mp->m_super->s_id)) + btp->bt_meta_sectorsize = bdev_logical_block_size(btp->bt_bdev); + btp->bt_meta_sectormask = btp->bt_meta_sectorsize - 1; + + error = xfs_init_buftarg(btp, btp->bt_meta_sectorsize, + mp->m_super->s_id); + if (error) goto error_free; return btp; error_free: kfree(btp); - return NULL; + return ERR_PTR(error); } static inline void @@ -2061,44 +2081,6 @@ xfs_buf_delwri_submit( return error; } -/* - * Push a single buffer on a delwri queue. - * - * The purpose of this function is to submit a single buffer of a delwri queue - * and return with the buffer still on the original queue. - * - * The buffer locking and queue management logic between _delwri_pushbuf() and - * _delwri_queue() guarantee that the buffer cannot be queued to another list - * before returning. - */ -int -xfs_buf_delwri_pushbuf( - struct xfs_buf *bp, - struct list_head *buffer_list) -{ - int error; - - ASSERT(bp->b_flags & _XBF_DELWRI_Q); - - trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_); - - xfs_buf_lock(bp); - bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC); - bp->b_flags |= XBF_WRITE; - xfs_buf_submit(bp); - - /* - * The buffer is now locked, under I/O but still on the original delwri - * queue. Wait for I/O completion, restore the DELWRI_Q flag and - * return with the buffer unlocked and still on the original queue. - */ - error = xfs_buf_iowait(bp); - bp->b_flags |= _XBF_DELWRI_Q; - xfs_buf_unlock(bp); - - return error; -} - void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) { /* @@ -2106,7 +2088,7 @@ void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) * This allows userspace to disrupt buffer caching for debug/testing * purposes. */ - if (XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_LRU_REF)) + if (XFS_TEST_ERROR(bp->b_mount, XFS_ERRTAG_BUF_LRU_REF)) lru_ref = 0; atomic_set(&bp->b_lru_ref, lru_ref); diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index d0b065a9a9f0..e25cd2a160f3 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -22,6 +22,7 @@ extern struct kmem_cache *xfs_buf_cache; */ struct xfs_buf; +#define XFS_BUF_DADDR_MAX ((xfs_daddr_t) S64_MAX) #define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL)) #define XBF_READ (1u << 0) /* buffer intended for reading from device */ @@ -94,7 +95,6 @@ void xfs_buf_cache_destroy(struct xfs_buf_cache *bch); */ struct xfs_buftarg { dev_t bt_dev; - struct file *bt_bdev_file; struct block_device *bt_bdev; struct dax_device *bt_daxdev; struct file *bt_file; @@ -104,6 +104,7 @@ struct xfs_buftarg { size_t bt_meta_sectormask; size_t bt_logical_sectorsize; size_t bt_logical_sectormask; + xfs_daddr_t bt_nr_sectors; /* LRU control structures */ struct shrinker *bt_shrinker; @@ -112,9 +113,9 @@ struct xfs_buftarg { struct percpu_counter bt_readahead_count; struct ratelimit_state bt_ioerror_rl; - /* Atomic write unit values */ - unsigned int bt_bdev_awu_min; - unsigned int bt_bdev_awu_max; + /* Hardware atomic write unit values, bytes */ + unsigned int bt_awu_min; + unsigned int bt_awu_max; /* built-in cache, if we're not using the perag one */ struct xfs_buf_cache bt_cache[]; @@ -326,7 +327,6 @@ extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *); void xfs_buf_delwri_queue_here(struct xfs_buf *bp, struct list_head *bl); extern int xfs_buf_delwri_submit(struct list_head *); extern int xfs_buf_delwri_submit_nowait(struct list_head *); -extern int xfs_buf_delwri_pushbuf(struct xfs_buf *, struct list_head *); static inline xfs_daddr_t xfs_buf_daddr(struct xfs_buf *bp) { @@ -374,9 +374,9 @@ struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *mp, extern void xfs_free_buftarg(struct xfs_buftarg *); extern void xfs_buftarg_wait(struct xfs_buftarg *); extern void xfs_buftarg_drain(struct xfs_buftarg *); -extern int xfs_setsize_buftarg(struct xfs_buftarg *, unsigned int); +int xfs_configure_buftarg(struct xfs_buftarg *btp, unsigned int sectorsize, + xfs_fsblock_t nr_blocks); -#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops); diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 19eb0b7a3e58..f4c5be67826e 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -32,19 +32,74 @@ static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip) return container_of(lip, struct xfs_buf_log_item, bli_item); } +static void +xfs_buf_item_get_format( + struct xfs_buf_log_item *bip, + int count) +{ + ASSERT(bip->bli_formats == NULL); + bip->bli_format_count = count; + + if (count == 1) { + bip->bli_formats = &bip->__bli_format; + return; + } + + bip->bli_formats = kzalloc(count * sizeof(struct xfs_buf_log_format), + GFP_KERNEL | __GFP_NOFAIL); +} + +static void +xfs_buf_item_free_format( + struct xfs_buf_log_item *bip) +{ + if (bip->bli_formats != &bip->__bli_format) { + kfree(bip->bli_formats); + bip->bli_formats = NULL; + } +} + +static void +xfs_buf_item_free( + struct xfs_buf_log_item *bip) +{ + xfs_buf_item_free_format(bip); + kvfree(bip->bli_item.li_lv_shadow); + kmem_cache_free(xfs_buf_item_cache, bip); +} + +/* + * xfs_buf_item_relse() is called when the buf log item is no longer needed. + */ +static void +xfs_buf_item_relse( + struct xfs_buf_log_item *bip) +{ + struct xfs_buf *bp = bip->bli_buf; + + trace_xfs_buf_item_relse(bp, _RET_IP_); + + ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)); + ASSERT(atomic_read(&bip->bli_refcount) == 0); + + bp->b_log_item = NULL; + xfs_buf_rele(bp); + xfs_buf_item_free(bip); +} + /* Is this log iovec plausibly large enough to contain the buffer log format? */ bool xfs_buf_log_check_iovec( - struct xfs_log_iovec *iovec) + struct kvec *iovec) { - struct xfs_buf_log_format *blfp = iovec->i_addr; + struct xfs_buf_log_format *blfp = iovec->iov_base; char *bmp_end; char *item_end; - if (offsetof(struct xfs_buf_log_format, blf_data_map) > iovec->i_len) + if (offsetof(struct xfs_buf_log_format, blf_data_map) > iovec->iov_len) return false; - item_end = (char *)iovec->i_addr + iovec->i_len; + item_end = (char *)iovec->iov_base + iovec->iov_len; bmp_end = (char *)&blfp->blf_data_map[blfp->blf_map_size]; return bmp_end <= item_end; } @@ -104,6 +159,25 @@ xfs_buf_item_size_segment( } /* + * Compute the worst case log item overhead for an invalidated buffer with the + * given map count and block size. + */ +unsigned int +xfs_buf_inval_log_space( + unsigned int map_count, + unsigned int blocksize) +{ + unsigned int chunks = DIV_ROUND_UP(blocksize, XFS_BLF_CHUNK); + unsigned int bitmap_size = DIV_ROUND_UP(chunks, NBWORD); + unsigned int ret = + offsetof(struct xfs_buf_log_format, blf_data_map) + + (bitmap_size * sizeof_field(struct xfs_buf_log_format, + blf_data_map[0])); + + return ret * map_count; +} + +/* * Return the number of log iovecs and space needed to log the given buf log * item. * @@ -371,6 +445,42 @@ xfs_buf_item_pin( } /* + * For a stale BLI, process all the necessary completions that must be + * performed when the final BLI reference goes away. The buffer will be + * referenced and locked here - we return to the caller with the buffer still + * referenced and locked for them to finalise processing of the buffer. + */ +static void +xfs_buf_item_finish_stale( + struct xfs_buf_log_item *bip) +{ + struct xfs_buf *bp = bip->bli_buf; + struct xfs_log_item *lip = &bip->bli_item; + + ASSERT(bip->bli_flags & XFS_BLI_STALE); + ASSERT(xfs_buf_islocked(bp)); + ASSERT(bp->b_flags & XBF_STALE); + ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); + ASSERT(list_empty(&lip->li_trans)); + ASSERT(!bp->b_transp); + + if (bip->bli_flags & XFS_BLI_STALE_INODE) { + xfs_buf_item_done(bp); + xfs_buf_inode_iodone(bp); + ASSERT(list_empty(&bp->b_li_list)); + return; + } + + /* + * We may or may not be on the AIL here, xfs_trans_ail_delete() will do + * the right thing regardless of the situation in which we are called. + */ + xfs_trans_ail_delete(lip, SHUTDOWN_LOG_IO_ERROR); + xfs_buf_item_relse(bip); + ASSERT(bp->b_log_item == NULL); +} + +/* * This is called to unpin the buffer associated with the buf log item which was * previously pinned with a call to xfs_buf_item_pin(). We enter this function * with a buffer pin count, a buffer reference and a BLI reference. @@ -419,13 +529,6 @@ xfs_buf_item_unpin( } if (stale) { - ASSERT(bip->bli_flags & XFS_BLI_STALE); - ASSERT(xfs_buf_islocked(bp)); - ASSERT(bp->b_flags & XBF_STALE); - ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); - ASSERT(list_empty(&lip->li_trans)); - ASSERT(!bp->b_transp); - trace_xfs_buf_item_unpin_stale(bip); /* @@ -436,22 +539,7 @@ xfs_buf_item_unpin( * processing is complete. */ xfs_buf_rele(bp); - - /* - * If we get called here because of an IO error, we may or may - * not have the item on the AIL. xfs_trans_ail_delete() will - * take care of that situation. xfs_trans_ail_delete() drops - * the AIL lock. - */ - if (bip->bli_flags & XFS_BLI_STALE_INODE) { - xfs_buf_item_done(bp); - xfs_buf_inode_iodone(bp); - ASSERT(list_empty(&bp->b_li_list)); - } else { - xfs_trans_ail_delete(lip, SHUTDOWN_LOG_IO_ERROR); - xfs_buf_item_relse(bp); - ASSERT(bp->b_log_item == NULL); - } + xfs_buf_item_finish_stale(bip); xfs_buf_relse(bp); return; } @@ -524,43 +612,42 @@ xfs_buf_item_push( * Drop the buffer log item refcount and take appropriate action. This helper * determines whether the bli must be freed or not, since a decrement to zero * does not necessarily mean the bli is unused. - * - * Return true if the bli is freed, false otherwise. */ -bool +void xfs_buf_item_put( struct xfs_buf_log_item *bip) { - struct xfs_log_item *lip = &bip->bli_item; - bool aborted; - bool dirty; + + ASSERT(xfs_buf_islocked(bip->bli_buf)); /* drop the bli ref and return if it wasn't the last one */ if (!atomic_dec_and_test(&bip->bli_refcount)) - return false; + return; - /* - * We dropped the last ref and must free the item if clean or aborted. - * If the bli is dirty and non-aborted, the buffer was clean in the - * transaction but still awaiting writeback from previous changes. In - * that case, the bli is freed on buffer writeback completion. - */ - aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags) || - xlog_is_shutdown(lip->li_log); - dirty = bip->bli_flags & XFS_BLI_DIRTY; - if (dirty && !aborted) - return false; + /* If the BLI is in the AIL, then it is still dirty and in use */ + if (test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)) { + ASSERT(bip->bli_flags & XFS_BLI_DIRTY); + return; + } /* - * The bli is aborted or clean. An aborted item may be in the AIL - * regardless of dirty state. For example, consider an aborted - * transaction that invalidated a dirty bli and cleared the dirty - * state. + * In shutdown conditions, we can be asked to free a dirty BLI that + * isn't in the AIL. This can occur due to a checkpoint aborting a BLI + * instead of inserting it into the AIL at checkpoint IO completion. If + * there's another bli reference (e.g. a btree cursor holds a clean + * reference) and it is released via xfs_trans_brelse(), we can get here + * with that aborted, dirty BLI. In this case, it is safe to free the + * dirty BLI immediately, as it is not in the AIL and there are no + * other references to it. + * + * We should never get here with a stale BLI via that path as + * xfs_trans_brelse() specifically holds onto stale buffers rather than + * releasing them. */ - if (aborted) - xfs_trans_ail_delete(lip, 0); - xfs_buf_item_relse(bip->bli_buf); - return true; + ASSERT(!(bip->bli_flags & XFS_BLI_DIRTY) || + test_bit(XFS_LI_ABORTED, &bip->bli_item.li_flags)); + ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); + xfs_buf_item_relse(bip); } /* @@ -581,6 +668,15 @@ xfs_buf_item_put( * if necessary but do not unlock the buffer. This is for support of * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't * free the item. + * + * If the XFS_BLI_STALE flag is set, the last reference to the BLI *must* + * perform a completion abort of any objects attached to the buffer for IO + * tracking purposes. This generally only happens in shutdown situations, + * normally xfs_buf_item_unpin() will drop the last BLI reference and perform + * completion processing. However, because transaction completion can race with + * checkpoint completion during a shutdown, this release context may end up + * being the last active reference to the BLI and so needs to perform this + * cleanup. */ STATIC void xfs_buf_item_release( @@ -588,18 +684,19 @@ xfs_buf_item_release( { struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; - bool released; bool hold = bip->bli_flags & XFS_BLI_HOLD; bool stale = bip->bli_flags & XFS_BLI_STALE; -#if defined(DEBUG) || defined(XFS_WARN) - bool ordered = bip->bli_flags & XFS_BLI_ORDERED; - bool dirty = bip->bli_flags & XFS_BLI_DIRTY; bool aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags); + bool dirty = bip->bli_flags & XFS_BLI_DIRTY; +#if defined(DEBUG) || defined(XFS_WARN) + bool ordered = bip->bli_flags & XFS_BLI_ORDERED; #endif trace_xfs_buf_item_release(bip); + ASSERT(xfs_buf_islocked(bp)); + /* * The bli dirty state should match whether the blf has logged segments * except for ordered buffers, where only the bli should be dirty. @@ -615,16 +712,56 @@ xfs_buf_item_release( bp->b_transp = NULL; bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED); + /* If there are other references, then we have nothing to do. */ + if (!atomic_dec_and_test(&bip->bli_refcount)) + goto out_release; + + /* + * Stale buffer completion frees the BLI, unlocks and releases the + * buffer. Neither the BLI or buffer are safe to reference after this + * call, so there's nothing more we need to do here. + * + * If we get here with a stale buffer and references to the BLI remain, + * we must not unlock the buffer as the last BLI reference owns lock + * context, not us. + */ + if (stale) { + xfs_buf_item_finish_stale(bip); + xfs_buf_relse(bp); + ASSERT(!hold); + return; + } + + /* + * Dirty or clean, aborted items are done and need to be removed from + * the AIL and released. This frees the BLI, but leaves the buffer + * locked and referenced. + */ + if (aborted || xlog_is_shutdown(lip->li_log)) { + ASSERT(list_empty(&bip->bli_buf->b_li_list)); + xfs_buf_item_done(bp); + goto out_release; + } + + /* + * Clean, unreferenced BLIs can be immediately freed, leaving the buffer + * locked and referenced. + * + * Dirty, unreferenced BLIs *must* be in the AIL awaiting writeback. + */ + if (!dirty) + xfs_buf_item_relse(bip); + else + ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags)); + + /* Not safe to reference the BLI from here */ +out_release: /* - * Unref the item and unlock the buffer unless held or stale. Stale - * buffers remain locked until final unpin unless the bli is freed by - * the unref call. The latter implies shutdown because buffer - * invalidation dirties the bli and transaction. + * If we get here with a stale buffer, we must not unlock the + * buffer as the last BLI reference owns lock context, not us. */ - released = xfs_buf_item_put(bip); - if (hold || (stale && !released)) + if (stale || hold) return; - ASSERT(!stale || aborted); xfs_buf_relse(bp); } @@ -710,33 +847,6 @@ static const struct xfs_item_ops xfs_buf_item_ops = { .iop_push = xfs_buf_item_push, }; -STATIC void -xfs_buf_item_get_format( - struct xfs_buf_log_item *bip, - int count) -{ - ASSERT(bip->bli_formats == NULL); - bip->bli_format_count = count; - - if (count == 1) { - bip->bli_formats = &bip->__bli_format; - return; - } - - bip->bli_formats = kzalloc(count * sizeof(struct xfs_buf_log_format), - GFP_KERNEL | __GFP_NOFAIL); -} - -STATIC void -xfs_buf_item_free_format( - struct xfs_buf_log_item *bip) -{ - if (bip->bli_formats != &bip->__bli_format) { - kfree(bip->bli_formats); - bip->bli_formats = NULL; - } -} - /* * Allocate a new buf log item to go with the given buffer. * Set the buffer's b_log_item field to point to the new @@ -786,6 +896,7 @@ xfs_buf_item_init( map_size = DIV_ROUND_UP(chunks, NBWORD); if (map_size > XFS_BLF_DATAMAP_SIZE) { + xfs_buf_item_free_format(bip); kmem_cache_free(xfs_buf_item_cache, bip); xfs_err(mp, "buffer item dirty bitmap (%u uints) too small to reflect %u bytes!", @@ -957,34 +1068,6 @@ xfs_buf_item_dirty_format( return false; } -STATIC void -xfs_buf_item_free( - struct xfs_buf_log_item *bip) -{ - xfs_buf_item_free_format(bip); - kvfree(bip->bli_item.li_lv_shadow); - kmem_cache_free(xfs_buf_item_cache, bip); -} - -/* - * xfs_buf_item_relse() is called when the buf log item is no longer needed. - */ -void -xfs_buf_item_relse( - struct xfs_buf *bp) -{ - struct xfs_buf_log_item *bip = bp->b_log_item; - - trace_xfs_buf_item_relse(bp, _RET_IP_); - ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)); - - if (atomic_read(&bip->bli_refcount)) - return; - bp->b_log_item = NULL; - xfs_buf_rele(bp); - xfs_buf_item_free(bip); -} - void xfs_buf_item_done( struct xfs_buf *bp) @@ -1004,5 +1087,5 @@ xfs_buf_item_done( xfs_trans_ail_delete(&bp->b_log_item->bli_item, (bp->b_flags & _XBF_LOGRECOVERY) ? 0 : SHUTDOWN_CORRUPT_INCORE); - xfs_buf_item_relse(bp); + xfs_buf_item_relse(bp->b_log_item); } diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 8cde85259a58..3159325dd17b 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -49,8 +49,7 @@ struct xfs_buf_log_item { int xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); void xfs_buf_item_done(struct xfs_buf *bp); -void xfs_buf_item_relse(struct xfs_buf *); -bool xfs_buf_item_put(struct xfs_buf_log_item *); +void xfs_buf_item_put(struct xfs_buf_log_item *bip); void xfs_buf_item_log(struct xfs_buf_log_item *, uint, uint); bool xfs_buf_item_dirty_format(struct xfs_buf_log_item *); void xfs_buf_inode_iodone(struct xfs_buf *); @@ -62,7 +61,10 @@ static inline void xfs_buf_dquot_iodone(struct xfs_buf *bp) } #endif /* CONFIG_XFS_QUOTA */ void xfs_buf_iodone(struct xfs_buf *); -bool xfs_buf_log_check_iovec(struct xfs_log_iovec *iovec); +bool xfs_buf_log_check_iovec(struct kvec *iovec); + +unsigned int xfs_buf_inval_log_space(unsigned int map_count, + unsigned int blocksize); extern struct kmem_cache *xfs_buf_item_cache; diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index d4c5cef5bc43..e4c8af873632 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -159,7 +159,7 @@ STATIC enum xlog_recover_reorder xlog_recover_buf_reorder( struct xlog_recover_item *item) { - struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; + struct xfs_buf_log_format *buf_f = item->ri_buf[0].iov_base; if (buf_f->blf_flags & XFS_BLF_CANCEL) return XLOG_REORDER_CANCEL_LIST; @@ -173,7 +173,7 @@ xlog_recover_buf_ra_pass2( struct xlog *log, struct xlog_recover_item *item) { - struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; + struct xfs_buf_log_format *buf_f = item->ri_buf[0].iov_base; xlog_buf_readahead(log, buf_f->blf_blkno, buf_f->blf_len, NULL); } @@ -187,11 +187,11 @@ xlog_recover_buf_commit_pass1( struct xlog *log, struct xlog_recover_item *item) { - struct xfs_buf_log_format *bf = item->ri_buf[0].i_addr; + struct xfs_buf_log_format *bf = item->ri_buf[0].iov_base; if (!xfs_buf_log_check_iovec(&item->ri_buf[0])) { - xfs_err(log->l_mp, "bad buffer log item size (%d)", - item->ri_buf[0].i_len); + xfs_err(log->l_mp, "bad buffer log item size (%zd)", + item->ri_buf[0].iov_len); return -EFSCORRUPTED; } @@ -487,8 +487,8 @@ xlog_recover_do_reg_buffer( nbits = xfs_contig_bits(buf_f->blf_data_map, buf_f->blf_map_size, bit); ASSERT(nbits > 0); - ASSERT(item->ri_buf[i].i_addr != NULL); - ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0); + ASSERT(item->ri_buf[i].iov_base != NULL); + ASSERT(item->ri_buf[i].iov_len % XFS_BLF_CHUNK == 0); ASSERT(BBTOB(bp->b_length) >= ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT)); @@ -500,8 +500,8 @@ xlog_recover_do_reg_buffer( * the log. Hence we need to trim nbits back to the length of * the current region being copied out of the log. */ - if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT)) - nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT; + if (item->ri_buf[i].iov_len < (nbits << XFS_BLF_SHIFT)) + nbits = item->ri_buf[i].iov_len >> XFS_BLF_SHIFT; /* * Do a sanity check if this is a dquot buffer. Just checking @@ -511,18 +511,18 @@ xlog_recover_do_reg_buffer( fa = NULL; if (buf_f->blf_flags & (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { - if (item->ri_buf[i].i_addr == NULL) { + if (item->ri_buf[i].iov_base == NULL) { xfs_alert(mp, "XFS: NULL dquot in %s.", __func__); goto next; } - if (item->ri_buf[i].i_len < size_disk_dquot) { + if (item->ri_buf[i].iov_len < size_disk_dquot) { xfs_alert(mp, - "XFS: dquot too small (%d) in %s.", - item->ri_buf[i].i_len, __func__); + "XFS: dquot too small (%zd) in %s.", + item->ri_buf[i].iov_len, __func__); goto next; } - fa = xfs_dquot_verify(mp, item->ri_buf[i].i_addr, -1); + fa = xfs_dquot_verify(mp, item->ri_buf[i].iov_base, -1); if (fa) { xfs_alert(mp, "dquot corrupt at %pS trying to replay into block 0x%llx", @@ -533,7 +533,7 @@ xlog_recover_do_reg_buffer( memcpy(xfs_buf_offset(bp, (uint)bit << XFS_BLF_SHIFT), /* dest */ - item->ri_buf[i].i_addr, /* source */ + item->ri_buf[i].iov_base, /* source */ nbits<<XFS_BLF_SHIFT); /* length */ next: i++; @@ -669,8 +669,8 @@ xlog_recover_do_inode_buffer( if (next_unlinked_offset < reg_buf_offset) continue; - ASSERT(item->ri_buf[item_index].i_addr != NULL); - ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0); + ASSERT(item->ri_buf[item_index].iov_base != NULL); + ASSERT((item->ri_buf[item_index].iov_len % XFS_BLF_CHUNK) == 0); ASSERT((reg_buf_offset + reg_buf_bytes) <= BBTOB(bp->b_length)); /* @@ -678,7 +678,7 @@ xlog_recover_do_inode_buffer( * current di_next_unlinked field. Extract its value * and copy it to the buffer copy. */ - logged_nextp = item->ri_buf[item_index].i_addr + + logged_nextp = item->ri_buf[item_index].iov_base + next_unlinked_offset - reg_buf_offset; if (XFS_IS_CORRUPT(mp, *logged_nextp == 0)) { xfs_alert(mp, @@ -736,6 +736,16 @@ xlog_recover_do_primary_sb_buffer( */ xfs_sb_from_disk(&mp->m_sb, dsb); + /* + * Grow can change the device size. Mirror that into the buftarg. + */ + mp->m_ddev_targp->bt_nr_sectors = + XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); + if (mp->m_rtdev_targp && mp->m_rtdev_targp != mp->m_ddev_targp) { + mp->m_rtdev_targp->bt_nr_sectors = + XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); + } + if (mp->m_sb.sb_agcount < orig_agcount) { xfs_alert(mp, "Shrinking AG count in log recovery not supported"); return -EFSCORRUPTED; @@ -1002,7 +1012,7 @@ xlog_recover_buf_commit_pass2( struct xlog_recover_item *item, xfs_lsn_t current_lsn) { - struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; + struct xfs_buf_log_format *buf_f = item->ri_buf[0].iov_base; struct xfs_mount *mp = log->l_mp; struct xfs_buf *bp; int error; diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c index b4ffd80b7cb6..dcbfa274e06d 100644 --- a/fs/xfs/xfs_buf_mem.c +++ b/fs/xfs/xfs_buf_mem.c @@ -165,7 +165,7 @@ xmbuf_map_backing_mem( folio_set_dirty(folio); folio_unlock(folio); - bp->b_addr = folio_address(folio); + bp->b_addr = folio_address(folio) + offset_in_folio(folio, pos); return 0; } diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index c1a306268ae4..b6ffe4807a11 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -103,30 +103,12 @@ xfs_discard_endio( bio_put(bio); } -static inline struct block_device * -xfs_group_bdev( - const struct xfs_group *xg) -{ - struct xfs_mount *mp = xg->xg_mount; - - switch (xg->xg_type) { - case XG_TYPE_AG: - return mp->m_ddev_targp->bt_bdev; - case XG_TYPE_RTG: - return mp->m_rtdev_targp->bt_bdev; - default: - ASSERT(0); - break; - } - return NULL; -} - /* * Walk the discard list and issue discards on all the busy extents in the * list. We plug and chain the bios so that we only need a single completion * call to clear all the busy extents once the discards are complete. */ -int +void xfs_discard_extents( struct xfs_mount *mp, struct xfs_busy_extents *extents) @@ -134,25 +116,19 @@ xfs_discard_extents( struct xfs_extent_busy *busyp; struct bio *bio = NULL; struct blk_plug plug; - int error = 0; blk_start_plug(&plug); list_for_each_entry(busyp, &extents->extent_list, list) { - trace_xfs_discard_extent(busyp->group, busyp->bno, - busyp->length); + struct xfs_group *xg = busyp->group; + struct xfs_buftarg *btp = + xfs_group_type_buftarg(xg->xg_mount, xg->xg_type); - error = __blkdev_issue_discard(xfs_group_bdev(busyp->group), - xfs_gbno_to_daddr(busyp->group, busyp->bno), + trace_xfs_discard_extent(xg, busyp->bno, busyp->length); + + __blkdev_issue_discard(btp->bt_bdev, + xfs_gbno_to_daddr(xg, busyp->bno), XFS_FSB_TO_BB(mp, busyp->length), GFP_KERNEL, &bio); - if (error && error != -EOPNOTSUPP) { - xfs_info(mp, - "discard failed for extent [0x%llx,%u], error %d", - (unsigned long long)busyp->bno, - busyp->length, - error); - break; - } } if (bio) { @@ -163,10 +139,16 @@ xfs_discard_extents( xfs_discard_endio_work(&extents->endio_work); } blk_finish_plug(&plug); - - return error; } +/* + * Care must be taken setting up the trim cursor as the perags may not have been + * initialised when the cursor is initialised. e.g. a clean mount which hasn't + * read in AGFs and the first operation run on the mounted fs is a trim. This + * can result in perag fields that aren't initialised until + * xfs_trim_gather_extents() calls xfs_alloc_read_agf() to lock down the AG for + * the free space search. + */ struct xfs_trim_cur { xfs_agblock_t start; xfs_extlen_t count; @@ -196,14 +178,20 @@ xfs_trim_gather_extents( */ xfs_log_force(mp, XFS_LOG_SYNC); - error = xfs_trans_alloc_empty(mp, &tp); - if (error) - return error; + tp = xfs_trans_alloc_empty(mp); error = xfs_alloc_read_agf(pag, tp, 0, &agbp); if (error) goto out_trans_cancel; + /* + * First time through tcur->count will not have been initialised as + * pag->pagf_longest is not guaranteed to be valid before we read + * the AGF buffer above. + */ + if (!tcur->count) + tcur->count = pag->pagf_longest; + if (tcur->by_bno) { /* sub-AG discard request always starts at tcur->start */ cur = xfs_bnobt_init_cursor(mp, tp, agbp, pag); @@ -350,7 +338,6 @@ xfs_trim_perag_extents( { struct xfs_trim_cur tcur = { .start = start, - .count = pag->pagf_longest, .end = end, .minlen = minlen, }; @@ -387,9 +374,7 @@ xfs_trim_perag_extents( * list after this function call, as it may have been freed by * the time control returns to us. */ - error = xfs_discard_extents(pag_mount(pag), extents); - if (error) - break; + xfs_discard_extents(pag_mount(pag), extents); if (xfs_trim_should_stop()) break; @@ -498,12 +483,10 @@ xfs_discard_rtdev_extents( trace_xfs_discard_rtextent(mp, busyp->bno, busyp->length); - error = __blkdev_issue_discard(bdev, + __blkdev_issue_discard(bdev, xfs_rtb_to_daddr(mp, busyp->bno), XFS_FSB_TO_BB(mp, busyp->length), GFP_NOFS, &bio); - if (error) - break; } xfs_discard_free_rtdev_extents(tr); @@ -583,9 +566,7 @@ xfs_trim_rtextents( struct xfs_trans *tp; int error; - error = xfs_trans_alloc_empty(mp, &tp); - if (error) - return error; + tp = xfs_trans_alloc_empty(mp); /* * Walk the free ranges between low and high. The query_range function @@ -701,9 +682,7 @@ xfs_trim_rtgroup_extents( struct xfs_trans *tp; int error; - error = xfs_trans_alloc_empty(mp, &tp); - if (error) - return error; + tp = xfs_trans_alloc_empty(mp); /* * Walk the free ranges between low and high. The query_range function @@ -732,8 +711,10 @@ xfs_trim_rtgroup_extents( break; } - if (!tr.queued) + if (!tr.queued) { + kfree(tr.extents); break; + } /* * We hand the extent list to the discard function here so the @@ -745,9 +726,7 @@ xfs_trim_rtgroup_extents( * list after this function call, as it may have been freed by * the time control returns to us. */ - error = xfs_discard_extents(rtg_mount(rtg), tr.extents); - if (error) - break; + xfs_discard_extents(rtg_mount(rtg), tr.extents); low = tr.restart_rtx; } while (!xfs_trim_should_stop() && low <= high); diff --git a/fs/xfs/xfs_discard.h b/fs/xfs/xfs_discard.h index 2b1a85223a56..8c5cc4af6a07 100644 --- a/fs/xfs/xfs_discard.h +++ b/fs/xfs/xfs_discard.h @@ -6,7 +6,7 @@ struct fstrim_range; struct xfs_mount; struct xfs_busy_extents; -int xfs_discard_extents(struct xfs_mount *mp, struct xfs_busy_extents *busy); +void xfs_discard_extents(struct xfs_mount *mp, struct xfs_busy_extents *busy); int xfs_ioc_trim(struct xfs_mount *mp, struct fstrim_range __user *fstrim); #endif /* XFS_DISCARD_H */ diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index edbc521870a1..612ca682a513 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -31,7 +31,7 @@ * * ip->i_lock * qi->qi_tree_lock - * dquot->q_qlock (xfs_dqlock() and friends) + * dquot->q_qlock * dquot->q_flush (xfs_dqflock() and friends) * qi->qi_lru_lock * @@ -801,10 +801,11 @@ xfs_dq_get_next_id( static struct xfs_dquot * xfs_qm_dqget_cache_lookup( struct xfs_mount *mp, - struct xfs_quotainfo *qi, - struct radix_tree_root *tree, - xfs_dqid_t id) + xfs_dqid_t id, + xfs_dqtype_t type) { + struct xfs_quotainfo *qi = mp->m_quotainfo; + struct radix_tree_root *tree = xfs_dquot_tree(qi, type); struct xfs_dquot *dqp; restart: @@ -816,16 +817,12 @@ restart: return NULL; } - xfs_dqlock(dqp); - if (dqp->q_flags & XFS_DQFLAG_FREEING) { - xfs_dqunlock(dqp); + if (!lockref_get_not_dead(&dqp->q_lockref)) { mutex_unlock(&qi->qi_tree_lock); trace_xfs_dqget_freeing(dqp); delay(1); goto restart; } - - dqp->q_nrefs++; mutex_unlock(&qi->qi_tree_lock); trace_xfs_dqget_hit(dqp); @@ -836,8 +833,7 @@ restart: /* * Try to insert a new dquot into the in-core cache. If an error occurs the * caller should throw away the dquot and start over. Otherwise, the dquot - * is returned locked (and held by the cache) as if there had been a cache - * hit. + * is returned (and held by the cache) as if there had been a cache hit. * * The insert needs to be done under memalloc_nofs context because the radix * tree can do memory allocation during insert. The qi->qi_tree_lock is taken in @@ -848,11 +844,12 @@ restart: static int xfs_qm_dqget_cache_insert( struct xfs_mount *mp, - struct xfs_quotainfo *qi, - struct radix_tree_root *tree, xfs_dqid_t id, + xfs_dqtype_t type, struct xfs_dquot *dqp) { + struct xfs_quotainfo *qi = mp->m_quotainfo; + struct radix_tree_root *tree = xfs_dquot_tree(qi, type); unsigned int nofs_flags; int error; @@ -860,14 +857,11 @@ xfs_qm_dqget_cache_insert( mutex_lock(&qi->qi_tree_lock); error = radix_tree_insert(tree, id, dqp); if (unlikely(error)) { - /* Duplicate found! Caller must try again. */ trace_xfs_dqget_dup(dqp); goto out_unlock; } - /* Return a locked dquot to the caller, with a reference taken. */ - xfs_dqlock(dqp); - dqp->q_nrefs = 1; + lockref_init(&dqp->q_lockref); qi->qi_dquots++; out_unlock: @@ -903,7 +897,7 @@ xfs_qm_dqget_checks( /* * Given the file system, id, and type (UDQUOT/GDQUOT/PDQUOT), return a - * locked dquot, doing an allocation (if requested) as needed. + * dquot, doing an allocation (if requested) as needed. */ int xfs_qm_dqget( @@ -913,8 +907,6 @@ xfs_qm_dqget( bool can_alloc, struct xfs_dquot **O_dqpp) { - struct xfs_quotainfo *qi = mp->m_quotainfo; - struct radix_tree_root *tree = xfs_dquot_tree(qi, type); struct xfs_dquot *dqp; int error; @@ -923,28 +915,30 @@ xfs_qm_dqget( return error; restart: - dqp = xfs_qm_dqget_cache_lookup(mp, qi, tree, id); - if (dqp) { - *O_dqpp = dqp; - return 0; - } + dqp = xfs_qm_dqget_cache_lookup(mp, id, type); + if (dqp) + goto found; error = xfs_qm_dqread(mp, id, type, can_alloc, &dqp); if (error) return error; - error = xfs_qm_dqget_cache_insert(mp, qi, tree, id, dqp); + error = xfs_qm_dqget_cache_insert(mp, id, type, dqp); if (error) { - /* - * Duplicate found. Just throw away the new dquot and start - * over. - */ xfs_qm_dqdestroy(dqp); - XFS_STATS_INC(mp, xs_qm_dquot_dups); - goto restart; + if (error == -EEXIST) { + /* + * Duplicate found. Just throw away the new dquot and + * start over. + */ + XFS_STATS_INC(mp, xs_qm_dquot_dups); + goto restart; + } + return error; } trace_xfs_dqget_miss(dqp); +found: *O_dqpp = dqp; return 0; } @@ -999,15 +993,16 @@ xfs_qm_dqget_inode( struct xfs_inode *ip, xfs_dqtype_t type, bool can_alloc, - struct xfs_dquot **O_dqpp) + struct xfs_dquot **dqpp) { struct xfs_mount *mp = ip->i_mount; - struct xfs_quotainfo *qi = mp->m_quotainfo; - struct radix_tree_root *tree = xfs_dquot_tree(qi, type); struct xfs_dquot *dqp; xfs_dqid_t id; int error; + ASSERT(!*dqpp); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); + error = xfs_qm_dqget_checks(mp, type); if (error) return error; @@ -1019,11 +1014,9 @@ xfs_qm_dqget_inode( id = xfs_qm_id_for_quotatype(ip, type); restart: - dqp = xfs_qm_dqget_cache_lookup(mp, qi, tree, id); - if (dqp) { - *O_dqpp = dqp; - return 0; - } + dqp = xfs_qm_dqget_cache_lookup(mp, id, type); + if (dqp) + goto found; /* * Dquot cache miss. We don't want to keep the inode lock across @@ -1049,7 +1042,6 @@ restart: if (dqp1) { xfs_qm_dqdestroy(dqp); dqp = dqp1; - xfs_dqlock(dqp); goto dqret; } } else { @@ -1058,21 +1050,26 @@ restart: return -ESRCH; } - error = xfs_qm_dqget_cache_insert(mp, qi, tree, id, dqp); + error = xfs_qm_dqget_cache_insert(mp, id, type, dqp); if (error) { - /* - * Duplicate found. Just throw away the new dquot and start - * over. - */ xfs_qm_dqdestroy(dqp); - XFS_STATS_INC(mp, xs_qm_dquot_dups); - goto restart; + if (error == -EEXIST) { + /* + * Duplicate found. Just throw away the new dquot and + * start over. + */ + XFS_STATS_INC(mp, xs_qm_dquot_dups); + goto restart; + } + return error; } dqret: xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); trace_xfs_dqget_miss(dqp); - *O_dqpp = dqp; +found: + trace_xfs_dqattach_get(dqp); + *dqpp = dqp; return 0; } @@ -1098,63 +1095,41 @@ xfs_qm_dqget_next( else if (error != 0) break; + mutex_lock(&dqp->q_qlock); if (!XFS_IS_DQUOT_UNINITIALIZED(dqp)) { *dqpp = dqp; return 0; } - xfs_qm_dqput(dqp); + mutex_unlock(&dqp->q_qlock); + xfs_qm_dqrele(dqp); } return error; } /* - * Release a reference to the dquot (decrement ref-count) and unlock it. - * - * If there is a group quota attached to this dquot, carefully release that - * too without tripping over deadlocks'n'stuff. + * Release a reference to the dquot. */ void -xfs_qm_dqput( +xfs_qm_dqrele( struct xfs_dquot *dqp) { - ASSERT(dqp->q_nrefs > 0); - ASSERT(XFS_DQ_IS_LOCKED(dqp)); + if (!dqp) + return; - trace_xfs_dqput(dqp); + trace_xfs_dqrele(dqp); - if (--dqp->q_nrefs == 0) { + if (lockref_put_or_lock(&dqp->q_lockref)) + return; + if (!--dqp->q_lockref.count) { struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo; - trace_xfs_dqput_free(dqp); + trace_xfs_dqrele_free(dqp); if (list_lru_add_obj(&qi->qi_lru, &dqp->q_lru)) XFS_STATS_INC(dqp->q_mount, xs_qm_dquot_unused); } - xfs_dqunlock(dqp); -} - -/* - * Release a dquot. Flush it if dirty, then dqput() it. - * dquot must not be locked. - */ -void -xfs_qm_dqrele( - struct xfs_dquot *dqp) -{ - if (!dqp) - return; - - trace_xfs_dqrele(dqp); - - xfs_dqlock(dqp); - /* - * We don't care to flush it if the dquot is dirty here. - * That will create stutters that we want to avoid. - * Instead we do a delayed write when we try to reclaim - * a dirty dquot. Also xfs_sync will take part of the burden... - */ - xfs_qm_dqput(dqp); + spin_unlock(&dqp->q_lockref.lock); } /* @@ -1186,9 +1161,8 @@ xfs_qm_dqflush_done( if (test_bit(XFS_LI_IN_AIL, &lip->li_flags) && (lip->li_lsn == qlip->qli_flush_lsn || test_bit(XFS_LI_FAILED, &lip->li_flags))) { - spin_lock(&ailp->ail_lock); - xfs_clear_li_failed(lip); + clear_bit(XFS_LI_FAILED, &lip->li_flags); if (lip->li_lsn == qlip->qli_flush_lsn) { /* xfs_ail_update_finish() drops the AIL lock */ tail_lsn = xfs_ail_delete_one(ailp, lip); @@ -1399,11 +1373,9 @@ xfs_qm_dqflush( ASSERT(XFS_DQ_IS_LOCKED(dqp)); ASSERT(!completion_done(&dqp->q_flush)); + ASSERT(atomic_read(&dqp->q_pincount) == 0); trace_xfs_dqflush(dqp); - - xfs_qm_dqunpin_wait(dqp); - fa = xfs_qm_dqflush_check(dqp); if (fa) { xfs_alert(mp, "corrupt dquot ID 0x%x in memory at %pS", diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 61217adf5ba5..bbb824adca82 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -71,7 +71,7 @@ struct xfs_dquot { xfs_dqtype_t q_type; uint16_t q_flags; xfs_dqid_t q_id; - uint q_nrefs; + struct lockref q_lockref; int q_bufoffset; xfs_daddr_t q_blkno; xfs_fileoff_t q_fileoffset; @@ -121,21 +121,6 @@ static inline void xfs_dqfunlock(struct xfs_dquot *dqp) complete(&dqp->q_flush); } -static inline int xfs_dqlock_nowait(struct xfs_dquot *dqp) -{ - return mutex_trylock(&dqp->q_qlock); -} - -static inline void xfs_dqlock(struct xfs_dquot *dqp) -{ - mutex_lock(&dqp->q_qlock); -} - -static inline void xfs_dqunlock(struct xfs_dquot *dqp) -{ - mutex_unlock(&dqp->q_qlock); -} - static inline int xfs_dquot_type(const struct xfs_dquot *dqp) { @@ -233,7 +218,6 @@ int xfs_qm_dqget_next(struct xfs_mount *mp, xfs_dqid_t id, int xfs_qm_dqget_uncached(struct xfs_mount *mp, xfs_dqid_t id, xfs_dqtype_t type, struct xfs_dquot **dqpp); -void xfs_qm_dqput(struct xfs_dquot *dqp); void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *); void xfs_dqlockn(struct xfs_dqtrx *q); @@ -246,9 +230,7 @@ void xfs_dquot_detach_buf(struct xfs_dquot *dqp); static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp) { - xfs_dqlock(dqp); - dqp->q_nrefs++; - xfs_dqunlock(dqp); + lockref_get(&dqp->q_lockref); return dqp; } diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 271b195ebb93..b374cd9f1900 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -132,7 +132,7 @@ xfs_qm_dquot_logitem_push( if (atomic_read(&dqp->q_pincount) > 0) return XFS_ITEM_PINNED; - if (!xfs_dqlock_nowait(dqp)) + if (!mutex_trylock(&dqp->q_qlock)) return XFS_ITEM_LOCKED; /* @@ -177,7 +177,7 @@ xfs_qm_dquot_logitem_push( out_relock_ail: spin_lock(&lip->li_ailp->ail_lock); out_unlock: - xfs_dqunlock(dqp); + mutex_unlock(&dqp->q_qlock); return rval; } @@ -195,7 +195,7 @@ xfs_qm_dquot_logitem_release( * transaction layer, within trans_commit. Hence, no LI_HOLD flag * for the logitem. */ - xfs_dqunlock(dqp); + mutex_unlock(&dqp->q_qlock); } STATIC void diff --git a/fs/xfs/xfs_dquot_item_recover.c b/fs/xfs/xfs_dquot_item_recover.c index 2c2720ce6923..89bc9bcaf51e 100644 --- a/fs/xfs/xfs_dquot_item_recover.c +++ b/fs/xfs/xfs_dquot_item_recover.c @@ -34,10 +34,10 @@ xlog_recover_dquot_ra_pass2( if (mp->m_qflags == 0) return; - recddq = item->ri_buf[1].i_addr; + recddq = item->ri_buf[1].iov_base; if (recddq == NULL) return; - if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot)) + if (item->ri_buf[1].iov_len < sizeof(struct xfs_disk_dquot)) return; type = recddq->d_type & XFS_DQTYPE_REC_MASK; @@ -45,7 +45,7 @@ xlog_recover_dquot_ra_pass2( if (log->l_quotaoffs_flag & type) return; - dq_f = item->ri_buf[0].i_addr; + dq_f = item->ri_buf[0].iov_base; ASSERT(dq_f); ASSERT(dq_f->qlf_len == 1); @@ -79,14 +79,14 @@ xlog_recover_dquot_commit_pass2( if (mp->m_qflags == 0) return 0; - recddq = item->ri_buf[1].i_addr; + recddq = item->ri_buf[1].iov_base; if (recddq == NULL) { xfs_alert(log->l_mp, "NULL dquot in %s.", __func__); return -EFSCORRUPTED; } - if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot)) { - xfs_alert(log->l_mp, "dquot too small (%d) in %s.", - item->ri_buf[1].i_len, __func__); + if (item->ri_buf[1].iov_len < sizeof(struct xfs_disk_dquot)) { + xfs_alert(log->l_mp, "dquot too small (%zd) in %s.", + item->ri_buf[1].iov_len, __func__); return -EFSCORRUPTED; } @@ -108,7 +108,7 @@ xlog_recover_dquot_commit_pass2( * The other possibility, of course, is that the quota subsystem was * removed since the last mount - ENOSYS. */ - dq_f = item->ri_buf[0].i_addr; + dq_f = item->ri_buf[0].iov_base; ASSERT(dq_f); fa = xfs_dquot_verify(mp, recddq, dq_f->qlf_id); if (fa) { @@ -147,7 +147,7 @@ xlog_recover_dquot_commit_pass2( } } - memcpy(ddq, recddq, item->ri_buf[1].i_len); + memcpy(ddq, recddq, item->ri_buf[1].iov_len); if (xfs_has_crc(mp)) { xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk), XFS_DQUOT_CRC_OFF); @@ -192,7 +192,7 @@ xlog_recover_quotaoff_commit_pass1( struct xlog *log, struct xlog_recover_item *item) { - struct xfs_qoff_logformat *qoff_f = item->ri_buf[0].i_addr; + struct xfs_qoff_logformat *qoff_f = item->ri_buf[0].iov_base; ASSERT(qoff_f); /* diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index dbd87e137694..39830b252ac8 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -10,61 +10,17 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" -#include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_sysfs.h" #include "xfs_inode.h" #ifdef DEBUG -static unsigned int xfs_errortag_random_default[] = { - XFS_RANDOM_DEFAULT, - XFS_RANDOM_IFLUSH_1, - XFS_RANDOM_IFLUSH_2, - XFS_RANDOM_IFLUSH_3, - XFS_RANDOM_IFLUSH_4, - XFS_RANDOM_IFLUSH_5, - XFS_RANDOM_IFLUSH_6, - XFS_RANDOM_DA_READ_BUF, - XFS_RANDOM_BTREE_CHECK_LBLOCK, - XFS_RANDOM_BTREE_CHECK_SBLOCK, - XFS_RANDOM_ALLOC_READ_AGF, - XFS_RANDOM_IALLOC_READ_AGI, - XFS_RANDOM_ITOBP_INOTOBP, - XFS_RANDOM_IUNLINK, - XFS_RANDOM_IUNLINK_REMOVE, - XFS_RANDOM_DIR_INO_VALIDATE, - XFS_RANDOM_BULKSTAT_READ_CHUNK, - XFS_RANDOM_IODONE_IOERR, - XFS_RANDOM_STRATREAD_IOERR, - XFS_RANDOM_STRATCMPL_IOERR, - XFS_RANDOM_DIOWRITE_IOERR, - XFS_RANDOM_BMAPIFORMAT, - XFS_RANDOM_FREE_EXTENT, - XFS_RANDOM_RMAP_FINISH_ONE, - XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE, - XFS_RANDOM_REFCOUNT_FINISH_ONE, - XFS_RANDOM_BMAP_FINISH_ONE, - XFS_RANDOM_AG_RESV_CRITICAL, - 0, /* XFS_RANDOM_DROP_WRITES has been removed */ - XFS_RANDOM_LOG_BAD_CRC, - XFS_RANDOM_LOG_ITEM_PIN, - XFS_RANDOM_BUF_LRU_REF, - XFS_RANDOM_FORCE_SCRUB_REPAIR, - XFS_RANDOM_FORCE_SUMMARY_RECALC, - XFS_RANDOM_IUNLINK_FALLBACK, - XFS_RANDOM_BUF_IOERROR, - XFS_RANDOM_REDUCE_MAX_IEXTENTS, - XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT, - XFS_RANDOM_AG_RESV_FAIL, - XFS_RANDOM_LARP, - XFS_RANDOM_DA_LEAF_SPLIT, - XFS_RANDOM_ATTR_LEAF_TO_NODE, - XFS_RANDOM_WB_DELAY_MS, - XFS_RANDOM_WRITE_DELAY_MS, - XFS_RANDOM_EXCHMAPS_FINISH_ONE, - XFS_RANDOM_METAFILE_RESV_CRITICAL, -}; +#define XFS_ERRTAG(_tag, _name, _default) \ + [XFS_ERRTAG_##_tag] = (_default), +#include "xfs_errortag.h" +static const unsigned int xfs_errortag_random_default[] = { XFS_ERRTAGS }; +#undef XFS_ERRTAG struct xfs_errortag_attr { struct attribute attr; @@ -93,21 +49,18 @@ xfs_errortag_attr_store( size_t count) { struct xfs_mount *mp = to_mp(kobject); - struct xfs_errortag_attr *xfs_attr = to_attr(attr); + unsigned int error_tag = to_attr(attr)->tag; int ret; - unsigned int val; if (strcmp(buf, "default") == 0) { - val = xfs_errortag_random_default[xfs_attr->tag]; + mp->m_errortag[error_tag] = + xfs_errortag_random_default[error_tag]; } else { - ret = kstrtouint(buf, 0, &val); + ret = kstrtouint(buf, 0, &mp->m_errortag[error_tag]); if (ret) return ret; } - ret = xfs_errortag_set(mp, xfs_attr->tag, val); - if (ret) - return ret; return count; } @@ -118,10 +71,9 @@ xfs_errortag_attr_show( char *buf) { struct xfs_mount *mp = to_mp(kobject); - struct xfs_errortag_attr *xfs_attr = to_attr(attr); + unsigned int error_tag = to_attr(attr)->tag; - return snprintf(buf, PAGE_SIZE, "%u\n", - xfs_errortag_get(mp, xfs_attr->tag)); + return snprintf(buf, PAGE_SIZE, "%u\n", mp->m_errortag[error_tag]); } static const struct sysfs_ops xfs_errortag_sysfs_ops = { @@ -129,110 +81,28 @@ static const struct sysfs_ops xfs_errortag_sysfs_ops = { .store = xfs_errortag_attr_store, }; -#define XFS_ERRORTAG_ATTR_RW(_name, _tag) \ +#define XFS_ERRTAG(_tag, _name, _default) \ static struct xfs_errortag_attr xfs_errortag_attr_##_name = { \ .attr = {.name = __stringify(_name), \ .mode = VERIFY_OCTAL_PERMISSIONS(S_IWUSR | S_IRUGO) }, \ - .tag = (_tag), \ -} - -#define XFS_ERRORTAG_ATTR_LIST(_name) &xfs_errortag_attr_##_name.attr - -XFS_ERRORTAG_ATTR_RW(noerror, XFS_ERRTAG_NOERROR); -XFS_ERRORTAG_ATTR_RW(iflush1, XFS_ERRTAG_IFLUSH_1); -XFS_ERRORTAG_ATTR_RW(iflush2, XFS_ERRTAG_IFLUSH_2); -XFS_ERRORTAG_ATTR_RW(iflush3, XFS_ERRTAG_IFLUSH_3); -XFS_ERRORTAG_ATTR_RW(iflush4, XFS_ERRTAG_IFLUSH_4); -XFS_ERRORTAG_ATTR_RW(iflush5, XFS_ERRTAG_IFLUSH_5); -XFS_ERRORTAG_ATTR_RW(iflush6, XFS_ERRTAG_IFLUSH_6); -XFS_ERRORTAG_ATTR_RW(dareadbuf, XFS_ERRTAG_DA_READ_BUF); -XFS_ERRORTAG_ATTR_RW(btree_chk_lblk, XFS_ERRTAG_BTREE_CHECK_LBLOCK); -XFS_ERRORTAG_ATTR_RW(btree_chk_sblk, XFS_ERRTAG_BTREE_CHECK_SBLOCK); -XFS_ERRORTAG_ATTR_RW(readagf, XFS_ERRTAG_ALLOC_READ_AGF); -XFS_ERRORTAG_ATTR_RW(readagi, XFS_ERRTAG_IALLOC_READ_AGI); -XFS_ERRORTAG_ATTR_RW(itobp, XFS_ERRTAG_ITOBP_INOTOBP); -XFS_ERRORTAG_ATTR_RW(iunlink, XFS_ERRTAG_IUNLINK); -XFS_ERRORTAG_ATTR_RW(iunlinkrm, XFS_ERRTAG_IUNLINK_REMOVE); -XFS_ERRORTAG_ATTR_RW(dirinovalid, XFS_ERRTAG_DIR_INO_VALIDATE); -XFS_ERRORTAG_ATTR_RW(bulkstat, XFS_ERRTAG_BULKSTAT_READ_CHUNK); -XFS_ERRORTAG_ATTR_RW(logiodone, XFS_ERRTAG_IODONE_IOERR); -XFS_ERRORTAG_ATTR_RW(stratread, XFS_ERRTAG_STRATREAD_IOERR); -XFS_ERRORTAG_ATTR_RW(stratcmpl, XFS_ERRTAG_STRATCMPL_IOERR); -XFS_ERRORTAG_ATTR_RW(diowrite, XFS_ERRTAG_DIOWRITE_IOERR); -XFS_ERRORTAG_ATTR_RW(bmapifmt, XFS_ERRTAG_BMAPIFORMAT); -XFS_ERRORTAG_ATTR_RW(free_extent, XFS_ERRTAG_FREE_EXTENT); -XFS_ERRORTAG_ATTR_RW(rmap_finish_one, XFS_ERRTAG_RMAP_FINISH_ONE); -XFS_ERRORTAG_ATTR_RW(refcount_continue_update, XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE); -XFS_ERRORTAG_ATTR_RW(refcount_finish_one, XFS_ERRTAG_REFCOUNT_FINISH_ONE); -XFS_ERRORTAG_ATTR_RW(bmap_finish_one, XFS_ERRTAG_BMAP_FINISH_ONE); -XFS_ERRORTAG_ATTR_RW(ag_resv_critical, XFS_ERRTAG_AG_RESV_CRITICAL); -XFS_ERRORTAG_ATTR_RW(log_bad_crc, XFS_ERRTAG_LOG_BAD_CRC); -XFS_ERRORTAG_ATTR_RW(log_item_pin, XFS_ERRTAG_LOG_ITEM_PIN); -XFS_ERRORTAG_ATTR_RW(buf_lru_ref, XFS_ERRTAG_BUF_LRU_REF); -XFS_ERRORTAG_ATTR_RW(force_repair, XFS_ERRTAG_FORCE_SCRUB_REPAIR); -XFS_ERRORTAG_ATTR_RW(bad_summary, XFS_ERRTAG_FORCE_SUMMARY_RECALC); -XFS_ERRORTAG_ATTR_RW(iunlink_fallback, XFS_ERRTAG_IUNLINK_FALLBACK); -XFS_ERRORTAG_ATTR_RW(buf_ioerror, XFS_ERRTAG_BUF_IOERROR); -XFS_ERRORTAG_ATTR_RW(reduce_max_iextents, XFS_ERRTAG_REDUCE_MAX_IEXTENTS); -XFS_ERRORTAG_ATTR_RW(bmap_alloc_minlen_extent, XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT); -XFS_ERRORTAG_ATTR_RW(ag_resv_fail, XFS_ERRTAG_AG_RESV_FAIL); -XFS_ERRORTAG_ATTR_RW(larp, XFS_ERRTAG_LARP); -XFS_ERRORTAG_ATTR_RW(da_leaf_split, XFS_ERRTAG_DA_LEAF_SPLIT); -XFS_ERRORTAG_ATTR_RW(attr_leaf_to_node, XFS_ERRTAG_ATTR_LEAF_TO_NODE); -XFS_ERRORTAG_ATTR_RW(wb_delay_ms, XFS_ERRTAG_WB_DELAY_MS); -XFS_ERRORTAG_ATTR_RW(write_delay_ms, XFS_ERRTAG_WRITE_DELAY_MS); -XFS_ERRORTAG_ATTR_RW(exchmaps_finish_one, XFS_ERRTAG_EXCHMAPS_FINISH_ONE); -XFS_ERRORTAG_ATTR_RW(metafile_resv_crit, XFS_ERRTAG_METAFILE_RESV_CRITICAL); + .tag = XFS_ERRTAG_##_tag, \ +}; +#include "xfs_errortag.h" +XFS_ERRTAGS +#undef XFS_ERRTAG +#define XFS_ERRTAG(_tag, _name, _default) \ + &xfs_errortag_attr_##_name.attr, +#include "xfs_errortag.h" static struct attribute *xfs_errortag_attrs[] = { - XFS_ERRORTAG_ATTR_LIST(noerror), - XFS_ERRORTAG_ATTR_LIST(iflush1), - XFS_ERRORTAG_ATTR_LIST(iflush2), - XFS_ERRORTAG_ATTR_LIST(iflush3), - XFS_ERRORTAG_ATTR_LIST(iflush4), - XFS_ERRORTAG_ATTR_LIST(iflush5), - XFS_ERRORTAG_ATTR_LIST(iflush6), - XFS_ERRORTAG_ATTR_LIST(dareadbuf), - XFS_ERRORTAG_ATTR_LIST(btree_chk_lblk), - XFS_ERRORTAG_ATTR_LIST(btree_chk_sblk), - XFS_ERRORTAG_ATTR_LIST(readagf), - XFS_ERRORTAG_ATTR_LIST(readagi), - XFS_ERRORTAG_ATTR_LIST(itobp), - XFS_ERRORTAG_ATTR_LIST(iunlink), - XFS_ERRORTAG_ATTR_LIST(iunlinkrm), - XFS_ERRORTAG_ATTR_LIST(dirinovalid), - XFS_ERRORTAG_ATTR_LIST(bulkstat), - XFS_ERRORTAG_ATTR_LIST(logiodone), - XFS_ERRORTAG_ATTR_LIST(stratread), - XFS_ERRORTAG_ATTR_LIST(stratcmpl), - XFS_ERRORTAG_ATTR_LIST(diowrite), - XFS_ERRORTAG_ATTR_LIST(bmapifmt), - XFS_ERRORTAG_ATTR_LIST(free_extent), - XFS_ERRORTAG_ATTR_LIST(rmap_finish_one), - XFS_ERRORTAG_ATTR_LIST(refcount_continue_update), - XFS_ERRORTAG_ATTR_LIST(refcount_finish_one), - XFS_ERRORTAG_ATTR_LIST(bmap_finish_one), - XFS_ERRORTAG_ATTR_LIST(ag_resv_critical), - XFS_ERRORTAG_ATTR_LIST(log_bad_crc), - XFS_ERRORTAG_ATTR_LIST(log_item_pin), - XFS_ERRORTAG_ATTR_LIST(buf_lru_ref), - XFS_ERRORTAG_ATTR_LIST(force_repair), - XFS_ERRORTAG_ATTR_LIST(bad_summary), - XFS_ERRORTAG_ATTR_LIST(iunlink_fallback), - XFS_ERRORTAG_ATTR_LIST(buf_ioerror), - XFS_ERRORTAG_ATTR_LIST(reduce_max_iextents), - XFS_ERRORTAG_ATTR_LIST(bmap_alloc_minlen_extent), - XFS_ERRORTAG_ATTR_LIST(ag_resv_fail), - XFS_ERRORTAG_ATTR_LIST(larp), - XFS_ERRORTAG_ATTR_LIST(da_leaf_split), - XFS_ERRORTAG_ATTR_LIST(attr_leaf_to_node), - XFS_ERRORTAG_ATTR_LIST(wb_delay_ms), - XFS_ERRORTAG_ATTR_LIST(write_delay_ms), - XFS_ERRORTAG_ATTR_LIST(exchmaps_finish_one), - XFS_ERRORTAG_ATTR_LIST(metafile_resv_crit), - NULL, + XFS_ERRTAGS + NULL }; ATTRIBUTE_GROUPS(xfs_errortag); +#undef XFS_ERRTAG + +/* -1 because XFS_ERRTAG_DROP_WRITES got removed, + 1 for NULL termination */ +static_assert(ARRAY_SIZE(xfs_errortag_attrs) == XFS_ERRTAG_MAX); static const struct kobj_type xfs_errortag_ktype = { .release = xfs_sysfs_release, @@ -295,7 +165,6 @@ xfs_errortag_enabled( bool xfs_errortag_test( struct xfs_mount *mp, - const char *expression, const char *file, int line, unsigned int error_tag) @@ -321,36 +190,12 @@ xfs_errortag_test( return false; xfs_warn_ratelimited(mp, -"Injecting error (%s) at file %s, line %d, on filesystem \"%s\"", - expression, file, line, mp->m_super->s_id); +"Injecting error at file %s, line %d, on filesystem \"%s\"", + file, line, mp->m_super->s_id); return true; } int -xfs_errortag_get( - struct xfs_mount *mp, - unsigned int error_tag) -{ - if (!xfs_errortag_valid(error_tag)) - return -EINVAL; - - return mp->m_errortag[error_tag]; -} - -int -xfs_errortag_set( - struct xfs_mount *mp, - unsigned int error_tag, - unsigned int tag_value) -{ - if (!xfs_errortag_valid(error_tag)) - return -EINVAL; - - mp->m_errortag[error_tag] = tag_value; - return 0; -} - -int xfs_errortag_add( struct xfs_mount *mp, unsigned int error_tag) @@ -359,9 +204,8 @@ xfs_errortag_add( if (!xfs_errortag_valid(error_tag)) return -EINVAL; - - return xfs_errortag_set(mp, error_tag, - xfs_errortag_random_default[error_tag]); + mp->m_errortag[error_tag] = xfs_errortag_random_default[error_tag]; + return 0; } int diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 0b9c5ba8a598..fe6a71bbe9cd 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -8,22 +8,17 @@ struct xfs_mount; -extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp, - const char *filename, int linenum, - xfs_failaddr_t failaddr); -extern void xfs_corruption_error(const char *tag, int level, - struct xfs_mount *mp, const void *buf, size_t bufsize, - const char *filename, int linenum, - xfs_failaddr_t failaddr); +void xfs_error_report(const char *tag, int level, struct xfs_mount *mp, + const char *filename, int linenum, xfs_failaddr_t failaddr); +void xfs_corruption_error(const char *tag, int level, struct xfs_mount *mp, + const void *buf, size_t bufsize, const char *filename, + int linenum, xfs_failaddr_t failaddr); void xfs_buf_corruption_error(struct xfs_buf *bp, xfs_failaddr_t fa); -extern void xfs_buf_verifier_error(struct xfs_buf *bp, int error, - const char *name, const void *buf, size_t bufsz, - xfs_failaddr_t failaddr); -extern void xfs_verifier_error(struct xfs_buf *bp, int error, - xfs_failaddr_t failaddr); -extern void xfs_inode_verifier_error(struct xfs_inode *ip, int error, - const char *name, const void *buf, size_t bufsz, - xfs_failaddr_t failaddr); +void xfs_buf_verifier_error(struct xfs_buf *bp, int error, const char *name, + const void *buf, size_t bufsz, xfs_failaddr_t failaddr); +void xfs_verifier_error(struct xfs_buf *bp, int error, xfs_failaddr_t failaddr); +void xfs_inode_verifier_error(struct xfs_inode *ip, int error, const char *name, + const void *buf, size_t bufsz, xfs_failaddr_t failaddr); #define XFS_ERROR_REPORT(e, lvl, mp) \ xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address) @@ -39,12 +34,12 @@ extern void xfs_inode_verifier_error(struct xfs_inode *ip, int error, #define XFS_CORRUPTION_DUMP_LEN (128) #ifdef DEBUG -extern int xfs_errortag_init(struct xfs_mount *mp); -extern void xfs_errortag_del(struct xfs_mount *mp); -extern bool xfs_errortag_test(struct xfs_mount *mp, const char *expression, - const char *file, int line, unsigned int error_tag); -#define XFS_TEST_ERROR(expr, mp, tag) \ - ((expr) || xfs_errortag_test((mp), #expr, __FILE__, __LINE__, (tag))) +int xfs_errortag_init(struct xfs_mount *mp); +void xfs_errortag_del(struct xfs_mount *mp); +bool xfs_errortag_test(struct xfs_mount *mp, const char *file, int line, + unsigned int error_tag); +#define XFS_TEST_ERROR(mp, tag) \ + xfs_errortag_test((mp), __FILE__, __LINE__, (tag)) bool xfs_errortag_enabled(struct xfs_mount *mp, unsigned int tag); #define XFS_ERRORTAG_DELAY(mp, tag) \ do { \ @@ -58,17 +53,13 @@ bool xfs_errortag_enabled(struct xfs_mount *mp, unsigned int tag); mdelay((mp)->m_errortag[(tag)]); \ } while (0) -extern int xfs_errortag_get(struct xfs_mount *mp, unsigned int error_tag); -extern int xfs_errortag_set(struct xfs_mount *mp, unsigned int error_tag, - unsigned int tag_value); -extern int xfs_errortag_add(struct xfs_mount *mp, unsigned int error_tag); -extern int xfs_errortag_clearall(struct xfs_mount *mp); +int xfs_errortag_add(struct xfs_mount *mp, unsigned int error_tag); +int xfs_errortag_clearall(struct xfs_mount *mp); #else #define xfs_errortag_init(mp) (0) #define xfs_errortag_del(mp) -#define XFS_TEST_ERROR(expr, mp, tag) (expr) +#define XFS_TEST_ERROR(mp, tag) (false) #define XFS_ERRORTAG_DELAY(mp, tag) ((void)0) -#define xfs_errortag_set(mp, tag, val) (ENOSYS) #define xfs_errortag_add(mp, tag) (ENOSYS) #define xfs_errortag_clearall(mp) (ENOSYS) #endif /* DEBUG */ diff --git a/fs/xfs/xfs_exchmaps_item.c b/fs/xfs/xfs_exchmaps_item.c index 264a121c5e16..229cbe0adf17 100644 --- a/fs/xfs/xfs_exchmaps_item.c +++ b/fs/xfs/xfs_exchmaps_item.c @@ -558,12 +558,12 @@ xlog_recover_xmi_commit_pass2( size_t len; len = sizeof(struct xfs_xmi_log_format); - if (item->ri_buf[0].i_len != len) { + if (item->ri_buf[0].iov_len != len) { XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); return -EFSCORRUPTED; } - xmi_formatp = item->ri_buf[0].i_addr; + xmi_formatp = item->ri_buf[0].iov_base; if (xmi_formatp->__pad != 0) { XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); return -EFSCORRUPTED; @@ -598,8 +598,8 @@ xlog_recover_xmd_commit_pass2( { struct xfs_xmd_log_format *xmd_formatp; - xmd_formatp = item->ri_buf[0].i_addr; - if (item->ri_buf[0].i_len != sizeof(struct xfs_xmd_log_format)) { + xmd_formatp = item->ri_buf[0].iov_base; + if (item->ri_buf[0].iov_len != sizeof(struct xfs_xmd_log_format)) { XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); return -EFSCORRUPTED; } diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h index f069b04e8ea1..3e6e019b6146 100644 --- a/fs/xfs/xfs_extent_busy.h +++ b/fs/xfs/xfs_extent_busy.h @@ -68,4 +68,12 @@ static inline void xfs_extent_busy_sort(struct list_head *list) list_sort(NULL, list, xfs_extent_busy_ag_cmp); } +/* + * Zoned RTGs don't need to track busy extents, as the actual block freeing only + * happens by a zone reset, which forces out all transactions that touched the + * to be reset zone first. + */ +#define xfs_group_has_extent_busy(mp, type) \ + ((type) == XG_TYPE_AG || !xfs_has_zoned((mp))) + #endif /* __XFS_EXTENT_BUSY_H__ */ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 777438b853da..418ddab590e0 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -83,6 +83,11 @@ xfs_efi_item_size( *nbytes += xfs_efi_log_format_sizeof(efip->efi_format.efi_nextents); } +unsigned int xfs_efi_log_space(unsigned int nr) +{ + return xlog_item_space(1, xfs_efi_log_format_sizeof(nr)); +} + /* * This is called to fill in the vector of log iovecs for the * given efi log item. We use only 1 iovec, and we point that @@ -177,15 +182,18 @@ xfs_efi_init( * It will handle the conversion of formats if necessary. */ STATIC int -xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) +xfs_efi_copy_format( + struct kvec *buf, + struct xfs_efi_log_format *dst_efi_fmt) { - xfs_efi_log_format_t *src_efi_fmt = buf->i_addr; - uint i; - uint len = xfs_efi_log_format_sizeof(src_efi_fmt->efi_nextents); - uint len32 = xfs_efi_log_format32_sizeof(src_efi_fmt->efi_nextents); - uint len64 = xfs_efi_log_format64_sizeof(src_efi_fmt->efi_nextents); + struct xfs_efi_log_format *src_efi_fmt = buf->iov_base; + uint len, len32, len64, i; + + len = xfs_efi_log_format_sizeof(src_efi_fmt->efi_nextents); + len32 = xfs_efi_log_format32_sizeof(src_efi_fmt->efi_nextents); + len64 = xfs_efi_log_format64_sizeof(src_efi_fmt->efi_nextents); - if (buf->i_len == len) { + if (buf->iov_len == len) { memcpy(dst_efi_fmt, src_efi_fmt, offsetof(struct xfs_efi_log_format, efi_extents)); for (i = 0; i < src_efi_fmt->efi_nextents; i++) @@ -193,8 +201,8 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) &src_efi_fmt->efi_extents[i], sizeof(struct xfs_extent)); return 0; - } else if (buf->i_len == len32) { - xfs_efi_log_format_32_t *src_efi_fmt_32 = buf->i_addr; + } else if (buf->iov_len == len32) { + struct xfs_efi_log_format_32 *src_efi_fmt_32 = buf->iov_base; dst_efi_fmt->efi_type = src_efi_fmt_32->efi_type; dst_efi_fmt->efi_size = src_efi_fmt_32->efi_size; @@ -207,8 +215,8 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) src_efi_fmt_32->efi_extents[i].ext_len; } return 0; - } else if (buf->i_len == len64) { - xfs_efi_log_format_64_t *src_efi_fmt_64 = buf->i_addr; + } else if (buf->iov_len == len64) { + struct xfs_efi_log_format_64 *src_efi_fmt_64 = buf->iov_base; dst_efi_fmt->efi_type = src_efi_fmt_64->efi_type; dst_efi_fmt->efi_size = src_efi_fmt_64->efi_size; @@ -222,8 +230,8 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) } return 0; } - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, NULL, buf->i_addr, - buf->i_len); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, NULL, buf->iov_base, + buf->iov_len); return -EFSCORRUPTED; } @@ -254,6 +262,11 @@ xfs_efd_item_size( *nbytes += xfs_efd_log_format_sizeof(efdp->efd_format.efd_nextents); } +unsigned int xfs_efd_log_space(unsigned int nr) +{ + return xlog_item_space(1, xfs_efd_log_format_sizeof(nr)); +} + /* * This is called to fill in the vector of log iovecs for the * given efd log item. We use only 1 iovec, and we point that @@ -855,11 +868,11 @@ xlog_recover_efi_commit_pass2( struct xfs_efi_log_format *efi_formatp; int error; - efi_formatp = item->ri_buf[0].i_addr; + efi_formatp = item->ri_buf[0].iov_base; - if (item->ri_buf[0].i_len < xfs_efi_log_format_sizeof(0)) { + if (item->ri_buf[0].iov_len < xfs_efi_log_format_sizeof(0)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + item->ri_buf[0].iov_base, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } @@ -894,11 +907,11 @@ xlog_recover_rtefi_commit_pass2( struct xfs_efi_log_format *efi_formatp; int error; - efi_formatp = item->ri_buf[0].i_addr; + efi_formatp = item->ri_buf[0].iov_base; - if (item->ri_buf[0].i_len < xfs_efi_log_format_sizeof(0)) { + if (item->ri_buf[0].iov_len < xfs_efi_log_format_sizeof(0)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + item->ri_buf[0].iov_base, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } @@ -923,7 +936,7 @@ xlog_recover_rtefi_commit_pass2( xfs_lsn_t lsn) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, - item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + item->ri_buf[0].iov_base, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } #endif @@ -948,9 +961,9 @@ xlog_recover_efd_commit_pass2( xfs_lsn_t lsn) { struct xfs_efd_log_format *efd_formatp; - int buflen = item->ri_buf[0].i_len; + int buflen = item->ri_buf[0].iov_len; - efd_formatp = item->ri_buf[0].i_addr; + efd_formatp = item->ri_buf[0].iov_base; if (buflen < sizeof(struct xfs_efd_log_format)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, @@ -958,9 +971,9 @@ xlog_recover_efd_commit_pass2( return -EFSCORRUPTED; } - if (item->ri_buf[0].i_len != xfs_efd_log_format32_sizeof( + if (item->ri_buf[0].iov_len != xfs_efd_log_format32_sizeof( efd_formatp->efd_nextents) && - item->ri_buf[0].i_len != xfs_efd_log_format64_sizeof( + item->ri_buf[0].iov_len != xfs_efd_log_format64_sizeof( efd_formatp->efd_nextents)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, efd_formatp, buflen); @@ -985,9 +998,9 @@ xlog_recover_rtefd_commit_pass2( xfs_lsn_t lsn) { struct xfs_efd_log_format *efd_formatp; - int buflen = item->ri_buf[0].i_len; + int buflen = item->ri_buf[0].iov_len; - efd_formatp = item->ri_buf[0].i_addr; + efd_formatp = item->ri_buf[0].iov_base; if (buflen < sizeof(struct xfs_efd_log_format)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, @@ -995,9 +1008,9 @@ xlog_recover_rtefd_commit_pass2( return -EFSCORRUPTED; } - if (item->ri_buf[0].i_len != xfs_efd_log_format32_sizeof( + if (item->ri_buf[0].iov_len != xfs_efd_log_format32_sizeof( efd_formatp->efd_nextents) && - item->ri_buf[0].i_len != xfs_efd_log_format64_sizeof( + item->ri_buf[0].iov_len != xfs_efd_log_format64_sizeof( efd_formatp->efd_nextents)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, efd_formatp, buflen); diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h index 41b7c4306079..af1b0331f7af 100644 --- a/fs/xfs/xfs_extfree_item.h +++ b/fs/xfs/xfs_extfree_item.h @@ -49,7 +49,7 @@ struct xfs_efi_log_item { struct xfs_log_item efi_item; atomic_t efi_refcount; atomic_t efi_next_extent; - xfs_efi_log_format_t efi_format; + struct xfs_efi_log_format efi_format; }; static inline size_t @@ -69,7 +69,7 @@ struct xfs_efd_log_item { struct xfs_log_item efd_item; struct xfs_efi_log_item *efd_efip; uint efd_next_extent; - xfs_efd_log_format_t efd_format; + struct xfs_efd_log_format efd_format; }; static inline size_t @@ -94,4 +94,7 @@ void xfs_extent_free_defer_add(struct xfs_trans *tp, struct xfs_extent_free_item *xefi, struct xfs_defer_pending **dfpp); +unsigned int xfs_efi_log_space(unsigned int nr); +unsigned int xfs_efd_log_space(unsigned int nr); + #endif /* __XFS_EXTFREE_ITEM_H__ */ diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 84f08c976ac4..7874cf745af3 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -27,6 +27,8 @@ #include "xfs_file.h" #include "xfs_aops.h" #include "xfs_zone_alloc.h" +#include "xfs_error.h" +#include "xfs_errortag.h" #include <linux/dax.h> #include <linux/falloc.h> @@ -75,52 +77,47 @@ xfs_dir_fsync( return xfs_log_force_inode(ip); } -static xfs_csn_t -xfs_fsync_seq( - struct xfs_inode *ip, - bool datasync) -{ - if (!xfs_ipincount(ip)) - return 0; - if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) - return 0; - return ip->i_itemp->ili_commit_seq; -} - /* - * All metadata updates are logged, which means that we just have to flush the - * log up to the latest LSN that touched the inode. + * All metadata updates are logged, which means that we just have to push the + * journal to the required sequence number than holds the updates. We track + * datasync commits separately to full sync commits, and hence only need to + * select the correct sequence number for the log force here. + * + * We don't have to serialise against concurrent modifications, as we do not + * have to wait for modifications that have not yet completed. We define a + * transaction commit as completing when the commit sequence number is updated, + * hence if the sequence number has not updated, the sync operation has been + * run before the commit completed and we don't have to wait for it. * - * If we have concurrent fsync/fdatasync() calls, we need them to all block on - * the log force before we clear the ili_fsync_fields field. This ensures that - * we don't get a racing sync operation that does not wait for the metadata to - * hit the journal before returning. If we race with clearing ili_fsync_fields, - * then all that will happen is the log force will do nothing as the lsn will - * already be on disk. We can't race with setting ili_fsync_fields because that - * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock - * shared until after the ili_fsync_fields is cleared. + * If we have concurrent fsync/fdatasync() calls, the sequence numbers remain + * set on the log item until - at least - the journal flush completes. In + * reality, they are only cleared when the inode is fully unpinned (i.e. + * persistent in the journal and not dirty in the CIL), and so we rely on + * xfs_log_force_seq() either skipping sequences that have been persisted or + * waiting on sequences that are still in flight to correctly order concurrent + * sync operations. */ -static int +static int xfs_fsync_flush_log( struct xfs_inode *ip, bool datasync, int *log_flushed) { - int error = 0; - xfs_csn_t seq; + struct xfs_inode_log_item *iip = ip->i_itemp; + xfs_csn_t seq = 0; - xfs_ilock(ip, XFS_ILOCK_SHARED); - seq = xfs_fsync_seq(ip, datasync); - if (seq) { - error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, - log_flushed); + spin_lock(&iip->ili_lock); + if (datasync) + seq = iip->ili_datasync_seq; + else + seq = iip->ili_commit_seq; + spin_unlock(&iip->ili_lock); - spin_lock(&ip->i_itemp->ili_lock); - ip->i_itemp->ili_fsync_fields = 0; - spin_unlock(&ip->i_itemp->ili_lock); - } - xfs_iunlock(ip, XFS_ILOCK_SHARED); - return error; + if (!seq) + return 0; + + return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, + log_flushed); } STATIC int @@ -158,12 +155,10 @@ xfs_file_fsync( error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev); /* - * Any inode that has dirty modifications in the log is pinned. The - * racy check here for a pinned inode will not catch modifications - * that happen concurrently to the fsync call, but fsync semantics - * only require to sync previously completed I/O. + * If the inode has a inode log item attached, it may need the journal + * flushed to persist any changes the log item might be tracking. */ - if (xfs_ipincount(ip)) { + if (ip->i_itemp) { err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed); if (err2 && !error) error = err2; @@ -497,7 +492,7 @@ restart: static ssize_t xfs_zoned_write_space_reserve( - struct xfs_inode *ip, + struct xfs_mount *mp, struct kiocb *iocb, struct iov_iter *from, unsigned int flags, @@ -533,8 +528,8 @@ xfs_zoned_write_space_reserve( * * Any remaining block will be returned after the write. */ - return xfs_zoned_space_reserve(ip, - XFS_B_TO_FSB(ip->i_mount, count) + 1 + 2, flags, ac); + return xfs_zoned_space_reserve(mp, XFS_B_TO_FSB(mp, count) + 1 + 2, + flags, ac); } static int @@ -576,7 +571,10 @@ xfs_dio_write_end_io( nofs_flag = memalloc_nofs_save(); if (flags & IOMAP_DIO_COW) { - error = xfs_reflink_end_cow(ip, offset, size); + if (iocb->ki_flags & IOCB_ATOMIC) + error = xfs_reflink_end_atomic_cow(ip, offset, size); + else + error = xfs_reflink_end_cow(ip, offset, size); if (error) goto out; } @@ -678,8 +676,17 @@ xfs_file_dio_write_aligned( struct xfs_zone_alloc_ctx *ac) { unsigned int iolock = XFS_IOLOCK_SHARED; + unsigned int dio_flags = 0; ssize_t ret; + /* + * For always COW inodes, each bio must be aligned to the file system + * block size and not just the device sector size because we need to + * allocate a block-aligned amount of space for each write. + */ + if (xfs_is_always_cow_inode(ip)) + dio_flags |= IOMAP_DIO_FSBLOCK_ALIGNED; + ret = xfs_ilock_iocb_for_write(iocb, &iolock); if (ret) return ret; @@ -697,7 +704,7 @@ xfs_file_dio_write_aligned( iolock = XFS_IOLOCK_SHARED; } trace_xfs_file_direct_write(iocb, from); - ret = iomap_dio_rw(iocb, from, ops, dops, 0, ac, 0); + ret = iomap_dio_rw(iocb, from, ops, dops, dio_flags, ac, 0); out_unlock: xfs_iunlock(ip, iolock); return ret; @@ -715,13 +722,79 @@ xfs_file_dio_write_zoned( struct xfs_zone_alloc_ctx ac = { }; ssize_t ret; - ret = xfs_zoned_write_space_reserve(ip, iocb, from, 0, &ac); + ret = xfs_zoned_write_space_reserve(ip->i_mount, iocb, from, 0, &ac); if (ret < 0) return ret; ret = xfs_file_dio_write_aligned(ip, iocb, from, &xfs_zoned_direct_write_iomap_ops, &xfs_dio_zoned_write_ops, &ac); - xfs_zoned_space_unreserve(ip, &ac); + xfs_zoned_space_unreserve(ip->i_mount, &ac); + return ret; +} + +/* + * Handle block atomic writes + * + * Two methods of atomic writes are supported: + * - REQ_ATOMIC-based, which would typically use some form of HW offload in the + * disk + * - COW-based, which uses a COW fork as a staging extent for data updates + * before atomically updating extent mappings for the range being written + * + */ +static noinline ssize_t +xfs_file_dio_write_atomic( + struct xfs_inode *ip, + struct kiocb *iocb, + struct iov_iter *from) +{ + unsigned int iolock = XFS_IOLOCK_SHARED; + ssize_t ret, ocount = iov_iter_count(from); + const struct iomap_ops *dops; + + /* + * HW offload should be faster, so try that first if it is already + * known that the write length is not too large. + */ + if (ocount > xfs_inode_buftarg(ip)->bt_awu_max) + dops = &xfs_atomic_write_cow_iomap_ops; + else + dops = &xfs_direct_write_iomap_ops; + +retry: + ret = xfs_ilock_iocb_for_write(iocb, &iolock); + if (ret) + return ret; + + ret = xfs_file_write_checks(iocb, from, &iolock, NULL); + if (ret) + goto out_unlock; + + /* Demote similar to xfs_file_dio_write_aligned() */ + if (iolock == XFS_IOLOCK_EXCL) { + xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); + iolock = XFS_IOLOCK_SHARED; + } + + trace_xfs_file_direct_write(iocb, from); + ret = iomap_dio_rw(iocb, from, dops, &xfs_dio_write_ops, + 0, NULL, 0); + + /* + * The retry mechanism is based on the ->iomap_begin method returning + * -ENOPROTOOPT, which would be when the REQ_ATOMIC-based write is not + * possible. The REQ_ATOMIC-based method typically not be possible if + * the write spans multiple extents or the disk blocks are misaligned. + */ + if (ret == -ENOPROTOOPT && dops == &xfs_direct_write_iomap_ops) { + xfs_iunlock(ip, iolock); + dops = &xfs_atomic_write_cow_iomap_ops; + goto retry; + } + +out_unlock: + if (iolock) + xfs_iunlock(ip, iolock); return ret; } @@ -828,18 +901,12 @@ xfs_file_dio_write( if ((iocb->ki_pos | count) & target->bt_logical_sectormask) return -EINVAL; - /* - * For always COW inodes we also must check the alignment of each - * individual iovec segment, as they could end up with different - * I/Os due to the way bio_iov_iter_get_pages works, and we'd - * then overwrite an already written block. - */ - if (((iocb->ki_pos | count) & ip->i_mount->m_blockmask) || - (xfs_is_always_cow_inode(ip) && - (iov_iter_alignment(from) & ip->i_mount->m_blockmask))) + if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask) return xfs_file_dio_write_unaligned(ip, iocb, from); if (xfs_is_zoned_inode(ip)) return xfs_file_dio_write_zoned(ip, iocb, from); + if (iocb->ki_flags & IOCB_ATOMIC) + return xfs_file_dio_write_atomic(ip, iocb, from); return xfs_file_dio_write_aligned(ip, iocb, from, &xfs_direct_write_iomap_ops, &xfs_dio_write_ops, NULL); } @@ -908,7 +975,8 @@ write_retry: trace_xfs_file_buffered_write(iocb, from); ret = iomap_file_buffered_write(iocb, from, - &xfs_buffered_write_iomap_ops, NULL); + &xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops, + NULL); /* * If we hit a space limit, try to free up some lingering preallocated @@ -961,7 +1029,7 @@ xfs_file_buffered_write_zoned( struct xfs_zone_alloc_ctx ac = { }; ssize_t ret; - ret = xfs_zoned_write_space_reserve(ip, iocb, from, XFS_ZR_GREEDY, &ac); + ret = xfs_zoned_write_space_reserve(mp, iocb, from, XFS_ZR_GREEDY, &ac); if (ret < 0) return ret; @@ -988,7 +1056,8 @@ xfs_file_buffered_write_zoned( retry: trace_xfs_file_buffered_write(iocb, from); ret = iomap_file_buffered_write(iocb, from, - &xfs_buffered_write_iomap_ops, &ac); + &xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops, + &ac); if (ret == -ENOSPC && !cleared_space) { /* * Kick off writeback to convert delalloc space and release the @@ -1002,7 +1071,7 @@ retry: out_unlock: xfs_iunlock(ip, iolock); out_unreserve: - xfs_zoned_space_unreserve(ip, &ac); + xfs_zoned_space_unreserve(ip->i_mount, &ac); if (ret > 0) { XFS_STATS_ADD(mp, xs_write_bytes, ret); ret = generic_write_sync(iocb, ret); @@ -1028,23 +1097,21 @@ xfs_file_write_iter( if (xfs_is_shutdown(ip->i_mount)) return -EIO; - if (IS_DAX(inode)) - return xfs_file_dax_write(iocb, from); - if (iocb->ki_flags & IOCB_ATOMIC) { - /* - * Currently only atomic writing of a single FS block is - * supported. It would be possible to atomic write smaller than - * a FS block, but there is no requirement to support this. - * Note that iomap also does not support this yet. - */ - if (ocount != ip->i_mount->m_sb.sb_blocksize) + if (ocount < xfs_get_atomic_write_min(ip)) return -EINVAL; + + if (ocount > xfs_get_atomic_write_max(ip)) + return -EINVAL; + ret = generic_atomic_write_valid(iocb, from); if (ret) return ret; } + if (IS_DAX(inode)) + return xfs_file_dax_write(iocb, from); + if (iocb->ki_flags & IOCB_DIRECT) { /* * Allow a directio write to fall back to a buffered @@ -1174,6 +1241,38 @@ xfs_falloc_insert_range( } /* + * For various operations we need to zero up to one block at each end of + * the affected range. For zoned file systems this will require a space + * allocation, for which we need a reservation ahead of time. + */ +#define XFS_ZONED_ZERO_EDGE_SPACE_RES 2 + +/* + * Zero range implements a full zeroing mechanism but is only used in limited + * situations. It is more efficient to allocate unwritten extents than to + * perform zeroing here, so use an errortag to randomly force zeroing on DEBUG + * kernels for added test coverage. + * + * On zoned file systems, the error is already injected by + * xfs_file_zoned_fallocate, which then reserves the additional space needed. + * We only check for this extra space reservation here. + */ +static inline bool +xfs_falloc_force_zero( + struct xfs_inode *ip, + struct xfs_zone_alloc_ctx *ac) +{ + if (xfs_is_zoned_inode(ip)) { + if (ac->reserved_blocks > XFS_ZONED_ZERO_EDGE_SPACE_RES) { + ASSERT(IS_ENABLED(CONFIG_XFS_DEBUG)); + return true; + } + return false; + } + return XFS_TEST_ERROR(ip->i_mount, XFS_ERRTAG_FORCE_ZERO_RANGE); +} + +/* * Punch a hole and prealloc the range. We use a hole punch rather than * unwritten extent conversion for two reasons: * @@ -1190,23 +1289,29 @@ xfs_falloc_zero_range( struct xfs_zone_alloc_ctx *ac) { struct inode *inode = file_inode(file); + struct xfs_inode *ip = XFS_I(inode); unsigned int blksize = i_blocksize(inode); loff_t new_size = 0; int error; - trace_xfs_zero_file_space(XFS_I(inode)); + trace_xfs_zero_file_space(ip); error = xfs_falloc_newsize(file, mode, offset, len, &new_size); if (error) return error; - error = xfs_free_file_space(XFS_I(inode), offset, len, ac); - if (error) - return error; + if (xfs_falloc_force_zero(ip, ac)) { + error = xfs_zero_range(ip, offset, len, ac, NULL); + } else { + error = xfs_free_file_space(ip, offset, len, ac); + if (error) + return error; - len = round_up(offset + len, blksize) - round_down(offset, blksize); - offset = round_down(offset, blksize); - error = xfs_alloc_file_space(XFS_I(inode), offset, len); + len = round_up(offset + len, blksize) - + round_down(offset, blksize); + offset = round_down(offset, blksize); + error = xfs_alloc_file_space(ip, offset, len); + } if (error) return error; return xfs_falloc_setsize(file, new_size); @@ -1266,9 +1371,10 @@ xfs_falloc_allocate_range( } #define XFS_FALLOC_FL_SUPPORTED \ - (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ - FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \ - FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE) + (FALLOC_FL_ALLOCATE_RANGE | FALLOC_FL_KEEP_SIZE | \ + FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE | \ + FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE | \ + FALLOC_FL_UNSHARE_RANGE) STATIC long __xfs_file_fallocate( @@ -1342,13 +1448,26 @@ xfs_file_zoned_fallocate( { struct xfs_zone_alloc_ctx ac = { }; struct xfs_inode *ip = XFS_I(file_inode(file)); + struct xfs_mount *mp = ip->i_mount; + xfs_filblks_t count_fsb; int error; - error = xfs_zoned_space_reserve(ip, 2, XFS_ZR_RESERVED, &ac); + /* + * If full zeroing is forced by the error injection knob, we need a + * space reservation that covers the entire range. See the comment in + * xfs_zoned_write_space_reserve for the rationale for the calculation. + * Otherwise just reserve space for the two boundary blocks. + */ + count_fsb = XFS_ZONED_ZERO_EDGE_SPACE_RES; + if ((mode & FALLOC_FL_MODE_MASK) == FALLOC_FL_ZERO_RANGE && + XFS_TEST_ERROR(mp, XFS_ERRTAG_FORCE_ZERO_RANGE)) + count_fsb += XFS_B_TO_FSB(mp, len) + 1; + + error = xfs_zoned_space_reserve(mp, count_fsb, XFS_ZR_RESERVED, &ac); if (error) return error; error = __xfs_file_fallocate(file, mode, offset, len, &ac); - xfs_zoned_space_unreserve(ip, &ac); + xfs_zoned_space_unreserve(mp, &ac); return error; } @@ -1488,7 +1607,7 @@ xfs_file_open( if (xfs_is_shutdown(XFS_M(inode->i_sb))) return -EIO; file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; - if (xfs_inode_can_atomicwrite(XFS_I(inode))) + if (xfs_get_atomic_write_min(XFS_I(inode)) > 0) file->f_mode |= FMODE_CAN_ATOMIC_WRITE; return generic_file_open(inode, file); } @@ -1660,7 +1779,7 @@ xfs_dax_fault_locked( bool write_fault) { vm_fault_t ret; - pfn_t pfn; + unsigned long pfn; if (!IS_ENABLED(CONFIG_FS_DAX)) { ASSERT(0); @@ -1758,12 +1877,12 @@ xfs_write_fault_zoned( * But as the overallocation is limited to less than a folio and will be * release instantly that's just fine. */ - error = xfs_zoned_space_reserve(ip, XFS_B_TO_FSB(ip->i_mount, len), 0, - &ac); + error = xfs_zoned_space_reserve(ip->i_mount, + XFS_B_TO_FSB(ip->i_mount, len), 0, &ac); if (error < 0) return vmf_fs_error(error); ret = __xfs_write_fault(vmf, order, &ac); - xfs_zoned_space_unreserve(ip, &ac); + xfs_zoned_space_unreserve(ip->i_mount, &ac); return ret; } @@ -1844,10 +1963,10 @@ static const struct vm_operations_struct xfs_file_vm_ops = { }; STATIC int -xfs_file_mmap( - struct file *file, - struct vm_area_struct *vma) +xfs_file_mmap_prepare( + struct vm_area_desc *desc) { + struct file *file = desc->file; struct inode *inode = file_inode(file); struct xfs_buftarg *target = xfs_inode_buftarg(XFS_I(inode)); @@ -1855,13 +1974,14 @@ xfs_file_mmap( * We don't support synchronous mappings for non-DAX files and * for DAX files if underneath dax_device is not synchronous. */ - if (!daxdev_mapping_supported(vma, target->bt_daxdev)) + if (!daxdev_mapping_supported(desc->vm_flags, file_inode(file), + target->bt_daxdev)) return -EOPNOTSUPP; file_accessed(file); - vma->vm_ops = &xfs_file_vm_ops; + desc->vm_ops = &xfs_file_vm_ops; if (IS_DAX(inode)) - vm_flags_set(vma, VM_HUGEPAGE); + desc->vm_flags |= VM_HUGEPAGE; return 0; } @@ -1876,7 +1996,7 @@ const struct file_operations xfs_file_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = xfs_file_compat_ioctl, #endif - .mmap = xfs_file_mmap, + .mmap_prepare = xfs_file_mmap_prepare, .open = xfs_file_open, .release = xfs_file_release, .fsync = xfs_file_fsync, diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index a961aa420c48..044918fbae06 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -304,11 +304,9 @@ xfs_filestream_create_association( * for us, so all we need to do here is take another active reference to * the perag for the cached association. * - * If we fail to store the association, we need to drop the fstrms - * counter as well as drop the perag reference we take here for the - * item. We do not need to return an error for this failure - as long as - * we return a referenced AG, the allocation can still go ahead just - * fine. + * If we fail to store the association, we do not need to return an + * error for this failure - as long as we return a referenced AG, the + * allocation can still go ahead just fine. */ item = kmalloc(sizeof(*item), GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!item) @@ -316,14 +314,9 @@ xfs_filestream_create_association( atomic_inc(&pag_group(args->pag)->xg_active_ref); item->pag = args->pag; - error = xfs_mru_cache_insert(mp->m_filestream, pino, &item->mru); - if (error) - goto out_free_item; + xfs_mru_cache_insert(mp->m_filestream, pino, &item->mru); return 0; -out_free_item: - xfs_perag_rele(item->pag); - kfree(item); out_put_fstrms: atomic_dec(&args->pag->pagf_fstrms); return 0; diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index a4bc1642fe56..af68c7de8ee8 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -876,6 +876,7 @@ xfs_getfsmap_rtdev_rmapbt( const struct xfs_fsmap *keys, struct xfs_getfsmap_info *info) { + struct xfs_fsmap key0 = *keys; /* struct copy */ struct xfs_mount *mp = tp->t_mountp; struct xfs_rtgroup *rtg = NULL; struct xfs_btree_cur *bt_cur = NULL; @@ -887,32 +888,46 @@ xfs_getfsmap_rtdev_rmapbt( int error = 0; eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart + mp->m_sb.sb_rblocks); - if (keys[0].fmr_physical >= eofs) + if (key0.fmr_physical >= eofs) return 0; + /* + * On zoned filesystems with an internal rt volume, the volume comes + * immediately after the end of the data volume. However, the + * xfs_rtblock_t address space is relative to the start of the data + * device, which means that the first @rtstart fsblocks do not actually + * point anywhere. If a fsmap query comes in with the low key starting + * below @rtstart, report it as "owned by filesystem". + */ rtstart_daddr = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart); - if (keys[0].fmr_physical < rtstart_daddr) { + if (xfs_has_zoned(mp) && key0.fmr_physical < rtstart_daddr) { struct xfs_fsmap_irec frec = { .owner = XFS_RMAP_OWN_FS, .len_daddr = rtstart_daddr, }; - /* Adjust the low key if we are continuing from where we left off. */ - if (keys[0].fmr_length > 0) { - info->low_daddr = keys[0].fmr_physical + keys[0].fmr_length; - return 0; + /* + * Adjust the start of the query range if we're picking up from + * a previous round, and only emit the record if we haven't + * already gone past. + */ + key0.fmr_physical += key0.fmr_length; + if (key0.fmr_physical < rtstart_daddr) { + error = xfs_getfsmap_helper(tp, info, &frec); + if (error) + return error; + + key0.fmr_physical = rtstart_daddr; } - /* Fabricate an rmap entry for space occupied by the data dev */ - error = xfs_getfsmap_helper(tp, info, &frec); - if (error) - return error; + /* Zero the other fields to avoid further adjustments. */ + key0.fmr_owner = 0; + key0.fmr_offset = 0; + key0.fmr_length = 0; } - start_rtb = xfs_daddr_to_rtb(mp, rtstart_daddr + keys[0].fmr_physical); - end_rtb = xfs_daddr_to_rtb(mp, rtstart_daddr + - min(eofs - 1, keys[1].fmr_physical)); - + start_rtb = xfs_daddr_to_rtb(mp, key0.fmr_physical); + end_rtb = xfs_daddr_to_rtb(mp, min(eofs - 1, keys[1].fmr_physical)); info->missing_owner = XFS_FMR_OWN_FREE; /* @@ -920,12 +935,12 @@ xfs_getfsmap_rtdev_rmapbt( * low to the fsmap low key and max out the high key to the end * of the rtgroup. */ - info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset); - error = xfs_fsmap_owner_to_rmap(&info->low, &keys[0]); + info->low.rm_offset = XFS_BB_TO_FSBT(mp, key0.fmr_offset); + error = xfs_fsmap_owner_to_rmap(&info->low, &key0); if (error) return error; - info->low.rm_blockcount = XFS_BB_TO_FSBT(mp, keys[0].fmr_length); - xfs_getfsmap_set_irec_flags(&info->low, &keys[0]); + info->low.rm_blockcount = XFS_BB_TO_FSBT(mp, key0.fmr_length); + xfs_getfsmap_set_irec_flags(&info->low, &key0); /* Adjust the low key if we are continuing from where we left off. */ if (info->low.rm_blockcount == 0) { @@ -1255,9 +1270,7 @@ xfs_getfsmap( * buffer locking abilities to detect cycles in the rmapbt * without deadlocking. */ - error = xfs_trans_alloc_empty(mp, &tp); - if (error) - break; + tp = xfs_trans_alloc_empty(mp); info.dev = handlers[i].dev; info.last = false; diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index f18fec0adf66..566fd663c95b 100644 --- a/fs/xfs/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -14,8 +14,6 @@ */ xfs_param_t xfs_params = { /* MIN DFLT MAX */ - .sgid_inherit = { 0, 0, 1 }, - .symlink_mode = { 0, 0, 1 }, .panic_mask = { 0, 0, XFS_PTAG_MASK}, .error_level = { 0, 3, 11 }, .syncd_timer = { 1*100, 30*100, 7200*100}, @@ -23,8 +21,6 @@ xfs_param_t xfs_params = { .inherit_sync = { 0, 1, 1 }, .inherit_nodump = { 0, 1, 1 }, .inherit_noatim = { 0, 1, 1 }, - .xfs_buf_timer = { 100/2, 1*100, 30*100 }, - .xfs_buf_age = { 1*100, 15*100, 7200*100}, .inherit_nosym = { 0, 0, 1 }, .rotorstep = { 1, 1, 255 }, .inherit_nodfrg = { 0, 1, 1 }, diff --git a/fs/xfs/xfs_handle.c b/fs/xfs/xfs_handle.c index f19fce557354..5a3e3bf4e7cc 100644 --- a/fs/xfs/xfs_handle.c +++ b/fs/xfs/xfs_handle.c @@ -233,14 +233,11 @@ xfs_open_by_handle( xfs_fsop_handlereq_t *hreq) { const struct cred *cred = current_cred(); - int error; - int fd; int permflag; - struct file *filp; struct inode *inode; struct dentry *dentry; fmode_t fmode; - struct path path; + struct path path __free(path_put) = {}; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -249,12 +246,11 @@ xfs_open_by_handle( if (IS_ERR(dentry)) return PTR_ERR(dentry); inode = d_inode(dentry); + path.dentry = dentry; /* Restrict xfs_open_by_handle to directories & regular files. */ - if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) { - error = -EPERM; - goto out_dput; - } + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) + return -EPERM; #if BITS_PER_LONG != 32 hreq->oflags |= O_LARGEFILE; @@ -263,48 +259,30 @@ xfs_open_by_handle( permflag = hreq->oflags; fmode = OPEN_FMODE(permflag); if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) && - (fmode & FMODE_WRITE) && IS_APPEND(inode)) { - error = -EPERM; - goto out_dput; - } + (fmode & FMODE_WRITE) && IS_APPEND(inode)) + return -EPERM; - if ((fmode & FMODE_WRITE) && IS_IMMUTABLE(inode)) { - error = -EPERM; - goto out_dput; - } + if ((fmode & FMODE_WRITE) && IS_IMMUTABLE(inode)) + return -EPERM; /* Can't write directories. */ - if (S_ISDIR(inode->i_mode) && (fmode & FMODE_WRITE)) { - error = -EISDIR; - goto out_dput; - } + if (S_ISDIR(inode->i_mode) && (fmode & FMODE_WRITE)) + return -EISDIR; - fd = get_unused_fd_flags(0); - if (fd < 0) { - error = fd; - goto out_dput; - } + path.mnt = mntget(parfilp->f_path.mnt); - path.mnt = parfilp->f_path.mnt; - path.dentry = dentry; - filp = dentry_open(&path, hreq->oflags, cred); - dput(dentry); - if (IS_ERR(filp)) { - put_unused_fd(fd); - return PTR_ERR(filp); - } + FD_PREPARE(fdf, 0, dentry_open(&path, hreq->oflags, cred)); + if (fdf.err) + return fdf.err; if (S_ISREG(inode->i_mode)) { + struct file *filp = fd_prepare_file(fdf); + filp->f_flags |= O_NOATIME; filp->f_mode |= FMODE_NOCMTIME; } - fd_install(fd, filp); - return fd; - - out_dput: - dput(dentry); - return error; + return fd_publish(fdf); } int diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c index 7c541fb373d5..3c1557fb1cf0 100644 --- a/fs/xfs/xfs_health.c +++ b/fs/xfs/xfs_health.c @@ -285,7 +285,7 @@ xfs_inode_mark_sick( * is not the case here. */ spin_lock(&VFS_I(ip)->i_lock); - VFS_I(ip)->i_state &= ~I_DONTCACHE; + inode_state_clear(VFS_I(ip), I_DONTCACHE); spin_unlock(&VFS_I(ip)->i_lock); } @@ -309,7 +309,7 @@ xfs_inode_mark_corrupt( * is not the case here. */ spin_lock(&VFS_I(ip)->i_lock); - VFS_I(ip)->i_state &= ~I_DONTCACHE; + inode_state_clear(VFS_I(ip), I_DONTCACHE); spin_unlock(&VFS_I(ip)->i_lock); } diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 726e29b837e6..23a920437fe4 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -334,7 +334,7 @@ xfs_reinit_inode( dev_t dev = inode->i_rdev; kuid_t uid = inode->i_uid; kgid_t gid = inode->i_gid; - unsigned long state = inode->i_state; + unsigned long state = inode_state_read_once(inode); error = inode_init_always(mp->m_super, inode); @@ -345,7 +345,7 @@ xfs_reinit_inode( inode->i_rdev = dev; inode->i_uid = uid; inode->i_gid = gid; - inode->i_state = state; + inode_state_assign_raw(inode, state); mapping_set_folio_min_order(inode->i_mapping, M_IGEO(mp)->min_folio_order); return error; @@ -358,7 +358,7 @@ xfs_reinit_inode( static int xfs_iget_recycle( struct xfs_perag *pag, - struct xfs_inode *ip) __releases(&ip->i_flags_lock) + struct xfs_inode *ip) { struct xfs_mount *mp = ip->i_mount; struct inode *inode = VFS_I(ip); @@ -366,20 +366,6 @@ xfs_iget_recycle( trace_xfs_iget_recycle(ip); - if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) - return -EAGAIN; - - /* - * We need to make it look like the inode is being reclaimed to prevent - * the actual reclaim workers from stomping over us while we recycle - * the inode. We can't clear the radix tree tag yet as it requires - * pag_ici_lock to be held exclusive. - */ - ip->i_flags |= XFS_IRECLAIM; - - spin_unlock(&ip->i_flags_lock); - rcu_read_unlock(); - ASSERT(!rwsem_is_locked(&inode->i_rwsem)); error = xfs_reinit_inode(mp, inode); xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -411,7 +397,7 @@ xfs_iget_recycle( ip->i_flags |= XFS_INEW; xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); - inode->i_state = I_NEW; + inode_state_assign_raw(inode, I_NEW); spin_unlock(&ip->i_flags_lock); spin_unlock(&pag->pag_ici_lock); @@ -576,10 +562,19 @@ xfs_iget_cache_hit( /* The inode fits the selection criteria; process it. */ if (ip->i_flags & XFS_IRECLAIMABLE) { - /* Drops i_flags_lock and RCU read lock. */ - error = xfs_iget_recycle(pag, ip); - if (error == -EAGAIN) + /* + * We need to make it look like the inode is being reclaimed to + * prevent the actual reclaim workers from stomping over us + * while we recycle the inode. We can't clear the radix tree + * tag yet as it requires pag_ici_lock to be held exclusive. + */ + if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) goto out_skip; + ip->i_flags |= XFS_IRECLAIM; + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + + error = xfs_iget_recycle(pag, ip); if (error) return error; } else { @@ -646,8 +641,7 @@ xfs_iget_cache_miss( goto out_destroy; /* - * For version 5 superblocks, if we are initialising a new inode and we - * are not utilising the XFS_FEAT_IKEEP inode cluster mode, we can + * For version 5 superblocks, if we are initialising a new inode, we * simply build the new inode core with a random generation number. * * For version 4 (and older) superblocks, log recovery is dependent on @@ -655,8 +649,7 @@ xfs_iget_cache_miss( * value and hence we must also read the inode off disk even when * initializing new inodes. */ - if (xfs_has_v3inodes(mp) && - (flags & XFS_IGET_CREATE) && !xfs_has_ikeep(mp)) { + if (xfs_has_v3inodes(mp) && (flags & XFS_IGET_CREATE)) { VFS_I(ip)->i_generation = get_random_u32(); } else { struct xfs_buf *bp; @@ -893,10 +886,7 @@ xfs_metafile_iget( struct xfs_trans *tp; int error; - error = xfs_trans_alloc_empty(mp, &tp); - if (error) - return error; - + tp = xfs_trans_alloc_empty(mp); error = xfs_trans_metafile_iget(tp, ino, metafile_type, ipp); xfs_trans_cancel(tp); return error; @@ -979,7 +969,15 @@ xfs_reclaim_inode( */ if (xlog_is_shutdown(ip->i_mount->m_log)) { xfs_iunpin_wait(ip); + /* + * Avoid a ABBA deadlock on the inode cluster buffer vs + * concurrent xfs_ifree_cluster() trying to mark the inode + * stale. We don't need the inode locked to run the flush abort + * code, but the flush abort needs to lock the cluster buffer. + */ + xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_iflush_shutdown_abort(ip); + xfs_ilock(ip, XFS_ILOCK_EXCL); goto reclaim; } if (xfs_ipincount(ip)) diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c index 4345db501714..f83ec2bd0583 100644 --- a/fs/xfs/xfs_icreate_item.c +++ b/fs/xfs/xfs_icreate_item.c @@ -158,7 +158,7 @@ xlog_recover_icreate_commit_pass2( int nbufs; int i; - icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr; + icl = (struct xfs_icreate_log *)item->ri_buf[0].iov_base; if (icl->icl_type != XFS_LI_ICREATE) { xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type"); return -EINVAL; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index ee3e0f284287..f1f88e48fe22 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -877,6 +877,35 @@ xfs_create_tmpfile( return error; } +static inline int +xfs_projid_differ( + struct xfs_inode *tdp, + struct xfs_inode *sip) +{ + /* + * If we are using project inheritance, we only allow hard link/renames + * creation in our tree when the project IDs are the same; else + * the tree quota mechanism could be circumvented. + */ + if (unlikely((tdp->i_diflags & XFS_DIFLAG_PROJINHERIT) && + tdp->i_projid != sip->i_projid)) { + /* + * Project quota setup skips special files which can + * leave inodes in a PROJINHERIT directory without a + * project ID set. We need to allow links to be made + * to these "project-less" inodes because userspace + * expects them to succeed after project ID setup, + * but everything else should be rejected. + */ + if (!special_file(VFS_I(sip)->i_mode) || + sip->i_projid != 0) { + return -EXDEV; + } + } + + return 0; +} + int xfs_link( struct xfs_inode *tdp, @@ -930,27 +959,9 @@ xfs_link( goto error_return; } - /* - * If we are using project inheritance, we only allow hard link - * creation in our tree when the project IDs are the same; else - * the tree quota mechanism could be circumvented. - */ - if (unlikely((tdp->i_diflags & XFS_DIFLAG_PROJINHERIT) && - tdp->i_projid != sip->i_projid)) { - /* - * Project quota setup skips special files which can - * leave inodes in a PROJINHERIT directory without a - * project ID set. We need to allow links to be made - * to these "project-less" inodes because userspace - * expects them to succeed after project ID setup, - * but everything else should be rejected. - */ - if (!special_file(VFS_I(sip)->i_mode) || - sip->i_projid != 0) { - error = -EXDEV; - goto error_return; - } - } + error = xfs_projid_differ(tdp, sip); + if (error) + goto error_return; error = xfs_dir_add_child(tp, resblks, &du); if (error) @@ -1035,7 +1046,7 @@ xfs_itruncate_extents_flags( int error = 0; xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); - if (atomic_read(&VFS_I(ip)->i_count)) + if (icount_read(VFS_I(ip))) xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL); ASSERT(new_size <= XFS_ISIZE(ip)); ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); @@ -1569,7 +1580,7 @@ xfs_iunlink_reload_next( next_ip->i_prev_unlinked = prev_agino; trace_xfs_iunlink_reload_next(next_ip); rele: - ASSERT(!(VFS_I(next_ip)->i_state & I_DONTCACHE)); + ASSERT(!(inode_state_read_once(VFS_I(next_ip)) & I_DONTCACHE)); if (xfs_is_quotacheck_running(mp) && next_ip) xfs_iflags_set(next_ip, XFS_IQUOTAUNCHECKED); xfs_irele(next_ip); @@ -1635,7 +1646,7 @@ retry: iip = ip->i_itemp; if (__xfs_iflags_test(ip, XFS_IFLUSHING)) { ASSERT(!list_empty(&iip->ili_item.li_bio_list)); - ASSERT(iip->ili_last_fields); + ASSERT(iip->ili_last_fields || xlog_is_shutdown(mp->m_log)); goto out_iunlock; } @@ -1656,7 +1667,6 @@ retry: spin_lock(&iip->ili_lock); iip->ili_last_fields = iip->ili_fields; iip->ili_fields = 0; - iip->ili_fsync_fields = 0; spin_unlock(&iip->ili_lock); ASSERT(iip->ili_last_fields); @@ -1821,12 +1831,20 @@ static void xfs_iunpin( struct xfs_inode *ip) { - xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED); + struct xfs_inode_log_item *iip = ip->i_itemp; + xfs_csn_t seq = 0; trace_xfs_inode_unpin_nowait(ip, _RET_IP_); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED); + + spin_lock(&iip->ili_lock); + seq = iip->ili_commit_seq; + spin_unlock(&iip->ili_lock); + if (!seq) + return; /* Give the log a push to start the unpinning I/O */ - xfs_log_force_seq(ip->i_mount, ip->i_itemp->ili_commit_seq, 0, NULL); + xfs_log_force_seq(ip->i_mount, seq, 0, NULL); } @@ -2093,7 +2111,7 @@ xfs_rename_alloc_whiteout( */ xfs_setup_iops(tmpfile); xfs_finish_inode_setup(tmpfile); - VFS_I(tmpfile)->i_state |= I_LINKABLE; + inode_state_set_raw(VFS_I(tmpfile), I_LINKABLE); *wip = tmpfile; return 0; @@ -2227,16 +2245,9 @@ retry: if (du_wip.ip) xfs_trans_ijoin(tp, du_wip.ip, 0); - /* - * If we are using project inheritance, we only allow renames - * into our tree when the project IDs are the same; else the - * tree quota mechanism would be circumvented. - */ - if (unlikely((target_dp->i_diflags & XFS_DIFLAG_PROJINHERIT) && - target_dp->i_projid != src_ip->i_projid)) { - error = -EXDEV; + error = xfs_projid_differ(target_dp, src_ip); + if (error) goto out_trans_cancel; - } /* RENAME_EXCHANGE is unique from here on. */ if (flags & RENAME_EXCHANGE) { @@ -2319,7 +2330,7 @@ retry: * flag from the inode so it doesn't accidentally get misused in * future. */ - VFS_I(du_wip.ip)->i_state &= ~I_LINKABLE; + inode_state_clear_raw(VFS_I(du_wip.ip), I_LINKABLE); } out_commit: @@ -2377,8 +2388,8 @@ xfs_iflush( * error handling as the caller will shutdown and fail the buffer. */ error = -EFSCORRUPTED; - if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC), - mp, XFS_ERRTAG_IFLUSH_1)) { + if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC) || + XFS_TEST_ERROR(mp, XFS_ERRTAG_IFLUSH_1)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: Bad inode %llu magic number 0x%x, ptr "PTR_FMT, __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip); @@ -2394,29 +2405,27 @@ xfs_iflush( goto flush_out; } } else if (S_ISREG(VFS_I(ip)->i_mode)) { - if (XFS_TEST_ERROR( - ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && - ip->i_df.if_format != XFS_DINODE_FMT_BTREE, - mp, XFS_ERRTAG_IFLUSH_3)) { + if ((ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && + ip->i_df.if_format != XFS_DINODE_FMT_BTREE) || + XFS_TEST_ERROR(mp, XFS_ERRTAG_IFLUSH_3)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: Bad regular inode %llu, ptr "PTR_FMT, __func__, ip->i_ino, ip); goto flush_out; } } else if (S_ISDIR(VFS_I(ip)->i_mode)) { - if (XFS_TEST_ERROR( - ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && - ip->i_df.if_format != XFS_DINODE_FMT_BTREE && - ip->i_df.if_format != XFS_DINODE_FMT_LOCAL, - mp, XFS_ERRTAG_IFLUSH_4)) { + if ((ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && + ip->i_df.if_format != XFS_DINODE_FMT_BTREE && + ip->i_df.if_format != XFS_DINODE_FMT_LOCAL) || + XFS_TEST_ERROR(mp, XFS_ERRTAG_IFLUSH_4)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: Bad directory inode %llu, ptr "PTR_FMT, __func__, ip->i_ino, ip); goto flush_out; } } - if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af) > - ip->i_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) { + if (ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af) > + ip->i_nblocks || XFS_TEST_ERROR(mp, XFS_ERRTAG_IFLUSH_5)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: detected corrupt incore inode %llu, " "total extents = %llu nblocks = %lld, ptr "PTR_FMT, @@ -2425,8 +2434,8 @@ xfs_iflush( ip->i_nblocks, ip); goto flush_out; } - if (XFS_TEST_ERROR(ip->i_forkoff > mp->m_sb.sb_inodesize, - mp, XFS_ERRTAG_IFLUSH_6)) { + if (ip->i_forkoff > mp->m_sb.sb_inodesize || + XFS_TEST_ERROR(mp, XFS_ERRTAG_IFLUSH_6)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: bad inode %llu, forkoff 0x%x, ptr "PTR_FMT, __func__, ip->i_ino, ip->i_forkoff, ip); @@ -2502,7 +2511,6 @@ flush_out: spin_lock(&iip->ili_lock); iip->ili_last_fields = iip->ili_fields; iip->ili_fields = 0; - iip->ili_fsync_fields = 0; set_bit(XFS_LI_FLUSHING, &iip->ili_item.li_flags); spin_unlock(&iip->ili_lock); @@ -2661,12 +2669,15 @@ int xfs_log_force_inode( struct xfs_inode *ip) { + struct xfs_inode_log_item *iip = ip->i_itemp; xfs_csn_t seq = 0; - xfs_ilock(ip, XFS_ILOCK_SHARED); - if (xfs_ipincount(ip)) - seq = ip->i_itemp->ili_commit_seq; - xfs_iunlock(ip, XFS_ILOCK_SHARED); + if (!iip) + return 0; + + spin_lock(&iip->ili_lock); + seq = iip->ili_commit_seq; + spin_unlock(&iip->ili_lock); if (!seq) return 0; @@ -2932,12 +2943,9 @@ xfs_inode_reload_unlinked( struct xfs_inode *ip) { struct xfs_trans *tp; - int error; - - error = xfs_trans_alloc_empty(ip->i_mount, &tp); - if (error) - return error; + int error = 0; + tp = xfs_trans_alloc_empty(ip->i_mount); xfs_ilock(ip, XFS_ILOCK_SHARED); if (xfs_inode_unlinked_incomplete(ip)) error = xfs_inode_reload_unlinked_bucket(tp, ip); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index eae0159983ca..bd6d33557194 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -356,19 +356,20 @@ static inline bool xfs_inode_has_bigrtalloc(const struct xfs_inode *ip) (XFS_IS_REALTIME_INODE(ip) ? \ (ip)->i_mount->m_rtdev_targp : (ip)->i_mount->m_ddev_targp) -static inline bool -xfs_inode_can_atomicwrite( - struct xfs_inode *ip) +static inline bool xfs_inode_can_hw_atomic_write(const struct xfs_inode *ip) { - struct xfs_mount *mp = ip->i_mount; - struct xfs_buftarg *target = xfs_inode_buftarg(ip); - - if (mp->m_sb.sb_blocksize < target->bt_bdev_awu_min) + if (IS_DAX(VFS_IC(ip))) return false; - if (mp->m_sb.sb_blocksize > target->bt_bdev_awu_max) + + return xfs_inode_buftarg(ip)->bt_awu_max > 0; +} + +static inline bool xfs_inode_can_sw_atomic_write(const struct xfs_inode *ip) +{ + if (IS_DAX(VFS_IC(ip))) return false; - return true; + return xfs_can_sw_atomic_write(ip->i_mount); } /* diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 40fc1bf900af..2eb0c6011a2e 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -113,9 +113,9 @@ xfs_inode_item_precommit( * to log the timestamps, or will clear already cleared fields in the * worst case. */ - if (inode->i_state & I_DIRTY_TIME) { + if (inode_state_read_once(inode) & I_DIRTY_TIME) { spin_lock(&inode->i_lock); - inode->i_state &= ~I_DIRTY_TIME; + inode_state_clear(inode, I_DIRTY_TIME); spin_unlock(&inode->i_lock); } @@ -131,46 +131,28 @@ xfs_inode_item_precommit( } /* - * Inode verifiers do not check that the extent size hint is an integer - * multiple of the rt extent size on a directory with both rtinherit - * and extszinherit flags set. If we're logging a directory that is - * misconfigured in this way, clear the hint. + * Inode verifiers do not check that the extent size hints are an + * integer multiple of the rt extent size on a directory with + * rtinherit flags set. If we're logging a directory that is + * misconfigured in this way, clear the bad hints. */ - if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) && - (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) && - xfs_extlen_to_rtxmod(ip->i_mount, ip->i_extsize) > 0) { - ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE | - XFS_DIFLAG_EXTSZINHERIT); - ip->i_extsize = 0; - flags |= XFS_ILOG_CORE; + if (ip->i_diflags & XFS_DIFLAG_RTINHERIT) { + if ((ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) && + xfs_extlen_to_rtxmod(ip->i_mount, ip->i_extsize) > 0) { + ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE | + XFS_DIFLAG_EXTSZINHERIT); + ip->i_extsize = 0; + flags |= XFS_ILOG_CORE; + } + if ((ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) && + xfs_extlen_to_rtxmod(ip->i_mount, ip->i_cowextsize) > 0) { + ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE; + ip->i_cowextsize = 0; + flags |= XFS_ILOG_CORE; + } } - /* - * Record the specific change for fdatasync optimisation. This allows - * fdatasync to skip log forces for inodes that are only timestamp - * dirty. Once we've processed the XFS_ILOG_IVERSION flag, convert it - * to XFS_ILOG_CORE so that the actual on-disk dirty tracking - * (ili_fields) correctly tracks that the version has changed. - */ spin_lock(&iip->ili_lock); - iip->ili_fsync_fields |= (flags & ~XFS_ILOG_IVERSION); - if (flags & XFS_ILOG_IVERSION) - flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE); - - /* - * Inode verifiers do not check that the CoW extent size hint is an - * integer multiple of the rt extent size on a directory with both - * rtinherit and cowextsize flags set. If we're logging a directory - * that is misconfigured in this way, clear the hint. - */ - if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) && - (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) && - xfs_extlen_to_rtxmod(ip->i_mount, ip->i_cowextsize) > 0) { - ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE; - ip->i_cowextsize = 0; - flags |= XFS_ILOG_CORE; - } - if (!iip->ili_item.li_buf) { struct xfs_buf *bp; int error; @@ -205,6 +187,20 @@ xfs_inode_item_precommit( } /* + * Store the dirty flags back into the inode item as this state is used + * later on in xfs_inode_item_committing() to determine whether the + * transaction is relevant to fsync state or not. + */ + iip->ili_dirty_flags = flags; + + /* + * Convert the flags on-disk fields that have been modified in the + * transaction so that ili_fields tracks the changes correctly. + */ + if (flags & XFS_ILOG_IVERSION) + flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE); + + /* * Always OR in the bits from the ili_last_fields field. This is to * coordinate with the xfs_iflush() and xfs_buf_inode_iodone() routines * in the eventual clearing of the ili_fields bits. See the big comment @@ -214,12 +210,6 @@ xfs_inode_item_precommit( spin_unlock(&iip->ili_lock); xfs_inode_item_precommit_check(ip); - - /* - * We are done with the log item transaction dirty state, so clear it so - * that it doesn't pollute future transactions. - */ - iip->ili_dirty_flags = 0; return 0; } @@ -729,13 +719,24 @@ xfs_inode_item_unpin( struct xfs_log_item *lip, int remove) { - struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode; + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; trace_xfs_inode_unpin(ip, _RET_IP_); ASSERT(lip->li_buf || xfs_iflags_test(ip, XFS_ISTALE)); ASSERT(atomic_read(&ip->i_pincount) > 0); - if (atomic_dec_and_test(&ip->i_pincount)) + + /* + * If this is the last unpin, then the inode no longer needs a journal + * flush to persist it. Hence we can clear the commit sequence numbers + * as a fsync/fdatasync operation on the inode at this point is a no-op. + */ + if (atomic_dec_and_lock(&ip->i_pincount, &iip->ili_lock)) { + iip->ili_commit_seq = 0; + iip->ili_datasync_seq = 0; + spin_unlock(&iip->ili_lock); wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT); + } } STATIC uint @@ -758,11 +759,14 @@ xfs_inode_item_push( * completed and items removed from the AIL before the next push * attempt. */ + trace_xfs_inode_push_stale(ip, _RET_IP_); return XFS_ITEM_PINNED; } - if (xfs_ipincount(ip) > 0 || xfs_buf_ispinned(bp)) + if (xfs_ipincount(ip) > 0 || xfs_buf_ispinned(bp)) { + trace_xfs_inode_push_pinned(ip, _RET_IP_); return XFS_ITEM_PINNED; + } if (xfs_iflags_test(ip, XFS_IFLUSHING)) return XFS_ITEM_FLUSHING; @@ -855,12 +859,45 @@ xfs_inode_item_committed( return lsn; } +/* + * The modification is now complete, so before we unlock the inode we need to + * update the commit sequence numbers for data integrity journal flushes. We + * always record the commit sequence number (ili_commit_seq) so that anything + * that needs a full journal sync will capture all of this modification. + * + * We then + * check if the changes will impact a datasync (O_DSYNC) journal flush. If the + * changes will require a datasync flush, then we also record the sequence in + * ili_datasync_seq. + * + * These commit sequence numbers will get cleared atomically with the inode being + * unpinned (i.e. pin count goes to zero), and so it will only be set when the + * inode is dirty in the journal. This removes the need for checking if the + * inode is pinned to determine if a journal flush is necessary, and hence + * removes the need for holding the ILOCK_SHARED in xfs_file_fsync() to + * serialise pin counts against commit sequence number updates. + * + */ STATIC void xfs_inode_item_committing( struct xfs_log_item *lip, xfs_csn_t seq) { - INODE_ITEM(lip)->ili_commit_seq = seq; + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + + spin_lock(&iip->ili_lock); + iip->ili_commit_seq = seq; + if (iip->ili_dirty_flags & ~(XFS_ILOG_IVERSION | XFS_ILOG_TIMESTAMP)) + iip->ili_datasync_seq = seq; + spin_unlock(&iip->ili_lock); + + /* + * Clear the per-transaction dirty flags now that we have finished + * recording the transaction's inode modifications in the CIL and are + * about to release and (maybe) unlock the inode. + */ + iip->ili_dirty_flags = 0; + return xfs_inode_item_release(lip); } @@ -1052,7 +1089,6 @@ xfs_iflush_abort_clean( { iip->ili_last_fields = 0; iip->ili_fields = 0; - iip->ili_fsync_fields = 0; iip->ili_flush_lsn = 0; iip->ili_item.li_buf = NULL; list_del_init(&iip->ili_item.li_bio_list); @@ -1089,13 +1125,7 @@ xfs_iflush_abort( * state. Whilst the inode is in the AIL, it should have a valid buffer * pointer for push operations to access - it is only safe to remove the * inode from the buffer once it has been removed from the AIL. - * - * We also clear the failed bit before removing the item from the AIL - * as xfs_trans_ail_delete()->xfs_clear_li_failed() will release buffer - * references the inode item owns and needs to hold until we've fully - * aborted the inode log item and detached it from the buffer. */ - clear_bit(XFS_LI_FAILED, &iip->ili_item.li_flags); xfs_trans_ail_delete(&iip->ili_item, 0); /* @@ -1185,12 +1215,12 @@ xfs_iflush_shutdown_abort( */ int xfs_inode_item_format_convert( - struct xfs_log_iovec *buf, + struct kvec *buf, struct xfs_inode_log_format *in_f) { - struct xfs_inode_log_format_32 *in_f32 = buf->i_addr; + struct xfs_inode_log_format_32 *in_f32 = buf->iov_base; - if (buf->i_len != sizeof(*in_f32)) { + if (buf->iov_len != sizeof(*in_f32)) { XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL); return -EFSCORRUPTED; } diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 377e06007804..2ddcca41714f 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -32,9 +32,17 @@ struct xfs_inode_log_item { spinlock_t ili_lock; /* flush state lock */ unsigned int ili_last_fields; /* fields when flushed */ unsigned int ili_fields; /* fields to be logged */ - unsigned int ili_fsync_fields; /* logged since last fsync */ xfs_lsn_t ili_flush_lsn; /* lsn at last flush */ + + /* + * We record the sequence number for every inode modification, as + * well as those that only require fdatasync operations for data + * integrity. This allows optimisation of the O_DSYNC/fdatasync path + * without needing to track what modifications the journal is currently + * carrying for the inode. These are protected by the above ili_lock. + */ xfs_csn_t ili_commit_seq; /* last transaction commit */ + xfs_csn_t ili_datasync_seq; /* for datasync optimisation */ }; static inline int xfs_inode_clean(struct xfs_inode *ip) @@ -46,8 +54,8 @@ extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *); extern void xfs_inode_item_destroy(struct xfs_inode *); extern void xfs_iflush_abort(struct xfs_inode *); extern void xfs_iflush_shutdown_abort(struct xfs_inode *); -extern int xfs_inode_item_format_convert(xfs_log_iovec_t *, - struct xfs_inode_log_format *); +int xfs_inode_item_format_convert(struct kvec *buf, + struct xfs_inode_log_format *in_f); extern struct kmem_cache *xfs_ili_cache; diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c index 7205fd14f6b3..9d1999d41be1 100644 --- a/fs/xfs/xfs_inode_item_recover.c +++ b/fs/xfs/xfs_inode_item_recover.c @@ -30,13 +30,13 @@ xlog_recover_inode_ra_pass2( struct xlog *log, struct xlog_recover_item *item) { - if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { - struct xfs_inode_log_format *ilfp = item->ri_buf[0].i_addr; + if (item->ri_buf[0].iov_len == sizeof(struct xfs_inode_log_format)) { + struct xfs_inode_log_format *ilfp = item->ri_buf[0].iov_base; xlog_buf_readahead(log, ilfp->ilf_blkno, ilfp->ilf_len, &xfs_inode_buf_ra_ops); } else { - struct xfs_inode_log_format_32 *ilfp = item->ri_buf[0].i_addr; + struct xfs_inode_log_format_32 *ilfp = item->ri_buf[0].iov_base; xlog_buf_readahead(log, ilfp->ilf_blkno, ilfp->ilf_len, &xfs_inode_buf_ra_ops); @@ -326,8 +326,8 @@ xlog_recover_inode_commit_pass2( int need_free = 0; xfs_failaddr_t fa; - if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { - in_f = item->ri_buf[0].i_addr; + if (item->ri_buf[0].iov_len == sizeof(struct xfs_inode_log_format)) { + in_f = item->ri_buf[0].iov_base; } else { in_f = kmalloc(sizeof(struct xfs_inode_log_format), GFP_KERNEL | __GFP_NOFAIL); @@ -366,7 +366,7 @@ xlog_recover_inode_commit_pass2( error = -EFSCORRUPTED; goto out_release; } - ldip = item->ri_buf[1].i_addr; + ldip = item->ri_buf[1].iov_base; if (XFS_IS_CORRUPT(mp, ldip->di_magic != XFS_DINODE_MAGIC)) { xfs_alert(mp, "%s: Bad inode log record, rec ptr "PTR_FMT", ino %lld", @@ -472,12 +472,12 @@ xlog_recover_inode_commit_pass2( goto out_release; } isize = xfs_log_dinode_size(mp); - if (unlikely(item->ri_buf[1].i_len > isize)) { + if (unlikely(item->ri_buf[1].iov_len > isize)) { XFS_CORRUPTION_ERROR("Bad log dinode size", XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); xfs_alert(mp, - "Bad inode 0x%llx log dinode size 0x%x", - in_f->ilf_ino, item->ri_buf[1].i_len); + "Bad inode 0x%llx log dinode size 0x%zx", + in_f->ilf_ino, item->ri_buf[1].iov_len); error = -EFSCORRUPTED; goto out_release; } @@ -500,8 +500,8 @@ xlog_recover_inode_commit_pass2( if (in_f->ilf_size == 2) goto out_owner_change; - len = item->ri_buf[2].i_len; - src = item->ri_buf[2].i_addr; + len = item->ri_buf[2].iov_len; + src = item->ri_buf[2].iov_base; ASSERT(in_f->ilf_size <= 4); ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK)); ASSERT(!(fields & XFS_ILOG_DFORK) || @@ -538,8 +538,8 @@ xlog_recover_inode_commit_pass2( } else { attr_index = 2; } - len = item->ri_buf[attr_index].i_len; - src = item->ri_buf[attr_index].i_addr; + len = item->ri_buf[attr_index].iov_len; + src = item->ri_buf[attr_index].iov_base; ASSERT(len == xlog_calc_iovec_len(in_f->ilf_asize)); switch (in_f->ilf_fields & XFS_ILOG_AFORK) { diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index d250f7f74e3b..59eaad774371 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -219,7 +219,7 @@ xfs_bulk_ireq_setup( else if (XFS_INO_TO_AGNO(mp, breq->startino) < hdr->agno) return -EINVAL; - breq->flags |= XFS_IBULK_SAME_AG; + breq->iwalk_flags |= XFS_IWALK_SAME_AG; /* Asking for an inode past the end of the AG? We're done! */ if (XFS_INO_TO_AGNO(mp, breq->startino) > hdr->agno) @@ -444,7 +444,7 @@ static void xfs_fill_fsxattr( struct xfs_inode *ip, int whichfork, - struct fileattr *fa) + struct file_kattr *fa) { struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); @@ -496,7 +496,7 @@ xfs_ioc_fsgetxattra( xfs_inode_t *ip, void __user *arg) { - struct fileattr fa; + struct file_kattr fa; xfs_ilock(ip, XFS_ILOCK_SHARED); xfs_fill_fsxattr(ip, XFS_ATTR_FORK, &fa); @@ -508,13 +508,10 @@ xfs_ioc_fsgetxattra( int xfs_fileattr_get( struct dentry *dentry, - struct fileattr *fa) + struct file_kattr *fa) { struct xfs_inode *ip = XFS_I(d_inode(dentry)); - if (d_is_special(dentry)) - return -ENOTTY; - xfs_ilock(ip, XFS_ILOCK_SHARED); xfs_fill_fsxattr(ip, XFS_DATA_FORK, fa); xfs_iunlock(ip, XFS_ILOCK_SHARED); @@ -526,7 +523,7 @@ static int xfs_ioctl_setattr_xflags( struct xfs_trans *tp, struct xfs_inode *ip, - struct fileattr *fa) + struct file_kattr *fa) { struct xfs_mount *mp = ip->i_mount; bool rtflag = (fa->fsx_xflags & FS_XFLAG_REALTIME); @@ -582,7 +579,7 @@ xfs_ioctl_setattr_xflags( static void xfs_ioctl_setattr_prepare_dax( struct xfs_inode *ip, - struct fileattr *fa) + struct file_kattr *fa) { struct xfs_mount *mp = ip->i_mount; struct inode *inode = VFS_I(ip); @@ -642,7 +639,7 @@ out_error: static int xfs_ioctl_setattr_check_extsize( struct xfs_inode *ip, - struct fileattr *fa) + struct file_kattr *fa) { struct xfs_mount *mp = ip->i_mount; xfs_failaddr_t failaddr; @@ -684,7 +681,7 @@ xfs_ioctl_setattr_check_extsize( static int xfs_ioctl_setattr_check_cowextsize( struct xfs_inode *ip, - struct fileattr *fa) + struct file_kattr *fa) { struct xfs_mount *mp = ip->i_mount; xfs_failaddr_t failaddr; @@ -709,7 +706,7 @@ xfs_ioctl_setattr_check_cowextsize( static int xfs_ioctl_setattr_check_projid( struct xfs_inode *ip, - struct fileattr *fa) + struct file_kattr *fa) { if (!fa->fsx_valid) return 0; @@ -725,7 +722,7 @@ int xfs_fileattr_set( struct mnt_idmap *idmap, struct dentry *dentry, - struct fileattr *fa) + struct file_kattr *fa) { struct xfs_inode *ip = XFS_I(d_inode(dentry)); struct xfs_mount *mp = ip->i_mount; @@ -736,9 +733,6 @@ xfs_fileattr_set( trace_xfs_ioctl_setattr(ip); - if (d_is_special(dentry)) - return -ENOTTY; - if (!fa->fsx_valid) { if (fa->flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NOATIME_FL | FS_NODUMP_FL | @@ -990,9 +984,8 @@ xfs_ioc_getlabel( BUILD_BUG_ON(sizeof(sbp->sb_fname) > FSLABEL_MAX); /* 1 larger than sb_fname, so this ensures a trailing NUL char */ - memset(label, 0, sizeof(label)); spin_lock(&mp->m_sb_lock); - strncpy(label, sbp->sb_fname, XFSLABEL_MAX); + memtostr_pad(label, sbp->sb_fname); spin_unlock(&mp->m_sb_lock); if (copy_to_user(user_label, label, sizeof(label))) @@ -1210,21 +1203,21 @@ xfs_file_ioctl( current->comm); return -ENOTTY; case XFS_IOC_DIOINFO: { - struct xfs_buftarg *target = xfs_inode_buftarg(ip); + struct kstat st; struct dioattr da; - da.d_mem = target->bt_logical_sectorsize; + error = vfs_getattr(&filp->f_path, &st, STATX_DIOALIGN, 0); + if (error) + return error; /* - * See xfs_report_dioalign() for an explanation about why this - * reports a value larger than the sector size for COW inodes. + * Some userspace directly feeds the return value to + * posix_memalign, which fails for values that are smaller than + * the pointer size. Round up the value to not break userspace. */ - if (xfs_is_cow_inode(ip)) - da.d_miniosz = xfs_inode_alloc_unitsize(ip); - else - da.d_miniosz = target->bt_logical_sectorsize; + da.d_mem = roundup(st.dio_mem_align, sizeof(void *)); + da.d_miniosz = st.dio_offset_align; da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1); - if (copy_to_user(arg, &da, sizeof(da))) return -EFAULT; return 0; @@ -1415,10 +1408,8 @@ xfs_file_ioctl( trace_xfs_ioc_free_eofblocks(mp, &icw, _RET_IP_); - sb_start_write(mp->m_super); - error = xfs_blockgc_free_space(mp, &icw); - sb_end_write(mp->m_super); - return error; + guard(super_write)(mp->m_super); + return xfs_blockgc_free_space(mp, &icw); } case XFS_IOC_EXCHANGE_RANGE: diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h index 12124946f347..f5ed5cf9d3df 100644 --- a/fs/xfs/xfs_ioctl.h +++ b/fs/xfs/xfs_ioctl.h @@ -17,13 +17,13 @@ xfs_ioc_swapext( extern int xfs_fileattr_get( struct dentry *dentry, - struct fileattr *fa); + struct file_kattr *fa); extern int xfs_fileattr_set( struct mnt_idmap *idmap, struct dentry *dentry, - struct fileattr *fa); + struct file_kattr *fa); extern long xfs_file_ioctl( diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index cb23c8871f81..04f39ea15898 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -79,6 +79,9 @@ xfs_iomap_valid( { struct xfs_inode *ip = XFS_I(inode); + if (iomap->type == IOMAP_HOLE) + return true; + if (iomap->validity_cookie != xfs_iomap_inode_sequence(ip, iomap->flags)) { trace_xfs_iomap_invalid(ip, iomap); @@ -89,7 +92,7 @@ xfs_iomap_valid( return true; } -static const struct iomap_folio_ops xfs_iomap_folio_ops = { +const struct iomap_write_ops xfs_iomap_write_ops = { .iomap_valid = xfs_iomap_valid, }; @@ -146,12 +149,20 @@ xfs_bmbt_to_iomap( iomap->bdev = target->bt_bdev; iomap->flags = iomap_flags; - if (xfs_ipincount(ip) && - (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) - iomap->flags |= IOMAP_F_DIRTY; + /* + * If the inode is dirty for datasync purposes, let iomap know so it + * doesn't elide the IO completion journal flushes on O_DSYNC IO. + */ + if (ip->i_itemp) { + struct xfs_inode_log_item *iip = ip->i_itemp; + + spin_lock(&iip->ili_lock); + if (iip->ili_datasync_seq) + iomap->flags |= IOMAP_F_DIRTY; + spin_unlock(&iip->ili_lock); + } iomap->validity_cookie = sequence_cookie; - iomap->folio_ops = &xfs_iomap_folio_ops; return 0; } @@ -798,6 +809,38 @@ imap_spans_range( return true; } +static bool +xfs_bmap_hw_atomic_write_possible( + struct xfs_inode *ip, + struct xfs_bmbt_irec *imap, + xfs_fileoff_t offset_fsb, + xfs_fileoff_t end_fsb) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fsize_t len = XFS_FSB_TO_B(mp, end_fsb - offset_fsb); + + /* + * atomic writes are required to be naturally aligned for disk blocks, + * which ensures that we adhere to block layer rules that we won't + * straddle any boundary or violate write alignment requirement. + */ + if (!IS_ALIGNED(imap->br_startblock, imap->br_blockcount)) + return false; + + /* + * Spanning multiple extents would mean that multiple BIOs would be + * issued, and so would lose atomicity required for REQ_ATOMIC-based + * atomics. + */ + if (!imap_spans_range(imap, offset_fsb, end_fsb)) + return false; + + /* + * The ->iomap_begin caller should ensure this, but check anyway. + */ + return len <= xfs_inode_buftarg(ip)->bt_awu_max; +} + static int xfs_direct_write_iomap_begin( struct inode *inode, @@ -812,9 +855,11 @@ xfs_direct_write_iomap_begin( struct xfs_bmbt_irec imap, cmap; xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length); + xfs_fileoff_t orig_end_fsb = end_fsb; int nimaps = 1, error = 0; bool shared = false; u16 iomap_flags = 0; + bool needs_alloc; unsigned int lockmode; u64 seq; @@ -875,13 +920,37 @@ relock: (flags & IOMAP_DIRECT) || IS_DAX(inode)); if (error) goto out_unlock; - if (shared) + if (shared) { + if ((flags & IOMAP_ATOMIC) && + !xfs_bmap_hw_atomic_write_possible(ip, &cmap, + offset_fsb, end_fsb)) { + error = -ENOPROTOOPT; + goto out_unlock; + } goto out_found_cow; + } end_fsb = imap.br_startoff + imap.br_blockcount; length = XFS_FSB_TO_B(mp, end_fsb) - offset; } - if (imap_needs_alloc(inode, flags, &imap, nimaps)) + needs_alloc = imap_needs_alloc(inode, flags, &imap, nimaps); + + if (flags & IOMAP_ATOMIC) { + error = -ENOPROTOOPT; + /* + * If we allocate less than what is required for the write + * then we may end up with multiple extents, which means that + * REQ_ATOMIC-based cannot be used, so avoid this possibility. + */ + if (needs_alloc && orig_end_fsb - offset_fsb > 1) + goto out_unlock; + + if (!xfs_bmap_hw_atomic_write_possible(ip, &imap, offset_fsb, + orig_end_fsb)) + goto out_unlock; + } + + if (needs_alloc) goto allocate_blocks; /* @@ -1022,6 +1091,190 @@ const struct iomap_ops xfs_zoned_direct_write_iomap_ops = { }; #endif /* CONFIG_XFS_RT */ +#ifdef DEBUG +static void +xfs_check_atomic_cow_conversion( + struct xfs_inode *ip, + xfs_fileoff_t offset_fsb, + xfs_filblks_t count_fsb, + const struct xfs_bmbt_irec *cmap) +{ + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec cmap2 = { }; + + if (xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap2)) + xfs_trim_extent(&cmap2, offset_fsb, count_fsb); + + ASSERT(cmap2.br_startoff == cmap->br_startoff); + ASSERT(cmap2.br_blockcount == cmap->br_blockcount); + ASSERT(cmap2.br_startblock == cmap->br_startblock); + ASSERT(cmap2.br_state == cmap->br_state); +} +#else +# define xfs_check_atomic_cow_conversion(...) ((void)0) +#endif + +static int +xfs_atomic_write_cow_iomap_begin( + struct inode *inode, + loff_t offset, + loff_t length, + unsigned flags, + struct iomap *iomap, + struct iomap *srcmap) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + const xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + const xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + length); + const xfs_filblks_t count_fsb = end_fsb - offset_fsb; + xfs_filblks_t hole_count_fsb; + int nmaps = 1; + xfs_filblks_t resaligned; + struct xfs_bmbt_irec cmap; + struct xfs_iext_cursor icur; + struct xfs_trans *tp; + unsigned int dblocks = 0, rblocks = 0; + int error; + u64 seq; + + ASSERT(flags & IOMAP_WRITE); + ASSERT(flags & IOMAP_DIRECT); + + if (xfs_is_shutdown(mp)) + return -EIO; + + if (!xfs_can_sw_atomic_write(mp)) { + ASSERT(xfs_can_sw_atomic_write(mp)); + return -EINVAL; + } + + /* blocks are always allocated in this path */ + if (flags & IOMAP_NOWAIT) + return -EAGAIN; + + trace_xfs_iomap_atomic_write_cow(ip, offset, length); +retry: + xfs_ilock(ip, XFS_ILOCK_EXCL); + + if (!ip->i_cowfp) { + ASSERT(!xfs_is_reflink_inode(ip)); + xfs_ifork_init_cow(ip); + } + + if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap)) + cmap.br_startoff = end_fsb; + if (cmap.br_startoff <= offset_fsb) { + if (isnullstartblock(cmap.br_startblock)) + goto convert_delay; + + /* + * cmap could extend outside the write range due to previous + * speculative preallocations. We must trim cmap to the write + * range because the cow fork treats written mappings to mean + * "write in progress". + */ + xfs_trim_extent(&cmap, offset_fsb, count_fsb); + goto found; + } + + hole_count_fsb = cmap.br_startoff - offset_fsb; + + resaligned = xfs_aligned_fsb_count(offset_fsb, hole_count_fsb, + xfs_get_cowextsz_hint(ip)); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + if (XFS_IS_REALTIME_INODE(ip)) { + dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0); + rblocks = resaligned; + } else { + dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); + rblocks = 0; + } + + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, dblocks, + rblocks, false, &tp); + if (error) + return error; + + /* extent layout could have changed since the unlock, so check again */ + if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap)) + cmap.br_startoff = end_fsb; + if (cmap.br_startoff <= offset_fsb) { + xfs_trans_cancel(tp); + if (isnullstartblock(cmap.br_startblock)) + goto convert_delay; + xfs_trim_extent(&cmap, offset_fsb, count_fsb); + goto found; + } + + /* + * Allocate the entire reservation as unwritten blocks. + * + * Use XFS_BMAPI_EXTSZALIGN to hint at aligning new extents according to + * extszhint, such that there will be a greater chance that future + * atomic writes to that same range will be aligned (and don't require + * this COW-based method). + */ + error = xfs_bmapi_write(tp, ip, offset_fsb, hole_count_fsb, + XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC | + XFS_BMAPI_EXTSZALIGN, 0, &cmap, &nmaps); + if (error) { + xfs_trans_cancel(tp); + goto out_unlock; + } + + xfs_inode_set_cowblocks_tag(ip); + error = xfs_trans_commit(tp); + if (error) + goto out_unlock; + + /* + * cmap could map more blocks than the range we passed into bmapi_write + * because of EXTSZALIGN or adjacent pre-existing unwritten mappings + * that were merged. Trim cmap to the original write range so that we + * don't convert more than we were asked to do for this write. + */ + xfs_trim_extent(&cmap, offset_fsb, count_fsb); + +found: + if (cmap.br_state != XFS_EXT_NORM) { + error = xfs_reflink_convert_cow_locked(ip, cmap.br_startoff, + cmap.br_blockcount); + if (error) + goto out_unlock; + cmap.br_state = XFS_EXT_NORM; + xfs_check_atomic_cow_conversion(ip, offset_fsb, count_fsb, + &cmap); + } + + trace_xfs_iomap_found(ip, offset, length, XFS_COW_FORK, &cmap); + seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq); + +convert_delay: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + error = xfs_bmapi_convert_delalloc(ip, XFS_COW_FORK, offset, iomap, + NULL); + if (error) + return error; + + /* + * Try the lookup again, because the delalloc conversion might have + * turned the COW mapping into unwritten, but we need it to be in + * written state. + */ + goto retry; +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +} + +const struct iomap_ops xfs_atomic_write_cow_iomap_ops = { + .iomap_begin = xfs_atomic_write_cow_iomap_begin, +}; + static int xfs_dax_write_iomap_end( struct inode *inode, @@ -1366,7 +1619,7 @@ xfs_zoned_buffered_write_iomap_begin( return error; if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, XFS_DATA_FORK); error = -EFSCORRUPTED; goto out_unlock; @@ -1505,6 +1758,8 @@ xfs_buffered_write_iomap_begin( struct iomap *iomap, struct iomap *srcmap) { + struct iomap_iter *iter = container_of(iomap, struct iomap_iter, + iomap); struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); @@ -1540,7 +1795,7 @@ xfs_buffered_write_iomap_begin( return error; if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + XFS_TEST_ERROR(mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, XFS_DATA_FORK); error = -EFSCORRUPTED; goto out_unlock; @@ -1570,21 +1825,41 @@ xfs_buffered_write_iomap_begin( } /* - * For zeroing, trim a delalloc extent that extends beyond the EOF - * block. If it starts beyond the EOF block, convert it to an + * For zeroing, trim extents that extend beyond the EOF block. If a + * delalloc extent starts beyond the EOF block, convert it to an * unwritten extent. */ - if ((flags & IOMAP_ZERO) && imap.br_startoff <= offset_fsb && - isnullstartblock(imap.br_startblock)) { + if (flags & IOMAP_ZERO) { xfs_fileoff_t eof_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)); + u64 end; - if (offset_fsb >= eof_fsb) + if (isnullstartblock(imap.br_startblock) && + offset_fsb >= eof_fsb) goto convert_delay; - if (end_fsb > eof_fsb) { + if (offset_fsb < eof_fsb && end_fsb > eof_fsb) end_fsb = eof_fsb; - xfs_trim_extent(&imap, offset_fsb, - end_fsb - offset_fsb); + + /* + * Look up dirty folios for unwritten mappings within EOF. + * Providing this bypasses the flush iomap uses to trigger + * extent conversion when unwritten mappings have dirty + * pagecache in need of zeroing. + * + * Trim the mapping to the end pos of the lookup, which in turn + * was trimmed to the end of the batch if it became full before + * the end of the mapping. + */ + if (imap.br_state == XFS_EXT_UNWRITTEN && + offset_fsb < eof_fsb) { + loff_t len = min(count, + XFS_FSB_TO_B(mp, imap.br_blockcount)); + + end = iomap_fill_dirty_folios(iter, offset, len); + end_fsb = min_t(xfs_fileoff_t, end_fsb, + XFS_B_TO_FSB(mp, end)); } + + xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb); } /* @@ -2012,7 +2287,8 @@ xfs_zero_range( return dax_zero_range(inode, pos, len, did_zero, &xfs_dax_write_iomap_ops); return iomap_zero_range(inode, pos, len, did_zero, - &xfs_buffered_write_iomap_ops, ac); + &xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops, + ac); } int @@ -2028,5 +2304,6 @@ xfs_truncate_page( return dax_truncate_page(inode, pos, did_zero, &xfs_dax_write_iomap_ops); return iomap_truncate_page(inode, pos, did_zero, - &xfs_buffered_write_iomap_ops, ac); + &xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops, + ac); } diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index d330c4a581b1..ebcce7d49446 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -56,5 +56,7 @@ extern const struct iomap_ops xfs_read_iomap_ops; extern const struct iomap_ops xfs_seek_iomap_ops; extern const struct iomap_ops xfs_xattr_iomap_ops; extern const struct iomap_ops xfs_dax_write_iomap_ops; +extern const struct iomap_ops xfs_atomic_write_cow_iomap_ops; +extern const struct iomap_write_ops xfs_iomap_write_ops; #endif /* __XFS_IOMAP_H__*/ diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 756bd3ca8e00..ad94fbf55014 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -431,14 +431,12 @@ xfs_vn_symlink( struct dentry *dentry, const char *symname) { - struct inode *inode; - struct xfs_inode *cip = NULL; - struct xfs_name name; - int error; - umode_t mode; + struct inode *inode; + struct xfs_inode *cip = NULL; + struct xfs_name name; + int error; + umode_t mode = S_IFLNK | S_IRWXUGO; - mode = S_IFLNK | - (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO); error = xfs_dentry_mode_to_name(&name, dentry, mode); if (unlikely(error)) goto out; @@ -601,16 +599,83 @@ xfs_report_dioalign( stat->dio_offset_align = stat->dio_read_offset_align; } +unsigned int +xfs_get_atomic_write_min( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + + /* + * If we can complete an atomic write via atomic out of place writes, + * then advertise a minimum size of one fsblock. Without this + * mechanism, we can only guarantee atomic writes up to a single LBA. + * + * If out of place writes are not available, we can guarantee an atomic + * write of exactly one single fsblock if the bdev will make that + * guarantee for us. + */ + if (xfs_inode_can_hw_atomic_write(ip) || + xfs_inode_can_sw_atomic_write(ip)) + return mp->m_sb.sb_blocksize; + + return 0; +} + +unsigned int +xfs_get_atomic_write_max( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + + /* + * If out of place writes are not available, we can guarantee an atomic + * write of exactly one single fsblock if the bdev will make that + * guarantee for us. + */ + if (!xfs_inode_can_sw_atomic_write(ip)) { + if (xfs_inode_can_hw_atomic_write(ip)) + return mp->m_sb.sb_blocksize; + return 0; + } + + /* + * If we can complete an atomic write via atomic out of place writes, + * then advertise a maximum size of whatever we can complete through + * that means. Hardware support is reported via max_opt, not here. + */ + if (XFS_IS_REALTIME_INODE(ip)) + return XFS_FSB_TO_B(mp, mp->m_groups[XG_TYPE_RTG].awu_max); + return XFS_FSB_TO_B(mp, mp->m_groups[XG_TYPE_AG].awu_max); +} + +unsigned int +xfs_get_atomic_write_max_opt( + struct xfs_inode *ip) +{ + unsigned int awu_max = xfs_get_atomic_write_max(ip); + + /* if the max is 1x block, then just keep behaviour that opt is 0 */ + if (awu_max <= ip->i_mount->m_sb.sb_blocksize) + return 0; + + /* + * Advertise the maximum size of an atomic write that we can tell the + * block device to perform for us. In general the bdev limit will be + * less than our out of place write limit, but we don't want to exceed + * the awu_max. + */ + return min(awu_max, xfs_inode_buftarg(ip)->bt_awu_max); +} + static void xfs_report_atomic_write( struct xfs_inode *ip, struct kstat *stat) { - unsigned int unit_min = 0, unit_max = 0; - - if (xfs_inode_can_atomicwrite(ip)) - unit_min = unit_max = ip->i_mount->m_sb.sb_blocksize; - generic_fill_statx_atomic_writes(stat, unit_min, unit_max); + generic_fill_statx_atomic_writes(stat, + xfs_get_atomic_write_min(ip), + xfs_get_atomic_write_max(ip), + xfs_get_atomic_write_max_opt(ip)); } STATIC int @@ -904,7 +969,7 @@ xfs_setattr_size( * change. */ if (xfs_is_zoned_inode(ip)) { - error = xfs_zoned_space_reserve(ip, 1, + error = xfs_zoned_space_reserve(mp, 1, XFS_ZR_NOWAIT | XFS_ZR_RESERVED, &ac); if (error) { if (error == -EAGAIN) @@ -932,7 +997,7 @@ xfs_setattr_size( } if (xfs_is_zoned_inode(ip)) - xfs_zoned_space_unreserve(ip, &ac); + xfs_zoned_space_unreserve(mp, &ac); if (error) return error; @@ -1268,6 +1333,8 @@ static const struct inode_operations xfs_symlink_inode_operations = { .setattr = xfs_vn_setattr, .listxattr = xfs_vn_listxattr, .update_time = xfs_vn_update_time, + .fileattr_get = xfs_fileattr_get, + .fileattr_set = xfs_fileattr_set, }; /* Figure out if this file actually supports DAX. */ @@ -1353,7 +1420,7 @@ xfs_setup_inode( bool is_meta = xfs_is_internal_inode(ip); inode->i_ino = ip->i_ino; - inode->i_state |= I_NEW; + inode_state_set_raw(inode, I_NEW); inode_sb_list_add(inode); /* make the inode look hashed for the writeback code */ diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h index 3c1a2605ffd2..0896f6b8b3b8 100644 --- a/fs/xfs/xfs_iops.h +++ b/fs/xfs/xfs_iops.h @@ -19,5 +19,8 @@ int xfs_inode_init_security(struct inode *inode, struct inode *dir, extern void xfs_setup_inode(struct xfs_inode *ip); extern void xfs_setup_iops(struct xfs_inode *ip); extern void xfs_diflags_to_iflags(struct xfs_inode *ip, bool init); +unsigned int xfs_get_atomic_write_min(struct xfs_inode *ip); +unsigned int xfs_get_atomic_write_max(struct xfs_inode *ip); +unsigned int xfs_get_atomic_write_max_opt(struct xfs_inode *ip); #endif /* __XFS_IOPS_H__ */ diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 1fa1c0564b0c..2aa37a4d2706 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -239,14 +239,10 @@ xfs_bulkstat_one( * Grab an empty transaction so that we can use its recursive buffer * locking abilities to detect cycles in the inobt without deadlocking. */ - error = xfs_trans_alloc_empty(breq->mp, &tp); - if (error) - goto out; - + tp = xfs_trans_alloc_empty(breq->mp); error = xfs_bulkstat_one_int(breq->mp, breq->idmap, tp, breq->startino, &bc); xfs_trans_cancel(tp); -out: kfree(bc.buf); /* @@ -311,7 +307,6 @@ xfs_bulkstat( .breq = breq, }; struct xfs_trans *tp; - unsigned int iwalk_flags = 0; int error; if (breq->idmap != &nop_mnt_idmap) { @@ -331,17 +326,10 @@ xfs_bulkstat( * Grab an empty transaction so that we can use its recursive buffer * locking abilities to detect cycles in the inobt without deadlocking. */ - error = xfs_trans_alloc_empty(breq->mp, &tp); - if (error) - goto out; - - if (breq->flags & XFS_IBULK_SAME_AG) - iwalk_flags |= XFS_IWALK_SAME_AG; - - error = xfs_iwalk(breq->mp, tp, breq->startino, iwalk_flags, + tp = xfs_trans_alloc_empty(breq->mp); + error = xfs_iwalk(breq->mp, tp, breq->startino, breq->iwalk_flags, xfs_bulkstat_iwalk, breq->icount, &bc); xfs_trans_cancel(tp); -out: kfree(bc.buf); /* @@ -464,14 +452,10 @@ xfs_inumbers( * Grab an empty transaction so that we can use its recursive buffer * locking abilities to detect cycles in the inobt without deadlocking. */ - error = xfs_trans_alloc_empty(breq->mp, &tp); - if (error) - goto out; - - error = xfs_inobt_walk(breq->mp, tp, breq->startino, breq->flags, + tp = xfs_trans_alloc_empty(breq->mp); + error = xfs_inobt_walk(breq->mp, tp, breq->startino, breq->iwalk_flags, xfs_inumbers_walk, breq->icount, &ic); xfs_trans_cancel(tp); -out: /* * We found some inode groups, so clear the error status and return diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h index f10e8f8f2335..2d0612f14d6e 100644 --- a/fs/xfs/xfs_itable.h +++ b/fs/xfs/xfs_itable.h @@ -13,17 +13,15 @@ struct xfs_ibulk { xfs_ino_t startino; /* start with this inode */ unsigned int icount; /* number of elements in ubuffer */ unsigned int ocount; /* number of records returned */ - unsigned int flags; /* see XFS_IBULK_FLAG_* */ + unsigned int flags; /* XFS_IBULK_FLAG_* */ + unsigned int iwalk_flags; /* XFS_IWALK_FLAG_* */ }; -/* Only iterate within the same AG as startino */ -#define XFS_IBULK_SAME_AG (1U << 0) - /* Fill out the bs_extents64 field if set. */ -#define XFS_IBULK_NREXT64 (1U << 1) +#define XFS_IBULK_NREXT64 (1U << 0) /* Signal that we can return metadata directories. */ -#define XFS_IBULK_METADIR (1U << 2) +#define XFS_IBULK_METADIR (1U << 1) /* * Advance the user buffer pointer by one record of the given size. If the diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c index 7db3ece370b1..c1c31d1a8e21 100644 --- a/fs/xfs/xfs_iwalk.c +++ b/fs/xfs/xfs_iwalk.c @@ -377,11 +377,8 @@ xfs_iwalk_run_callbacks( if (!has_more) return 0; - if (iwag->drop_trans) { - error = xfs_trans_alloc_empty(mp, &iwag->tp); - if (error) - return error; - } + if (iwag->drop_trans) + iwag->tp = xfs_trans_alloc_empty(mp); /* ...and recreate the cursor just past where we left off. */ error = xfs_ialloc_read_agi(iwag->pag, iwag->tp, 0, agi_bpp); @@ -617,9 +614,7 @@ xfs_iwalk_ag_work( * Grab an empty transaction so that we can use its recursive buffer * locking abilities to detect cycles in the inobt without deadlocking. */ - error = xfs_trans_alloc_empty(mp, &iwag->tp); - if (error) - goto out; + iwag->tp = xfs_trans_alloc_empty(mp); iwag->drop_trans = 1; error = xfs_iwalk_ag(iwag); diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 9a2221b4aa21..4dd747bdbcca 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -89,8 +89,6 @@ typedef __u32 xfs_nlink_t; #undef XFS_NATIVE_HOST #endif -#define irix_sgid_inherit xfs_params.sgid_inherit.val -#define irix_symlink_mode xfs_params.symlink_mode.val #define xfs_panic_mask xfs_params.panic_mask.val #define xfs_error_level xfs_params.error_level.val #define xfs_syncd_centisecs xfs_params.syncd_timer.val diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 6493bdb57351..a311385b23d8 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -109,14 +109,14 @@ xlog_prepare_iovec( vec = &lv->lv_iovecp[0]; } - len = lv->lv_buf_len + sizeof(struct xlog_op_header); + len = lv->lv_buf_used + sizeof(struct xlog_op_header); if (!IS_ALIGNED(len, sizeof(uint64_t))) { - lv->lv_buf_len = round_up(len, sizeof(uint64_t)) - + lv->lv_buf_used = round_up(len, sizeof(uint64_t)) - sizeof(struct xlog_op_header); } vec->i_type = type; - vec->i_addr = lv->lv_buf + lv->lv_buf_len; + vec->i_addr = lv->lv_buf + lv->lv_buf_used; oph = vec->i_addr; oph->oh_clientid = XFS_TRANSACTION; @@ -534,8 +534,8 @@ xlog_state_release_iclog( */ if ((iclog->ic_state == XLOG_STATE_WANT_SYNC || (iclog->ic_flags & XLOG_ICL_NEED_FUA)) && - !iclog->ic_header.h_tail_lsn) { - iclog->ic_header.h_tail_lsn = + !iclog->ic_header->h_tail_lsn) { + iclog->ic_header->h_tail_lsn = cpu_to_be64(atomic64_read(&log->l_tail_lsn)); } @@ -969,8 +969,8 @@ xfs_log_unmount_write( * counters will be recalculated. Refer to xlog_check_unmount_rec for * more details. */ - if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp, - XFS_ERRTAG_FORCE_SUMMARY_RECALC)) { + if (xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS) || + XFS_TEST_ERROR(mp, XFS_ERRTAG_FORCE_SUMMARY_RECALC)) { xfs_alert(mp, "%s: will fix summary counters at next mount", __func__); return; @@ -1240,7 +1240,7 @@ xlog_ioend_work( /* * Race to shutdown the filesystem if we see an error. */ - if (XFS_TEST_ERROR(error, log->l_mp, XFS_ERRTAG_IODONE_IOERR)) { + if (error || XFS_TEST_ERROR(log->l_mp, XFS_ERRTAG_IODONE_IOERR)) { xfs_alert(log->l_mp, "log I/O error %d", error); xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); } @@ -1279,11 +1279,12 @@ xlog_get_iclog_buffer_size( log->l_iclog_size = mp->m_logbsize; /* - * # headers = size / 32k - one header holds cycles from 32k of data. + * Combined size of the log record headers. The first 32k cycles + * are stored directly in the xlog_rec_header, the rest in the + * variable number of xlog_rec_ext_headers at its end. */ - log->l_iclog_heads = - DIV_ROUND_UP(mp->m_logbsize, XLOG_HEADER_CYCLE_SIZE); - log->l_iclog_hsize = log->l_iclog_heads << BBSHIFT; + log->l_iclog_hsize = struct_size(log->l_iclog->ic_header, h_ext, + DIV_ROUND_UP(mp->m_logbsize, XLOG_HEADER_CYCLE_SIZE) - 1); } void @@ -1367,9 +1368,8 @@ xlog_alloc_log( int num_bblks) { struct xlog *log; - xlog_rec_header_t *head; - xlog_in_core_t **iclogp; - xlog_in_core_t *iclog, *prev_iclog=NULL; + struct xlog_in_core **iclogp; + struct xlog_in_core *iclog, *prev_iclog = NULL; int i; int error = -ENOMEM; uint log2_size = 0; @@ -1436,13 +1436,6 @@ xlog_alloc_log( init_waitqueue_head(&log->l_flush_wait); iclogp = &log->l_iclog; - /* - * The amount of memory to allocate for the iclog structure is - * rather funky due to the way the structure is defined. It is - * done this way so that we can use different sizes for machines - * with different amounts of memory. See the definition of - * xlog_in_core_t in xfs_log_priv.h for details. - */ ASSERT(log->l_iclog_size >= 4096); for (i = 0; i < log->l_iclog_bufs; i++) { size_t bvec_size = howmany(log->l_iclog_size, PAGE_SIZE) * @@ -1457,26 +1450,25 @@ xlog_alloc_log( iclog->ic_prev = prev_iclog; prev_iclog = iclog; - iclog->ic_data = kvzalloc(log->l_iclog_size, + iclog->ic_header = kvzalloc(log->l_iclog_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL); - if (!iclog->ic_data) + if (!iclog->ic_header) goto out_free_iclog; - head = &iclog->ic_header; - memset(head, 0, sizeof(xlog_rec_header_t)); - head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM); - head->h_version = cpu_to_be32( + iclog->ic_header->h_magicno = + cpu_to_be32(XLOG_HEADER_MAGIC_NUM); + iclog->ic_header->h_version = cpu_to_be32( xfs_has_logv2(log->l_mp) ? 2 : 1); - head->h_size = cpu_to_be32(log->l_iclog_size); - /* new fields */ - head->h_fmt = cpu_to_be32(XLOG_FMT); - memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t)); + iclog->ic_header->h_size = cpu_to_be32(log->l_iclog_size); + iclog->ic_header->h_fmt = cpu_to_be32(XLOG_FMT); + memcpy(&iclog->ic_header->h_fs_uuid, &mp->m_sb.sb_uuid, + sizeof(iclog->ic_header->h_fs_uuid)); + iclog->ic_datap = (void *)iclog->ic_header + log->l_iclog_hsize; iclog->ic_size = log->l_iclog_size - log->l_iclog_hsize; iclog->ic_state = XLOG_STATE_ACTIVE; iclog->ic_log = log; atomic_set(&iclog->ic_refcnt, 0); INIT_LIST_HEAD(&iclog->ic_callbacks); - iclog->ic_datap = (void *)iclog->ic_data + log->l_iclog_hsize; init_waitqueue_head(&iclog->ic_force_wait); init_waitqueue_head(&iclog->ic_write_wait); @@ -1489,8 +1481,7 @@ xlog_alloc_log( log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */ log->l_ioend_workqueue = alloc_workqueue("xfs-log/%s", - XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | - WQ_HIGHPRI), + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_PERCPU), 0, mp->m_super->s_id); if (!log->l_ioend_workqueue) goto out_free_iclog; @@ -1505,7 +1496,7 @@ out_destroy_workqueue: out_free_iclog: for (iclog = log->l_iclog; iclog; iclog = prev_iclog) { prev_iclog = iclog->ic_next; - kvfree(iclog->ic_data); + kvfree(iclog->ic_header); kfree(iclog); if (prev_iclog == log->l_iclog) break; @@ -1525,36 +1516,19 @@ xlog_pack_data( struct xlog_in_core *iclog, int roundoff) { - int i, j, k; - int size = iclog->ic_offset + roundoff; - __be32 cycle_lsn; - char *dp; - - cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn); + struct xlog_rec_header *rhead = iclog->ic_header; + __be32 cycle_lsn = CYCLE_LSN_DISK(rhead->h_lsn); + char *dp = iclog->ic_datap; + int i; - dp = iclog->ic_datap; - for (i = 0; i < BTOBB(size); i++) { - if (i >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) - break; - iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp; + for (i = 0; i < BTOBB(iclog->ic_offset + roundoff); i++) { + *xlog_cycle_data(rhead, i) = *(__be32 *)dp; *(__be32 *)dp = cycle_lsn; dp += BBSIZE; } - if (xfs_has_logv2(log->l_mp)) { - xlog_in_core_2_t *xhdr = iclog->ic_data; - - for ( ; i < BTOBB(size); i++) { - j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp; - *(__be32 *)dp = cycle_lsn; - dp += BBSIZE; - } - - for (i = 1; i < log->l_iclog_heads; i++) - xhdr[i].hic_xheader.xh_cycle = cycle_lsn; - } + for (i = 0; i < (log->l_iclog_hsize >> BBSHIFT) - 1; i++) + rhead->h_ext[i].xh_cycle = cycle_lsn; } /* @@ -1568,27 +1542,22 @@ xlog_cksum( struct xlog *log, struct xlog_rec_header *rhead, char *dp, - int size) + unsigned int hdrsize, + unsigned int size) { uint32_t crc; /* first generate the crc for the record header ... */ - crc = xfs_start_cksum_update((char *)rhead, - sizeof(struct xlog_rec_header), + crc = xfs_start_cksum_update((char *)rhead, hdrsize, offsetof(struct xlog_rec_header, h_crc)); /* ... then for additional cycle data for v2 logs ... */ if (xfs_has_logv2(log->l_mp)) { - union xlog_in_core2 *xhdr = (union xlog_in_core2 *)rhead; - int i; - int xheads; - - xheads = DIV_ROUND_UP(size, XLOG_HEADER_CYCLE_SIZE); + int xheads, i; - for (i = 1; i < xheads; i++) { - crc = crc32c(crc, &xhdr[i].hic_xheader, - sizeof(struct xlog_rec_ext_header)); - } + xheads = DIV_ROUND_UP(size, XLOG_HEADER_CYCLE_SIZE) - 1; + for (i = 0; i < xheads; i++) + crc = crc32c(crc, &rhead->h_ext[i], XLOG_REC_EXT_SIZE); } /* ... and finally for the payload */ @@ -1607,27 +1576,6 @@ xlog_bio_end_io( &iclog->ic_end_io_work); } -static int -xlog_map_iclog_data( - struct bio *bio, - void *data, - size_t count) -{ - do { - struct page *page = kmem_to_page(data); - unsigned int off = offset_in_page(data); - size_t len = min_t(size_t, count, PAGE_SIZE - off); - - if (bio_add_page(bio, page, len, off) != len) - return -EIO; - - data += len; - count -= len; - } while (count); - - return 0; -} - STATIC void xlog_write_iclog( struct xlog *log, @@ -1693,11 +1641,12 @@ xlog_write_iclog( iclog->ic_flags &= ~(XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA); - if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) - goto shutdown; - - if (is_vmalloc_addr(iclog->ic_data)) - flush_kernel_vmap_range(iclog->ic_data, count); + if (is_vmalloc_addr(iclog->ic_header)) { + if (!bio_add_vmalloc(&iclog->ic_bio, iclog->ic_header, count)) + goto shutdown; + } else { + bio_add_virt_nofail(&iclog->ic_bio, iclog->ic_header, count); + } /* * If this log buffer would straddle the end of the log we will have @@ -1825,20 +1774,20 @@ xlog_sync( size = iclog->ic_offset; if (xfs_has_logv2(log->l_mp)) size += roundoff; - iclog->ic_header.h_len = cpu_to_be32(size); + iclog->ic_header->h_len = cpu_to_be32(size); XFS_STATS_INC(log->l_mp, xs_log_writes); XFS_STATS_ADD(log->l_mp, xs_log_blocks, BTOBB(count)); - bno = BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn)); + bno = BLOCK_LSN(be64_to_cpu(iclog->ic_header->h_lsn)); /* Do we need to split this write into 2 parts? */ if (bno + BTOBB(count) > log->l_logBBsize) - xlog_split_iclog(log, &iclog->ic_header, bno, count); + xlog_split_iclog(log, iclog->ic_header, bno, count); /* calculcate the checksum */ - iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header, - iclog->ic_datap, size); + iclog->ic_header->h_crc = xlog_cksum(log, iclog->ic_header, + iclog->ic_datap, XLOG_REC_SIZE, size); /* * Intentionally corrupt the log record CRC based on the error injection * frequency, if defined. This facilitates testing log recovery in the @@ -1847,12 +1796,12 @@ xlog_sync( * detects the bad CRC and attempts to recover. */ #ifdef DEBUG - if (XFS_TEST_ERROR(false, log->l_mp, XFS_ERRTAG_LOG_BAD_CRC)) { - iclog->ic_header.h_crc &= cpu_to_le32(0xAAAAAAAA); + if (XFS_TEST_ERROR(log->l_mp, XFS_ERRTAG_LOG_BAD_CRC)) { + iclog->ic_header->h_crc &= cpu_to_le32(0xAAAAAAAA); iclog->ic_fail_crc = true; xfs_warn(log->l_mp, "Intentionally corrupted log record at LSN 0x%llx. Shutdown imminent.", - be64_to_cpu(iclog->ic_header.h_lsn)); + be64_to_cpu(iclog->ic_header->h_lsn)); } #endif xlog_verify_iclog(log, iclog, count); @@ -1864,10 +1813,10 @@ xlog_sync( */ STATIC void xlog_dealloc_log( - struct xlog *log) + struct xlog *log) { - xlog_in_core_t *iclog, *next_iclog; - int i; + struct xlog_in_core *iclog, *next_iclog; + int i; /* * Destroy the CIL after waiting for iclog IO completion because an @@ -1879,7 +1828,7 @@ xlog_dealloc_log( iclog = log->l_iclog; for (i = 0; i < log->l_iclog_bufs; i++) { next_iclog = iclog->ic_next; - kvfree(iclog->ic_data); + kvfree(iclog->ic_header); kfree(iclog); iclog = next_iclog; } @@ -1901,7 +1850,7 @@ xlog_state_finish_copy( { lockdep_assert_held(&log->l_icloglock); - be32_add_cpu(&iclog->ic_header.h_num_logops, record_cnt); + be32_add_cpu(&iclog->ic_header->h_num_logops, record_cnt); iclog->ic_offset += copy_bytes; } @@ -1951,9 +1900,9 @@ xlog_print_trans( if (!lv) continue; xfs_warn(mp, " niovecs = %d", lv->lv_niovecs); - xfs_warn(mp, " size = %d", lv->lv_size); + xfs_warn(mp, " alloc_size = %d", lv->lv_alloc_size); xfs_warn(mp, " bytes = %d", lv->lv_bytes); - xfs_warn(mp, " buf len = %d", lv->lv_buf_len); + xfs_warn(mp, " buf used= %d", lv->lv_buf_used); /* dump each iovec for the log item */ vec = lv->lv_iovecp; @@ -2324,7 +2273,7 @@ xlog_state_activate_iclog( * We don't need to cover the dummy. */ if (*iclogs_changed == 0 && - iclog->ic_header.h_num_logops == cpu_to_be32(XLOG_COVER_OPS)) { + iclog->ic_header->h_num_logops == cpu_to_be32(XLOG_COVER_OPS)) { *iclogs_changed = 1; } else { /* @@ -2336,11 +2285,11 @@ xlog_state_activate_iclog( iclog->ic_state = XLOG_STATE_ACTIVE; iclog->ic_offset = 0; - iclog->ic_header.h_num_logops = 0; - memset(iclog->ic_header.h_cycle_data, 0, - sizeof(iclog->ic_header.h_cycle_data)); - iclog->ic_header.h_lsn = 0; - iclog->ic_header.h_tail_lsn = 0; + iclog->ic_header->h_num_logops = 0; + memset(iclog->ic_header->h_cycle_data, 0, + sizeof(iclog->ic_header->h_cycle_data)); + iclog->ic_header->h_lsn = 0; + iclog->ic_header->h_tail_lsn = 0; } /* @@ -2432,7 +2381,7 @@ xlog_get_lowest_lsn( iclog->ic_state == XLOG_STATE_DIRTY) continue; - lsn = be64_to_cpu(iclog->ic_header.h_lsn); + lsn = be64_to_cpu(iclog->ic_header->h_lsn); if ((lsn && !lowest_lsn) || XFS_LSN_CMP(lsn, lowest_lsn) < 0) lowest_lsn = lsn; } while ((iclog = iclog->ic_next) != log->l_iclog); @@ -2467,7 +2416,7 @@ xlog_state_iodone_process_iclog( * If this is not the lowest lsn iclog, then we will leave it * for another completion to process. */ - header_lsn = be64_to_cpu(iclog->ic_header.h_lsn); + header_lsn = be64_to_cpu(iclog->ic_header->h_lsn); lowest_lsn = xlog_get_lowest_lsn(log); if (lowest_lsn && XFS_LSN_CMP(lowest_lsn, header_lsn) < 0) return false; @@ -2630,9 +2579,9 @@ xlog_state_get_iclog_space( struct xlog_ticket *ticket, int *logoffsetp) { - int log_offset; - xlog_rec_header_t *head; - xlog_in_core_t *iclog; + int log_offset; + struct xlog_rec_header *head; + struct xlog_in_core *iclog; restart: spin_lock(&log->l_icloglock); @@ -2650,7 +2599,7 @@ restart: goto restart; } - head = &iclog->ic_header; + head = iclog->ic_header; atomic_inc(&iclog->ic_refcnt); /* prevents sync */ log_offset = iclog->ic_offset; @@ -2676,10 +2625,11 @@ restart: * until you know exactly how many bytes get copied. Therefore, wait * until later to update ic_offset. * - * xlog_write() algorithm assumes that at least 2 xlog_op_header_t's + * xlog_write() algorithm assumes that at least 2 xlog_op_header's * can fit into remaining data section. */ - if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) { + if (iclog->ic_size - iclog->ic_offset < + 2 * sizeof(struct xlog_op_header)) { int error = 0; xlog_state_switch_iclogs(log, iclog, iclog->ic_size); @@ -2814,7 +2764,7 @@ xlog_state_switch_iclogs( if (!eventual_size) eventual_size = iclog->ic_offset; iclog->ic_state = XLOG_STATE_WANT_SYNC; - iclog->ic_header.h_prev_block = cpu_to_be32(log->l_prev_block); + iclog->ic_header->h_prev_block = cpu_to_be32(log->l_prev_block); log->l_prev_block = log->l_curr_block; log->l_prev_cycle = log->l_curr_cycle; @@ -2858,7 +2808,7 @@ xlog_force_and_check_iclog( struct xlog_in_core *iclog, bool *completed) { - xfs_lsn_t lsn = be64_to_cpu(iclog->ic_header.h_lsn); + xfs_lsn_t lsn = be64_to_cpu(iclog->ic_header->h_lsn); int error; *completed = false; @@ -2870,7 +2820,7 @@ xlog_force_and_check_iclog( * If the iclog has already been completed and reused the header LSN * will have been rewritten by completion */ - if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) + if (be64_to_cpu(iclog->ic_header->h_lsn) != lsn) *completed = true; return 0; } @@ -2888,7 +2838,7 @@ xlog_force_and_check_iclog( * * 1. the current iclog is active and has no data; the previous iclog * is in the active or dirty state. - * 2. the current iclog is drity, and the previous iclog is in the + * 2. the current iclog is dirty, and the previous iclog is in the * active or dirty state. * * We may sleep if: @@ -3003,7 +2953,7 @@ xlog_force_lsn( goto out_error; iclog = log->l_iclog; - while (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) { + while (be64_to_cpu(iclog->ic_header->h_lsn) != lsn) { trace_xlog_iclog_force_lsn(iclog, _RET_IP_); iclog = iclog->ic_next; if (iclog == log->l_iclog) @@ -3112,16 +3062,16 @@ xfs_log_force_seq( */ void xfs_log_ticket_put( - xlog_ticket_t *ticket) + struct xlog_ticket *ticket) { ASSERT(atomic_read(&ticket->t_ref) > 0); if (atomic_dec_and_test(&ticket->t_ref)) kmem_cache_free(xfs_log_ticket_cache, ticket); } -xlog_ticket_t * +struct xlog_ticket * xfs_log_ticket_get( - xlog_ticket_t *ticket) + struct xlog_ticket *ticket) { ASSERT(atomic_read(&ticket->t_ref) > 0); atomic_inc(&ticket->t_ref); @@ -3173,11 +3123,11 @@ xlog_calc_unit_res( */ /* for trans header */ - unit_bytes += sizeof(xlog_op_header_t); - unit_bytes += sizeof(xfs_trans_header_t); + unit_bytes += sizeof(struct xlog_op_header); + unit_bytes += sizeof(struct xfs_trans_header); /* for start-rec */ - unit_bytes += sizeof(xlog_op_header_t); + unit_bytes += sizeof(struct xlog_op_header); /* * for LR headers - the space for data in an iclog is the size minus @@ -3200,12 +3150,12 @@ xlog_calc_unit_res( num_headers = howmany(unit_bytes, iclog_space); /* for split-recs - ophdrs added when data split over LRs */ - unit_bytes += sizeof(xlog_op_header_t) * num_headers; + unit_bytes += sizeof(struct xlog_op_header) * num_headers; /* add extra header reservations if we overrun */ while (!num_headers || howmany(unit_bytes, iclog_space) > num_headers) { - unit_bytes += sizeof(xlog_op_header_t); + unit_bytes += sizeof(struct xlog_op_header); num_headers++; } unit_bytes += log->l_iclog_hsize * num_headers; @@ -3269,7 +3219,7 @@ xlog_verify_dump_tail( { xfs_alert(log->l_mp, "ran out of log space tail 0x%llx/0x%llx, head lsn 0x%llx, head 0x%x/0x%x, prev head 0x%x/0x%x", - iclog ? be64_to_cpu(iclog->ic_header.h_tail_lsn) : -1, + iclog ? be64_to_cpu(iclog->ic_header->h_tail_lsn) : -1, atomic64_read(&log->l_tail_lsn), log->l_ailp->ail_head_lsn, log->l_curr_cycle, log->l_curr_block, @@ -3288,7 +3238,7 @@ xlog_verify_tail_lsn( struct xlog *log, struct xlog_in_core *iclog) { - xfs_lsn_t tail_lsn = be64_to_cpu(iclog->ic_header.h_tail_lsn); + xfs_lsn_t tail_lsn = be64_to_cpu(iclog->ic_header->h_tail_lsn); int blocks; if (CYCLE_LSN(tail_lsn) == log->l_prev_cycle) { @@ -3342,13 +3292,12 @@ xlog_verify_iclog( struct xlog_in_core *iclog, int count) { - xlog_op_header_t *ophead; - xlog_in_core_t *icptr; - xlog_in_core_2_t *xhdr; - void *base_ptr, *ptr, *p; + struct xlog_rec_header *rhead = iclog->ic_header; + struct xlog_in_core *icptr; + void *base_ptr, *ptr; ptrdiff_t field_offset; uint8_t clientid; - int len, i, j, k, op_len; + int len, i, op_len; int idx; /* check validity of iclog pointers */ @@ -3362,11 +3311,10 @@ xlog_verify_iclog( spin_unlock(&log->l_icloglock); /* check log magic numbers */ - if (iclog->ic_header.h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) + if (rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) xfs_emerg(log->l_mp, "%s: invalid magic num", __func__); - base_ptr = ptr = &iclog->ic_header; - p = &iclog->ic_header; + base_ptr = ptr = rhead; for (ptr += BBSIZE; ptr < base_ptr + count; ptr += BBSIZE) { if (*(__be32 *)ptr == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) xfs_emerg(log->l_mp, "%s: unexpected magic num", @@ -3374,29 +3322,19 @@ xlog_verify_iclog( } /* check fields */ - len = be32_to_cpu(iclog->ic_header.h_num_logops); + len = be32_to_cpu(rhead->h_num_logops); base_ptr = ptr = iclog->ic_datap; - ophead = ptr; - xhdr = iclog->ic_data; for (i = 0; i < len; i++) { - ophead = ptr; + struct xlog_op_header *ophead = ptr; + void *p = &ophead->oh_clientid; /* clientid is only 1 byte */ - p = &ophead->oh_clientid; field_offset = p - base_ptr; if (field_offset & 0x1ff) { clientid = ophead->oh_clientid; } else { idx = BTOBBT((void *)&ophead->oh_clientid - iclog->ic_datap); - if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { - j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - clientid = xlog_get_client_id( - xhdr[j].hic_xheader.xh_cycle_data[k]); - } else { - clientid = xlog_get_client_id( - iclog->ic_header.h_cycle_data[idx]); - } + clientid = xlog_get_client_id(*xlog_cycle_data(rhead, idx)); } if (clientid != XFS_TRANSACTION && clientid != XFS_LOG) { xfs_warn(log->l_mp, @@ -3412,15 +3350,9 @@ xlog_verify_iclog( op_len = be32_to_cpu(ophead->oh_len); } else { idx = BTOBBT((void *)&ophead->oh_len - iclog->ic_datap); - if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { - j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - op_len = be32_to_cpu(xhdr[j].hic_xheader.xh_cycle_data[k]); - } else { - op_len = be32_to_cpu(iclog->ic_header.h_cycle_data[idx]); - } + op_len = be32_to_cpu(*xlog_cycle_data(rhead, idx)); } - ptr += sizeof(xlog_op_header_t) + op_len; + ptr += sizeof(struct xlog_op_header) + op_len; } } #endif @@ -3549,19 +3481,19 @@ xlog_force_shutdown( STATIC int xlog_iclogs_empty( - struct xlog *log) + struct xlog *log) { - xlog_in_core_t *iclog; + struct xlog_in_core *iclog = log->l_iclog; - iclog = log->l_iclog; do { /* endianness does not matter here, zero is zero in * any language. */ - if (iclog->ic_header.h_num_logops) + if (iclog->ic_header->h_num_logops) return 0; iclog = iclog->ic_next; } while (iclog != log->l_iclog); + return 1; } diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 13455854365f..dcc1f44ed68f 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -16,10 +16,47 @@ struct xfs_log_vec { struct xfs_log_item *lv_item; /* owner */ char *lv_buf; /* formatted buffer */ int lv_bytes; /* accounted space in buffer */ - int lv_buf_len; /* aligned size of buffer */ - int lv_size; /* size of allocated lv */ + int lv_buf_used; /* buffer space used so far */ + int lv_alloc_size; /* size of allocated lv */ }; +/* Region types for iovec's i_type */ +#define XLOG_REG_TYPE_BFORMAT 1 +#define XLOG_REG_TYPE_BCHUNK 2 +#define XLOG_REG_TYPE_EFI_FORMAT 3 +#define XLOG_REG_TYPE_EFD_FORMAT 4 +#define XLOG_REG_TYPE_IFORMAT 5 +#define XLOG_REG_TYPE_ICORE 6 +#define XLOG_REG_TYPE_IEXT 7 +#define XLOG_REG_TYPE_IBROOT 8 +#define XLOG_REG_TYPE_ILOCAL 9 +#define XLOG_REG_TYPE_IATTR_EXT 10 +#define XLOG_REG_TYPE_IATTR_BROOT 11 +#define XLOG_REG_TYPE_IATTR_LOCAL 12 +#define XLOG_REG_TYPE_QFORMAT 13 +#define XLOG_REG_TYPE_DQUOT 14 +#define XLOG_REG_TYPE_QUOTAOFF 15 +#define XLOG_REG_TYPE_LRHEADER 16 +#define XLOG_REG_TYPE_UNMOUNT 17 +#define XLOG_REG_TYPE_COMMIT 18 +#define XLOG_REG_TYPE_TRANSHDR 19 +#define XLOG_REG_TYPE_ICREATE 20 +#define XLOG_REG_TYPE_RUI_FORMAT 21 +#define XLOG_REG_TYPE_RUD_FORMAT 22 +#define XLOG_REG_TYPE_CUI_FORMAT 23 +#define XLOG_REG_TYPE_CUD_FORMAT 24 +#define XLOG_REG_TYPE_BUI_FORMAT 25 +#define XLOG_REG_TYPE_BUD_FORMAT 26 +#define XLOG_REG_TYPE_ATTRI_FORMAT 27 +#define XLOG_REG_TYPE_ATTRD_FORMAT 28 +#define XLOG_REG_TYPE_ATTR_NAME 29 +#define XLOG_REG_TYPE_ATTR_VALUE 30 +#define XLOG_REG_TYPE_XMI_FORMAT 31 +#define XLOG_REG_TYPE_XMD_FORMAT 32 +#define XLOG_REG_TYPE_ATTR_NEWNAME 33 +#define XLOG_REG_TYPE_ATTR_NEWVALUE 34 +#define XLOG_REG_TYPE_MAX 34 + #define XFS_LOG_VEC_ORDERED (-1) /* @@ -64,12 +101,13 @@ xlog_finish_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec *vec, oph->oh_len = cpu_to_be32(len); len += sizeof(struct xlog_op_header); - lv->lv_buf_len += len; + lv->lv_buf_used += len; lv->lv_bytes += len; vec->i_len = len; /* Catch buffer overruns */ - ASSERT((void *)lv->lv_buf + lv->lv_bytes <= (void *)lv + lv->lv_size); + ASSERT((void *)lv->lv_buf + lv->lv_bytes <= + (void *)lv + lv->lv_alloc_size); } /* @@ -87,13 +125,6 @@ xlog_copy_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, return buf; } -static inline void * -xlog_copy_from_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, - const struct xfs_log_iovec *src) -{ - return xlog_copy_iovec(lv, vecp, src->i_type, src->i_addr, src->i_len); -} - /* * By comparing each component, we don't have to worry about extra * endian issues in treating two 32 bit numbers as one 64 bit number diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 1ca406ec1b40..778ac47adb8c 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -275,7 +275,7 @@ xlog_cil_alloc_shadow_bufs( struct xfs_log_vec *lv; int niovecs = 0; int nbytes = 0; - int buf_size; + int alloc_size; bool ordered = false; /* Skip items which aren't dirty in this transaction. */ @@ -309,23 +309,21 @@ xlog_cil_alloc_shadow_bufs( * Then round nbytes up to 64-bit alignment so that the initial * buffer alignment is easy to calculate and verify. */ - nbytes += niovecs * - (sizeof(uint64_t) + sizeof(struct xlog_op_header)); - nbytes = round_up(nbytes, sizeof(uint64_t)); + nbytes = xlog_item_space(niovecs, nbytes); /* * The data buffer needs to start 64-bit aligned, so round up * that space to ensure we can align it appropriately and not * overrun the buffer. */ - buf_size = nbytes + xlog_cil_iovec_space(niovecs); + alloc_size = nbytes + xlog_cil_iovec_space(niovecs); /* * if we have no shadow buffer, or it is too small, we need to * reallocate it. */ if (!lip->li_lv_shadow || - buf_size > lip->li_lv_shadow->lv_size) { + alloc_size > lip->li_lv_shadow->lv_alloc_size) { /* * We free and allocate here as a realloc would copy * unnecessary data. We don't use kvzalloc() for the @@ -334,15 +332,15 @@ xlog_cil_alloc_shadow_bufs( * storage. */ kvfree(lip->li_lv_shadow); - lv = xlog_kvmalloc(buf_size); + lv = xlog_kvmalloc(alloc_size); memset(lv, 0, xlog_cil_iovec_space(niovecs)); INIT_LIST_HEAD(&lv->lv_list); lv->lv_item = lip; - lv->lv_size = buf_size; + lv->lv_alloc_size = alloc_size; if (ordered) - lv->lv_buf_len = XFS_LOG_VEC_ORDERED; + lv->lv_buf_used = XFS_LOG_VEC_ORDERED; else lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1]; lip->li_lv_shadow = lv; @@ -350,9 +348,9 @@ xlog_cil_alloc_shadow_bufs( /* same or smaller, optimise common overwrite case */ lv = lip->li_lv_shadow; if (ordered) - lv->lv_buf_len = XFS_LOG_VEC_ORDERED; + lv->lv_buf_used = XFS_LOG_VEC_ORDERED; else - lv->lv_buf_len = 0; + lv->lv_buf_used = 0; lv->lv_bytes = 0; } @@ -372,30 +370,30 @@ xlog_cil_alloc_shadow_bufs( STATIC void xfs_cil_prepare_item( struct xlog *log, + struct xfs_log_item *lip, struct xfs_log_vec *lv, - struct xfs_log_vec *old_lv, int *diff_len) { /* Account for the new LV being passed in */ - if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) + if (lv->lv_buf_used != XFS_LOG_VEC_ORDERED) *diff_len += lv->lv_bytes; /* * If there is no old LV, this is the first time we've seen the item in * this CIL context and so we need to pin it. If we are replacing the - * old_lv, then remove the space it accounts for and make it the shadow + * old lv, then remove the space it accounts for and make it the shadow * buffer for later freeing. In both cases we are now switching to the * shadow buffer, so update the pointer to it appropriately. */ - if (!old_lv) { + if (!lip->li_lv) { if (lv->lv_item->li_ops->iop_pin) lv->lv_item->li_ops->iop_pin(lv->lv_item); lv->lv_item->li_lv_shadow = NULL; - } else if (old_lv != lv) { - ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED); + } else if (lip->li_lv != lv) { + ASSERT(lv->lv_buf_used != XFS_LOG_VEC_ORDERED); - *diff_len -= old_lv->lv_bytes; - lv->lv_item->li_lv_shadow = old_lv; + *diff_len -= lip->li_lv->lv_bytes; + lv->lv_item->li_lv_shadow = lip->li_lv; } /* attach new log vector to log item */ @@ -454,10 +452,8 @@ xlog_cil_insert_format_items( } list_for_each_entry(lip, &tp->t_items, li_trans) { - struct xfs_log_vec *lv; - struct xfs_log_vec *old_lv = NULL; - struct xfs_log_vec *shadow; - bool ordered = false; + struct xfs_log_vec *lv = lip->li_lv; + struct xfs_log_vec *shadow = lip->li_lv_shadow; /* Skip items which aren't dirty in this transaction. */ if (!test_bit(XFS_LI_DIRTY, &lip->li_flags)) @@ -467,22 +463,23 @@ xlog_cil_insert_format_items( * The formatting size information is already attached to * the shadow lv on the log item. */ - shadow = lip->li_lv_shadow; - if (shadow->lv_buf_len == XFS_LOG_VEC_ORDERED) - ordered = true; + if (shadow->lv_buf_used == XFS_LOG_VEC_ORDERED) { + if (!lv) { + lv = shadow; + lv->lv_item = lip; + } + ASSERT(shadow->lv_alloc_size == lv->lv_alloc_size); + xfs_cil_prepare_item(log, lip, lv, diff_len); + continue; + } /* Skip items that do not have any vectors for writing */ - if (!shadow->lv_niovecs && !ordered) + if (!shadow->lv_niovecs) continue; /* compare to existing item size */ - old_lv = lip->li_lv; - if (lip->li_lv && shadow->lv_size <= lip->li_lv->lv_size) { + if (lv && shadow->lv_alloc_size <= lv->lv_alloc_size) { /* same or smaller, optimise common overwrite case */ - lv = lip->li_lv; - - if (ordered) - goto insert; /* * set the item up as though it is a new insertion so @@ -494,7 +491,7 @@ xlog_cil_insert_format_items( lv->lv_niovecs = shadow->lv_niovecs; /* reset the lv buffer information for new formatting */ - lv->lv_buf_len = 0; + lv->lv_buf_used = 0; lv->lv_bytes = 0; lv->lv_buf = (char *)lv + xlog_cil_iovec_space(lv->lv_niovecs); @@ -502,17 +499,11 @@ xlog_cil_insert_format_items( /* switch to shadow buffer! */ lv = shadow; lv->lv_item = lip; - if (ordered) { - /* track as an ordered logvec */ - ASSERT(lip->li_lv == NULL); - goto insert; - } } ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t))); lip->li_ops->iop_format(lip, lv); -insert: - xfs_cil_prepare_item(log, lv, old_lv, diff_len); + xfs_cil_prepare_item(log, lip, lv, diff_len); } } @@ -795,8 +786,10 @@ xlog_cil_ail_insert( struct xfs_log_item *lip = lv->lv_item; xfs_lsn_t item_lsn; - if (aborted) + if (aborted) { + trace_xlog_ail_insert_abort(lip); set_bit(XFS_LI_ABORTED, &lip->li_flags); + } if (lip->li_ops->flags & XFS_ITEM_RELEASE_WHEN_COMMITTED) { lip->li_ops->iop_release(lip); @@ -947,7 +940,7 @@ xlog_cil_set_ctx_write_state( struct xlog_in_core *iclog) { struct xfs_cil *cil = ctx->cil; - xfs_lsn_t lsn = be64_to_cpu(iclog->ic_header.h_lsn); + xfs_lsn_t lsn = be64_to_cpu(iclog->ic_header->h_lsn); ASSERT(!ctx->commit_lsn); if (!ctx->start_lsn) { @@ -1245,7 +1238,7 @@ xlog_cil_build_lv_chain( lv->lv_order_id = item->li_order_id; /* we don't write ordered log vectors */ - if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) + if (lv->lv_buf_used != XFS_LOG_VEC_ORDERED) *num_bytes += lv->lv_bytes; *num_iovecs += lv->lv_niovecs; list_add_tail(&lv->lv_list, &ctx->lv_chain); @@ -1465,9 +1458,9 @@ xlog_cil_push_work( */ spin_lock(&log->l_icloglock); if (ctx->start_lsn != ctx->commit_lsn) { - xfs_lsn_t plsn; + xfs_lsn_t plsn = be64_to_cpu( + ctx->commit_iclog->ic_prev->ic_header->h_lsn); - plsn = be64_to_cpu(ctx->commit_iclog->ic_prev->ic_header.h_lsn); if (plsn && XFS_LSN_CMP(plsn, ctx->commit_lsn) < 0) { /* * Waiting on ic_force_wait orders the completion of diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index f3d78869e5e5..0fe59f0525aa 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -144,7 +144,7 @@ enum xlog_iclog_state { #define XLOG_COVER_OPS 5 -typedef struct xlog_ticket { +struct xlog_ticket { struct list_head t_queue; /* reserve/write queue */ struct task_struct *t_task; /* task that owns this ticket */ xlog_tid_t t_tid; /* transaction identifier */ @@ -155,13 +155,11 @@ typedef struct xlog_ticket { char t_cnt; /* current unit count */ uint8_t t_flags; /* properties of reservation */ int t_iclog_hdrs; /* iclog hdrs in t_curr_res */ -} xlog_ticket_t; +}; /* - * - A log record header is 512 bytes. There is plenty of room to grow the - * xlog_rec_header_t into the reserved space. - * - ic_data follows, so a write to disk can start at the beginning of - * the iclog. + * In-core log structure. + * * - ic_forcewait is used to implement synchronous forcing of the iclog to disk. * - ic_next is the pointer to the next iclog in the ring. * - ic_log is a pointer back to the global log structure. @@ -183,7 +181,7 @@ typedef struct xlog_ticket { * We'll put all the read-only and l_icloglock fields in the first cacheline, * and move everything else out to subsequent cachelines. */ -typedef struct xlog_in_core { +struct xlog_in_core { wait_queue_head_t ic_force_wait; wait_queue_head_t ic_write_wait; struct xlog_in_core *ic_next; @@ -198,8 +196,7 @@ typedef struct xlog_in_core { /* reference counts need their own cacheline */ atomic_t ic_refcnt ____cacheline_aligned_in_smp; - xlog_in_core_2_t *ic_data; -#define ic_header ic_data->hic_header + struct xlog_rec_header *ic_header; #ifdef DEBUG bool ic_fail_crc : 1; #endif @@ -207,7 +204,7 @@ typedef struct xlog_in_core { struct work_struct ic_end_io_work; struct bio ic_bio; struct bio_vec ic_bvec[]; -} xlog_in_core_t; +}; /* * The CIL context is used to aggregate per-transaction details as well be @@ -409,7 +406,6 @@ struct xlog { struct list_head *l_buf_cancel_table; struct list_head r_dfops; /* recovered log intent items */ int l_iclog_hsize; /* size of iclog header */ - int l_iclog_heads; /* # of iclog header sectors */ uint l_sectBBsize; /* sector size in BBs (2^n) */ int l_iclog_size; /* size of log in bytes */ int l_iclog_bufs; /* number of iclog buffers */ @@ -422,7 +418,7 @@ struct xlog { /* waiting for iclog flush */ int l_covered_state;/* state of "covering disk * log entries" */ - xlog_in_core_t *l_iclog; /* head log queue */ + struct xlog_in_core *l_iclog; /* head log queue */ spinlock_t l_icloglock; /* grab to change iclog state */ int l_curr_cycle; /* Cycle number of log writes */ int l_prev_cycle; /* Cycle number before last @@ -499,8 +495,8 @@ xlog_recover_finish( extern void xlog_recover_cancel(struct xlog *); -extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead, - char *dp, int size); +__le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead, + char *dp, unsigned int hdrsize, unsigned int size); extern struct kmem_cache *xfs_log_ticket_cache; struct xlog_ticket *xlog_ticket_alloc(struct xlog *log, int unit_bytes, @@ -698,4 +694,34 @@ xlog_kvmalloc( return p; } +/* + * Given a count of iovecs and space for a log item, compute the space we need + * in the log to store that data plus the log headers. + */ +static inline unsigned int +xlog_item_space( + unsigned int niovecs, + unsigned int nbytes) +{ + nbytes += niovecs * (sizeof(uint64_t) + sizeof(struct xlog_op_header)); + return round_up(nbytes, sizeof(uint64_t)); +} + +/* + * Cycles over XLOG_CYCLE_DATA_SIZE overflow into the extended header that was + * added for v2 logs. Addressing for the cycles array there is off by one, + * because the first batch of cycles is in the original header. + */ +static inline __be32 *xlog_cycle_data(struct xlog_rec_header *rhead, unsigned i) +{ + if (i >= XLOG_CYCLE_DATA_SIZE) { + unsigned j = i / XLOG_CYCLE_DATA_SIZE; + unsigned k = i % XLOG_CYCLE_DATA_SIZE; + + return &rhead->h_ext[j - 1].xh_cycle_data[k]; + } + + return &rhead->h_cycle_data[i]; +} + #endif /* __XFS_LOG_PRIV_H__ */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 2f76531842f8..03e42c7dab56 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -190,8 +190,8 @@ xlog_bwrite( */ STATIC void xlog_header_check_dump( - xfs_mount_t *mp, - xlog_rec_header_t *head) + struct xfs_mount *mp, + struct xlog_rec_header *head) { xfs_debug(mp, "%s: SB : uuid = %pU, fmt = %d", __func__, &mp->m_sb.sb_uuid, XLOG_FMT); @@ -207,8 +207,8 @@ xlog_header_check_dump( */ STATIC int xlog_header_check_recover( - xfs_mount_t *mp, - xlog_rec_header_t *head) + struct xfs_mount *mp, + struct xlog_rec_header *head) { ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)); @@ -238,8 +238,8 @@ xlog_header_check_recover( */ STATIC int xlog_header_check_mount( - xfs_mount_t *mp, - xlog_rec_header_t *head) + struct xfs_mount *mp, + struct xlog_rec_header *head) { ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)); @@ -400,7 +400,7 @@ xlog_find_verify_log_record( xfs_daddr_t i; char *buffer; char *offset = NULL; - xlog_rec_header_t *head = NULL; + struct xlog_rec_header *head = NULL; int error = 0; int smallmem = 0; int num_blks = *last_blk - start_blk; @@ -437,7 +437,7 @@ xlog_find_verify_log_record( goto out; } - head = (xlog_rec_header_t *)offset; + head = (struct xlog_rec_header *)offset; if (head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) break; @@ -1237,7 +1237,7 @@ xlog_find_tail( xfs_daddr_t *head_blk, xfs_daddr_t *tail_blk) { - xlog_rec_header_t *rhead; + struct xlog_rec_header *rhead; char *offset = NULL; char *buffer; int error; @@ -1487,7 +1487,7 @@ xlog_add_record( int tail_cycle, int tail_block) { - xlog_rec_header_t *recp = (xlog_rec_header_t *)buf; + struct xlog_rec_header *recp = (struct xlog_rec_header *)buf; memset(buf, 0, BBSIZE); recp->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM); @@ -2131,15 +2131,15 @@ xlog_recover_add_to_cont_trans( item = list_entry(trans->r_itemq.prev, struct xlog_recover_item, ri_list); - old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; - old_len = item->ri_buf[item->ri_cnt-1].i_len; + old_ptr = item->ri_buf[item->ri_cnt-1].iov_base; + old_len = item->ri_buf[item->ri_cnt-1].iov_len; ptr = kvrealloc(old_ptr, len + old_len, GFP_KERNEL); if (!ptr) return -ENOMEM; memcpy(&ptr[old_len], dp, len); - item->ri_buf[item->ri_cnt-1].i_len += len; - item->ri_buf[item->ri_cnt-1].i_addr = ptr; + item->ri_buf[item->ri_cnt-1].iov_len += len; + item->ri_buf[item->ri_cnt-1].iov_base = ptr; trace_xfs_log_recover_item_add_cont(log, trans, item, 0); return 0; } @@ -2223,7 +2223,7 @@ xlog_recover_add_to_trans( } item->ri_total = in_f->ilf_size; - item->ri_buf = kzalloc(item->ri_total * sizeof(xfs_log_iovec_t), + item->ri_buf = kcalloc(item->ri_total, sizeof(*item->ri_buf), GFP_KERNEL | __GFP_NOFAIL); } @@ -2237,8 +2237,8 @@ xlog_recover_add_to_trans( } /* Description region is ri_buf[0] */ - item->ri_buf[item->ri_cnt].i_addr = ptr; - item->ri_buf[item->ri_cnt].i_len = len; + item->ri_buf[item->ri_cnt].iov_base = ptr; + item->ri_buf[item->ri_cnt].iov_len = len; item->ri_cnt++; trace_xfs_log_recover_item_add(log, trans, item, 0); return 0; @@ -2262,7 +2262,7 @@ xlog_recover_free_trans( /* Free the regions in the item. */ list_del(&item->ri_list); for (i = 0; i < item->ri_cnt; i++) - kvfree(item->ri_buf[i].i_addr); + kvfree(item->ri_buf[i].iov_base); /* Free the item itself */ kfree(item->ri_buf); kfree(item); @@ -2863,23 +2863,12 @@ xlog_unpack_data( char *dp, struct xlog *log) { - int i, j, k; + int i; - for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) && - i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { - *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i]; + for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) { + *(__be32 *)dp = *xlog_cycle_data(rhead, i); dp += BBSIZE; } - - if (xfs_has_logv2(log->l_mp)) { - xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead; - for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) { - j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k]; - dp += BBSIZE; - } - } } /* @@ -2894,20 +2883,34 @@ xlog_recover_process( int pass, struct list_head *buffer_list) { - __le32 old_crc = rhead->h_crc; - __le32 crc; + __le32 expected_crc = rhead->h_crc, crc, other_crc; + + crc = xlog_cksum(log, rhead, dp, XLOG_REC_SIZE, + be32_to_cpu(rhead->h_len)); - crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len)); + /* + * Look at the end of the struct xlog_rec_header definition in + * xfs_log_format.h for the glory details. + */ + if (expected_crc && crc != expected_crc) { + other_crc = xlog_cksum(log, rhead, dp, XLOG_REC_SIZE_OTHER, + be32_to_cpu(rhead->h_len)); + if (other_crc == expected_crc) { + xfs_notice_once(log->l_mp, + "Fixing up incorrect CRC due to padding."); + crc = other_crc; + } + } /* * Nothing else to do if this is a CRC verification pass. Just return * if this a record with a non-zero crc. Unfortunately, mkfs always - * sets old_crc to 0 so we must consider this valid even on v5 supers. - * Otherwise, return EFSBADCRC on failure so the callers up the stack - * know precisely what failed. + * sets expected_crc to 0 so we must consider this valid even on v5 + * supers. Otherwise, return EFSBADCRC on failure so the callers up the + * stack know precisely what failed. */ if (pass == XLOG_RECOVER_CRCPASS) { - if (old_crc && crc != old_crc) + if (expected_crc && crc != expected_crc) return -EFSBADCRC; return 0; } @@ -2918,11 +2921,11 @@ xlog_recover_process( * zero CRC check prevents warnings from being emitted when upgrading * the kernel from one that does not add CRCs by default. */ - if (crc != old_crc) { - if (old_crc || xfs_has_crc(log->l_mp)) { + if (crc != expected_crc) { + if (expected_crc || xfs_has_crc(log->l_mp)) { xfs_alert(log->l_mp, "log record CRC mismatch: found 0x%x, expected 0x%x.", - le32_to_cpu(old_crc), + le32_to_cpu(expected_crc), le32_to_cpu(crc)); xfs_hex_dump(dp, 32); } @@ -2994,7 +2997,7 @@ xlog_do_recovery_pass( int pass, xfs_daddr_t *first_bad) /* out: first bad log rec */ { - xlog_rec_header_t *rhead; + struct xlog_rec_header *rhead; xfs_daddr_t blk_no, rblk_no; xfs_daddr_t rhead_blk; char *offset; @@ -3031,7 +3034,7 @@ xlog_do_recovery_pass( if (error) goto bread_err1; - rhead = (xlog_rec_header_t *)offset; + rhead = (struct xlog_rec_header *)offset; /* * xfsprogs has a bug where record length is based on lsunit but @@ -3138,7 +3141,7 @@ xlog_do_recovery_pass( if (error) goto bread_err2; } - rhead = (xlog_rec_header_t *)offset; + rhead = (struct xlog_rec_header *)offset; error = xlog_valid_rec_header(log, rhead, split_hblks ? blk_no : 0, h_size); if (error) @@ -3220,7 +3223,7 @@ xlog_do_recovery_pass( if (error) goto bread_err2; - rhead = (xlog_rec_header_t *)offset; + rhead = (struct xlog_rec_header *)offset; error = xlog_valid_rec_header(log, rhead, blk_no, h_size); if (error) goto bread_err2; diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c index 15d410d16bb2..19aba2c3d525 100644 --- a/fs/xfs/xfs_message.c +++ b/fs/xfs/xfs_message.c @@ -141,14 +141,6 @@ xfs_warn_experimental( const char *name; long opstate; } features[] = { - [XFS_EXPERIMENTAL_PNFS] = { - .opstate = XFS_OPSTATE_WARNED_PNFS, - .name = "pNFS", - }, - [XFS_EXPERIMENTAL_SCRUB] = { - .opstate = XFS_OPSTATE_WARNED_SCRUB, - .name = "online scrub", - }, [XFS_EXPERIMENTAL_SHRINK] = { .opstate = XFS_OPSTATE_WARNED_SHRINK, .name = "online shrink", @@ -161,14 +153,6 @@ xfs_warn_experimental( .opstate = XFS_OPSTATE_WARNED_LBS, .name = "large block size", }, - [XFS_EXPERIMENTAL_EXCHRANGE] = { - .opstate = XFS_OPSTATE_WARNED_EXCHRANGE, - .name = "exchange range", - }, - [XFS_EXPERIMENTAL_PPTR] = { - .opstate = XFS_OPSTATE_WARNED_PPTR, - .name = "parent pointer", - }, [XFS_EXPERIMENTAL_METADIR] = { .opstate = XFS_OPSTATE_WARNED_METADIR, .name = "metadata directory tree", diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h index a92a4d09c8e9..d68e72379f9d 100644 --- a/fs/xfs/xfs_message.h +++ b/fs/xfs/xfs_message.h @@ -91,13 +91,9 @@ void xfs_buf_alert_ratelimited(struct xfs_buf *bp, const char *rlmsg, const char *fmt, ...); enum xfs_experimental_feat { - XFS_EXPERIMENTAL_PNFS, - XFS_EXPERIMENTAL_SCRUB, XFS_EXPERIMENTAL_SHRINK, XFS_EXPERIMENTAL_LARP, XFS_EXPERIMENTAL_LBS, - XFS_EXPERIMENTAL_EXCHRANGE, - XFS_EXPERIMENTAL_PPTR, XFS_EXPERIMENTAL_METADIR, XFS_EXPERIMENTAL_ZONED, diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 00b53f479ece..0953f6ae94ab 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -171,19 +171,16 @@ xfs_readsb( ASSERT(mp->m_ddev_targp != NULL); /* - * For the initial read, we must guess at the sector - * size based on the block device. It's enough to - * get the sb_sectsize out of the superblock and - * then reread with the proper length. - * We don't verify it yet, because it may not be complete. + * In the first pass, use the device sector size to just read enough + * of the superblock to extract the XFS sector size. + * + * The device sector size must be smaller than or equal to the XFS + * sector size and thus we can always read the superblock. Once we know + * the XFS sector size, re-read it and run the buffer verifier. */ - sector_size = xfs_getsize_buftarg(mp->m_ddev_targp); + sector_size = mp->m_ddev_targp->bt_logical_sectorsize; buf_ops = NULL; - /* - * Allocate a (locked) buffer to hold the superblock. This will be kept - * around at all times to optimize access to the superblock. - */ reread: error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, BTOBB(sector_size), &bp, buf_ops); @@ -247,6 +244,10 @@ reread: /* no need to be quiet anymore, so reset the buf ops */ bp->b_ops = &xfs_sb_buf_ops; + /* + * Keep a pointer of the sb buffer around instead of caching it in the + * buffer cache because we access it frequently. + */ mp->m_sb_bp = bp; xfs_buf_unlock(bp); return 0; @@ -666,6 +667,152 @@ xfs_agbtree_compute_maxlevels( mp->m_agbtree_maxlevels = max(levels, mp->m_refc_maxlevels); } +/* Maximum atomic write IO size that the kernel allows. */ +static inline xfs_extlen_t xfs_calc_atomic_write_max(struct xfs_mount *mp) +{ + return rounddown_pow_of_two(XFS_B_TO_FSB(mp, MAX_RW_COUNT)); +} + +/* + * If the underlying device advertises atomic write support, limit the size of + * atomic writes to the greatest power-of-two factor of the group size so + * that every atomic write unit aligns with the start of every group. This is + * required so that the allocations for an atomic write will always be + * aligned compatibly with the alignment requirements of the storage. + * + * If the device doesn't advertise atomic writes, then there are no alignment + * restrictions and the largest out-of-place write we can do ourselves is the + * number of blocks that user files can allocate from any group. + */ +static xfs_extlen_t +xfs_calc_group_awu_max( + struct xfs_mount *mp, + enum xfs_group_type type) +{ + struct xfs_groups *g = &mp->m_groups[type]; + struct xfs_buftarg *btp = xfs_group_type_buftarg(mp, type); + + if (g->blocks == 0) + return 0; + if (btp && btp->bt_awu_min > 0) + return max_pow_of_two_factor(g->blocks); + return rounddown_pow_of_two(g->blocks); +} + +/* Compute the maximum atomic write unit size for each section. */ +static inline void +xfs_calc_atomic_write_unit_max( + struct xfs_mount *mp, + enum xfs_group_type type) +{ + struct xfs_groups *g = &mp->m_groups[type]; + + const xfs_extlen_t max_write = xfs_calc_atomic_write_max(mp); + const xfs_extlen_t max_ioend = xfs_reflink_max_atomic_cow(mp); + const xfs_extlen_t max_gsize = xfs_calc_group_awu_max(mp, type); + + g->awu_max = min3(max_write, max_ioend, max_gsize); + trace_xfs_calc_atomic_write_unit_max(mp, type, max_write, max_ioend, + max_gsize, g->awu_max); +} + +/* + * Try to set the atomic write maximum to a new value that we got from + * userspace via mount option. + */ +int +xfs_set_max_atomic_write_opt( + struct xfs_mount *mp, + unsigned long long new_max_bytes) +{ + const xfs_filblks_t new_max_fsbs = XFS_B_TO_FSBT(mp, new_max_bytes); + const xfs_extlen_t max_write = xfs_calc_atomic_write_max(mp); + const xfs_extlen_t max_group = + max(mp->m_groups[XG_TYPE_AG].blocks, + mp->m_groups[XG_TYPE_RTG].blocks); + const xfs_extlen_t max_group_write = + max(xfs_calc_group_awu_max(mp, XG_TYPE_AG), + xfs_calc_group_awu_max(mp, XG_TYPE_RTG)); + int error; + + if (new_max_bytes == 0) + goto set_limit; + + ASSERT(max_write <= U32_MAX); + + /* generic_atomic_write_valid enforces power of two length */ + if (!is_power_of_2(new_max_bytes)) { + xfs_warn(mp, + "max atomic write size of %llu bytes is not a power of 2", + new_max_bytes); + return -EINVAL; + } + + if (new_max_bytes & mp->m_blockmask) { + xfs_warn(mp, + "max atomic write size of %llu bytes not aligned with fsblock", + new_max_bytes); + return -EINVAL; + } + + if (new_max_fsbs > max_write) { + xfs_warn(mp, + "max atomic write size of %lluk cannot be larger than max write size %lluk", + new_max_bytes >> 10, + XFS_FSB_TO_B(mp, max_write) >> 10); + return -EINVAL; + } + + if (new_max_fsbs > max_group) { + xfs_warn(mp, + "max atomic write size of %lluk cannot be larger than allocation group size %lluk", + new_max_bytes >> 10, + XFS_FSB_TO_B(mp, max_group) >> 10); + return -EINVAL; + } + + if (new_max_fsbs > max_group_write) { + xfs_warn(mp, + "max atomic write size of %lluk cannot be larger than max allocation group write size %lluk", + new_max_bytes >> 10, + XFS_FSB_TO_B(mp, max_group_write) >> 10); + return -EINVAL; + } + + if (xfs_has_reflink(mp)) + goto set_limit; + + if (new_max_fsbs == 1) { + if (mp->m_ddev_targp->bt_awu_max || + (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_awu_max)) { + } else { + xfs_warn(mp, + "cannot support atomic writes of size %lluk with no reflink or HW support", + new_max_bytes >> 10); + return -EINVAL; + } + } else { + xfs_warn(mp, + "cannot support atomic writes of size %lluk with no reflink support", + new_max_bytes >> 10); + return -EINVAL; + } + +set_limit: + error = xfs_calc_atomic_write_reservation(mp, new_max_fsbs); + if (error) { + xfs_warn(mp, + "cannot support completing atomic writes of %lluk", + new_max_bytes >> 10); + return error; + } + + xfs_calc_atomic_write_unit_max(mp, XG_TYPE_AG); + xfs_calc_atomic_write_unit_max(mp, XG_TYPE_RTG); + mp->m_awu_max_bytes = new_max_bytes; + return 0; +} + /* Compute maximum possible height for realtime btree types for this fs. */ static inline void xfs_rtbtree_compute_maxlevels( @@ -910,19 +1057,6 @@ xfs_mountfs( xfs_inodegc_start(mp); xfs_blockgc_start(mp); - /* - * Now that we've recovered any pending superblock feature bit - * additions, we can finish setting up the attr2 behaviour for the - * mount. The noattr2 option overrides the superblock flag, so only - * check the superblock feature flag if the mount option is not set. - */ - if (xfs_has_noattr2(mp)) { - mp->m_features &= ~XFS_FEAT_ATTR2; - } else if (!xfs_has_attr2(mp) && - (mp->m_sb.sb_features2 & XFS_SB_VERSION2_ATTR2BIT)) { - mp->m_features |= XFS_FEAT_ATTR2; - } - if (xfs_has_metadir(mp)) { error = xfs_mount_setup_metadir(mp); if (error) @@ -1082,6 +1216,15 @@ xfs_mountfs( xfs_zone_gc_start(mp); } + /* + * Pre-calculate atomic write unit max. This involves computations + * derived from transaction reservations, so we must do this after the + * log is fully initialized. + */ + error = xfs_set_max_atomic_write_opt(mp, mp->m_awu_max_bytes); + if (error) + goto out_agresv; + return 0; out_agresv: diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 799b84220ebb..b871dfde372b 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -119,6 +119,12 @@ struct xfs_groups { * SMR hard drives. */ xfs_fsblock_t start_fsb; + + /* + * Maximum length of an atomic write for files stored in this + * collection of allocation groups, in fsblocks. + */ + xfs_extlen_t awu_max; }; struct xfs_freecounter { @@ -229,6 +235,10 @@ typedef struct xfs_mount { bool m_finobt_nores; /* no per-AG finobt resv. */ bool m_update_sb; /* sb needs update in mount */ unsigned int m_max_open_zones; + unsigned int m_zonegc_low_space; + + /* max_atomic_write mount option value */ + unsigned long long m_awu_max_bytes; /* * Bitsets of per-fs metadata that have been checked and/or are sick. @@ -352,7 +362,6 @@ typedef struct xfs_mount { #define XFS_FEAT_EXTFLG (1ULL << 7) /* unwritten extents */ #define XFS_FEAT_ASCIICI (1ULL << 8) /* ASCII only case-insens. */ #define XFS_FEAT_LAZYSBCOUNT (1ULL << 9) /* Superblk counters */ -#define XFS_FEAT_ATTR2 (1ULL << 10) /* dynamic attr fork */ #define XFS_FEAT_PARENT (1ULL << 11) /* parent pointers */ #define XFS_FEAT_PROJID32 (1ULL << 12) /* 32 bit project id */ #define XFS_FEAT_CRC (1ULL << 13) /* metadata CRCs */ @@ -375,7 +384,6 @@ typedef struct xfs_mount { /* Mount features */ #define XFS_FEAT_NOLIFETIME (1ULL << 47) /* disable lifetime hints */ -#define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */ #define XFS_FEAT_NOALIGN (1ULL << 49) /* ignore alignment */ #define XFS_FEAT_ALLOCSIZE (1ULL << 50) /* user specified allocation size */ #define XFS_FEAT_LARGE_IOSIZE (1ULL << 51) /* report large preferred @@ -385,7 +393,6 @@ typedef struct xfs_mount { #define XFS_FEAT_DISCARD (1ULL << 54) /* discard unused blocks */ #define XFS_FEAT_GRPID (1ULL << 55) /* group-ID assigned from directory */ #define XFS_FEAT_SMALL_INUMS (1ULL << 56) /* user wants 32bit inodes */ -#define XFS_FEAT_IKEEP (1ULL << 57) /* keep empty inode clusters*/ #define XFS_FEAT_SWALLOC (1ULL << 58) /* stripe width allocation */ #define XFS_FEAT_FILESTREAMS (1ULL << 59) /* use filestreams allocator */ #define XFS_FEAT_DAX_ALWAYS (1ULL << 60) /* DAX always enabled */ @@ -463,6 +470,11 @@ static inline bool xfs_has_nonzoned(const struct xfs_mount *mp) return !xfs_has_zoned(mp); } +static inline bool xfs_can_sw_atomic_write(struct xfs_mount *mp) +{ + return xfs_has_reflink(mp); +} + /* * Some features are always on for v5 file systems, allow the compiler to * eliminiate dead code when building without v4 support. @@ -488,12 +500,17 @@ __XFS_HAS_V4_FEAT(align, ALIGN) __XFS_HAS_V4_FEAT(logv2, LOGV2) __XFS_HAS_V4_FEAT(extflg, EXTFLG) __XFS_HAS_V4_FEAT(lazysbcount, LAZYSBCOUNT) -__XFS_ADD_V4_FEAT(attr2, ATTR2) __XFS_ADD_V4_FEAT(projid32, PROJID32) __XFS_HAS_V4_FEAT(v3inodes, V3INODES) __XFS_HAS_V4_FEAT(crc, CRC) __XFS_HAS_V4_FEAT(pquotino, PQUOTINO) +static inline void xfs_add_attr2(struct xfs_mount *mp) +{ + if (IS_ENABLED(CONFIG_XFS_SUPPORT_V4)) + xfs_sb_version_addattr2(&mp->m_sb); +} + /* * Mount features * @@ -501,7 +518,6 @@ __XFS_HAS_V4_FEAT(pquotino, PQUOTINO) * bit inodes and read-only state, are kept as operational state rather than * features. */ -__XFS_HAS_FEAT(noattr2, NOATTR2) __XFS_HAS_FEAT(noalign, NOALIGN) __XFS_HAS_FEAT(allocsize, ALLOCSIZE) __XFS_HAS_FEAT(large_iosize, LARGE_IOSIZE) @@ -510,7 +526,6 @@ __XFS_HAS_FEAT(dirsync, DIRSYNC) __XFS_HAS_FEAT(discard, DISCARD) __XFS_HAS_FEAT(grpid, GRPID) __XFS_HAS_FEAT(small_inums, SMALL_INUMS) -__XFS_HAS_FEAT(ikeep, IKEEP) __XFS_HAS_FEAT(swalloc, SWALLOC) __XFS_HAS_FEAT(filestreams, FILESTREAMS) __XFS_HAS_FEAT(dax_always, DAX_ALWAYS) @@ -542,10 +557,6 @@ __XFS_HAS_FEAT(nouuid, NOUUID) */ #define XFS_OPSTATE_BLOCKGC_ENABLED 6 -/* Kernel has logged a warning about pNFS being used on this fs. */ -#define XFS_OPSTATE_WARNED_PNFS 7 -/* Kernel has logged a warning about online fsck being used on this fs. */ -#define XFS_OPSTATE_WARNED_SCRUB 8 /* Kernel has logged a warning about shrink being used on this fs. */ #define XFS_OPSTATE_WARNED_SHRINK 9 /* Kernel has logged a warning about logged xattr updates being used. */ @@ -558,10 +569,6 @@ __XFS_HAS_FEAT(nouuid, NOUUID) #define XFS_OPSTATE_USE_LARP 13 /* Kernel has logged a warning about blocksize > pagesize on this fs. */ #define XFS_OPSTATE_WARNED_LBS 14 -/* Kernel has logged a warning about exchange-range being used on this fs. */ -#define XFS_OPSTATE_WARNED_EXCHRANGE 15 -/* Kernel has logged a warning about parent pointers being used on this fs. */ -#define XFS_OPSTATE_WARNED_PPTR 16 /* Kernel has logged a warning about metadata dirs being used on this fs. */ #define XFS_OPSTATE_WARNED_METADIR 17 /* Filesystem should use qflags to determine quotaon status */ @@ -630,7 +637,6 @@ xfs_should_warn(struct xfs_mount *mp, long nr) { (1UL << XFS_OPSTATE_READONLY), "read_only" }, \ { (1UL << XFS_OPSTATE_INODEGC_ENABLED), "inodegc" }, \ { (1UL << XFS_OPSTATE_BLOCKGC_ENABLED), "blockgc" }, \ - { (1UL << XFS_OPSTATE_WARNED_SCRUB), "wscrub" }, \ { (1UL << XFS_OPSTATE_WARNED_SHRINK), "wshrink" }, \ { (1UL << XFS_OPSTATE_WARNED_LARP), "wlarp" }, \ { (1UL << XFS_OPSTATE_QUOTACHECK_RUNNING), "quotacheck" }, \ @@ -792,4 +798,24 @@ static inline void xfs_mod_sb_delalloc(struct xfs_mount *mp, int64_t delta) percpu_counter_add(&mp->m_delalloc_blks, delta); } +int xfs_set_max_atomic_write_opt(struct xfs_mount *mp, + unsigned long long new_max_bytes); + +static inline struct xfs_buftarg * +xfs_group_type_buftarg( + struct xfs_mount *mp, + enum xfs_group_type type) +{ + switch (type) { + case XG_TYPE_AG: + return mp->m_ddev_targp; + case XG_TYPE_RTG: + return mp->m_rtdev_targp; + default: + ASSERT(0); + break; + } + return NULL; +} + #endif /* __XFS_MOUNT_H__ */ diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c index d0f5b403bdbe..73b7e72944e4 100644 --- a/fs/xfs/xfs_mru_cache.c +++ b/fs/xfs/xfs_mru_cache.c @@ -293,7 +293,8 @@ int xfs_mru_cache_init(void) { xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache", - XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE), 1); + XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_PERCPU), + 1); if (!xfs_mru_reap_wq) return -ENOMEM; return 0; @@ -320,7 +321,7 @@ xfs_mru_cache_create( xfs_mru_cache_free_func_t free_func) { struct xfs_mru_cache *mru = NULL; - int err = 0, grp; + int grp; unsigned int grp_time; if (mrup) @@ -341,8 +342,8 @@ xfs_mru_cache_create( mru->lists = kzalloc(mru->grp_count * sizeof(*mru->lists), GFP_KERNEL | __GFP_NOFAIL); if (!mru->lists) { - err = -ENOMEM; - goto exit; + kfree(mru); + return -ENOMEM; } for (grp = 0; grp < mru->grp_count; grp++) @@ -361,14 +362,7 @@ xfs_mru_cache_create( mru->free_func = free_func; mru->data = data; *mrup = mru; - -exit: - if (err && mru && mru->lists) - kfree(mru->lists); - if (err && mru) - kfree(mru); - - return err; + return 0; } /* @@ -414,6 +408,8 @@ xfs_mru_cache_destroy( * To insert an element, call xfs_mru_cache_insert() with the data store, the * element's key and the client data pointer. This function returns 0 on * success or ENOMEM if memory for the data element couldn't be allocated. + * + * The passed in elem is freed through the per-cache free_func on failure. */ int xfs_mru_cache_insert( @@ -421,14 +417,11 @@ xfs_mru_cache_insert( unsigned long key, struct xfs_mru_cache_elem *elem) { - int error; - - ASSERT(mru && mru->lists); - if (!mru || !mru->lists) - return -EINVAL; + int error = -EINVAL; + error = -ENOMEM; if (radix_tree_preload(GFP_KERNEL)) - return -ENOMEM; + goto out_free; INIT_LIST_HEAD(&elem->list_node); elem->key = key; @@ -440,6 +433,12 @@ xfs_mru_cache_insert( _xfs_mru_cache_list_insert(mru, elem); spin_unlock(&mru->lock); + if (error) + goto out_free; + return 0; + +out_free: + mru->free_func(mru->data, elem); return error; } diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c index ed8d8ed42f0a..b17672889942 100644 --- a/fs/xfs/xfs_notify_failure.c +++ b/fs/xfs/xfs_notify_failure.c @@ -127,7 +127,7 @@ xfs_dax_notify_failure_freeze( struct super_block *sb = mp->m_super; int error; - error = freeze_super(sb, FREEZE_HOLDER_KERNEL); + error = freeze_super(sb, FREEZE_HOLDER_KERNEL, NULL); if (error) xfs_emerg(mp, "already frozen by kernel, err=%d", error); @@ -143,7 +143,7 @@ xfs_dax_notify_failure_thaw( int error; if (kernel_frozen) { - error = thaw_super(sb, FREEZE_HOLDER_KERNEL); + error = thaw_super(sb, FREEZE_HOLDER_KERNEL, NULL); if (error) xfs_emerg(mp, "still frozen after notify failure, err=%d", error); @@ -153,7 +153,7 @@ xfs_dax_notify_failure_thaw( * Also thaw userspace call anyway because the device is about to be * removed immediately. */ - thaw_super(sb, FREEZE_HOLDER_USERSPACE); + thaw_super(sb, FREEZE_HOLDER_USERSPACE, NULL); } static int @@ -165,7 +165,7 @@ xfs_dax_translate_range( uint64_t *bblen) { u64 dev_start = btp->bt_dax_part_off; - u64 dev_len = bdev_nr_bytes(btp->bt_bdev); + u64 dev_len = BBTOB(btp->bt_nr_sectors); u64 dev_end = dev_start + dev_len - 1; /* Notify failure on the whole device. */ @@ -253,8 +253,7 @@ xfs_dax_notify_dev_failure( return -EOPNOTSUPP; } - error = xfs_dax_translate_range(type == XG_TYPE_RTG ? - mp->m_rtdev_targp : mp->m_ddev_targp, + error = xfs_dax_translate_range(xfs_group_type_buftarg(mp, type), offset, len, &daddr, &bblen); if (error) return error; @@ -280,10 +279,7 @@ xfs_dax_notify_dev_failure( kernel_frozen = xfs_dax_notify_failure_freeze(mp) == 0; } - error = xfs_trans_alloc_empty(mp, &tp); - if (error) - goto out; - + tp = xfs_trans_alloc_empty(mp); start_gno = xfs_fsb_to_gno(mp, start_bno, type); end_gno = xfs_fsb_to_gno(mp, end_bno, type); while ((xg = xfs_group_next_range(mp, xg, start_gno, end_gno, type))) { @@ -354,7 +350,6 @@ xfs_dax_notify_dev_failure( error = -EFSCORRUPTED; } -out: /* Thaw the fs if it has been frozen before. */ if (mf_flags & MF_MEM_PRE_REMOVE) xfs_dax_notify_failure_thaw(mp, kernel_frozen); diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 6f4479deac6d..afe7497012d4 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -58,8 +58,6 @@ xfs_fs_get_uuid( { struct xfs_mount *mp = XFS_M(sb); - xfs_warn_experimental(mp, XFS_EXPERIMENTAL_PNFS); - if (*len < sizeof(uuid_t)) return -EINVAL; diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 417439b58785..95be67ac6eb4 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -126,14 +126,17 @@ xfs_qm_dqpurge( void *data) { struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo; - int error = -EAGAIN; - xfs_dqlock(dqp); - if ((dqp->q_flags & XFS_DQFLAG_FREEING) || dqp->q_nrefs != 0) - goto out_unlock; - - dqp->q_flags |= XFS_DQFLAG_FREEING; + spin_lock(&dqp->q_lockref.lock); + if (dqp->q_lockref.count > 0 || __lockref_is_dead(&dqp->q_lockref)) { + spin_unlock(&dqp->q_lockref.lock); + return -EAGAIN; + } + lockref_mark_dead(&dqp->q_lockref); + spin_unlock(&dqp->q_lockref.lock); + mutex_lock(&dqp->q_qlock); + xfs_qm_dqunpin_wait(dqp); xfs_dqflock(dqp); /* @@ -143,6 +146,7 @@ xfs_qm_dqpurge( */ if (XFS_DQ_IS_DIRTY(dqp)) { struct xfs_buf *bp = NULL; + int error; /* * We don't care about getting disk errors here. We need @@ -150,9 +154,9 @@ xfs_qm_dqpurge( */ error = xfs_dquot_use_attached_buf(dqp, &bp); if (error == -EAGAIN) { - xfs_dqfunlock(dqp); - dqp->q_flags &= ~XFS_DQFLAG_FREEING; - goto out_unlock; + /* resurrect the refcount from the dead. */ + dqp->q_lockref.count = 0; + goto out_funlock; } if (!bp) goto out_funlock; @@ -176,7 +180,7 @@ out_funlock: !test_bit(XFS_LI_IN_AIL, &dqp->q_logitem.qli_item.li_flags)); xfs_dqfunlock(dqp); - xfs_dqunlock(dqp); + mutex_unlock(&dqp->q_qlock); radix_tree_delete(xfs_dquot_tree(qi, xfs_dquot_type(dqp)), dqp->q_id); qi->qi_dquots--; @@ -191,10 +195,6 @@ out_funlock: xfs_qm_dqdestroy(dqp); return 0; - -out_unlock: - xfs_dqunlock(dqp); - return error; } /* @@ -287,51 +287,6 @@ xfs_qm_unmount_quotas( xfs_qm_destroy_quotainos(mp->m_quotainfo); } -STATIC int -xfs_qm_dqattach_one( - struct xfs_inode *ip, - xfs_dqtype_t type, - bool doalloc, - struct xfs_dquot **IO_idqpp) -{ - struct xfs_dquot *dqp; - int error; - - xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); - error = 0; - - /* - * See if we already have it in the inode itself. IO_idqpp is &i_udquot - * or &i_gdquot. This made the code look weird, but made the logic a lot - * simpler. - */ - dqp = *IO_idqpp; - if (dqp) { - trace_xfs_dqattach_found(dqp); - return 0; - } - - /* - * Find the dquot from somewhere. This bumps the reference count of - * dquot and returns it locked. This can return ENOENT if dquot didn't - * exist on disk and we didn't ask it to allocate; ESRCH if quotas got - * turned off suddenly. - */ - error = xfs_qm_dqget_inode(ip, type, doalloc, &dqp); - if (error) - return error; - - trace_xfs_dqattach_get(dqp); - - /* - * dqget may have dropped and re-acquired the ilock, but it guarantees - * that the dquot returned is the one that should go in the inode. - */ - *IO_idqpp = dqp; - xfs_dqunlock(dqp); - return 0; -} - static bool xfs_qm_need_dqattach( struct xfs_inode *ip) @@ -371,7 +326,7 @@ xfs_qm_dqattach_locked( ASSERT(!xfs_is_metadir_inode(ip)); if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) { - error = xfs_qm_dqattach_one(ip, XFS_DQTYPE_USER, + error = xfs_qm_dqget_inode(ip, XFS_DQTYPE_USER, doalloc, &ip->i_udquot); if (error) goto done; @@ -379,7 +334,7 @@ xfs_qm_dqattach_locked( } if (XFS_IS_GQUOTA_ON(mp) && !ip->i_gdquot) { - error = xfs_qm_dqattach_one(ip, XFS_DQTYPE_GROUP, + error = xfs_qm_dqget_inode(ip, XFS_DQTYPE_GROUP, doalloc, &ip->i_gdquot); if (error) goto done; @@ -387,7 +342,7 @@ xfs_qm_dqattach_locked( } if (XFS_IS_PQUOTA_ON(mp) && !ip->i_pdquot) { - error = xfs_qm_dqattach_one(ip, XFS_DQTYPE_PROJ, + error = xfs_qm_dqget_inode(ip, XFS_DQTYPE_PROJ, doalloc, &ip->i_pdquot); if (error) goto done; @@ -465,8 +420,9 @@ xfs_qm_dquot_isolate( struct xfs_dquot *dqp = container_of(item, struct xfs_dquot, q_lru); struct xfs_qm_isolate *isol = arg; + enum lru_status ret = LRU_SKIP; - if (!xfs_dqlock_nowait(dqp)) + if (!spin_trylock(&dqp->q_lockref.lock)) goto out_miss_busy; /* @@ -474,15 +430,24 @@ xfs_qm_dquot_isolate( * from the LRU, leave it for the freeing task to complete the freeing * process rather than risk it being free from under us here. */ - if (dqp->q_flags & XFS_DQFLAG_FREEING) + if (__lockref_is_dead(&dqp->q_lockref)) + goto out_miss_unlock; + + /* + * If the dquot is pinned or dirty, rotate it to the end of the LRU to + * give some time for it to be cleaned before we try to isolate it + * again. + */ + ret = LRU_ROTATE; + if (XFS_DQ_IS_DIRTY(dqp) || atomic_read(&dqp->q_pincount) > 0) goto out_miss_unlock; /* * This dquot has acquired a reference in the meantime remove it from * the freelist and try again. */ - if (dqp->q_nrefs) { - xfs_dqunlock(dqp); + if (dqp->q_lockref.count) { + spin_unlock(&dqp->q_lockref.lock); XFS_STATS_INC(dqp->q_mount, xs_qm_dqwants); trace_xfs_dqreclaim_want(dqp); @@ -492,51 +457,23 @@ xfs_qm_dquot_isolate( } /* - * If the dquot is dirty, flush it. If it's already being flushed, just - * skip it so there is time for the IO to complete before we try to - * reclaim it again on the next LRU pass. + * The dquot may still be under IO, in which case the flush lock will be + * held. If we can't get the flush lock now, just skip over the dquot as + * if it was dirty. */ if (!xfs_dqflock_nowait(dqp)) goto out_miss_unlock; - if (XFS_DQ_IS_DIRTY(dqp)) { - struct xfs_buf *bp = NULL; - int error; - - trace_xfs_dqreclaim_dirty(dqp); - - /* we have to drop the LRU lock to flush the dquot */ - spin_unlock(&lru->lock); - - error = xfs_dquot_use_attached_buf(dqp, &bp); - if (!bp || error == -EAGAIN) { - xfs_dqfunlock(dqp); - goto out_unlock_dirty; - } - - /* - * dqflush completes dqflock on error, and the delwri ioend - * does it on success. - */ - error = xfs_qm_dqflush(dqp, bp); - if (error) - goto out_unlock_dirty; - - xfs_buf_delwri_queue(bp, &isol->buffers); - xfs_buf_relse(bp); - goto out_unlock_dirty; - } - + ASSERT(!XFS_DQ_IS_DIRTY(dqp)); xfs_dquot_detach_buf(dqp); xfs_dqfunlock(dqp); /* * Prevent lookups now that we are past the point of no return. */ - dqp->q_flags |= XFS_DQFLAG_FREEING; - xfs_dqunlock(dqp); + lockref_mark_dead(&dqp->q_lockref); + spin_unlock(&dqp->q_lockref.lock); - ASSERT(dqp->q_nrefs == 0); list_lru_isolate_move(lru, &dqp->q_lru, &isol->dispose); XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused); trace_xfs_dqreclaim_done(dqp); @@ -544,17 +481,11 @@ xfs_qm_dquot_isolate( return LRU_REMOVED; out_miss_unlock: - xfs_dqunlock(dqp); + spin_unlock(&dqp->q_lockref.lock); out_miss_busy: trace_xfs_dqreclaim_busy(dqp); XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses); - return LRU_SKIP; - -out_unlock_dirty: - trace_xfs_dqreclaim_busy(dqp); - XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses); - xfs_dqunlock(dqp); - return LRU_RETRY; + return ret; } static unsigned long @@ -681,10 +612,7 @@ xfs_qm_load_metadir_qinos( struct xfs_trans *tp; int error; - error = xfs_trans_alloc_empty(mp, &tp); - if (error) - return error; - + tp = xfs_trans_alloc_empty(mp); error = xfs_dqinode_load_parent(tp, &qi->qi_dirip); if (error == -ENOENT) { /* no quota dir directory, but we'll create one later */ @@ -1340,9 +1268,10 @@ xfs_qm_quotacheck_dqadjust( return error; } + mutex_lock(&dqp->q_qlock); error = xfs_dquot_attach_buf(NULL, dqp); if (error) - return error; + goto out_unlock; trace_xfs_dqadjust(dqp); @@ -1372,8 +1301,10 @@ xfs_qm_quotacheck_dqadjust( } dqp->q_flags |= XFS_DQFLAG_DIRTY; - xfs_qm_dqput(dqp); - return 0; +out_unlock: + mutex_unlock(&dqp->q_qlock); + xfs_qm_dqrele(dqp); + return error; } /* @@ -1486,45 +1417,19 @@ xfs_qm_flush_one( struct xfs_dquot *dqp, void *data) { - struct xfs_mount *mp = dqp->q_mount; struct list_head *buffer_list = data; struct xfs_buf *bp = NULL; int error = 0; - xfs_dqlock(dqp); - if (dqp->q_flags & XFS_DQFLAG_FREEING) - goto out_unlock; + if (!lockref_get_not_dead(&dqp->q_lockref)) + return 0; + + mutex_lock(&dqp->q_qlock); if (!XFS_DQ_IS_DIRTY(dqp)) goto out_unlock; - /* - * The only way the dquot is already flush locked by the time quotacheck - * gets here is if reclaim flushed it before the dqadjust walk dirtied - * it for the final time. Quotacheck collects all dquot bufs in the - * local delwri queue before dquots are dirtied, so reclaim can't have - * possibly queued it for I/O. The only way out is to push the buffer to - * cycle the flush lock. - */ - if (!xfs_dqflock_nowait(dqp)) { - /* buf is pinned in-core by delwri list */ - error = xfs_buf_incore(mp->m_ddev_targp, dqp->q_blkno, - mp->m_quotainfo->qi_dqchunklen, 0, &bp); - if (error) - goto out_unlock; - - if (!(bp->b_flags & _XBF_DELWRI_Q)) { - error = -EAGAIN; - xfs_buf_relse(bp); - goto out_unlock; - } - xfs_buf_unlock(bp); - - xfs_buf_delwri_pushbuf(bp, buffer_list); - xfs_buf_rele(bp); - - error = -EAGAIN; - goto out_unlock; - } + xfs_qm_dqunpin_wait(dqp); + xfs_dqflock(dqp); error = xfs_dquot_use_attached_buf(dqp, &bp); if (error) @@ -1539,7 +1444,8 @@ xfs_qm_flush_one( xfs_buf_delwri_queue(bp, buffer_list); xfs_buf_relse(bp); out_unlock: - xfs_dqunlock(dqp); + mutex_unlock(&dqp->q_qlock); + xfs_qm_dqrele(dqp); return error; } @@ -1803,10 +1709,7 @@ xfs_qm_qino_load( struct xfs_inode *dp = NULL; int error; - error = xfs_trans_alloc_empty(mp, &tp); - if (error) - return error; - + tp = xfs_trans_alloc_empty(mp); if (xfs_has_metadir(mp)) { error = xfs_dqinode_load_parent(tp, &dp); if (error) @@ -1958,16 +1861,12 @@ xfs_qm_vop_dqalloc( struct xfs_dquot *gq = NULL; struct xfs_dquot *pq = NULL; int error; - uint lockflags; if (!XFS_IS_QUOTA_ON(mp)) return 0; ASSERT(!xfs_is_metadir_inode(ip)); - lockflags = XFS_ILOCK_EXCL; - xfs_ilock(ip, lockflags); - if ((flags & XFS_QMOPT_INHERIT) && XFS_INHERIT_GID(ip)) gid = inode->i_gid; @@ -1976,38 +1875,22 @@ xfs_qm_vop_dqalloc( * if necessary. The dquot(s) will not be locked. */ if (XFS_NOT_DQATTACHED(mp, ip)) { + xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_qm_dqattach_locked(ip, true); - if (error) { - xfs_iunlock(ip, lockflags); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (error) return error; - } } if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) { ASSERT(O_udqpp); if (!uid_eq(inode->i_uid, uid)) { - /* - * What we need is the dquot that has this uid, and - * if we send the inode to dqget, the uid of the inode - * takes priority over what's sent in the uid argument. - * We must unlock inode here before calling dqget if - * we're not sending the inode, because otherwise - * we'll deadlock by doing trans_reserve while - * holding ilock. - */ - xfs_iunlock(ip, lockflags); error = xfs_qm_dqget(mp, from_kuid(user_ns, uid), XFS_DQTYPE_USER, true, &uq); if (error) { ASSERT(error != -ENOENT); return error; } - /* - * Get the ilock in the right order. - */ - xfs_dqunlock(uq); - lockflags = XFS_ILOCK_SHARED; - xfs_ilock(ip, lockflags); } else { /* * Take an extra reference, because we'll return @@ -2020,16 +1903,12 @@ xfs_qm_vop_dqalloc( if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) { ASSERT(O_gdqpp); if (!gid_eq(inode->i_gid, gid)) { - xfs_iunlock(ip, lockflags); error = xfs_qm_dqget(mp, from_kgid(user_ns, gid), XFS_DQTYPE_GROUP, true, &gq); if (error) { ASSERT(error != -ENOENT); goto error_rele; } - xfs_dqunlock(gq); - lockflags = XFS_ILOCK_SHARED; - xfs_ilock(ip, lockflags); } else { ASSERT(ip->i_gdquot); gq = xfs_qm_dqhold(ip->i_gdquot); @@ -2038,16 +1917,12 @@ xfs_qm_vop_dqalloc( if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) { ASSERT(O_pdqpp); if (ip->i_projid != prid) { - xfs_iunlock(ip, lockflags); error = xfs_qm_dqget(mp, prid, XFS_DQTYPE_PROJ, true, &pq); if (error) { ASSERT(error != -ENOENT); goto error_rele; } - xfs_dqunlock(pq); - lockflags = XFS_ILOCK_SHARED; - xfs_ilock(ip, lockflags); } else { ASSERT(ip->i_pdquot); pq = xfs_qm_dqhold(ip->i_pdquot); @@ -2055,7 +1930,6 @@ xfs_qm_vop_dqalloc( } trace_xfs_dquot_dqalloc(ip); - xfs_iunlock(ip, lockflags); if (O_udqpp) *O_udqpp = uq; else @@ -2132,7 +2006,7 @@ xfs_qm_vop_chown( * back now. */ tp->t_flags |= XFS_TRANS_DIRTY; - xfs_dqlock(prevdq); + mutex_lock(&prevdq->q_qlock); if (isrt) { ASSERT(prevdq->q_rtb.reserved >= ip->i_delayed_blks); prevdq->q_rtb.reserved -= ip->i_delayed_blks; @@ -2140,7 +2014,7 @@ xfs_qm_vop_chown( ASSERT(prevdq->q_blk.reserved >= ip->i_delayed_blks); prevdq->q_blk.reserved -= ip->i_delayed_blks; } - xfs_dqunlock(prevdq); + mutex_unlock(&prevdq->q_qlock); /* * Take an extra reference, because the inode is going to keep diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 35b64bc3a7a8..e88ed6ad0e65 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -57,7 +57,7 @@ struct xfs_quotainfo { struct xfs_inode *qi_pquotaip; /* project quota inode */ struct xfs_inode *qi_dirip; /* quota metadir */ struct list_lru qi_lru; - int qi_dquots; + uint64_t qi_dquots; struct mutex qi_quotaofflock;/* to serialize quotaoff */ xfs_filblks_t qi_dqchunklen; /* # BBs in a chunk of dqs */ uint qi_dqperchunk; /* # ondisk dq in above chunk */ diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index 245d754f382a..edc0aef3cf34 100644 --- a/fs/xfs/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c @@ -73,8 +73,10 @@ xfs_qm_statvfs( struct xfs_dquot *dqp; if (!xfs_qm_dqget(mp, ip->i_projid, XFS_DQTYPE_PROJ, false, &dqp)) { + mutex_lock(&dqp->q_qlock); xfs_fill_statvfs_from_dquot(statp, ip, dqp); - xfs_qm_dqput(dqp); + mutex_unlock(&dqp->q_qlock); + xfs_qm_dqrele(dqp); } } diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 0c78f30fa4a3..022e2179c06b 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -303,13 +303,12 @@ xfs_qm_scall_setqlim( } defq = xfs_get_defquota(q, xfs_dquot_type(dqp)); - xfs_dqunlock(dqp); error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_setqlim, 0, 0, 0, &tp); if (error) goto out_rele; - xfs_dqlock(dqp); + mutex_lock(&dqp->q_qlock); xfs_trans_dqjoin(tp, dqp); /* @@ -459,6 +458,7 @@ xfs_qm_scall_getquota( * If everything's NULL, this dquot doesn't quite exist as far as * our utility programs are concerned. */ + mutex_lock(&dqp->q_qlock); if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) { error = -ENOENT; goto out_put; @@ -467,7 +467,8 @@ xfs_qm_scall_getquota( xfs_qm_scall_getquota_fill_qc(mp, type, dqp, dst); out_put: - xfs_qm_dqput(dqp); + mutex_unlock(&dqp->q_qlock); + xfs_qm_dqrele(dqp); return error; } @@ -497,7 +498,8 @@ xfs_qm_scall_getquota_next( *id = dqp->q_id; xfs_qm_scall_getquota_fill_qc(mp, type, dqp, dst); + mutex_unlock(&dqp->q_qlock); - xfs_qm_dqput(dqp); + xfs_qm_dqrele(dqp); return error; } diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index 4c7f7ce4fd2f..94fbe3d99ec7 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -65,7 +65,7 @@ xfs_fs_get_quota_state( memset(state, 0, sizeof(*state)); if (!XFS_IS_QUOTA_ON(mp)) return 0; - state->s_incoredqs = q->qi_dquots; + state->s_incoredqs = min_t(uint64_t, q->qi_dquots, UINT_MAX); if (XFS_IS_UQUOTA_ON(mp)) state->s_state[USRQUOTA].flags |= QCI_ACCT_ENABLED; if (XFS_IS_UQUOTA_ENFORCED(mp)) diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index fe2d7aab8554..3728234699a2 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -78,6 +78,11 @@ xfs_cui_item_size( *nbytes += xfs_cui_log_format_sizeof(cuip->cui_format.cui_nextents); } +unsigned int xfs_cui_log_space(unsigned int nr) +{ + return xlog_item_space(1, xfs_cui_log_format_sizeof(nr)); +} + /* * This is called to fill in the vector of log iovecs for the * given cui log item. We use only 1 iovec, and we point that @@ -179,6 +184,11 @@ xfs_cud_item_size( *nbytes += sizeof(struct xfs_cud_log_format); } +unsigned int xfs_cud_log_space(void) +{ + return xlog_item_space(1, sizeof(struct xfs_cud_log_format)); +} + /* * This is called to fill in the vector of log iovecs for the * given cud log item. We use only 1 iovec, and we point that @@ -707,18 +717,18 @@ xlog_recover_cui_commit_pass2( struct xfs_cui_log_format *cui_formatp; size_t len; - cui_formatp = item->ri_buf[0].i_addr; + cui_formatp = item->ri_buf[0].iov_base; - if (item->ri_buf[0].i_len < xfs_cui_log_format_sizeof(0)) { + if (item->ri_buf[0].iov_len < xfs_cui_log_format_sizeof(0)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + item->ri_buf[0].iov_base, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } len = xfs_cui_log_format_sizeof(cui_formatp->cui_nextents); - if (item->ri_buf[0].i_len != len) { + if (item->ri_buf[0].iov_len != len) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + item->ri_buf[0].iov_base, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } @@ -749,18 +759,18 @@ xlog_recover_rtcui_commit_pass2( struct xfs_cui_log_format *cui_formatp; size_t len; - cui_formatp = item->ri_buf[0].i_addr; + cui_formatp = item->ri_buf[0].iov_base; - if (item->ri_buf[0].i_len < xfs_cui_log_format_sizeof(0)) { + if (item->ri_buf[0].iov_len < xfs_cui_log_format_sizeof(0)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + item->ri_buf[0].iov_base, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } len = xfs_cui_log_format_sizeof(cui_formatp->cui_nextents); - if (item->ri_buf[0].i_len != len) { + if (item->ri_buf[0].iov_len != len) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + item->ri_buf[0].iov_base, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } @@ -781,7 +791,7 @@ xlog_recover_rtcui_commit_pass2( xfs_lsn_t lsn) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, - item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + item->ri_buf[0].iov_base, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } #endif @@ -807,10 +817,10 @@ xlog_recover_cud_commit_pass2( { struct xfs_cud_log_format *cud_formatp; - cud_formatp = item->ri_buf[0].i_addr; - if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format)) { + cud_formatp = item->ri_buf[0].iov_base; + if (item->ri_buf[0].iov_len != sizeof(struct xfs_cud_log_format)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, - item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + item->ri_buf[0].iov_base, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } @@ -833,10 +843,10 @@ xlog_recover_rtcud_commit_pass2( { struct xfs_cud_log_format *cud_formatp; - cud_formatp = item->ri_buf[0].i_addr; - if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format)) { + cud_formatp = item->ri_buf[0].iov_base; + if (item->ri_buf[0].iov_len != sizeof(struct xfs_cud_log_format)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, - item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + item->ri_buf[0].iov_base, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } diff --git a/fs/xfs/xfs_refcount_item.h b/fs/xfs/xfs_refcount_item.h index bfee8f30c63c..0fc3f493342b 100644 --- a/fs/xfs/xfs_refcount_item.h +++ b/fs/xfs/xfs_refcount_item.h @@ -76,4 +76,7 @@ struct xfs_refcount_intent; void xfs_refcount_defer_add(struct xfs_trans *tp, struct xfs_refcount_intent *ri); +unsigned int xfs_cui_log_space(unsigned int nr); +unsigned int xfs_cud_log_space(void); + #endif /* __XFS_REFCOUNT_ITEM_H__ */ diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index cc3b4df88110..3f177b4ec131 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -293,7 +293,7 @@ xfs_bmap_trim_cow( return xfs_reflink_trim_around_shared(ip, imap, shared); } -static int +int xfs_reflink_convert_cow_locked( struct xfs_inode *ip, xfs_fileoff_t offset_fsb, @@ -786,35 +786,19 @@ xfs_reflink_update_quota( * requirements as low as possible. */ STATIC int -xfs_reflink_end_cow_extent( +xfs_reflink_end_cow_extent_locked( + struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *offset_fsb, xfs_fileoff_t end_fsb) { struct xfs_iext_cursor icur; struct xfs_bmbt_irec got, del, data; - struct xfs_mount *mp = ip->i_mount; - struct xfs_trans *tp; struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK); - unsigned int resblks; int nmaps; bool isrt = XFS_IS_REALTIME_INODE(ip); int error; - resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, - XFS_TRANS_RESERVE, &tp); - if (error) - return error; - - /* - * Lock the inode. We have to ijoin without automatic unlock because - * the lead transaction is the refcountbt record deletion; the data - * fork update follows as a deferred log item. - */ - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, 0); - /* * In case of racing, overlapping AIO writes no COW extents might be * left by the time I/O completes for the loser of the race. In that @@ -823,7 +807,7 @@ xfs_reflink_end_cow_extent( if (!xfs_iext_lookup_extent(ip, ifp, *offset_fsb, &icur, &got) || got.br_startoff >= end_fsb) { *offset_fsb = end_fsb; - goto out_cancel; + return 0; } /* @@ -837,7 +821,7 @@ xfs_reflink_end_cow_extent( if (!xfs_iext_next_extent(ifp, &icur, &got) || got.br_startoff >= end_fsb) { *offset_fsb = end_fsb; - goto out_cancel; + return 0; } } del = got; @@ -846,14 +830,14 @@ xfs_reflink_end_cow_extent( error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK, XFS_IEXT_REFLINK_END_COW_CNT); if (error) - goto out_cancel; + return error; /* Grab the corresponding mapping in the data fork. */ nmaps = 1; error = xfs_bmapi_read(ip, del.br_startoff, del.br_blockcount, &data, &nmaps, 0); if (error) - goto out_cancel; + return error; /* We can only remap the smaller of the two extent sizes. */ data.br_blockcount = min(data.br_blockcount, del.br_blockcount); @@ -882,7 +866,7 @@ xfs_reflink_end_cow_extent( error = xfs_bunmapi(NULL, ip, data.br_startoff, data.br_blockcount, 0, 1, &done); if (error) - goto out_cancel; + return error; ASSERT(done); } @@ -899,17 +883,45 @@ xfs_reflink_end_cow_extent( /* Remove the mapping from the CoW fork. */ xfs_bmap_del_extent_cow(ip, &icur, &got, &del); - error = xfs_trans_commit(tp); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - if (error) - return error; - /* Update the caller about how much progress we made. */ *offset_fsb = del.br_startoff + del.br_blockcount; return 0; +} -out_cancel: - xfs_trans_cancel(tp); +/* + * Remap part of the CoW fork into the data fork. + * + * We aim to remap the range starting at @offset_fsb and ending at @end_fsb + * into the data fork; this function will remap what it can (at the end of the + * range) and update @end_fsb appropriately. Each remap gets its own + * transaction because we can end up merging and splitting bmbt blocks for + * every remap operation and we'd like to keep the block reservation + * requirements as low as possible. + */ +STATIC int +xfs_reflink_end_cow_extent( + struct xfs_inode *ip, + xfs_fileoff_t *offset_fsb, + xfs_fileoff_t end_fsb) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + unsigned int resblks; + int error; + + resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, + XFS_TRANS_RESERVE, &tp); + if (error) + return error; + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + error = xfs_reflink_end_cow_extent_locked(tp, ip, offset_fsb, end_fsb); + if (error) + xfs_trans_cancel(tp); + else + error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -973,6 +985,78 @@ xfs_reflink_end_cow( } /* + * Fully remap all of the file's data fork at once, which is the critical part + * in achieving atomic behaviour. + * The regular CoW end path does not use function as to keep the block + * reservation per transaction as low as possible. + */ +int +xfs_reflink_end_atomic_cow( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t count) +{ + xfs_fileoff_t offset_fsb; + xfs_fileoff_t end_fsb; + int error = 0; + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + unsigned int resblks; + + trace_xfs_reflink_end_cow(ip, offset, count); + + offset_fsb = XFS_B_TO_FSBT(mp, offset); + end_fsb = XFS_B_TO_FSB(mp, offset + count); + + /* + * Each remapping operation could cause a btree split, so in the worst + * case that's one for each block. + */ + resblks = (end_fsb - offset_fsb) * + XFS_NEXTENTADD_SPACE_RES(mp, 1, XFS_DATA_FORK); + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_atomic_ioend, resblks, 0, + XFS_TRANS_RESERVE, &tp); + if (error) + return error; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + while (end_fsb > offset_fsb && !error) { + error = xfs_reflink_end_cow_extent_locked(tp, ip, &offset_fsb, + end_fsb); + } + if (error) { + trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_); + goto out_cancel; + } + error = xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +out_cancel: + xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +} + +/* Compute the largest atomic write that we can complete through software. */ +xfs_extlen_t +xfs_reflink_max_atomic_cow( + struct xfs_mount *mp) +{ + /* We cannot do any atomic writes without out of place writes. */ + if (!xfs_can_sw_atomic_write(mp)) + return 0; + + /* + * Atomic write limits must always be a power-of-2, according to + * generic_atomic_write_valid. + */ + return rounddown_pow_of_two(xfs_calc_max_atomic_write_fsblocks(mp)); +} + +/* * Free all CoW staging blocks that are still referenced by the ondisk refcount * metadata. The ondisk metadata does not track which inode created the * staging extent, so callers must ensure that there are no cached inodes with @@ -1797,7 +1881,8 @@ xfs_reflink_unshare( &xfs_dax_write_iomap_ops); else error = iomap_file_unshare(inode, offset, len, - &xfs_buffered_write_iomap_ops); + &xfs_buffered_write_iomap_ops, + &xfs_iomap_write_ops); if (error) goto out; diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index cc4e92278279..9d1ed9bb0bee 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -17,7 +17,7 @@ xfs_can_free_cowblocks(struct xfs_inode *ip) { struct inode *inode = VFS_I(ip); - if ((inode->i_state & I_DIRTY_PAGES) || + if ((inode_state_read_once(inode) & I_DIRTY_PAGES) || mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY) || mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK) || atomic_read(&inode->i_dio_count)) @@ -35,6 +35,8 @@ int xfs_reflink_allocate_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap, bool convert_now); extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count); +int xfs_reflink_convert_cow_locked(struct xfs_inode *ip, + xfs_fileoff_t offset_fsb, xfs_filblks_t count_fsb); extern int xfs_reflink_cancel_cow_blocks(struct xfs_inode *ip, struct xfs_trans **tpp, xfs_fileoff_t offset_fsb, @@ -43,6 +45,8 @@ extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count, bool cancel_real); extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count); +int xfs_reflink_end_atomic_cow(struct xfs_inode *ip, xfs_off_t offset, + xfs_off_t count); extern int xfs_reflink_recover_cow(struct xfs_mount *mp); extern loff_t xfs_reflink_remap_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t len, @@ -64,4 +68,6 @@ extern int xfs_reflink_update_dest(struct xfs_inode *dest, xfs_off_t newlen, bool xfs_reflink_supports_rextsize(struct xfs_mount *mp, unsigned int rextsize); +xfs_extlen_t xfs_reflink_max_atomic_cow(struct xfs_mount *mp); + #endif /* __XFS_REFLINK_H */ diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 89decffe76c8..15f0903f6fd4 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -77,6 +77,11 @@ xfs_rui_item_size( *nbytes += xfs_rui_log_format_sizeof(ruip->rui_format.rui_nextents); } +unsigned int xfs_rui_log_space(unsigned int nr) +{ + return xlog_item_space(1, xfs_rui_log_format_sizeof(nr)); +} + /* * This is called to fill in the vector of log iovecs for the * given rui log item. We use only 1 iovec, and we point that @@ -180,6 +185,11 @@ xfs_rud_item_size( *nbytes += sizeof(struct xfs_rud_log_format); } +unsigned int xfs_rud_log_space(void) +{ + return xlog_item_space(1, sizeof(struct xfs_rud_log_format)); +} + /* * This is called to fill in the vector of log iovecs for the * given rud log item. We use only 1 iovec, and we point that @@ -736,18 +746,18 @@ xlog_recover_rui_commit_pass2( struct xfs_rui_log_format *rui_formatp; size_t len; - rui_formatp = item->ri_buf[0].i_addr; + rui_formatp = item->ri_buf[0].iov_base; - if (item->ri_buf[0].i_len < xfs_rui_log_format_sizeof(0)) { + if (item->ri_buf[0].iov_len < xfs_rui_log_format_sizeof(0)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + item->ri_buf[0].iov_base, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } len = xfs_rui_log_format_sizeof(rui_formatp->rui_nextents); - if (item->ri_buf[0].i_len != len) { + if (item->ri_buf[0].iov_len != len) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + item->ri_buf[0].iov_base, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } @@ -778,18 +788,18 @@ xlog_recover_rtrui_commit_pass2( struct xfs_rui_log_format *rui_formatp; size_t len; - rui_formatp = item->ri_buf[0].i_addr; + rui_formatp = item->ri_buf[0].iov_base; - if (item->ri_buf[0].i_len < xfs_rui_log_format_sizeof(0)) { + if (item->ri_buf[0].iov_len < xfs_rui_log_format_sizeof(0)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + item->ri_buf[0].iov_base, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } len = xfs_rui_log_format_sizeof(rui_formatp->rui_nextents); - if (item->ri_buf[0].i_len != len) { + if (item->ri_buf[0].iov_len != len) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + item->ri_buf[0].iov_base, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } @@ -810,7 +820,7 @@ xlog_recover_rtrui_commit_pass2( xfs_lsn_t lsn) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, - item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + item->ri_buf[0].iov_base, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } #endif @@ -836,10 +846,10 @@ xlog_recover_rud_commit_pass2( { struct xfs_rud_log_format *rud_formatp; - rud_formatp = item->ri_buf[0].i_addr; - if (item->ri_buf[0].i_len != sizeof(struct xfs_rud_log_format)) { + rud_formatp = item->ri_buf[0].iov_base; + if (item->ri_buf[0].iov_len != sizeof(struct xfs_rud_log_format)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, - rud_formatp, item->ri_buf[0].i_len); + rud_formatp, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } @@ -862,10 +872,10 @@ xlog_recover_rtrud_commit_pass2( { struct xfs_rud_log_format *rud_formatp; - rud_formatp = item->ri_buf[0].i_addr; - if (item->ri_buf[0].i_len != sizeof(struct xfs_rud_log_format)) { + rud_formatp = item->ri_buf[0].iov_base; + if (item->ri_buf[0].iov_len != sizeof(struct xfs_rud_log_format)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp, - rud_formatp, item->ri_buf[0].i_len); + rud_formatp, item->ri_buf[0].iov_len); return -EFSCORRUPTED; } diff --git a/fs/xfs/xfs_rmap_item.h b/fs/xfs/xfs_rmap_item.h index 40d331555675..3a99f0117f2d 100644 --- a/fs/xfs/xfs_rmap_item.h +++ b/fs/xfs/xfs_rmap_item.h @@ -75,4 +75,7 @@ struct xfs_rmap_intent; void xfs_rmap_defer_add(struct xfs_trans *tp, struct xfs_rmap_intent *ri); +unsigned int xfs_rui_log_space(unsigned int nr); +unsigned int xfs_rud_log_space(void); + #endif /* __XFS_RMAP_ITEM_H__ */ diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 6484c596ecea..e063f4f2f2e6 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -729,9 +729,7 @@ xfs_rtginode_ensure( if (rtg->rtg_inodes[type]) return 0; - error = xfs_trans_alloc_empty(rtg_mount(rtg), &tp); - if (error) - return error; + tp = xfs_trans_alloc_empty(rtg_mount(rtg)); error = xfs_rtginode_load(rtg, type, tp); xfs_trans_cancel(tp); @@ -1257,10 +1255,10 @@ xfs_growfs_check_rtgeom( min_logfsbs = min_t(xfs_extlen_t, xfs_log_calc_minimum_size(nmp), nmp->m_rsumblocks * 2); - kfree(nmp); + trace_xfs_growfs_check_rtgeom(mp, min_logfsbs); if (min_logfsbs > mp->m_sb.sb_logblocks) - return -EINVAL; + goto out_inval; if (xfs_has_zoned(mp)) { uint32_t gblocks = mp->m_groups[XG_TYPE_RTG].blocks; @@ -1268,16 +1266,20 @@ xfs_growfs_check_rtgeom( if (rextsize != 1) return -EINVAL; - div_u64_rem(mp->m_sb.sb_rblocks, gblocks, &rem); + div_u64_rem(nmp->m_sb.sb_rblocks, gblocks, &rem); if (rem) { xfs_warn(mp, "new RT volume size (%lld) not aligned to RT group size (%d)", - mp->m_sb.sb_rblocks, gblocks); - return -EINVAL; + nmp->m_sb.sb_rblocks, gblocks); + goto out_inval; } } + kfree(nmp); return 0; +out_inval: + kfree(nmp); + return -EINVAL; } /* @@ -1303,9 +1305,7 @@ xfs_growfs_rt_prep_groups( if (!mp->m_rtdirip) { struct xfs_trans *tp; - error = xfs_trans_alloc_empty(mp, &tp); - if (error) - return error; + tp = xfs_trans_alloc_empty(mp); error = xfs_rtginode_load_parent(tp); xfs_trans_cancel(tp); @@ -1672,10 +1672,7 @@ xfs_rtmount_inodes( struct xfs_rtgroup *rtg = NULL; int error; - error = xfs_trans_alloc_empty(mp, &tp); - if (error) - return error; - + tp = xfs_trans_alloc_empty(mp); if (xfs_has_rtgroups(mp) && mp->m_sb.sb_rgcount > 0) { error = xfs_rtginode_load_parent(tp); if (error) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index b2dd0c0bf509..bc71aa9dcee8 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -102,19 +102,33 @@ static const struct constant_table dax_param_enums[] = { * Table driven mount option parser. */ enum { - Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev, + Op_deprecated, Opt_logbufs, Opt_logbsize, Opt_logdev, Opt_rtdev, Opt_wsync, Opt_noalign, Opt_swalloc, Opt_sunit, Opt_swidth, Opt_nouuid, Opt_grpid, Opt_nogrpid, Opt_bsdgroups, Opt_sysvgroups, - Opt_allocsize, Opt_norecovery, Opt_inode64, Opt_inode32, Opt_ikeep, - Opt_noikeep, Opt_largeio, Opt_nolargeio, Opt_attr2, Opt_noattr2, + Opt_allocsize, Opt_norecovery, Opt_inode64, Opt_inode32, + Opt_largeio, Opt_nolargeio, Opt_filestreams, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota, Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce, Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum, Opt_max_open_zones, - Opt_lifetime, Opt_nolifetime, + Opt_lifetime, Opt_nolifetime, Opt_max_atomic_write, }; +#define fsparam_dead(NAME) \ + __fsparam(NULL, (NAME), Op_deprecated, fs_param_deprecated, NULL) + static const struct fs_parameter_spec xfs_fs_parameters[] = { + /* + * These mount options were supposed to be deprecated in September 2025 + * but the deprecation warning was buggy, so not all users were + * notified. The deprecation is now obnoxiously loud and postponed to + * September 2030. + */ + fsparam_dead("attr2"), + fsparam_dead("noattr2"), + fsparam_dead("ikeep"), + fsparam_dead("noikeep"), + fsparam_u32("logbufs", Opt_logbufs), fsparam_string("logbsize", Opt_logbsize), fsparam_string("logdev", Opt_logdev), @@ -133,12 +147,8 @@ static const struct fs_parameter_spec xfs_fs_parameters[] = { fsparam_flag("norecovery", Opt_norecovery), fsparam_flag("inode64", Opt_inode64), fsparam_flag("inode32", Opt_inode32), - fsparam_flag("ikeep", Opt_ikeep), - fsparam_flag("noikeep", Opt_noikeep), fsparam_flag("largeio", Opt_largeio), fsparam_flag("nolargeio", Opt_nolargeio), - fsparam_flag("attr2", Opt_attr2), - fsparam_flag("noattr2", Opt_noattr2), fsparam_flag("filestreams", Opt_filestreams), fsparam_flag("quota", Opt_quota), fsparam_flag("noquota", Opt_noquota), @@ -159,6 +169,7 @@ static const struct fs_parameter_spec xfs_fs_parameters[] = { fsparam_u32("max_open_zones", Opt_max_open_zones), fsparam_flag("lifetime", Opt_lifetime), fsparam_flag("nolifetime", Opt_nolifetime), + fsparam_string("max_atomic_write", Opt_max_atomic_write), {} }; @@ -174,13 +185,11 @@ xfs_fs_show_options( { static struct proc_xfs_info xfs_info_set[] = { /* the few simple ones we can get from the mount struct */ - { XFS_FEAT_IKEEP, ",ikeep" }, { XFS_FEAT_WSYNC, ",wsync" }, { XFS_FEAT_NOALIGN, ",noalign" }, { XFS_FEAT_SWALLOC, ",swalloc" }, { XFS_FEAT_NOUUID, ",nouuid" }, { XFS_FEAT_NORECOVERY, ",norecovery" }, - { XFS_FEAT_ATTR2, ",attr2" }, { XFS_FEAT_FILESTREAMS, ",filestreams" }, { XFS_FEAT_GRPID, ",grpid" }, { XFS_FEAT_DISCARD, ",discard" }, @@ -241,6 +250,9 @@ xfs_fs_show_options( if (mp->m_max_open_zones) seq_printf(m, ",max_open_zones=%u", mp->m_max_open_zones); + if (mp->m_awu_max_bytes) + seq_printf(m, ",max_atomic_write=%lluk", + mp->m_awu_max_bytes >> 10); return 0; } @@ -380,10 +392,11 @@ xfs_blkdev_get( struct file **bdev_filep) { int error = 0; + blk_mode_t mode; - *bdev_filep = bdev_file_open_by_path(name, - BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_RESTRICT_WRITES, - mp->m_super, &fs_holder_ops); + mode = sb_open_mode(mp->m_super->s_flags); + *bdev_filep = bdev_file_open_by_path(name, mode, + mp->m_super, &fs_holder_ops); if (IS_ERR(*bdev_filep)) { error = PTR_ERR(*bdev_filep); *bdev_filep = NULL; @@ -481,21 +494,29 @@ xfs_open_devices( /* * Setup xfs_mount buffer target pointers */ - error = -ENOMEM; mp->m_ddev_targp = xfs_alloc_buftarg(mp, sb->s_bdev_file); - if (!mp->m_ddev_targp) + if (IS_ERR(mp->m_ddev_targp)) { + error = PTR_ERR(mp->m_ddev_targp); + mp->m_ddev_targp = NULL; goto out_close_rtdev; + } if (rtdev_file) { mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev_file); - if (!mp->m_rtdev_targp) + if (IS_ERR(mp->m_rtdev_targp)) { + error = PTR_ERR(mp->m_rtdev_targp); + mp->m_rtdev_targp = NULL; goto out_free_ddev_targ; + } } if (logdev_file && file_bdev(logdev_file) != ddev) { mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev_file); - if (!mp->m_logdev_targp) + if (IS_ERR(mp->m_logdev_targp)) { + error = PTR_ERR(mp->m_logdev_targp); + mp->m_logdev_targp = NULL; goto out_free_rtdev_targ; + } } else { mp->m_logdev_targp = mp->m_ddev_targp; /* Handle won't be used, drop it */ @@ -528,7 +549,8 @@ xfs_setup_devices( { int error; - error = xfs_setsize_buftarg(mp->m_ddev_targp, mp->m_sb.sb_sectsize); + error = xfs_configure_buftarg(mp->m_ddev_targp, mp->m_sb.sb_sectsize, + mp->m_sb.sb_dblocks); if (error) return error; @@ -537,8 +559,8 @@ xfs_setup_devices( if (xfs_has_sector(mp)) log_sector_size = mp->m_sb.sb_logsectsize; - error = xfs_setsize_buftarg(mp->m_logdev_targp, - log_sector_size); + error = xfs_configure_buftarg(mp->m_logdev_targp, + log_sector_size, mp->m_sb.sb_logblocks); if (error) return error; } @@ -551,8 +573,8 @@ xfs_setup_devices( } mp->m_rtdev_targp = mp->m_ddev_targp; } else if (mp->m_rtname) { - error = xfs_setsize_buftarg(mp->m_rtdev_targp, - mp->m_sb.sb_sectsize); + error = xfs_configure_buftarg(mp->m_rtdev_targp, + mp->m_sb.sb_sectsize, mp->m_sb.sb_rblocks); if (error) return error; } @@ -565,19 +587,19 @@ xfs_init_mount_workqueues( struct xfs_mount *mp) { mp->m_buf_workqueue = alloc_workqueue("xfs-buf/%s", - XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM), + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU), 1, mp->m_super->s_id); if (!mp->m_buf_workqueue) goto out; mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s", - XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM), + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU), 0, mp->m_super->s_id); if (!mp->m_unwritten_workqueue) goto out_destroy_buf; mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s", - XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM), + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU), 0, mp->m_super->s_id); if (!mp->m_reclaim_workqueue) goto out_destroy_unwritten; @@ -589,13 +611,14 @@ xfs_init_mount_workqueues( goto out_destroy_reclaim; mp->m_inodegc_wq = alloc_workqueue("xfs-inodegc/%s", - XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM), + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_PERCPU), 1, mp->m_super->s_id); if (!mp->m_inodegc_wq) goto out_destroy_blockgc; mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s", - XFS_WQFLAGS(WQ_FREEZABLE), 0, mp->m_super->s_id); + XFS_WQFLAGS(WQ_FREEZABLE | WQ_PERCPU), 0, + mp->m_super->s_id); if (!mp->m_sync_workqueue) goto out_destroy_inodegc; @@ -765,7 +788,7 @@ xfs_fs_drop_inode( return 0; } - return generic_drop_inode(inode); + return inode_generic_drop(inode); } STATIC void @@ -777,6 +800,12 @@ xfs_fs_evict_inode( truncate_inode_pages_final(&inode->i_data); clear_inode(inode); + + if (IS_ENABLED(CONFIG_XFS_RT) && + S_ISREG(inode->i_mode) && inode->i_private) { + xfs_open_zone_put(inode->i_private); + inode->i_private = NULL; + } } static void @@ -1075,15 +1104,6 @@ xfs_finish_flags( } /* - * V5 filesystems always use attr2 format for attributes. - */ - if (xfs_has_crc(mp) && xfs_has_noattr2(mp)) { - xfs_warn(mp, "Cannot mount a V5 filesystem as noattr2. " - "attr2 is always enabled for V5 filesystems."); - return -EINVAL; - } - - /* * prohibit r/w mounts of read-only filesystems */ if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !xfs_is_readonly(mp)) { @@ -1149,7 +1169,7 @@ xfs_init_percpu_counters( return 0; free_freecounters: - while (--i > 0) + while (--i >= 0) percpu_counter_destroy(&mp->m_free[i].count); percpu_counter_destroy(&mp->m_delalloc_rtextents); free_delalloc: @@ -1334,19 +1354,64 @@ suffix_kstrtoint( return ret; } +static int +suffix_kstrtoull( + const char *s, + unsigned int base, + unsigned long long *res) +{ + int last, shift_left_factor = 0; + unsigned long long _res; + char *value; + int ret = 0; + + value = kstrdup(s, GFP_KERNEL); + if (!value) + return -ENOMEM; + + last = strlen(value) - 1; + if (value[last] == 'K' || value[last] == 'k') { + shift_left_factor = 10; + value[last] = '\0'; + } + if (value[last] == 'M' || value[last] == 'm') { + shift_left_factor = 20; + value[last] = '\0'; + } + if (value[last] == 'G' || value[last] == 'g') { + shift_left_factor = 30; + value[last] = '\0'; + } + + if (kstrtoull(value, base, &_res)) + ret = -EINVAL; + kfree(value); + *res = _res << shift_left_factor; + return ret; +} + static inline void xfs_fs_warn_deprecated( struct fs_context *fc, - struct fs_parameter *param, - uint64_t flag, - bool value) + struct fs_parameter *param) { - /* Don't print the warning if reconfiguring and current mount point - * already had the flag set + /* + * Always warn about someone passing in a deprecated mount option. + * Previously we wouldn't print the warning if we were reconfiguring + * and current mount point already had the flag set, but that was not + * the right thing to do. + * + * Many distributions mount the root filesystem with no options in the + * initramfs and rely on mount -a to remount the root fs with the + * options in fstab. However, the old behavior meant that there would + * never be a warning about deprecated mount options for the root fs in + * /etc/fstab. On a single-fs system, that means no warning at all. + * + * Compounding this problem are distribution scripts that copy + * /proc/mounts to fstab, which means that we can't remove mount + * options unless we're 100% sure they have only ever been advertised + * in /proc/mounts in response to explicitly provided mount options. */ - if ((fc->purpose & FS_CONTEXT_FOR_RECONFIGURE) && - !!(XFS_M(fc->root->d_sb)->m_features & flag) == value) - return; xfs_warn(fc->s_fs_info, "%s mount option is deprecated.", param->key); } @@ -1372,6 +1437,9 @@ xfs_fs_parse_param( return opt; switch (opt) { + case Op_deprecated: + xfs_fs_warn_deprecated(fc, param); + return 0; case Opt_logbufs: parsing_mp->m_logbufs = result.uint_32; return 0; @@ -1492,23 +1560,6 @@ xfs_fs_parse_param( xfs_mount_set_dax_mode(parsing_mp, result.uint_32); return 0; #endif - /* Following mount options will be removed in September 2025 */ - case Opt_ikeep: - xfs_fs_warn_deprecated(fc, param, XFS_FEAT_IKEEP, true); - parsing_mp->m_features |= XFS_FEAT_IKEEP; - return 0; - case Opt_noikeep: - xfs_fs_warn_deprecated(fc, param, XFS_FEAT_IKEEP, false); - parsing_mp->m_features &= ~XFS_FEAT_IKEEP; - return 0; - case Opt_attr2: - xfs_fs_warn_deprecated(fc, param, XFS_FEAT_ATTR2, true); - parsing_mp->m_features |= XFS_FEAT_ATTR2; - return 0; - case Opt_noattr2: - xfs_fs_warn_deprecated(fc, param, XFS_FEAT_NOATTR2, true); - parsing_mp->m_features |= XFS_FEAT_NOATTR2; - return 0; case Opt_max_open_zones: parsing_mp->m_max_open_zones = result.uint_32; return 0; @@ -1518,6 +1569,14 @@ xfs_fs_parse_param( case Opt_nolifetime: parsing_mp->m_features |= XFS_FEAT_NOLIFETIME; return 0; + case Opt_max_atomic_write: + if (suffix_kstrtoull(param->string, 10, + &parsing_mp->m_awu_max_bytes)) { + xfs_warn(parsing_mp, + "max atomic write size must be positive integer"); + return -EINVAL; + } + return 0; default: xfs_warn(parsing_mp, "unknown mount option [%s].", param->key); return -EINVAL; @@ -1536,16 +1595,6 @@ xfs_fs_validate_params( return -EINVAL; } - /* - * We have not read the superblock at this point, so only the attr2 - * mount option can set the attr2 feature by this stage. - */ - if (xfs_has_attr2(mp) && xfs_has_noattr2(mp)) { - xfs_warn(mp, "attr2 and noattr2 cannot both be specified."); - return -EINVAL; - } - - if (xfs_has_noalign(mp) && (mp->m_dalign || mp->m_swidth)) { xfs_warn(mp, "sunit and swidth options incompatible with the noalign option"); @@ -1644,7 +1693,10 @@ xfs_fs_fill_super( if (error) return error; - sb_min_blocksize(sb, BBSIZE); + if (!sb_min_blocksize(sb, BBSIZE)) { + xfs_err(mp, "unable to set blocksize"); + return -EINVAL; + } sb->s_xattr = xfs_xattr_handlers; sb->s_export_op = &xfs_export_operations; #ifdef CONFIG_XFS_QUOTA @@ -1897,13 +1949,6 @@ xfs_fs_fill_super( } } - - if (xfs_has_exchange_range(mp)) - xfs_warn_experimental(mp, XFS_EXPERIMENTAL_EXCHRANGE); - - if (xfs_has_parent(mp)) - xfs_warn_experimental(mp, XFS_EXPERIMENTAL_PPTR); - /* * If no quota mount options were provided, maybe we'll try to pick * up the quota accounting and enforcement flags from the ondisk sb. @@ -1969,6 +2014,19 @@ xfs_remount_rw( struct xfs_sb *sbp = &mp->m_sb; int error; + if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp && + xfs_readonly_buftarg(mp->m_logdev_targp)) { + xfs_warn(mp, + "ro->rw transition prohibited by read-only logdev"); + return -EACCES; + } + + if (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp)) { + xfs_warn(mp, + "ro->rw transition prohibited by read-only rtdev"); + return -EACCES; + } + if (xfs_has_norecovery(mp)) { xfs_warn(mp, "ro->rw transition prohibited on norecovery mount"); @@ -2114,6 +2172,14 @@ xfs_fs_reconfigure( if (error) return error; + /* Validate new max_atomic_write option before making other changes */ + if (mp->m_awu_max_bytes != new_mp->m_awu_max_bytes) { + error = xfs_set_max_atomic_write_opt(mp, + new_mp->m_awu_max_bytes); + if (error) + return error; + } + /* inode32 -> inode64 */ if (xfs_has_small_inums(mp) && !xfs_has_small_inums(new_mp)) { mp->m_features &= ~XFS_FEAT_SMALL_INUMS; @@ -2126,6 +2192,17 @@ xfs_fs_reconfigure( mp->m_maxagi = xfs_set_inode_alloc(mp, mp->m_sb.sb_agcount); } + /* + * Now that mp has been modified according to the remount options, we + * do a final option validation with xfs_finish_flags() just like it is + * just like it is done during mount. We cannot use + * done during mount. We cannot use xfs_finish_flags() on new_mp as it + * contains only the user given options. + */ + error = xfs_finish_flags(mp); + if (error) + return error; + /* ro -> rw */ if (xfs_is_readonly(mp) && !(flags & SB_RDONLY)) { error = xfs_remount_rw(mp); @@ -2178,7 +2255,7 @@ xfs_init_fs_context( struct xfs_mount *mp; int i; - mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL | __GFP_NOFAIL); + mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL); if (!mp) return -ENOMEM; @@ -2499,8 +2576,8 @@ xfs_init_workqueues(void) * AGs in all the filesystems mounted. Hence use the default large * max_active value for this workqueue. */ - xfs_alloc_wq = alloc_workqueue("xfsalloc", - XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE), 0); + xfs_alloc_wq = alloc_workqueue("xfsalloc", XFS_WQFLAGS(WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_PERCPU), + 0); if (!xfs_alloc_wq) return -ENOMEM; diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index 751dc74a3067..9918f14b4874 100644 --- a/fs/xfs/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c @@ -50,7 +50,7 @@ xfs_panic_mask_proc_handler( } #endif /* CONFIG_PROC_FS */ -STATIC int +static inline int xfs_deprecated_dointvec_minmax( const struct ctl_table *ctl, int write, @@ -68,24 +68,6 @@ xfs_deprecated_dointvec_minmax( static const struct ctl_table xfs_table[] = { { - .procname = "irix_sgid_inherit", - .data = &xfs_params.sgid_inherit.val, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = xfs_deprecated_dointvec_minmax, - .extra1 = &xfs_params.sgid_inherit.min, - .extra2 = &xfs_params.sgid_inherit.max - }, - { - .procname = "irix_symlink_mode", - .data = &xfs_params.symlink_mode.val, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = xfs_deprecated_dointvec_minmax, - .extra1 = &xfs_params.symlink_mode.min, - .extra2 = &xfs_params.symlink_mode.max - }, - { .procname = "panic_mask", .data = &xfs_params.panic_mask.val, .maxlen = sizeof(int), @@ -185,15 +167,6 @@ static const struct ctl_table xfs_table[] = { .extra1 = &xfs_params.blockgc_timer.min, .extra2 = &xfs_params.blockgc_timer.max, }, - { - .procname = "speculative_cow_prealloc_lifetime", - .data = &xfs_params.blockgc_timer.val, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = xfs_deprecated_dointvec_minmax, - .extra1 = &xfs_params.blockgc_timer.min, - .extra2 = &xfs_params.blockgc_timer.max, - }, /* please keep this the last entry */ #ifdef CONFIG_PROC_FS { diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h index 276696a07040..ed9d896079c1 100644 --- a/fs/xfs/xfs_sysctl.h +++ b/fs/xfs/xfs_sysctl.h @@ -19,9 +19,6 @@ typedef struct xfs_sysctl_val { } xfs_sysctl_val_t; typedef struct xfs_param { - xfs_sysctl_val_t sgid_inherit; /* Inherit S_ISGID if process' GID is - * not a member of parent dir GID. */ - xfs_sysctl_val_t symlink_mode; /* Link creat mode affected by umask */ xfs_sysctl_val_t panic_mask; /* bitmask to cause panic on errors. */ xfs_sysctl_val_t error_level; /* Degree of reporting for problems */ xfs_sysctl_val_t syncd_timer; /* Interval between xfssyncd wakeups */ @@ -29,8 +26,6 @@ typedef struct xfs_param { xfs_sysctl_val_t inherit_sync; /* Inherit the "sync" inode flag. */ xfs_sysctl_val_t inherit_nodump;/* Inherit the "nodump" inode flag. */ xfs_sysctl_val_t inherit_noatim;/* Inherit the "noatime" inode flag. */ - xfs_sysctl_val_t xfs_buf_timer; /* Interval between xfsbufd wakeups. */ - xfs_sysctl_val_t xfs_buf_age; /* Metadata buffer age before flush. */ xfs_sysctl_val_t inherit_nosym; /* Inherit the "nosymlinks" flag. */ xfs_sysctl_val_t rotorstep; /* inode32 AG rotoring control knob */ xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */ diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index b7e82d85f043..7a5c5ef2db92 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -718,8 +718,40 @@ max_open_zones_show( } XFS_SYSFS_ATTR_RO(max_open_zones); +static ssize_t +zonegc_low_space_store( + struct kobject *kobj, + const char *buf, + size_t count) +{ + int ret; + unsigned int val; + + ret = kstrtouint(buf, 0, &val); + if (ret) + return ret; + + if (val > 100) + return -EINVAL; + + zoned_to_mp(kobj)->m_zonegc_low_space = val; + + return count; +} + +static ssize_t +zonegc_low_space_show( + struct kobject *kobj, + char *buf) +{ + return sysfs_emit(buf, "%u\n", + zoned_to_mp(kobj)->m_zonegc_low_space); +} +XFS_SYSFS_ATTR_RW(zonegc_low_space); + static struct attribute *xfs_zoned_attrs[] = { ATTR_LIST(max_open_zones), + ATTR_LIST(zonegc_low_space), NULL, }; ATTRIBUTE_GROUPS(xfs_zoned); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index e56ba1963160..f70afbf3cb19 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -170,6 +170,96 @@ DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound); DEFINE_ATTR_LIST_EVENT(xfs_attr_leaf_list); DEFINE_ATTR_LIST_EVENT(xfs_attr_node_list); +TRACE_EVENT(xfs_calc_atomic_write_unit_max, + TP_PROTO(struct xfs_mount *mp, enum xfs_group_type type, + unsigned int max_write, unsigned int max_ioend, + unsigned int max_gsize, unsigned int awu_max), + TP_ARGS(mp, type, max_write, max_ioend, max_gsize, awu_max), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(enum xfs_group_type, type) + __field(unsigned int, max_write) + __field(unsigned int, max_ioend) + __field(unsigned int, max_gsize) + __field(unsigned int, awu_max) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->type = type; + __entry->max_write = max_write; + __entry->max_ioend = max_ioend; + __entry->max_gsize = max_gsize; + __entry->awu_max = awu_max; + ), + TP_printk("dev %d:%d %s max_write %u max_ioend %u max_gsize %u awu_max %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XG_TYPE_STRINGS), + __entry->max_write, + __entry->max_ioend, + __entry->max_gsize, + __entry->awu_max) +); + +TRACE_EVENT(xfs_calc_max_atomic_write_fsblocks, + TP_PROTO(struct xfs_mount *mp, unsigned int per_intent, + unsigned int step_size, unsigned int logres, + unsigned int blockcount), + TP_ARGS(mp, per_intent, step_size, logres, blockcount), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, per_intent) + __field(unsigned int, step_size) + __field(unsigned int, logres) + __field(unsigned int, blockcount) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->per_intent = per_intent; + __entry->step_size = step_size; + __entry->logres = logres; + __entry->blockcount = blockcount; + ), + TP_printk("dev %d:%d per_intent %u step_size %u logres %u blockcount %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->per_intent, + __entry->step_size, + __entry->logres, + __entry->blockcount) +); + +TRACE_EVENT(xfs_calc_max_atomic_write_log_geometry, + TP_PROTO(struct xfs_mount *mp, unsigned int per_intent, + unsigned int step_size, unsigned int blockcount, + unsigned int min_logblocks, unsigned int logres), + TP_ARGS(mp, per_intent, step_size, blockcount, min_logblocks, logres), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, per_intent) + __field(unsigned int, step_size) + __field(unsigned int, blockcount) + __field(unsigned int, min_logblocks) + __field(unsigned int, cur_logblocks) + __field(unsigned int, logres) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->per_intent = per_intent; + __entry->step_size = step_size; + __entry->blockcount = blockcount; + __entry->min_logblocks = min_logblocks; + __entry->cur_logblocks = mp->m_sb.sb_logblocks; + __entry->logres = logres; + ), + TP_printk("dev %d:%d per_intent %u step_size %u blockcount %u min_logblocks %u logblocks %u logres %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->per_intent, + __entry->step_size, + __entry->blockcount, + __entry->min_logblocks, + __entry->cur_logblocks, + __entry->logres) +); + TRACE_EVENT(xlog_intent_recovery_failed, TP_PROTO(struct xfs_mount *mp, const struct xfs_defer_op_type *ops, int error), @@ -335,8 +425,8 @@ DECLARE_EVENT_CLASS(xfs_zone_alloc_class, __field(dev_t, dev) __field(xfs_rgnumber_t, rgno) __field(xfs_rgblock_t, used) + __field(xfs_rgblock_t, allocated) __field(xfs_rgblock_t, written) - __field(xfs_rgblock_t, write_pointer) __field(xfs_rgblock_t, rgbno) __field(xfs_extlen_t, len) ), @@ -344,17 +434,17 @@ DECLARE_EVENT_CLASS(xfs_zone_alloc_class, __entry->dev = rtg_mount(oz->oz_rtg)->m_super->s_dev; __entry->rgno = rtg_rgno(oz->oz_rtg); __entry->used = rtg_rmap(oz->oz_rtg)->i_used_blocks; + __entry->allocated = oz->oz_allocated; __entry->written = oz->oz_written; - __entry->write_pointer = oz->oz_write_pointer; __entry->rgbno = rgbno; __entry->len = len; ), - TP_printk("dev %d:%d rgno 0x%x used 0x%x written 0x%x wp 0x%x rgbno 0x%x len 0x%x", + TP_printk("dev %d:%d rgno 0x%x used 0x%x alloced 0x%x written 0x%x rgbno 0x%x len 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rgno, __entry->used, + __entry->allocated, __entry->written, - __entry->write_pointer, __entry->rgbno, __entry->len) ); @@ -365,6 +455,7 @@ DEFINE_EVENT(xfs_zone_alloc_class, name, \ xfs_extlen_t len), \ TP_ARGS(oz, rgbno, len)) DEFINE_ZONE_ALLOC_EVENT(xfs_zone_record_blocks); +DEFINE_ZONE_ALLOC_EVENT(xfs_zone_skip_blocks); DEFINE_ZONE_ALLOC_EVENT(xfs_zone_alloc_blocks); TRACE_EVENT(xfs_zone_gc_select_victim, @@ -685,7 +776,6 @@ DEFINE_BUF_EVENT(xfs_buf_iowait_done); DEFINE_BUF_EVENT(xfs_buf_delwri_queue); DEFINE_BUF_EVENT(xfs_buf_delwri_queued); DEFINE_BUF_EVENT(xfs_buf_delwri_split); -DEFINE_BUF_EVENT(xfs_buf_delwri_pushbuf); DEFINE_BUF_EVENT(xfs_buf_get_uncached); DEFINE_BUF_EVENT(xfs_buf_item_relse); DEFINE_BUF_EVENT(xfs_buf_iodone_async); @@ -990,7 +1080,9 @@ DEFINE_INODE_EVENT(xfs_get_acl); #endif DEFINE_INODE_EVENT(xfs_vm_bmap); DEFINE_INODE_EVENT(xfs_file_ioctl); +#ifdef CONFIG_COMPAT DEFINE_INODE_EVENT(xfs_file_compat_ioctl); +#endif DEFINE_INODE_EVENT(xfs_ioctl_setattr); DEFINE_INODE_EVENT(xfs_dir_fsync); DEFINE_INODE_EVENT(xfs_file_fsync); @@ -1054,20 +1146,23 @@ DECLARE_EVENT_CLASS(xfs_iref_class, __field(xfs_ino_t, ino) __field(int, count) __field(int, pincount) + __field(unsigned long, iflags) __field(unsigned long, caller_ip) ), TP_fast_assign( __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; - __entry->count = atomic_read(&VFS_I(ip)->i_count); + __entry->count = icount_read(VFS_I(ip)); __entry->pincount = atomic_read(&ip->i_pincount); + __entry->iflags = ip->i_flags; __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %pS", + TP_printk("dev %d:%d ino 0x%llx count %d pincount %d iflags 0x%lx caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->count, __entry->pincount, + __entry->iflags, (char *)__entry->caller_ip) ) @@ -1157,6 +1252,8 @@ DEFINE_IREF_EVENT(xfs_irele); DEFINE_IREF_EVENT(xfs_inode_pin); DEFINE_IREF_EVENT(xfs_inode_unpin); DEFINE_IREF_EVENT(xfs_inode_unpin_nowait); +DEFINE_IREF_EVENT(xfs_inode_push_pinned); +DEFINE_IREF_EVENT(xfs_inode_push_stale); DECLARE_EVENT_CLASS(xfs_namespace_class, TP_PROTO(struct xfs_inode *dp, const struct xfs_name *name), @@ -1253,7 +1350,7 @@ DECLARE_EVENT_CLASS(xfs_dquot_class, __entry->id = dqp->q_id; __entry->type = dqp->q_type; __entry->flags = dqp->q_flags; - __entry->nrefs = dqp->q_nrefs; + __entry->nrefs = data_race(dqp->q_lockref.count); __entry->res_bcount = dqp->q_blk.reserved; __entry->res_rtbcount = dqp->q_rtb.reserved; @@ -1300,10 +1397,8 @@ DEFINE_EVENT(xfs_dquot_class, name, \ TP_ARGS(dqp)) DEFINE_DQUOT_EVENT(xfs_dqadjust); DEFINE_DQUOT_EVENT(xfs_dqreclaim_want); -DEFINE_DQUOT_EVENT(xfs_dqreclaim_dirty); DEFINE_DQUOT_EVENT(xfs_dqreclaim_busy); DEFINE_DQUOT_EVENT(xfs_dqreclaim_done); -DEFINE_DQUOT_EVENT(xfs_dqattach_found); DEFINE_DQUOT_EVENT(xfs_dqattach_get); DEFINE_DQUOT_EVENT(xfs_dqalloc); DEFINE_DQUOT_EVENT(xfs_dqtobp_read); @@ -1313,9 +1408,8 @@ DEFINE_DQUOT_EVENT(xfs_dqget_hit); DEFINE_DQUOT_EVENT(xfs_dqget_miss); DEFINE_DQUOT_EVENT(xfs_dqget_freeing); DEFINE_DQUOT_EVENT(xfs_dqget_dup); -DEFINE_DQUOT_EVENT(xfs_dqput); -DEFINE_DQUOT_EVENT(xfs_dqput_free); DEFINE_DQUOT_EVENT(xfs_dqrele); +DEFINE_DQUOT_EVENT(xfs_dqrele_free); DEFINE_DQUOT_EVENT(xfs_dqflush); DEFINE_DQUOT_EVENT(xfs_dqflush_force); DEFINE_DQUOT_EVENT(xfs_dqflush_done); @@ -1505,7 +1599,6 @@ DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant); DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant_sub); DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant_exit); DEFINE_LOGGRANT_EVENT(xfs_log_cil_wait); -DEFINE_LOGGRANT_EVENT(xfs_log_cil_return); DECLARE_EVENT_CLASS(xfs_log_item_class, TP_PROTO(struct xfs_log_item *lip), @@ -1561,6 +1654,8 @@ DEFINE_LOG_ITEM_EVENT(xfs_ail_flushing); DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_mark); DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_skip); DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_unpin); +DEFINE_LOG_ITEM_EVENT(xlog_ail_insert_abort); +DEFINE_LOG_ITEM_EVENT(xfs_trans_free_abort); DECLARE_EVENT_CLASS(xfs_ail_class, TP_PROTO(struct xfs_log_item *lip, xfs_lsn_t old_lsn, xfs_lsn_t new_lsn), @@ -1657,6 +1752,28 @@ DEFINE_RW_EVENT(xfs_file_direct_write); DEFINE_RW_EVENT(xfs_file_dax_write); DEFINE_RW_EVENT(xfs_reflink_bounce_dio_write); +TRACE_EVENT(xfs_iomap_atomic_write_cow, + TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), + TP_ARGS(ip, offset, count), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_off_t, offset) + __field(ssize_t, count) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->offset = offset; + __entry->count = count; + ), + TP_printk("dev %d:%d ino 0x%llx pos 0x%llx bytecount 0x%zx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->offset, + __entry->count) +) + DECLARE_EVENT_CLASS(xfs_imap_class, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, int whichfork, struct xfs_bmbt_irec *irec), @@ -1744,8 +1861,6 @@ DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert); DEFINE_SIMPLE_IO_EVENT(xfs_setfilesize); DEFINE_SIMPLE_IO_EVENT(xfs_zero_eof); DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write); -DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_unwritten); -DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_append); DEFINE_SIMPLE_IO_EVENT(xfs_file_splice_read); DEFINE_SIMPLE_IO_EVENT(xfs_zoned_map_blocks); @@ -1778,31 +1893,6 @@ DEFINE_EVENT(xfs_itrunc_class, name, \ DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_start); DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_end); -TRACE_EVENT(xfs_pagecache_inval, - TP_PROTO(struct xfs_inode *ip, xfs_off_t start, xfs_off_t finish), - TP_ARGS(ip, start, finish), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(xfs_ino_t, ino) - __field(xfs_fsize_t, size) - __field(xfs_off_t, start) - __field(xfs_off_t, finish) - ), - TP_fast_assign( - __entry->dev = VFS_I(ip)->i_sb->s_dev; - __entry->ino = ip->i_ino; - __entry->size = ip->i_disk_size; - __entry->start = start; - __entry->finish = finish; - ), - TP_printk("dev %d:%d ino 0x%llx disize 0x%llx start 0x%llx finish 0x%llx", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->ino, - __entry->size, - __entry->start, - __entry->finish) -); - TRACE_EVENT(xfs_bunmap, TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t fileoff, xfs_filblks_t len, int flags, unsigned long caller_ip), @@ -2148,14 +2238,12 @@ DEFINE_EVENT(xfs_alloc_class, name, \ DEFINE_ALLOC_EVENT(xfs_alloc_exact_done); DEFINE_ALLOC_EVENT(xfs_alloc_exact_notfound); DEFINE_ALLOC_EVENT(xfs_alloc_exact_error); -DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft); DEFINE_ALLOC_EVENT(xfs_alloc_near_first); DEFINE_ALLOC_EVENT(xfs_alloc_cur); DEFINE_ALLOC_EVENT(xfs_alloc_cur_right); DEFINE_ALLOC_EVENT(xfs_alloc_cur_left); DEFINE_ALLOC_EVENT(xfs_alloc_cur_lookup); DEFINE_ALLOC_EVENT(xfs_alloc_cur_lookup_done); -DEFINE_ALLOC_EVENT(xfs_alloc_near_error); DEFINE_ALLOC_EVENT(xfs_alloc_near_noentry); DEFINE_ALLOC_EVENT(xfs_alloc_near_busy); DEFINE_ALLOC_EVENT(xfs_alloc_size_neither); @@ -2350,13 +2438,8 @@ DEFINE_ATTR_EVENT(xfs_attr_leaf_toosmall); DEFINE_ATTR_EVENT(xfs_attr_node_addname); DEFINE_ATTR_EVENT(xfs_attr_node_get); DEFINE_ATTR_EVENT(xfs_attr_node_replace); -DEFINE_ATTR_EVENT(xfs_attr_node_removename); - -DEFINE_ATTR_EVENT(xfs_attr_fillstate); -DEFINE_ATTR_EVENT(xfs_attr_refillstate); DEFINE_ATTR_EVENT(xfs_attr_rmtval_get); -DEFINE_ATTR_EVENT(xfs_attr_rmtval_set); #define DEFINE_DA_EVENT(name) \ DEFINE_EVENT(xfs_da_class, name, \ @@ -2774,7 +2857,6 @@ DEFINE_EVENT(xfs_rtdiscard_class, name, \ TP_ARGS(mp, rtbno, len)) DEFINE_RTDISCARD_EVENT(xfs_discard_rtextent); DEFINE_RTDISCARD_EVENT(xfs_discard_rttoosmall); -DEFINE_RTDISCARD_EVENT(xfs_discard_rtrelax); DECLARE_EVENT_CLASS(xfs_btree_cur_class, TP_PROTO(struct xfs_btree_cur *cur, int level, struct xfs_buf *bp), @@ -4091,36 +4173,6 @@ DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_extent_error); DEFINE_INODE_IREC_EVENT(xfs_reflink_remap_extent_src); DEFINE_INODE_IREC_EVENT(xfs_reflink_remap_extent_dest); -/* dedupe tracepoints */ -DEFINE_DOUBLE_IO_EVENT(xfs_reflink_compare_extents); -DEFINE_INODE_ERROR_EVENT(xfs_reflink_compare_extents_error); - -/* ioctl tracepoints */ -TRACE_EVENT(xfs_ioctl_clone, - TP_PROTO(struct inode *src, struct inode *dest), - TP_ARGS(src, dest), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(unsigned long, src_ino) - __field(loff_t, src_isize) - __field(unsigned long, dest_ino) - __field(loff_t, dest_isize) - ), - TP_fast_assign( - __entry->dev = src->i_sb->s_dev; - __entry->src_ino = src->i_ino; - __entry->src_isize = i_size_read(src); - __entry->dest_ino = dest->i_ino; - __entry->dest_isize = i_size_read(dest); - ), - TP_printk("dev %d:%d ino 0x%lx isize 0x%llx -> ino 0x%lx isize 0x%llx", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->src_ino, - __entry->src_isize, - __entry->dest_ino, - __entry->dest_isize) -); - /* unshare tracepoints */ DEFINE_SIMPLE_IO_EVENT(xfs_reflink_unshare); DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error); @@ -4128,7 +4180,6 @@ DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error); /* copy on write */ DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared); DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found); -DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc); DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow); DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range); @@ -4881,7 +4932,7 @@ DECLARE_EVENT_CLASS(xlog_iclog_class, __entry->refcount = atomic_read(&iclog->ic_refcnt); __entry->offset = iclog->ic_offset; __entry->flags = iclog->ic_flags; - __entry->lsn = be64_to_cpu(iclog->ic_header.h_lsn); + __entry->lsn = be64_to_cpu(iclog->ic_header->h_lsn); __entry->caller_ip = caller_ip; ), TP_printk("dev %d:%d state %s refcnt %d offset %u lsn 0x%llx flags %s caller %pS", @@ -4913,7 +4964,6 @@ DEFINE_ICLOG_EVENT(xlog_iclog_switch); DEFINE_ICLOG_EVENT(xlog_iclog_sync); DEFINE_ICLOG_EVENT(xlog_iclog_syncing); DEFINE_ICLOG_EVENT(xlog_iclog_sync_done); -DEFINE_ICLOG_EVENT(xlog_iclog_want_sync); DEFINE_ICLOG_EVENT(xlog_iclog_wait_on); DEFINE_ICLOG_EVENT(xlog_iclog_write); @@ -4962,7 +5012,6 @@ DEFINE_DAS_STATE_EVENT(xfs_attr_sf_addname_return); DEFINE_DAS_STATE_EVENT(xfs_attr_set_iter_return); DEFINE_DAS_STATE_EVENT(xfs_attr_leaf_addname_return); DEFINE_DAS_STATE_EVENT(xfs_attr_node_addname_return); -DEFINE_DAS_STATE_EVENT(xfs_attr_remove_iter_return); DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_alloc); DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_remove_return); DEFINE_DAS_STATE_EVENT(xfs_attr_defer_add); diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index c6657072361a..474f5a04ec63 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -134,18 +134,14 @@ xfs_trans_dup( } /* - * This is called to reserve free disk blocks and log space for the - * given transaction. This must be done before allocating any resources - * within the transaction. + * This is called to reserve free disk blocks and log space for the given + * transaction before allocating any resources within the transaction. * * This will return ENOSPC if there are not enough blocks available. * It will sleep waiting for available log space. - * The only valid value for the flags parameter is XFS_RES_LOG_PERM, which - * is used by long running transactions. If any one of the reservations - * fails then they will all be backed out. * - * This does not do quota reservations. That typically is done by the - * caller afterwards. + * This does not do quota reservations. That typically is done by the caller + * afterwards. */ static int xfs_trans_reserve( @@ -158,10 +154,12 @@ xfs_trans_reserve( int error = 0; bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; + ASSERT(resp->tr_logres > 0); + /* - * Attempt to reserve the needed disk blocks by decrementing - * the number needed from the number available. This will - * fail if the count would go below zero. + * Attempt to reserve the needed disk blocks by decrementing the number + * needed from the number available. This will fail if the count would + * go below zero. */ if (blocks > 0) { error = xfs_dec_fdblocks(mp, blocks, rsvd); @@ -173,42 +171,20 @@ xfs_trans_reserve( /* * Reserve the log space needed for this transaction. */ - if (resp->tr_logres > 0) { - bool permanent = false; - - ASSERT(tp->t_log_res == 0 || - tp->t_log_res == resp->tr_logres); - ASSERT(tp->t_log_count == 0 || - tp->t_log_count == resp->tr_logcount); - - if (resp->tr_logflags & XFS_TRANS_PERM_LOG_RES) { - tp->t_flags |= XFS_TRANS_PERM_LOG_RES; - permanent = true; - } else { - ASSERT(tp->t_ticket == NULL); - ASSERT(!(tp->t_flags & XFS_TRANS_PERM_LOG_RES)); - } - - if (tp->t_ticket != NULL) { - ASSERT(resp->tr_logflags & XFS_TRANS_PERM_LOG_RES); - error = xfs_log_regrant(mp, tp->t_ticket); - } else { - error = xfs_log_reserve(mp, resp->tr_logres, - resp->tr_logcount, - &tp->t_ticket, permanent); - } - - if (error) - goto undo_blocks; + if (resp->tr_logflags & XFS_TRANS_PERM_LOG_RES) + tp->t_flags |= XFS_TRANS_PERM_LOG_RES; + error = xfs_log_reserve(mp, resp->tr_logres, resp->tr_logcount, + &tp->t_ticket, (tp->t_flags & XFS_TRANS_PERM_LOG_RES)); + if (error) + goto undo_blocks; - tp->t_log_res = resp->tr_logres; - tp->t_log_count = resp->tr_logcount; - } + tp->t_log_res = resp->tr_logres; + tp->t_log_count = resp->tr_logcount; /* - * Attempt to reserve the needed realtime extents by decrementing - * the number needed from the number available. This will - * fail if the count would go below zero. + * Attempt to reserve the needed realtime extents by decrementing the + * number needed from the number available. This will fail if the + * count would go below zero. */ if (rtextents > 0) { error = xfs_dec_frextents(mp, rtextents); @@ -221,18 +197,11 @@ xfs_trans_reserve( return 0; - /* - * Error cases jump to one of these labels to undo any - * reservations which have already been performed. - */ undo_log: - if (resp->tr_logres > 0) { - xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket); - tp->t_ticket = NULL; - tp->t_log_res = 0; - tp->t_flags &= ~XFS_TRANS_PERM_LOG_RES; - } - + xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket); + tp->t_ticket = NULL; + tp->t_log_res = 0; + tp->t_flags &= ~XFS_TRANS_PERM_LOG_RES; undo_blocks: if (blocks > 0) { xfs_add_fdblocks(mp, blocks); @@ -241,6 +210,28 @@ undo_blocks: return error; } +static struct xfs_trans * +__xfs_trans_alloc( + struct xfs_mount *mp, + uint flags) +{ + struct xfs_trans *tp; + + ASSERT(!(flags & XFS_TRANS_RES_FDBLKS) || xfs_has_lazysbcount(mp)); + + tp = kmem_cache_zalloc(xfs_trans_cache, GFP_KERNEL | __GFP_NOFAIL); + if (!(flags & XFS_TRANS_NO_WRITECOUNT)) + sb_start_intwrite(mp->m_super); + xfs_trans_set_context(tp); + tp->t_flags = flags; + tp->t_mountp = mp; + INIT_LIST_HEAD(&tp->t_items); + INIT_LIST_HEAD(&tp->t_busy); + INIT_LIST_HEAD(&tp->t_dfops); + tp->t_highest_agno = NULLAGNUMBER; + return tp; +} + int xfs_trans_alloc( struct xfs_mount *mp, @@ -254,33 +245,16 @@ xfs_trans_alloc( bool want_retry = true; int error; + ASSERT(resp->tr_logres > 0); + /* * Allocate the handle before we do our freeze accounting and setting up * GFP_NOFS allocation context so that we avoid lockdep false positives * by doing GFP_KERNEL allocations inside sb_start_intwrite(). */ retry: - tp = kmem_cache_zalloc(xfs_trans_cache, GFP_KERNEL | __GFP_NOFAIL); - if (!(flags & XFS_TRANS_NO_WRITECOUNT)) - sb_start_intwrite(mp->m_super); - xfs_trans_set_context(tp); - - /* - * Zero-reservation ("empty") transactions can't modify anything, so - * they're allowed to run while we're frozen. - */ - WARN_ON(resp->tr_logres > 0 && - mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE); - ASSERT(!(flags & XFS_TRANS_RES_FDBLKS) || - xfs_has_lazysbcount(mp)); - - tp->t_flags = flags; - tp->t_mountp = mp; - INIT_LIST_HEAD(&tp->t_items); - INIT_LIST_HEAD(&tp->t_busy); - INIT_LIST_HEAD(&tp->t_dfops); - tp->t_highest_agno = NULLAGNUMBER; - + tp = __xfs_trans_alloc(mp, flags); + WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE); error = xfs_trans_reserve(tp, resp, blocks, rtextents); if (error == -ENOSPC && want_retry) { xfs_trans_cancel(tp); @@ -324,14 +298,11 @@ retry: * where we can be grabbing buffers at the same time that freeze is trying to * drain the buffer LRU list. */ -int +struct xfs_trans * xfs_trans_alloc_empty( - struct xfs_mount *mp, - struct xfs_trans **tpp) + struct xfs_mount *mp) { - struct xfs_trans_res resv = {0}; - - return xfs_trans_alloc(mp, &resv, 0, 0, XFS_TRANS_NO_WRITECOUNT, tpp); + return __xfs_trans_alloc(mp, XFS_TRANS_NO_WRITECOUNT); } /* @@ -481,19 +452,17 @@ xfs_trans_mod_sb( */ STATIC void xfs_trans_apply_sb_deltas( - xfs_trans_t *tp) + struct xfs_trans *tp) { - struct xfs_dsb *sbp; - struct xfs_buf *bp; - int whole = 0; - - bp = xfs_trans_getsb(tp); - sbp = bp->b_addr; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_buf *bp = xfs_trans_getsb(tp); + struct xfs_dsb *sbp = bp->b_addr; + int whole = 0; /* * Only update the superblock counters if we are logging them */ - if (!xfs_has_lazysbcount((tp->t_mountp))) { + if (!xfs_has_lazysbcount(mp)) { if (tp->t_icount_delta) be64_add_cpu(&sbp->sb_icount, tp->t_icount_delta); if (tp->t_ifree_delta) @@ -520,8 +489,7 @@ xfs_trans_apply_sb_deltas( * write the correct value ondisk. */ if ((tp->t_frextents_delta || tp->t_res_frextents_delta) && - !xfs_has_rtgroups(tp->t_mountp)) { - struct xfs_mount *mp = tp->t_mountp; + !xfs_has_rtgroups(mp)) { int64_t rtxdelta; rtxdelta = tp->t_frextents_delta + tp->t_res_frextents_delta; @@ -534,6 +502,8 @@ xfs_trans_apply_sb_deltas( if (tp->t_dblocks_delta) { be64_add_cpu(&sbp->sb_dblocks, tp->t_dblocks_delta); + mp->m_ddev_targp->bt_nr_sectors += + XFS_FSB_TO_BB(mp, tp->t_dblocks_delta); whole = 1; } if (tp->t_agcount_delta) { @@ -553,7 +523,7 @@ xfs_trans_apply_sb_deltas( * recompute the ondisk rtgroup block log. The incore values * will be recomputed in xfs_trans_unreserve_and_mod_sb. */ - if (xfs_has_rtgroups(tp->t_mountp)) { + if (xfs_has_rtgroups(mp)) { sbp->sb_rgblklog = xfs_compute_rgblklog( be32_to_cpu(sbp->sb_rgextents), be32_to_cpu(sbp->sb_rextsize)); @@ -566,6 +536,8 @@ xfs_trans_apply_sb_deltas( } if (tp->t_rblocks_delta) { be64_add_cpu(&sbp->sb_rblocks, tp->t_rblocks_delta); + mp->m_rtdev_targp->bt_nr_sectors += + XFS_FSB_TO_BB(mp, tp->t_rblocks_delta); whole = 1; } if (tp->t_rextents_delta) { @@ -742,8 +714,10 @@ xfs_trans_free_items( list_for_each_entry_safe(lip, next, &tp->t_items, li_trans) { xfs_trans_del_item(lip); - if (abort) + if (abort) { + trace_xfs_trans_free_abort(lip); set_bit(XFS_LI_ABORTED, &lip->li_flags); + } if (lip->li_ops->iop_release) lip->li_ops->iop_release(lip); } @@ -1024,51 +998,57 @@ xfs_trans_cancel( } /* - * Roll from one trans in the sequence of PERMANENT transactions to - * the next: permanent transactions are only flushed out when - * committed with xfs_trans_commit(), but we still want as soon - * as possible to let chunks of it go to the log. So we commit the - * chunk we've been working on and get a new transaction to continue. + * Roll from one trans in the sequence of PERMANENT transactions to the next: + * permanent transactions are only flushed out when committed with + * xfs_trans_commit(), but we still want as soon as possible to let chunks of it + * go to the log. So we commit the chunk we've been working on and get a new + * transaction to continue. */ int xfs_trans_roll( struct xfs_trans **tpp) { - struct xfs_trans *trans = *tpp; - struct xfs_trans_res tres; + struct xfs_trans *tp = *tpp; + unsigned int log_res = tp->t_log_res; + unsigned int log_count = tp->t_log_count; int error; - trace_xfs_trans_roll(trans, _RET_IP_); + trace_xfs_trans_roll(tp, _RET_IP_); + + ASSERT(log_res > 0); /* * Copy the critical parameters from one trans to the next. */ - tres.tr_logres = trans->t_log_res; - tres.tr_logcount = trans->t_log_count; - - *tpp = xfs_trans_dup(trans); + *tpp = xfs_trans_dup(tp); /* * Commit the current transaction. - * If this commit failed, then it'd just unlock those items that - * are not marked ihold. That also means that a filesystem shutdown - * is in progress. The caller takes the responsibility to cancel - * the duplicate transaction that gets returned. + * + * If this commit failed, then it'd just unlock those items that are not + * marked ihold. That also means that a filesystem shutdown is in + * progress. The caller takes the responsibility to cancel the + * duplicate transaction that gets returned. */ - error = __xfs_trans_commit(trans, true); + error = __xfs_trans_commit(tp, true); if (error) return error; /* * Reserve space in the log for the next transaction. - * This also pushes items in the "AIL", the list of logged items, - * out to disk if they are taking up space at the tail of the log - * that we want to use. This requires that either nothing be locked - * across this call, or that anything that is locked be logged in - * the prior and the next transactions. + * + * This also pushes items in the AIL out to disk if they are taking up + * space at the tail of the log that we want to use. This requires that + * either nothing be locked across this call, or that anything that is + * locked be logged in the prior and the next transactions. */ - tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; - return xfs_trans_reserve(*tpp, &tres, 0, 0); + tp = *tpp; + error = xfs_log_regrant(tp->t_mountp, tp->t_ticket); + if (error) + return error; + tp->t_log_res = log_res; + tp->t_log_count = log_count; + return 0; } /* @@ -1144,9 +1124,18 @@ xfs_trans_reserve_more( unsigned int blocks, unsigned int rtextents) { - struct xfs_trans_res resv = { }; - - return xfs_trans_reserve(tp, &resv, blocks, rtextents); + bool rsvd = tp->t_flags & XFS_TRANS_RESERVE; + + if (blocks && xfs_dec_fdblocks(tp->t_mountp, blocks, rsvd)) + return -ENOSPC; + if (rtextents && xfs_dec_frextents(tp->t_mountp, rtextents)) { + if (blocks) + xfs_add_fdblocks(tp->t_mountp, blocks); + return -ENOSPC; + } + tp->t_blk_res += blocks; + tp->t_rtx_res += rtextents; + return 0; } /* @@ -1161,14 +1150,13 @@ xfs_trans_reserve_more_inode( unsigned int rblocks, bool force_quota) { - struct xfs_trans_res resv = { }; struct xfs_mount *mp = ip->i_mount; unsigned int rtx = xfs_extlen_to_rtxlen(mp, rblocks); int error; xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); - error = xfs_trans_reserve(tp, &resv, dblocks, rtx); + error = xfs_trans_reserve_more(tp, dblocks, rtx); if (error) return error; diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 2b366851e9a4..7fb860f645a3 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -15,7 +15,6 @@ struct xfs_efd_log_item; struct xfs_efi_log_item; struct xfs_inode; struct xfs_item_ops; -struct xfs_log_iovec; struct xfs_mount; struct xfs_trans; struct xfs_trans_res; @@ -168,8 +167,7 @@ int xfs_trans_alloc(struct xfs_mount *mp, struct xfs_trans_res *resp, struct xfs_trans **tpp); int xfs_trans_reserve_more(struct xfs_trans *tp, unsigned int blocks, unsigned int rtextents); -int xfs_trans_alloc_empty(struct xfs_mount *mp, - struct xfs_trans **tpp); +struct xfs_trans *xfs_trans_alloc_empty(struct xfs_mount *mp); void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t); int xfs_trans_get_buf_map(struct xfs_trans *tp, struct xfs_buftarg *target, diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 0fcb1828e598..38983c6777df 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -315,7 +315,7 @@ xfs_ail_splice( } /* - * Delete the given item from the AIL. Return a pointer to the item. + * Delete the given item from the AIL. */ static void xfs_ail_delete( @@ -374,7 +374,7 @@ xfsaild_push_item( * If log item pinning is enabled, skip the push and track the item as * pinned. This can help induce head-behind-tail conditions. */ - if (XFS_TEST_ERROR(false, ailp->ail_log->l_mp, XFS_ERRTAG_LOG_ITEM_PIN)) + if (XFS_TEST_ERROR(ailp->ail_log->l_mp, XFS_ERRTAG_LOG_ITEM_PIN)) return XFS_ITEM_PINNED; /* @@ -777,26 +777,28 @@ xfs_ail_update_finish( } /* - * xfs_trans_ail_update - bulk AIL insertion operation. + * xfs_trans_ail_update_bulk - bulk AIL insertion operation. * - * @xfs_trans_ail_update takes an array of log items that all need to be + * @xfs_trans_ail_update_bulk takes an array of log items that all need to be * positioned at the same LSN in the AIL. If an item is not in the AIL, it will - * be added. Otherwise, it will be repositioned by removing it and re-adding - * it to the AIL. If we move the first item in the AIL, update the log tail to - * match the new minimum LSN in the AIL. + * be added. Otherwise, it will be repositioned by removing it and re-adding + * it to the AIL. * - * This function takes the AIL lock once to execute the update operations on - * all the items in the array, and as such should not be called with the AIL - * lock held. As a result, once we have the AIL lock, we need to check each log - * item LSN to confirm it needs to be moved forward in the AIL. + * If we move the first item in the AIL, update the log tail to match the new + * minimum LSN in the AIL. * - * To optimise the insert operation, we delete all the items from the AIL in - * the first pass, moving them into a temporary list, then splice the temporary - * list into the correct position in the AIL. This avoids needing to do an - * insert operation on every item. + * This function should be called with the AIL lock held. * - * This function must be called with the AIL lock held. The lock is dropped - * before returning. + * To optimise the insert operation, we add all items to a temporary list, then + * splice this list into the correct position in the AIL. + * + * Items that are already in the AIL are first deleted from their current + * location before being added to the temporary list. + * + * This avoids needing to do an insert operation on every item. + * + * The AIL lock is dropped by xfs_ail_update_finish() before returning to + * the caller. */ void xfs_trans_ail_update_bulk( @@ -909,10 +911,9 @@ xfs_trans_ail_delete( return; } - /* xfs_ail_update_finish() drops the AIL lock */ - xfs_clear_li_failed(lip); + clear_bit(XFS_LI_FAILED, &lip->li_flags); tail_lsn = xfs_ail_delete_one(ailp, lip); - xfs_ail_update_finish(ailp, tail_lsn); + xfs_ail_update_finish(ailp, tail_lsn); /* drops the AIL lock */ } int diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 765456bf3428..c842ce06acd6 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -393,7 +393,7 @@ xfs_trans_dqlockedjoin( unsigned int i; ASSERT(q[0].qt_dquot != NULL); if (q[1].qt_dquot == NULL) { - xfs_dqlock(q[0].qt_dquot); + mutex_lock(&q[0].qt_dquot->q_qlock); xfs_trans_dqjoin(tp, q[0].qt_dquot); } else if (q[2].qt_dquot == NULL) { xfs_dqlock2(q[0].qt_dquot, q[1].qt_dquot); @@ -693,7 +693,7 @@ xfs_trans_unreserve_and_mod_dquots( locked = already_locked; if (qtrx->qt_blk_res) { if (!locked) { - xfs_dqlock(dqp); + mutex_lock(&dqp->q_qlock); locked = true; } dqp->q_blk.reserved -= @@ -701,7 +701,7 @@ xfs_trans_unreserve_and_mod_dquots( } if (qtrx->qt_ino_res) { if (!locked) { - xfs_dqlock(dqp); + mutex_lock(&dqp->q_qlock); locked = true; } dqp->q_ino.reserved -= @@ -710,14 +710,14 @@ xfs_trans_unreserve_and_mod_dquots( if (qtrx->qt_rtblk_res) { if (!locked) { - xfs_dqlock(dqp); + mutex_lock(&dqp->q_qlock); locked = true; } dqp->q_rtb.reserved -= (xfs_qcnt_t)qtrx->qt_rtblk_res; } if (locked && !already_locked) - xfs_dqunlock(dqp); + mutex_unlock(&dqp->q_qlock); } } @@ -820,7 +820,7 @@ xfs_trans_dqresv( struct xfs_dquot_res *blkres; struct xfs_quota_limits *qlim; - xfs_dqlock(dqp); + mutex_lock(&dqp->q_qlock); defq = xfs_get_defquota(q, xfs_dquot_type(dqp)); @@ -887,16 +887,16 @@ xfs_trans_dqresv( XFS_IS_CORRUPT(mp, dqp->q_ino.reserved < dqp->q_ino.count)) goto error_corrupt; - xfs_dqunlock(dqp); + mutex_unlock(&dqp->q_qlock); return 0; error_return: - xfs_dqunlock(dqp); + mutex_unlock(&dqp->q_qlock); if (xfs_dquot_type(dqp) == XFS_DQTYPE_PROJ) return -ENOSPC; return -EDQUOT; error_corrupt: - xfs_dqunlock(dqp); + mutex_unlock(&dqp->q_qlock); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); xfs_fs_mark_sick(mp, XFS_SICK_FS_QUOTACHECK); return -EFSCORRUPTED; diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index bd841df93021..f945f0450b16 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -167,32 +167,4 @@ xfs_trans_ail_copy_lsn( } #endif -static inline void -xfs_clear_li_failed( - struct xfs_log_item *lip) -{ - struct xfs_buf *bp = lip->li_buf; - - ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags)); - lockdep_assert_held(&lip->li_ailp->ail_lock); - - if (test_and_clear_bit(XFS_LI_FAILED, &lip->li_flags)) { - lip->li_buf = NULL; - xfs_buf_rele(bp); - } -} - -static inline void -xfs_set_li_failed( - struct xfs_log_item *lip, - struct xfs_buf *bp) -{ - lockdep_assert_held(&lip->li_ailp->ail_lock); - - if (!test_and_set_bit(XFS_LI_FAILED, &lip->li_flags)) { - xfs_buf_hold(bp); - lip->li_buf = bp; - } -} - #endif /* __XFS_TRANS_PRIV_H__ */ diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 0f641a9091ec..ac5cecec9aa1 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -243,7 +243,7 @@ __xfs_xattr_put_listent( offset = context->buffer + context->count; memcpy(offset, prefix, prefix_len); offset += prefix_len; - strncpy(offset, (char *)name, namelen); /* real name */ + memcpy(offset, (char *)name, namelen); /* real name */ offset += namelen; *offset = '\0'; diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c index 52af234936a2..bbcf21704ea0 100644 --- a/fs/xfs/xfs_zone_alloc.c +++ b/fs/xfs/xfs_zone_alloc.c @@ -24,15 +24,24 @@ #include "xfs_zone_priv.h" #include "xfs_zones.h" #include "xfs_trace.h" +#include "xfs_mru_cache.h" + +static void +xfs_open_zone_free_rcu( + struct callback_head *cb) +{ + struct xfs_open_zone *oz = container_of(cb, typeof(*oz), oz_rcu); + + xfs_rtgroup_rele(oz->oz_rtg); + kfree(oz); +} void xfs_open_zone_put( struct xfs_open_zone *oz) { - if (atomic_dec_and_test(&oz->oz_ref)) { - xfs_rtgroup_rele(oz->oz_rtg); - kfree(oz); - } + if (atomic_dec_and_test(&oz->oz_ref)) + call_rcu(&oz->oz_rcu, xfs_open_zone_free_rcu); } static inline uint32_t @@ -94,9 +103,6 @@ xfs_zone_account_reclaimable( */ trace_xfs_zone_emptied(rtg); - if (!was_full) - xfs_group_clear_mark(xg, XFS_RTG_RECLAIMABLE); - spin_lock(&zi->zi_used_buckets_lock); if (!was_full) xfs_zone_remove_from_bucket(zi, rgno, from_bucket); @@ -118,7 +124,6 @@ xfs_zone_account_reclaimable( xfs_zone_add_to_bucket(zi, rgno, to_bucket); spin_unlock(&zi->zi_used_buckets_lock); - xfs_group_set_mark(xg, XFS_RTG_RECLAIMABLE); if (zi->zi_gc_thread && xfs_zoned_need_gc(mp)) wake_up_process(zi->zi_gc_thread); } else if (to_bucket != from_bucket) { @@ -133,6 +138,28 @@ xfs_zone_account_reclaimable( } } +/* + * Check if we have any zones that can be reclaimed by looking at the entry + * counters for the zone buckets. + */ +bool +xfs_zoned_have_reclaimable( + struct xfs_zone_info *zi) +{ + int i; + + spin_lock(&zi->zi_used_buckets_lock); + for (i = 0; i < XFS_ZONE_USED_BUCKETS; i++) { + if (zi->zi_used_bucket_entries[i]) { + spin_unlock(&zi->zi_used_buckets_lock); + return true; + } + } + spin_unlock(&zi->zi_used_buckets_lock); + + return false; +} + static void xfs_open_zone_mark_full( struct xfs_open_zone *oz) @@ -165,10 +192,9 @@ xfs_open_zone_mark_full( static void xfs_zone_record_blocks( struct xfs_trans *tp, - xfs_fsblock_t fsbno, - xfs_filblks_t len, struct xfs_open_zone *oz, - bool used) + xfs_fsblock_t fsbno, + xfs_filblks_t len) { struct xfs_mount *mp = tp->t_mountp; struct xfs_rtgroup *rtg = oz->oz_rtg; @@ -178,18 +204,37 @@ xfs_zone_record_blocks( xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP); - if (used) { - rmapip->i_used_blocks += len; - ASSERT(rmapip->i_used_blocks <= rtg_blocks(rtg)); - } else { - xfs_add_frextents(mp, len); - } + rmapip->i_used_blocks += len; + ASSERT(rmapip->i_used_blocks <= rtg_blocks(rtg)); oz->oz_written += len; if (oz->oz_written == rtg_blocks(rtg)) xfs_open_zone_mark_full(oz); xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE); } +/* + * Called for blocks that have been written to disk, but not actually linked to + * an inode, which can happen when garbage collection races with user data + * writes to a file. + */ +static void +xfs_zone_skip_blocks( + struct xfs_open_zone *oz, + xfs_filblks_t len) +{ + struct xfs_rtgroup *rtg = oz->oz_rtg; + + trace_xfs_zone_skip_blocks(oz, 0, len); + + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); + oz->oz_written += len; + if (oz->oz_written == rtg_blocks(rtg)) + xfs_open_zone_mark_full(oz); + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); + + xfs_add_frextents(rtg_mount(rtg), len); +} + static int xfs_zoned_map_extent( struct xfs_trans *tp, @@ -219,6 +264,14 @@ xfs_zoned_map_extent( * If a data write raced with this GC write, keep the existing data in * the data fork, mark our newly written GC extent as reclaimable, then * move on to the next extent. + * + * Note that this can also happen when racing with operations that do + * not actually invalidate the data, but just move it to a different + * inode (XFS_IOC_EXCHANGE_RANGE), or to a different offset inside the + * inode (FALLOC_FL_COLLAPSE_RANGE / FALLOC_FL_INSERT_RANGE). If the + * data was just moved around, GC fails to free the zone, but the zone + * becomes a GC candidate again as soon as all previous GC I/O has + * finished and these blocks will be moved out eventually. */ if (old_startblock != NULLFSBLOCK && old_startblock != data.br_startblock) @@ -249,8 +302,7 @@ xfs_zoned_map_extent( } } - xfs_zone_record_blocks(tp, new->br_startblock, new->br_blockcount, oz, - true); + xfs_zone_record_blocks(tp, oz, new->br_startblock, new->br_blockcount); /* Map the new blocks into the data fork. */ xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, new); @@ -258,8 +310,7 @@ xfs_zoned_map_extent( skip: trace_xfs_reflink_cow_remap_skip(ip, new); - xfs_zone_record_blocks(tp, new->br_startblock, new->br_blockcount, oz, - false); + xfs_zone_skip_blocks(oz, new->br_blockcount); return 0; } @@ -357,44 +408,6 @@ xfs_zone_free_blocks( return 0; } -/* - * Check if the zone containing the data just before the offset we are - * writing to is still open and has space. - */ -static struct xfs_open_zone * -xfs_last_used_zone( - struct iomap_ioend *ioend) -{ - struct xfs_inode *ip = XFS_I(ioend->io_inode); - struct xfs_mount *mp = ip->i_mount; - xfs_fileoff_t offset_fsb = XFS_B_TO_FSB(mp, ioend->io_offset); - struct xfs_rtgroup *rtg = NULL; - struct xfs_open_zone *oz = NULL; - struct xfs_iext_cursor icur; - struct xfs_bmbt_irec got; - - xfs_ilock(ip, XFS_ILOCK_SHARED); - if (!xfs_iext_lookup_extent_before(ip, &ip->i_df, &offset_fsb, - &icur, &got)) { - xfs_iunlock(ip, XFS_ILOCK_SHARED); - return NULL; - } - xfs_iunlock(ip, XFS_ILOCK_SHARED); - - rtg = xfs_rtgroup_grab(mp, xfs_rtb_to_rgno(mp, got.br_startblock)); - if (!rtg) - return NULL; - - xfs_ilock(rtg_rmap(rtg), XFS_ILOCK_SHARED); - oz = READ_ONCE(rtg->rtg_open_zone); - if (oz && (oz->oz_is_gc || !atomic_inc_not_zero(&oz->oz_ref))) - oz = NULL; - xfs_iunlock(rtg_rmap(rtg), XFS_ILOCK_SHARED); - - xfs_rtgroup_rele(rtg); - return oz; -} - static struct xfs_group * xfs_find_free_zone( struct xfs_mount *mp, @@ -433,7 +446,7 @@ xfs_init_open_zone( spin_lock_init(&oz->oz_alloc_lock); atomic_set(&oz->oz_ref, 1); oz->oz_rtg = rtg; - oz->oz_write_pointer = write_pointer; + oz->oz_allocated = write_pointer; oz->oz_written = write_pointer; oz->oz_write_hint = write_hint; oz->oz_is_gc = is_gc; @@ -514,64 +527,58 @@ xfs_try_open_zone( return oz; } +enum xfs_zone_alloc_score { + /* Any open zone will do it, we're desperate */ + XFS_ZONE_ALLOC_ANY = 0, + + /* It better fit somehow */ + XFS_ZONE_ALLOC_OK = 1, + + /* Only reuse a zone if it fits really well. */ + XFS_ZONE_ALLOC_GOOD = 2, +}; + /* - * For data with short or medium lifetime, try to colocated it into an - * already open zone with a matching temperature. + * Life time hint co-location matrix. Fields not set default to 0 + * aka XFS_ZONE_ALLOC_ANY. */ -static bool -xfs_colocate_eagerly( - enum rw_hint file_hint) -{ - switch (file_hint) { - case WRITE_LIFE_MEDIUM: - case WRITE_LIFE_SHORT: - case WRITE_LIFE_NONE: - return true; - default: - return false; - } -} - -static bool -xfs_good_hint_match( - struct xfs_open_zone *oz, - enum rw_hint file_hint) -{ - switch (oz->oz_write_hint) { - case WRITE_LIFE_LONG: - case WRITE_LIFE_EXTREME: - /* colocate long and extreme */ - if (file_hint == WRITE_LIFE_LONG || - file_hint == WRITE_LIFE_EXTREME) - return true; - break; - case WRITE_LIFE_MEDIUM: - /* colocate medium with medium */ - if (file_hint == WRITE_LIFE_MEDIUM) - return true; - break; - case WRITE_LIFE_SHORT: - case WRITE_LIFE_NONE: - case WRITE_LIFE_NOT_SET: - /* colocate short and none */ - if (file_hint <= WRITE_LIFE_SHORT) - return true; - break; - } - return false; -} +static const unsigned int +xfs_zoned_hint_score[WRITE_LIFE_HINT_NR][WRITE_LIFE_HINT_NR] = { + [WRITE_LIFE_NOT_SET] = { + [WRITE_LIFE_NOT_SET] = XFS_ZONE_ALLOC_OK, + }, + [WRITE_LIFE_NONE] = { + [WRITE_LIFE_NONE] = XFS_ZONE_ALLOC_OK, + }, + [WRITE_LIFE_SHORT] = { + [WRITE_LIFE_SHORT] = XFS_ZONE_ALLOC_GOOD, + }, + [WRITE_LIFE_MEDIUM] = { + [WRITE_LIFE_MEDIUM] = XFS_ZONE_ALLOC_GOOD, + }, + [WRITE_LIFE_LONG] = { + [WRITE_LIFE_LONG] = XFS_ZONE_ALLOC_OK, + [WRITE_LIFE_EXTREME] = XFS_ZONE_ALLOC_OK, + }, + [WRITE_LIFE_EXTREME] = { + [WRITE_LIFE_LONG] = XFS_ZONE_ALLOC_OK, + [WRITE_LIFE_EXTREME] = XFS_ZONE_ALLOC_OK, + }, +}; static bool xfs_try_use_zone( struct xfs_zone_info *zi, enum rw_hint file_hint, struct xfs_open_zone *oz, - bool lowspace) + unsigned int goodness) { - if (oz->oz_write_pointer == rtg_blocks(oz->oz_rtg)) + if (oz->oz_allocated == rtg_blocks(oz->oz_rtg)) return false; - if (!lowspace && !xfs_good_hint_match(oz, file_hint)) + + if (xfs_zoned_hint_score[oz->oz_write_hint][file_hint] < goodness) return false; + if (!atomic_inc_not_zero(&oz->oz_ref)) return false; @@ -602,14 +609,14 @@ static struct xfs_open_zone * xfs_select_open_zone_lru( struct xfs_zone_info *zi, enum rw_hint file_hint, - bool lowspace) + unsigned int goodness) { struct xfs_open_zone *oz; lockdep_assert_held(&zi->zi_open_zones_lock); list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) - if (xfs_try_use_zone(zi, file_hint, oz, lowspace)) + if (xfs_try_use_zone(zi, file_hint, oz, goodness)) return oz; cond_resched_lock(&zi->zi_open_zones_lock); @@ -626,7 +633,7 @@ xfs_select_open_zone_mru( lockdep_assert_held(&zi->zi_open_zones_lock); list_for_each_entry_reverse(oz, &zi->zi_open_zones, oz_entry) - if (xfs_try_use_zone(zi, file_hint, oz, false)) + if (xfs_try_use_zone(zi, file_hint, oz, XFS_ZONE_ALLOC_OK)) return oz; cond_resched_lock(&zi->zi_open_zones_lock); @@ -641,25 +648,29 @@ static inline enum rw_hint xfs_inode_write_hint(struct xfs_inode *ip) } /* - * Try to pack inodes that are written back after they were closed tight instead - * of trying to open new zones for them or spread them to the least recently - * used zone. This optimizes the data layout for workloads that untar or copy - * a lot of small files. Right now this does not separate multiple such + * Try to tightly pack small files that are written back after they were closed + * instead of trying to open new zones for them or spread them to the least + * recently used zone. This optimizes the data layout for workloads that untar + * or copy a lot of small files. Right now this does not separate multiple such * streams. */ static inline bool xfs_zoned_pack_tight(struct xfs_inode *ip) { + struct xfs_mount *mp = ip->i_mount; + size_t zone_capacity = + XFS_FSB_TO_B(mp, mp->m_groups[XG_TYPE_RTG].blocks); + + /* + * Do not pack write files that are already using a full zone to avoid + * fragmentation. + */ + if (i_size_read(VFS_I(ip)) >= zone_capacity) + return false; + return !inode_is_open_for_write(VFS_I(ip)) && !(ip->i_diflags & XFS_DIFLAG_APPEND); } -/* - * Pick a new zone for writes. - * - * If we aren't using up our budget of open zones just open a new one from the - * freelist. Else try to find one that matches the expected data lifetime. If - * we don't find one that is good pick any zone that is available. - */ static struct xfs_open_zone * xfs_select_zone_nowait( struct xfs_mount *mp, @@ -679,31 +690,34 @@ xfs_select_zone_nowait( * data. */ spin_lock(&zi->zi_open_zones_lock); - if (xfs_colocate_eagerly(write_hint)) - oz = xfs_select_open_zone_lru(zi, write_hint, false); - else if (pack_tight) + oz = xfs_select_open_zone_lru(zi, write_hint, XFS_ZONE_ALLOC_GOOD); + if (oz) + goto out_unlock; + + if (pack_tight) oz = xfs_select_open_zone_mru(zi, write_hint); if (oz) goto out_unlock; /* - * See if we can open a new zone and use that. + * See if we can open a new zone and use that so that data for different + * files is mixed as little as possible. */ oz = xfs_try_open_zone(mp, write_hint); if (oz) goto out_unlock; /* - * Try to colocate cold data with other cold data if we failed to open a - * new zone for it. + * Try to find an zone that is an ok match to colocate data with. */ - if (write_hint != WRITE_LIFE_NOT_SET && - !xfs_colocate_eagerly(write_hint)) - oz = xfs_select_open_zone_lru(zi, write_hint, false); - if (!oz) - oz = xfs_select_open_zone_lru(zi, WRITE_LIFE_NOT_SET, false); - if (!oz) - oz = xfs_select_open_zone_lru(zi, WRITE_LIFE_NOT_SET, true); + oz = xfs_select_open_zone_lru(zi, write_hint, XFS_ZONE_ALLOC_OK); + if (oz) + goto out_unlock; + + /* + * Pick the least recently used zone, regardless of hint match + */ + oz = xfs_select_open_zone_lru(zi, write_hint, XFS_ZONE_ALLOC_ANY); out_unlock: spin_unlock(&zi->zi_open_zones_lock); return oz; @@ -726,7 +740,7 @@ xfs_select_zone( for (;;) { prepare_to_wait(&zi->zi_zone_wait, &wait, TASK_UNINTERRUPTIBLE); oz = xfs_select_zone_nowait(mp, write_hint, pack_tight); - if (oz) + if (oz || xfs_is_shutdown(mp)) break; schedule(); } @@ -743,25 +757,25 @@ xfs_zone_alloc_blocks( { struct xfs_rtgroup *rtg = oz->oz_rtg; struct xfs_mount *mp = rtg_mount(rtg); - xfs_rgblock_t rgbno; + xfs_rgblock_t allocated; spin_lock(&oz->oz_alloc_lock); count_fsb = min3(count_fsb, XFS_MAX_BMBT_EXTLEN, - (xfs_filblks_t)rtg_blocks(rtg) - oz->oz_write_pointer); + (xfs_filblks_t)rtg_blocks(rtg) - oz->oz_allocated); if (!count_fsb) { spin_unlock(&oz->oz_alloc_lock); return 0; } - rgbno = oz->oz_write_pointer; - oz->oz_write_pointer += count_fsb; + allocated = oz->oz_allocated; + oz->oz_allocated += count_fsb; spin_unlock(&oz->oz_alloc_lock); - trace_xfs_zone_alloc_blocks(oz, rgbno, count_fsb); + trace_xfs_zone_alloc_blocks(oz, allocated, count_fsb); *sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0); *is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *sector); if (!*is_seq) - *sector += XFS_FSB_TO_BB(mp, rgbno); + *sector += XFS_FSB_TO_BB(mp, allocated); return XFS_FSB_TO_B(mp, count_fsb); } @@ -776,6 +790,57 @@ xfs_mark_rtg_boundary( ioend->io_flags |= IOMAP_IOEND_BOUNDARY; } +/* + * Check if we have a cached last open zone available for the inode and + * if yes return a reference to it. + */ +static struct xfs_open_zone * +xfs_get_cached_zone( + struct xfs_inode *ip) +{ + struct xfs_open_zone *oz; + + rcu_read_lock(); + oz = VFS_I(ip)->i_private; + if (oz) { + /* + * GC only steals open zones at mount time, so no GC zones + * should end up in the cache. + */ + ASSERT(!oz->oz_is_gc); + if (!atomic_inc_not_zero(&oz->oz_ref)) + oz = NULL; + } + rcu_read_unlock(); + + return oz; +} + +/* + * Stash our zone in the inode so that is is reused for future allocations. + * + * The open_zone structure will be pinned until either the inode is freed or + * until the cached open zone is replaced with a different one because the + * current one was full when we tried to use it. This means we keep any + * open zone around forever as long as any inode that used it for the last + * write is cached, which slightly increases the memory use of cached inodes + * that were every written to, but significantly simplifies the cached zone + * lookup. Because the open_zone is clearly marked as full when all data + * in the underlying RTG was written, the caching is always safe. + */ +static void +xfs_set_cached_zone( + struct xfs_inode *ip, + struct xfs_open_zone *oz) +{ + struct xfs_open_zone *old_oz; + + atomic_inc(&oz->oz_ref); + old_oz = xchg(&VFS_I(ip)->i_private, oz); + if (old_oz) + xfs_open_zone_put(old_oz); +} + static void xfs_submit_zoned_bio( struct iomap_ioend *ioend, @@ -813,17 +878,18 @@ xfs_zone_alloc_and_submit( goto out_error; /* - * If we don't have a cached zone in this write context, see if the - * last extent before the one we are writing to points to an active - * zone. If so, just continue writing to it. + * If we don't have a locally cached zone in this write context, see if + * the inode is still associated with a zone and use that if so. */ - if (!*oz && ioend->io_offset) - *oz = xfs_last_used_zone(ioend); + if (!*oz) + *oz = xfs_get_cached_zone(ip); + if (!*oz) { select_zone: *oz = xfs_select_zone(mp, write_hint, pack_tight); if (!*oz) goto out_error; + xfs_set_cached_zone(ip, *oz); } alloc_len = xfs_zone_alloc_blocks(*oz, XFS_B_TO_FSB(mp, ioend->io_size), @@ -883,7 +949,7 @@ xfs_zone_rgbno_is_valid( lockdep_assert_held(&rtg_rmap(rtg)->i_lock); if (rtg->rtg_open_zone) - return rgbno < rtg->rtg_open_zone->oz_write_pointer; + return rgbno < rtg->rtg_open_zone->oz_allocated; return !xa_get_mark(&rtg_mount(rtg)->m_groups[XG_TYPE_RTG].xa, rtg_rgno(rtg), XFS_RTG_FREE); } @@ -901,6 +967,12 @@ xfs_free_open_zones( xfs_open_zone_put(oz); } spin_unlock(&zi->zi_open_zones_lock); + + /* + * Wait for all open zones to be freed so that they drop the group + * references: + */ + rcu_barrier(); } struct xfs_init_zones { @@ -917,7 +989,7 @@ xfs_init_zone( { struct xfs_mount *mp = rtg_mount(rtg); struct xfs_zone_info *zi = mp->m_zone_info; - uint64_t used = rtg_rmap(rtg)->i_used_blocks; + uint32_t used = rtg_rmap(rtg)->i_used_blocks; xfs_rgblock_t write_pointer, highest_rgbno; int error; @@ -1014,24 +1086,27 @@ xfs_get_zone_info_cb( } /* - * Calculate the max open zone limit based on the of number of - * backing zones available + * Calculate the max open zone limit based on the of number of backing zones + * available. */ static inline uint32_t xfs_max_open_zones( struct xfs_mount *mp) { unsigned int max_open, max_open_data_zones; + /* - * We need two zones for every open data zone, - * one in reserve as we don't reclaim open zones. One data zone - * and its spare is included in XFS_MIN_ZONES. + * We need two zones for every open data zone, one in reserve as we + * don't reclaim open zones. One data zone and its spare is included + * in XFS_MIN_ZONES to support at least one user data writer. */ max_open_data_zones = (mp->m_sb.sb_rgcount - XFS_MIN_ZONES) / 2 + 1; max_open = max_open_data_zones + XFS_OPEN_GC_ZONES; /* - * Cap the max open limit to 1/4 of available space + * Cap the max open limit to 1/4 of available space. Without this we'd + * run out of easy reclaim targets too quickly and storage devices don't + * handle huge numbers of concurrent write streams overly well. */ max_open = min(max_open, mp->m_sb.sb_rgcount / 4); @@ -1063,7 +1138,7 @@ xfs_calc_open_zones( if (bdev_open_zones) mp->m_max_open_zones = bdev_open_zones; else - mp->m_max_open_zones = xfs_max_open_zones(mp); + mp->m_max_open_zones = XFS_DEFAULT_MAX_OPEN_ZONES; } if (mp->m_max_open_zones < XFS_MIN_OPEN_ZONES) { @@ -1147,6 +1222,7 @@ xfs_mount_zones( .mp = mp, }; struct xfs_buftarg *bt = mp->m_rtdev_targp; + xfs_extlen_t zone_blocks = mp->m_groups[XG_TYPE_RTG].blocks; int error; if (!bt) { @@ -1176,13 +1252,36 @@ xfs_mount_zones( if (!mp->m_zone_info) return -ENOMEM; - xfs_info(mp, "%u zones of %u blocks size (%u max open)", - mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks, - mp->m_max_open_zones); + xfs_info(mp, "%u zones of %u blocks (%u max open zones)", + mp->m_sb.sb_rgcount, zone_blocks, mp->m_max_open_zones); trace_xfs_zones_mount(mp); + /* + * The writeback code switches between inodes regularly to provide + * fairness. The default lower bound is 4MiB, but for zoned file + * systems we want to increase that both to reduce seeks, but also more + * importantly so that workloads that writes files in a multiple of the + * zone size do not get fragmented and require garbage collection when + * they shouldn't. Increase is to the zone size capped by the max + * extent len. + * + * Note that because s_min_writeback_pages is a superblock field, this + * value also get applied to non-zoned files on the data device if + * there are any. On typical zoned setup all data is on the RT device + * because using the more efficient sequential write required zones + * is the reason for using the zone allocator, and either the RT device + * and the (meta)data device are on the same block device, or the + * (meta)data device is on a fast SSD while the data on the RT device + * is on a SMR HDD. In any combination of the above cases enforcing + * the higher min_writeback_pages for non-RT inodes is either a noop + * or beneficial. + */ + mp->m_super->s_min_writeback_pages = + XFS_FSB_TO_B(mp, min(zone_blocks, XFS_MAX_BMBT_EXTLEN)) >> + PAGE_SHIFT; + if (bdev_is_zoned(bt->bt_bdev)) { - error = blkdev_report_zones(bt->bt_bdev, + error = blkdev_report_zones_cached(bt->bt_bdev, XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart), mp->m_sb.sb_rgcount, xfs_get_zone_info_cb, &iz); if (error < 0) @@ -1192,8 +1291,10 @@ xfs_mount_zones( while ((rtg = xfs_rtgroup_next(mp, rtg))) { error = xfs_init_zone(&iz, rtg, NULL); - if (error) + if (error) { + xfs_rtgroup_rele(rtg); goto out_free_zone_info; + } } } @@ -1201,6 +1302,13 @@ xfs_mount_zones( xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, iz.available + iz.reclaimable); + /* + * The user may configure GC to free up a percentage of unused blocks. + * By default this is 0. GC will always trigger at the minimum level + * for keeping max_open_zones available for data placement. + */ + mp->m_zonegc_low_space = 0; + error = xfs_zone_gc_mount(mp); if (error) goto out_free_zone_info; diff --git a/fs/xfs/xfs_zone_alloc.h b/fs/xfs/xfs_zone_alloc.h index ecf39106704c..4db02816d0fd 100644 --- a/fs/xfs/xfs_zone_alloc.h +++ b/fs/xfs/xfs_zone_alloc.h @@ -23,9 +23,9 @@ struct xfs_zone_alloc_ctx { */ #define XFS_ZR_RESERVED (1U << 2) -int xfs_zoned_space_reserve(struct xfs_inode *ip, xfs_filblks_t count_fsb, +int xfs_zoned_space_reserve(struct xfs_mount *mp, xfs_filblks_t count_fsb, unsigned int flags, struct xfs_zone_alloc_ctx *ac); -void xfs_zoned_space_unreserve(struct xfs_inode *ip, +void xfs_zoned_space_unreserve(struct xfs_mount *mp, struct xfs_zone_alloc_ctx *ac); void xfs_zoned_add_available(struct xfs_mount *mp, xfs_filblks_t count_fsb); diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c index c5136ea9bb1d..3c52cc1497d4 100644 --- a/fs/xfs/xfs_zone_gc.c +++ b/fs/xfs/xfs_zone_gc.c @@ -114,8 +114,9 @@ struct xfs_gc_bio { /* Open Zone being written to */ struct xfs_open_zone *oz; + struct xfs_rtgroup *victim_rtg; + /* Bio used for reads and writes, including the bvec used by it */ - struct bio_vec bv; struct bio bio; /* must be last */ }; @@ -162,18 +163,35 @@ struct xfs_zone_gc_data { /* * We aim to keep enough zones free in stock to fully use the open zone limit - * for data placement purposes. + * for data placement purposes. Additionally, the m_zonegc_low_space tunable + * can be set to make sure a fraction of the unused blocks are available for + * writing. */ bool xfs_zoned_need_gc( struct xfs_mount *mp) { - if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE)) + s64 available, free, threshold; + s32 remainder; + + if (!xfs_zoned_have_reclaimable(mp->m_zone_info)) return false; - if (xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE) < - mp->m_groups[XG_TYPE_RTG].blocks * - (mp->m_max_open_zones - XFS_OPEN_GC_ZONES)) + + available = xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE); + + if (available < + xfs_rtgs_to_rfsbs(mp, mp->m_max_open_zones - XFS_OPEN_GC_ZONES)) + return true; + + free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS); + + threshold = div_s64_rem(free, 100, &remainder); + threshold = threshold * mp->m_zonegc_low_space + + remainder * div_s64(mp->m_zonegc_low_space, 100); + + if (available < threshold) return true; + return false; } @@ -246,6 +264,7 @@ xfs_zone_gc_iter_init( iter->rec_count = 0; iter->rec_idx = 0; iter->victim_rtg = victim_rtg; + atomic_inc(&victim_rtg->rtg_gccount); } /* @@ -272,8 +291,6 @@ xfs_zone_gc_query_cb( return 0; } -#define cmp_int(l, r) ((l > r) - (l < r)) - static int xfs_zone_gc_rmap_rec_cmp( const void *a, @@ -312,10 +329,7 @@ xfs_zone_gc_query( iter->rec_idx = 0; iter->rec_count = 0; - error = xfs_trans_alloc_empty(mp, &tp); - if (error) - return error; - + tp = xfs_trans_alloc_empty(mp); xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); cur = xfs_rtrmapbt_init_cursor(tp, rtg); error = xfs_rmap_query_range(cur, &ri_low, &ri_high, @@ -349,6 +363,7 @@ xfs_zone_gc_query( return 0; done: + atomic_dec(&iter->victim_rtg->rtg_gccount); xfs_rtgroup_rele(iter->victim_rtg); iter->victim_rtg = NULL; return 0; @@ -438,6 +453,20 @@ xfs_zone_gc_pick_victim_from( if (!rtg) continue; + /* + * If the zone is already undergoing GC, don't pick it again. + * + * This prevents us from picking one of the zones for which we + * already submitted GC I/O, but for which the remapping hasn't + * concluded yet. This won't cause data corruption, but + * increases write amplification and slows down GC, so this is + * a bad thing. + */ + if (atomic_read(&rtg->rtg_gccount)) { + xfs_rtgroup_rele(rtg); + continue; + } + /* skip zones that are just waiting for a reset */ if (rtg_rmap(rtg)->i_used_blocks == 0 || rtg_rmap(rtg)->i_used_blocks >= victim_used) { @@ -478,21 +507,6 @@ xfs_zone_gc_select_victim( struct xfs_rtgroup *victim_rtg = NULL; unsigned int bucket; - if (xfs_is_shutdown(mp)) - return false; - - if (iter->victim_rtg) - return true; - - /* - * Don't start new work if we are asked to stop or park. - */ - if (kthread_should_stop() || kthread_should_park()) - return false; - - if (!xfs_zoned_need_gc(mp)) - return false; - spin_lock(&zi->zi_used_buckets_lock); for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) { victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket); @@ -517,8 +531,7 @@ xfs_zone_gc_steal_open( spin_lock(&zi->zi_open_zones_lock); list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) { - if (!found || - oz->oz_write_pointer < found->oz_write_pointer) + if (!found || oz->oz_allocated < found->oz_allocated) found = oz; } @@ -568,7 +581,7 @@ xfs_zone_gc_ensure_target( { struct xfs_open_zone *oz = mp->m_zone_info->zi_open_gc_zone; - if (!oz || oz->oz_write_pointer == rtg_blocks(oz->oz_rtg)) + if (!oz || oz->oz_allocated == rtg_blocks(oz->oz_rtg)) return xfs_zone_gc_select_target(mp); return oz; } @@ -589,7 +602,7 @@ xfs_zone_gc_space_available( oz = xfs_zone_gc_ensure_target(data->mp); if (!oz) return false; - return oz->oz_write_pointer < rtg_blocks(oz->oz_rtg) && + return oz->oz_allocated < rtg_blocks(oz->oz_rtg) && xfs_zone_gc_scratch_available(data); } @@ -631,7 +644,7 @@ xfs_zone_gc_alloc_blocks( */ spin_lock(&mp->m_sb_lock); *count_fsb = min(*count_fsb, - rtg_blocks(oz->oz_rtg) - oz->oz_write_pointer); + rtg_blocks(oz->oz_rtg) - oz->oz_allocated); *count_fsb = min3(*count_fsb, mp->m_free[XC_FREE_RTEXTENTS].res_avail, mp->m_free[XC_FREE_RTAVAILABLE].res_avail); @@ -645,8 +658,8 @@ xfs_zone_gc_alloc_blocks( *daddr = xfs_gbno_to_daddr(&oz->oz_rtg->rtg_group, 0); *is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr); if (!*is_seq) - *daddr += XFS_FSB_TO_BB(mp, oz->oz_write_pointer); - oz->oz_write_pointer += *count_fsb; + *daddr += XFS_FSB_TO_BB(mp, oz->oz_allocated); + oz->oz_allocated += *count_fsb; atomic_inc(&oz->oz_ref); return oz; } @@ -691,6 +704,9 @@ xfs_zone_gc_start_chunk( chunk->scratch = &data->scratch[data->scratch_idx]; chunk->data = data; chunk->oz = oz; + chunk->victim_rtg = iter->victim_rtg; + atomic_inc(&chunk->victim_rtg->rtg_group.xg_active_ref); + atomic_inc(&chunk->victim_rtg->rtg_gccount); bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock); bio->bi_end_io = xfs_zone_gc_end_io; @@ -713,6 +729,8 @@ static void xfs_zone_gc_free_chunk( struct xfs_gc_bio *chunk) { + atomic_dec(&chunk->victim_rtg->rtg_gccount); + xfs_rtgroup_rele(chunk->victim_rtg); list_del(&chunk->entry); xfs_open_zone_put(chunk->oz); xfs_irele(chunk->ip); @@ -773,6 +791,10 @@ xfs_zone_gc_split_write( split_chunk->oz = chunk->oz; atomic_inc(&chunk->oz->oz_ref); + split_chunk->victim_rtg = chunk->victim_rtg; + atomic_inc(&chunk->victim_rtg->rtg_group.xg_active_ref); + atomic_inc(&chunk->victim_rtg->rtg_gccount); + chunk->offset += split_len; chunk->len -= split_len; chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len); @@ -789,7 +811,8 @@ xfs_zone_gc_write_chunk( { struct xfs_zone_gc_data *data = chunk->data; struct xfs_mount *mp = chunk->ip->i_mount; - unsigned int folio_offset = chunk->bio.bi_io_vec->bv_offset; + phys_addr_t bvec_paddr = + bvec_phys(bio_first_bvec_all(&chunk->bio)); struct xfs_gc_bio *split_chunk; if (chunk->bio.bi_status) @@ -804,7 +827,7 @@ xfs_zone_gc_write_chunk( bio_reset(&chunk->bio, mp->m_rtdev_targp->bt_bdev, REQ_OP_WRITE); bio_add_folio_nofail(&chunk->bio, chunk->scratch->folio, chunk->len, - folio_offset); + offset_in_folio(chunk->scratch->folio, bvec_paddr)); while ((split_chunk = xfs_zone_gc_split_write(data, chunk))) xfs_zone_gc_submit_write(data, split_chunk); @@ -962,6 +985,27 @@ xfs_zone_gc_reset_zones( } while (next); } +static bool +xfs_zone_gc_should_start_new_work( + struct xfs_zone_gc_data *data) +{ + if (xfs_is_shutdown(data->mp)) + return false; + if (!xfs_zone_gc_space_available(data)) + return false; + + if (!data->iter.victim_rtg) { + if (kthread_should_stop() || kthread_should_park()) + return false; + if (!xfs_zoned_need_gc(data->mp)) + return false; + if (!xfs_zone_gc_select_victim(data)) + return false; + } + + return true; +} + /* * Handle the work to read and write data for GC and to reset the zones, * including handling all completions. @@ -969,7 +1013,7 @@ xfs_zone_gc_reset_zones( * Note that the order of the chunks is preserved so that we don't undo the * optimal order established by xfs_zone_gc_query(). */ -static bool +static void xfs_zone_gc_handle_work( struct xfs_zone_gc_data *data) { @@ -983,30 +1027,22 @@ xfs_zone_gc_handle_work( zi->zi_reset_list = NULL; spin_unlock(&zi->zi_reset_list_lock); - if (!xfs_zone_gc_select_victim(data) || - !xfs_zone_gc_space_available(data)) { - if (list_empty(&data->reading) && - list_empty(&data->writing) && - list_empty(&data->resetting) && - !reset_list) - return false; - } - - __set_current_state(TASK_RUNNING); - try_to_freeze(); - - if (reset_list) + if (reset_list) { + set_current_state(TASK_RUNNING); xfs_zone_gc_reset_zones(data, reset_list); + } list_for_each_entry_safe(chunk, next, &data->resetting, entry) { if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) break; + set_current_state(TASK_RUNNING); xfs_zone_gc_finish_reset(chunk); } list_for_each_entry_safe(chunk, next, &data->writing, entry) { if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) break; + set_current_state(TASK_RUNNING); xfs_zone_gc_finish_chunk(chunk); } @@ -1014,15 +1050,18 @@ xfs_zone_gc_handle_work( list_for_each_entry_safe(chunk, next, &data->reading, entry) { if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) break; + set_current_state(TASK_RUNNING); xfs_zone_gc_write_chunk(chunk); } blk_finish_plug(&plug); - blk_start_plug(&plug); - while (xfs_zone_gc_start_chunk(data)) - ; - blk_finish_plug(&plug); - return true; + if (xfs_zone_gc_should_start_new_work(data)) { + set_current_state(TASK_RUNNING); + blk_start_plug(&plug); + while (xfs_zone_gc_start_chunk(data)) + ; + blk_finish_plug(&plug); + } } /* @@ -1046,8 +1085,18 @@ xfs_zoned_gcd( for (;;) { set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE); xfs_set_zonegc_running(mp); - if (xfs_zone_gc_handle_work(data)) + + xfs_zone_gc_handle_work(data); + + /* + * Only sleep if nothing set the state to running. Else check for + * work again as someone might have queued up more work and woken + * us in the meantime. + */ + if (get_current_state() == TASK_RUNNING) { + try_to_freeze(); continue; + } if (list_empty(&data->reading) && list_empty(&data->writing) && @@ -1133,16 +1182,16 @@ xfs_zone_gc_mount( goto out_put_gc_zone; } - mp->m_zone_info->zi_gc_thread = kthread_create(xfs_zoned_gcd, data, + zi->zi_gc_thread = kthread_create(xfs_zoned_gcd, data, "xfs-zone-gc/%s", mp->m_super->s_id); - if (IS_ERR(mp->m_zone_info->zi_gc_thread)) { + if (IS_ERR(zi->zi_gc_thread)) { xfs_warn(mp, "unable to create zone gc thread"); - error = PTR_ERR(mp->m_zone_info->zi_gc_thread); + error = PTR_ERR(zi->zi_gc_thread); goto out_free_gc_data; } /* xfs_zone_gc_start will unpark for rw mounts */ - kthread_park(mp->m_zone_info->zi_gc_thread); + kthread_park(zi->zi_gc_thread); return 0; out_free_gc_data: diff --git a/fs/xfs/xfs_zone_info.c b/fs/xfs/xfs_zone_info.c index 733bcc2f8645..07e30c596975 100644 --- a/fs/xfs/xfs_zone_info.c +++ b/fs/xfs/xfs_zone_info.c @@ -32,7 +32,7 @@ xfs_show_open_zone( { seq_printf(m, "\t zone %d, wp %u, written %u, used %u, hint %s\n", rtg_rgno(oz->oz_rtg), - oz->oz_write_pointer, oz->oz_written, + oz->oz_allocated, oz->oz_written, rtg_rmap(oz->oz_rtg)->i_used_blocks, xfs_write_hint_to_str(oz->oz_write_hint)); } diff --git a/fs/xfs/xfs_zone_priv.h b/fs/xfs/xfs_zone_priv.h index ab696975a993..ce7f0e2f4598 100644 --- a/fs/xfs/xfs_zone_priv.h +++ b/fs/xfs/xfs_zone_priv.h @@ -11,18 +11,18 @@ struct xfs_open_zone { atomic_t oz_ref; /* - * oz_write_pointer is the write pointer at which space is handed out - * for conventional zones, or simple the count of blocks handed out - * so far for sequential write required zones and is protected by - * oz_alloc_lock/ + * oz_allocated is the amount of space already allocated out of the zone + * and is protected by oz_alloc_lock. + * + * For conventional zones it also is the offset of the next write. */ spinlock_t oz_alloc_lock; - xfs_rgblock_t oz_write_pointer; + xfs_rgblock_t oz_allocated; /* - * oz_written is the number of blocks for which we've received a - * write completion. oz_written must always be <= oz_write_pointer - * and is protected by the ILOCK of the rmap inode. + * oz_written is the number of blocks for which we've received a write + * completion. oz_written must always be <= oz_allocated and is + * protected by the ILOCK of the rmap inode. */ xfs_rgblock_t oz_written; @@ -44,6 +44,8 @@ struct xfs_open_zone { * the life time of an open zone. */ struct xfs_rtgroup *oz_rtg; + + struct rcu_head oz_rcu; }; /* @@ -111,6 +113,7 @@ struct xfs_open_zone *xfs_open_zone(struct xfs_mount *mp, int xfs_zone_gc_reset_sync(struct xfs_rtgroup *rtg); bool xfs_zoned_need_gc(struct xfs_mount *mp); +bool xfs_zoned_have_reclaimable(struct xfs_zone_info *zi); int xfs_zone_gc_mount(struct xfs_mount *mp); void xfs_zone_gc_unmount(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_zone_space_resv.c b/fs/xfs/xfs_zone_space_resv.c index 93c9a7721139..fc1a4d1ce10c 100644 --- a/fs/xfs/xfs_zone_space_resv.c +++ b/fs/xfs/xfs_zone_space_resv.c @@ -10,6 +10,7 @@ #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_rtbitmap.h" +#include "xfs_icache.h" #include "xfs_zone_alloc.h" #include "xfs_zone_priv.h" #include "xfs_zones.h" @@ -53,12 +54,10 @@ xfs_zoned_default_resblks( { switch (ctr) { case XC_FREE_RTEXTENTS: - return (uint64_t)XFS_RESERVED_ZONES * - mp->m_groups[XG_TYPE_RTG].blocks + - mp->m_sb.sb_rtreserved; + return xfs_rtgs_to_rfsbs(mp, XFS_RESERVED_ZONES) + + mp->m_sb.sb_rtreserved; case XC_FREE_RTAVAILABLE: - return (uint64_t)XFS_GC_ZONES * - mp->m_groups[XG_TYPE_RTG].blocks; + return xfs_rtgs_to_rfsbs(mp, XFS_GC_ZONES); default: ASSERT(0); return 0; @@ -117,11 +116,10 @@ xfs_zoned_space_wait_error( static int xfs_zoned_reserve_available( - struct xfs_inode *ip, + struct xfs_mount *mp, xfs_filblks_t count_fsb, unsigned int flags) { - struct xfs_mount *mp = ip->i_mount; struct xfs_zone_info *zi = mp->m_zone_info; struct xfs_zone_reservation reservation = { .task = current, @@ -174,7 +172,7 @@ xfs_zoned_reserve_available( * processing a pending GC request give up as we're fully out * of space. */ - if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE) && + if (!xfs_zoned_have_reclaimable(mp->m_zone_info) && !xfs_is_zonegc_running(mp)) break; @@ -198,11 +196,10 @@ xfs_zoned_reserve_available( */ static int xfs_zoned_reserve_extents_greedy( - struct xfs_inode *ip, + struct xfs_mount *mp, xfs_filblks_t *count_fsb, unsigned int flags) { - struct xfs_mount *mp = ip->i_mount; struct xfs_zone_info *zi = mp->m_zone_info; s64 len = *count_fsb; int error = -ENOSPC; @@ -220,12 +217,11 @@ xfs_zoned_reserve_extents_greedy( int xfs_zoned_space_reserve( - struct xfs_inode *ip, + struct xfs_mount *mp, xfs_filblks_t count_fsb, unsigned int flags, struct xfs_zone_alloc_ctx *ac) { - struct xfs_mount *mp = ip->i_mount; int error; ASSERT(ac->reserved_blocks == 0); @@ -233,12 +229,17 @@ xfs_zoned_space_reserve( error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb, flags & XFS_ZR_RESERVED); + if (error == -ENOSPC && !(flags & XFS_ZR_NOWAIT)) { + xfs_inodegc_flush(mp); + error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb, + flags & XFS_ZR_RESERVED); + } if (error == -ENOSPC && (flags & XFS_ZR_GREEDY) && count_fsb > 1) - error = xfs_zoned_reserve_extents_greedy(ip, &count_fsb, flags); + error = xfs_zoned_reserve_extents_greedy(mp, &count_fsb, flags); if (error) return error; - error = xfs_zoned_reserve_available(ip, count_fsb, flags); + error = xfs_zoned_reserve_available(mp, count_fsb, flags); if (error) { xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb); return error; @@ -249,12 +250,10 @@ xfs_zoned_space_reserve( void xfs_zoned_space_unreserve( - struct xfs_inode *ip, + struct xfs_mount *mp, struct xfs_zone_alloc_ctx *ac) { if (ac->reserved_blocks > 0) { - struct xfs_mount *mp = ip->i_mount; - xfs_zoned_add_available(mp, ac->reserved_blocks); xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, ac->reserved_blocks); } |
