diff options
| author | Carlos Maiolino <cem@kernel.org> | 2025-03-04 11:25:46 +0100 | 
|---|---|---|
| committer | Carlos Maiolino <cem@kernel.org> | 2025-03-04 11:25:46 +0100 | 
| commit | 4c6283ec9284bb72906dba83bc7a809747e6331e (patch) | |
| tree | 6a2ed104fc86a90bb787ff0dbee020461e59ec14 /fs/xfs/xfs_rtalloc.c | |
| parent | 0a1fd78080c8c9a5582e82100bd91b87ae5ac57c (diff) | |
| parent | 9c477912b2f58da71751f244aceecf5f8cc549ed (diff) | |
Merge tag 'xfs-zoned-allocator-2025-03-03' of git://git.infradead.org/users/hch/xfs into xfs-6.15-zoned_devices
xfs: add support for zoned devices
Add support for the new zoned space allocator and thus for zoned devices:
    https://zonedstorage.io/docs/introduction/zoned-storage
to XFS. This has been developed for and tested on both SMR hard drives,
which are the oldest and most common class of zoned devices:
   https://zonedstorage.io/docs/introduction/smr
and ZNS SSDs:
   https://zonedstorage.io/docs/introduction/zns
It has not been tested with zoned UFS devices, as their current capacity
points and performance characteristics aren't too interesting for XFS
use cases (but never say never).
Sequential write only zones are only supported for data using a new
allocator for the RT device, which maps each zone to a rtgroup which
is written sequentially.  All metadata and (for now) the log require
using randomly writable space. This means a realtime device is required
to support zoned storage, but for the common case of SMR hard drives
that contain random writable zones and sequential write required zones
on the same block device, the concept of an internal RT device is added
which means using XFS on a SMR HDD is as simple as:
$ mkfs.xfs /dev/sda
$ mount /dev/sda /mnt
When using NVMe ZNS SSDs that do not support conventional zones, the
traditional multi-device RT configuration is required.  E.g. for an
SSD with a conventional namespace 1 and a zoned namespace 2:
$ mkfs.xfs /dev/nvme0n1 -o rtdev=/dev/nvme0n2
$ mount -o rtdev=/dev/nvme0n2 /dev/nvme0n1 /mnt
The zoned allocator can also be used on conventional block devices, or
on conventional zones (e.g. when using an SMR HDD as the external RT
device).  For example using zoned XFS on normal SSDs shows very nice
performance advantages and write amplification reduction for intelligent
workloads like RocksDB.
Some work is still in progress or planned, but should not affect the
integration with the rest of XFS or the on-disk format:
 - support for quotas
 - support for reflinks
Note that the I/O path already supports reflink, but garbage collection
isn't refcount aware yet and would unshare shared blocks, thus rendering
the feature useless.
Diffstat (limited to 'fs/xfs/xfs_rtalloc.c')
| -rw-r--r-- | fs/xfs/xfs_rtalloc.c | 237 | 
1 files changed, 142 insertions, 95 deletions
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 57bef567e011..9a99629d7de4 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -33,6 +33,7 @@  #include "xfs_trace.h"  #include "xfs_rtrefcount_btree.h"  #include "xfs_reflink.h" +#include "xfs_zone_alloc.h"  /*   * Return whether there are any free extents in the size range given @@ -663,7 +664,8 @@ xfs_rtunmount_rtg(  	for (i = 0; i < XFS_RTGI_MAX; i++)  		xfs_rtginode_irele(&rtg->rtg_inodes[i]); -	kvfree(rtg->rtg_rsum_cache); +	if (!xfs_has_zoned(rtg_mount(rtg))) +		kvfree(rtg->rtg_rsum_cache);  }  static int @@ -858,6 +860,84 @@ xfs_growfs_rt_init_rtsb(  	return error;  } +static void +xfs_growfs_rt_sb_fields( +	struct xfs_trans	*tp, +	const struct xfs_mount	*nmp) +{ +	struct xfs_mount	*mp = tp->t_mountp; + +	if (nmp->m_sb.sb_rextsize != mp->m_sb.sb_rextsize) +		xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSIZE, +			nmp->m_sb.sb_rextsize - mp->m_sb.sb_rextsize); +	if (nmp->m_sb.sb_rbmblocks != mp->m_sb.sb_rbmblocks) +		xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBMBLOCKS, +			nmp->m_sb.sb_rbmblocks - mp->m_sb.sb_rbmblocks); +	if (nmp->m_sb.sb_rblocks != mp->m_sb.sb_rblocks) +		xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBLOCKS, +			nmp->m_sb.sb_rblocks - mp->m_sb.sb_rblocks); +	if (nmp->m_sb.sb_rextents != mp->m_sb.sb_rextents) +		xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTENTS, +			nmp->m_sb.sb_rextents - mp->m_sb.sb_rextents); +	if (nmp->m_sb.sb_rextslog != mp->m_sb.sb_rextslog) +		xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSLOG, +			nmp->m_sb.sb_rextslog - mp->m_sb.sb_rextslog); +	if (nmp->m_sb.sb_rgcount != mp->m_sb.sb_rgcount) +		xfs_trans_mod_sb(tp, XFS_TRANS_SB_RGCOUNT, +			nmp->m_sb.sb_rgcount - mp->m_sb.sb_rgcount); +} + +static int +xfs_growfs_rt_zoned( +	struct xfs_rtgroup	*rtg, +	xfs_rfsblock_t		nrblocks) +{ +	struct xfs_mount	*mp = rtg_mount(rtg); +	struct xfs_mount	*nmp; +	struct xfs_trans	*tp; +	xfs_rtbxlen_t		freed_rtx; +	int			error; + +	/* +	 * Calculate new sb and mount fields for this round.  Also ensure the +	 * rtg_extents value is uptodate as the rtbitmap code relies on it. +	 */ +	nmp = xfs_growfs_rt_alloc_fake_mount(mp, nrblocks, +			mp->m_sb.sb_rextsize); +	if (!nmp) +		return -ENOMEM; +	freed_rtx = nmp->m_sb.sb_rextents - mp->m_sb.sb_rextents; + +	xfs_rtgroup_calc_geometry(nmp, rtg, rtg_rgno(rtg), +			nmp->m_sb.sb_rgcount, nmp->m_sb.sb_rextents); + +	error = xfs_trans_alloc(mp, &M_RES(nmp)->tr_growrtfree, 0, 0, 0, &tp); +	if (error) +		goto out_free; + +	xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); +	xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP); + +	xfs_growfs_rt_sb_fields(tp, nmp); +	xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, freed_rtx); + +	error = xfs_trans_commit(tp); +	if (error) +		goto out_free; + +	/* +	 * Ensure the mount RT feature flag is now set, and compute new +	 * maxlevels for rt btrees. +	 */ +	mp->m_features |= XFS_FEAT_REALTIME; +	xfs_rtrmapbt_compute_maxlevels(mp); +	xfs_rtrefcountbt_compute_maxlevels(mp); +	xfs_zoned_add_available(mp, freed_rtx); +out_free: +	kfree(nmp); +	return error; +} +  static int  xfs_growfs_rt_bmblock(  	struct xfs_rtgroup	*rtg, @@ -943,24 +1023,7 @@ xfs_growfs_rt_bmblock(  	/*  	 * Update superblock fields.  	 */ -	if (nmp->m_sb.sb_rextsize != mp->m_sb.sb_rextsize) -		xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_REXTSIZE, -			nmp->m_sb.sb_rextsize - mp->m_sb.sb_rextsize); -	if (nmp->m_sb.sb_rbmblocks != mp->m_sb.sb_rbmblocks) -		xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_RBMBLOCKS, -			nmp->m_sb.sb_rbmblocks - mp->m_sb.sb_rbmblocks); -	if (nmp->m_sb.sb_rblocks != mp->m_sb.sb_rblocks) -		xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_RBLOCKS, -			nmp->m_sb.sb_rblocks - mp->m_sb.sb_rblocks); -	if (nmp->m_sb.sb_rextents != mp->m_sb.sb_rextents) -		xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_REXTENTS, -			nmp->m_sb.sb_rextents - mp->m_sb.sb_rextents); -	if (nmp->m_sb.sb_rextslog != mp->m_sb.sb_rextslog) -		xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_REXTSLOG, -			nmp->m_sb.sb_rextslog - mp->m_sb.sb_rextslog); -	if (nmp->m_sb.sb_rgcount != mp->m_sb.sb_rgcount) -		xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_RGCOUNT, -			nmp->m_sb.sb_rgcount - mp->m_sb.sb_rgcount); +	xfs_growfs_rt_sb_fields(args.tp, nmp);  	/*  	 * Free the new extent. @@ -1127,6 +1190,11 @@ xfs_growfs_rtg(  			goto out_rele;  	} +	if (xfs_has_zoned(mp)) { +		error = xfs_growfs_rt_zoned(rtg, nrblocks); +		goto out_rele; +	} +  	error = xfs_growfs_rt_alloc_blocks(rtg, nrblocks, rextsize, &bmblocks);  	if (error)  		goto out_rele; @@ -1146,8 +1214,7 @@ xfs_growfs_rtg(  	if (old_rsum_cache)  		kvfree(old_rsum_cache); -	xfs_rtgroup_rele(rtg); -	return 0; +	goto out_rele;  out_error:  	/* @@ -1195,6 +1262,22 @@ xfs_growfs_check_rtgeom(  	if (min_logfsbs > mp->m_sb.sb_logblocks)  		return -EINVAL; + +	if (xfs_has_zoned(mp)) { +		uint32_t	gblocks = mp->m_groups[XG_TYPE_RTG].blocks; +		uint32_t	rem; + +		if (rextsize != 1) +			return -EINVAL; +		div_u64_rem(mp->m_sb.sb_rblocks, gblocks, &rem); +		if (rem) { +			xfs_warn(mp, +"new RT volume size (%lld) not aligned to RT group size (%d)", +				mp->m_sb.sb_rblocks, gblocks); +			return -EINVAL; +		} +	} +  	return 0;  } @@ -1249,6 +1332,35 @@ xfs_grow_last_rtg(  }  /* + * Read in the last block of the RT device to make sure it is accessible. + */ +static int +xfs_rt_check_size( +	struct xfs_mount	*mp, +	xfs_rfsblock_t		last_block) +{ +	xfs_daddr_t		daddr = XFS_FSB_TO_BB(mp, last_block); +	struct xfs_buf		*bp; +	int			error; + +	if (XFS_BB_TO_FSB(mp, daddr) != last_block) { +		xfs_warn(mp, "RT device size overflow: %llu != %llu", +			XFS_BB_TO_FSB(mp, daddr), last_block); +		return -EFBIG; +	} + +	error = xfs_buf_read_uncached(mp->m_rtdev_targp, +			XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart) + daddr, +			XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL); +	if (error) +		xfs_warn(mp, "cannot read last RT device sector (%lld)", +				last_block); +	else +		xfs_buf_relse(bp); +	return error; +} + +/*   * Grow the realtime area of the filesystem.   */  int @@ -1259,7 +1371,6 @@ xfs_growfs_rt(  	xfs_rgnumber_t		old_rgcount = mp->m_sb.sb_rgcount;  	xfs_rgnumber_t		new_rgcount = 1;  	xfs_rgnumber_t		rgno; -	struct xfs_buf		*bp;  	xfs_agblock_t		old_rextsize = mp->m_sb.sb_rextsize;  	int			error; @@ -1302,15 +1413,10 @@ xfs_growfs_rt(  	error = xfs_sb_validate_fsb_count(&mp->m_sb, in->newblocks);  	if (error)  		goto out_unlock; -	/* -	 * Read in the last block of the device, make sure it exists. -	 */ -	error = xfs_buf_read_uncached(mp->m_rtdev_targp, -				XFS_FSB_TO_BB(mp, in->newblocks - 1), -				XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL); + +	error = xfs_rt_check_size(mp, in->newblocks - 1);  	if (error)  		goto out_unlock; -	xfs_buf_relse(bp);  	/*  	 * Calculate new parameters.  These are the final values to be reached. @@ -1376,8 +1482,7 @@ xfs_growfs_rt(  			error = error2;  		/* Reset the rt metadata btree space reservations. */ -		xfs_rt_resv_free(mp); -		error2 = xfs_rt_resv_init(mp); +		error2 = xfs_metafile_resv_init(mp);  		if (error2 && error2 != -ENOSPC)  			error = error2;  	} @@ -1444,10 +1549,6 @@ int				/* error */  xfs_rtmount_init(  	struct xfs_mount	*mp)	/* file system mount structure */  { -	struct xfs_buf		*bp;	/* buffer for last block of subvolume */ -	xfs_daddr_t		d;	/* address of last block of subvolume */ -	int			error; -  	if (mp->m_sb.sb_rblocks == 0)  		return 0;  	if (mp->m_rtdev_targp == NULL) { @@ -1458,25 +1559,7 @@ xfs_rtmount_init(  	mp->m_rsumblocks = xfs_rtsummary_blockcount(mp, &mp->m_rsumlevels); -	/* -	 * Check that the realtime section is an ok size. -	 */ -	d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); -	if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_rblocks) { -		xfs_warn(mp, "realtime mount -- %llu != %llu", -			(unsigned long long) XFS_BB_TO_FSB(mp, d), -			(unsigned long long) mp->m_sb.sb_rblocks); -		return -EFBIG; -	} -	error = xfs_buf_read_uncached(mp->m_rtdev_targp, -					d - XFS_FSB_TO_BB(mp, 1), -					XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL); -	if (error) { -		xfs_warn(mp, "realtime device size check failed"); -		return error; -	} -	xfs_buf_relse(bp); -	return 0; +	return xfs_rt_check_size(mp, mp->m_sb.sb_rblocks - 1);  }  static int @@ -1519,50 +1602,10 @@ xfs_rtalloc_reinit_frextents(  	spin_lock(&mp->m_sb_lock);  	mp->m_sb.sb_frextents = val;  	spin_unlock(&mp->m_sb_lock); -	percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents); +	xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, mp->m_sb.sb_frextents);  	return 0;  } -/* Free space reservations for rt metadata inodes. */ -void -xfs_rt_resv_free( -	struct xfs_mount	*mp) -{ -	struct xfs_rtgroup	*rtg = NULL; -	unsigned int		i; - -	while ((rtg = xfs_rtgroup_next(mp, rtg))) { -		for (i = 0; i < XFS_RTGI_MAX; i++) -			xfs_metafile_resv_free(rtg->rtg_inodes[i]); -	} -} - -/* Reserve space for rt metadata inodes' space expansion. */ -int -xfs_rt_resv_init( -	struct xfs_mount	*mp) -{ -	struct xfs_rtgroup	*rtg = NULL; -	xfs_filblks_t		ask; -	int			error = 0; - -	while ((rtg = xfs_rtgroup_next(mp, rtg))) { -		int		err2; - -		ask = xfs_rtrmapbt_calc_reserves(mp); -		err2 = xfs_metafile_resv_init(rtg_rmap(rtg), ask); -		if (err2 && !error) -			error = err2; - -		ask = xfs_rtrefcountbt_calc_reserves(mp); -		err2 = xfs_metafile_resv_init(rtg_refcount(rtg), ask); -		if (err2 && !error) -			error = err2; -	} - -	return error; -} -  /*   * Read in the bmbt of an rt metadata inode so that we never have to load them   * at runtime.  This enables the use of shared ILOCKs for rtbitmap scans.  Use @@ -1613,6 +1656,8 @@ xfs_rtmount_rtg(  		}  	} +	if (xfs_has_zoned(mp)) +		return 0;  	return xfs_alloc_rsum_cache(rtg, mp->m_sb.sb_rbmblocks);  } @@ -2097,6 +2142,8 @@ xfs_bmap_rtalloc(  		ap->datatype & XFS_ALLOC_INITIAL_USER_DATA;  	int			error; +	ASSERT(!xfs_has_zoned(ap->tp->t_mountp)); +  retry:  	error = xfs_rtallocate_align(ap, &ralen, &raminlen, &prod, &noalign);  	if (error)  | 
