diff options
author | Hans Holmberg <hans.holmberg@wdc.com> | 2025-01-31 09:39:36 +0100 |
---|---|---|
committer | Christoph Hellwig <hch@lst.de> | 2025-03-03 08:17:10 -0700 |
commit | 64d0361114fdeae992da2a6c409041c3c13b63ae (patch) | |
tree | d7b2b55189ae6a6a6ec5fdc139dd653d4044d792 /fs/xfs/xfs_zone_alloc.c | |
parent | 7452a6daf9f9c343d483cff60a0016379c1edb6c (diff) |
xfs: support write life time based data placement
Add a file write life time data placement allocation scheme that aims to
minimize fragmentation and thereby to do two things:
a) separate file data to different zones when possible.
b) colocate file data of similar life times when feasible.
To get best results, average file sizes should align with the zone
capacity that is reported through the XFS_IOC_FSGEOMETRY ioctl.
This improvement in data placement efficiency reduces the number of
blocks requiring relocation by GC, and thus decreases overall write
amplification. The impact on performance varies depending on how full
the file system is.
For RocksDB using leveled compaction, the lifetime hints can improve
throughput for overwrite workloads at 80% file system utilization by
~10%, but for lower file system utilization there won't be as much
benefit in application performance as there is less need for garbage
collection to start with.
Lifetime hints can be disabled using the nolifetime mount option.
Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
Diffstat (limited to 'fs/xfs/xfs_zone_alloc.c')
-rw-r--r-- | fs/xfs/xfs_zone_alloc.c | 130 |
1 files changed, 114 insertions, 16 deletions
diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c index b7b2820ec0ef..fd4c60a050e6 100644 --- a/fs/xfs/xfs_zone_alloc.c +++ b/fs/xfs/xfs_zone_alloc.c @@ -424,6 +424,7 @@ static struct xfs_open_zone * xfs_init_open_zone( struct xfs_rtgroup *rtg, xfs_rgblock_t write_pointer, + enum rw_hint write_hint, bool is_gc) { struct xfs_open_zone *oz; @@ -434,6 +435,7 @@ xfs_init_open_zone( oz->oz_rtg = rtg; oz->oz_write_pointer = write_pointer; oz->oz_written = write_pointer; + oz->oz_write_hint = write_hint; oz->oz_is_gc = is_gc; /* @@ -453,6 +455,7 @@ xfs_init_open_zone( struct xfs_open_zone * xfs_open_zone( struct xfs_mount *mp, + enum rw_hint write_hint, bool is_gc) { struct xfs_zone_info *zi = mp->m_zone_info; @@ -465,12 +468,13 @@ xfs_open_zone( return NULL; set_current_state(TASK_RUNNING); - return xfs_init_open_zone(to_rtg(xg), 0, is_gc); + return xfs_init_open_zone(to_rtg(xg), 0, write_hint, is_gc); } static struct xfs_open_zone * xfs_try_open_zone( - struct xfs_mount *mp) + struct xfs_mount *mp, + enum rw_hint write_hint) { struct xfs_zone_info *zi = mp->m_zone_info; struct xfs_open_zone *oz; @@ -487,7 +491,7 @@ xfs_try_open_zone( */ zi->zi_nr_open_zones++; spin_unlock(&zi->zi_open_zones_lock); - oz = xfs_open_zone(mp, false); + oz = xfs_open_zone(mp, write_hint, false); spin_lock(&zi->zi_open_zones_lock); if (!oz) { zi->zi_nr_open_zones--; @@ -510,17 +514,79 @@ xfs_try_open_zone( return oz; } +/* + * For data with short or medium lifetime, try to colocated it into an + * already open zone with a matching temperature. + */ +static bool +xfs_colocate_eagerly( + enum rw_hint file_hint) +{ + switch (file_hint) { + case WRITE_LIFE_MEDIUM: + case WRITE_LIFE_SHORT: + case WRITE_LIFE_NONE: + return true; + default: + return false; + } +} + +static bool +xfs_good_hint_match( + struct xfs_open_zone *oz, + enum rw_hint file_hint) +{ + switch (oz->oz_write_hint) { + case WRITE_LIFE_LONG: + case WRITE_LIFE_EXTREME: + /* colocate long and extreme */ + if (file_hint == WRITE_LIFE_LONG || + file_hint == WRITE_LIFE_EXTREME) + return true; + break; + case WRITE_LIFE_MEDIUM: + /* colocate medium with medium */ + if (file_hint == WRITE_LIFE_MEDIUM) + return true; + break; + case WRITE_LIFE_SHORT: + case WRITE_LIFE_NONE: + case WRITE_LIFE_NOT_SET: + /* colocate short and none */ + if (file_hint <= WRITE_LIFE_SHORT) + return true; + break; + } + return false; +} + static bool xfs_try_use_zone( struct xfs_zone_info *zi, - struct xfs_open_zone *oz) + enum rw_hint file_hint, + struct xfs_open_zone *oz, + bool lowspace) { if (oz->oz_write_pointer == rtg_blocks(oz->oz_rtg)) return false; + if (!lowspace && !xfs_good_hint_match(oz, file_hint)) + return false; if (!atomic_inc_not_zero(&oz->oz_ref)) return false; /* + * If we have a hint set for the data, use that for the zone even if + * some data was written already without any hint set, but don't change + * the temperature after that as that would make little sense without + * tracking per-temperature class written block counts, which is + * probably overkill anyway. + */ + if (file_hint != WRITE_LIFE_NOT_SET && + oz->oz_write_hint == WRITE_LIFE_NOT_SET) + oz->oz_write_hint = file_hint; + + /* * If we couldn't match by inode or life time we just pick the first * zone with enough space above. For that we want the least busy zone * for some definition of "least" busy. For now this simple LRU @@ -534,14 +600,16 @@ xfs_try_use_zone( static struct xfs_open_zone * xfs_select_open_zone_lru( - struct xfs_zone_info *zi) + struct xfs_zone_info *zi, + enum rw_hint file_hint, + bool lowspace) { struct xfs_open_zone *oz; lockdep_assert_held(&zi->zi_open_zones_lock); list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) - if (xfs_try_use_zone(zi, oz)) + if (xfs_try_use_zone(zi, file_hint, oz, lowspace)) return oz; cond_resched_lock(&zi->zi_open_zones_lock); @@ -550,20 +618,28 @@ xfs_select_open_zone_lru( static struct xfs_open_zone * xfs_select_open_zone_mru( - struct xfs_zone_info *zi) + struct xfs_zone_info *zi, + enum rw_hint file_hint) { struct xfs_open_zone *oz; lockdep_assert_held(&zi->zi_open_zones_lock); list_for_each_entry_reverse(oz, &zi->zi_open_zones, oz_entry) - if (xfs_try_use_zone(zi, oz)) + if (xfs_try_use_zone(zi, file_hint, oz, false)) return oz; cond_resched_lock(&zi->zi_open_zones_lock); return NULL; } +static inline enum rw_hint xfs_inode_write_hint(struct xfs_inode *ip) +{ + if (xfs_has_nolifetime(ip->i_mount)) + return WRITE_LIFE_NOT_SET; + return VFS_I(ip)->i_write_hint; +} + /* * Try to pack inodes that are written back after they were closed tight instead * of trying to open new zones for them or spread them to the least recently @@ -587,6 +663,7 @@ static inline bool xfs_zoned_pack_tight(struct xfs_inode *ip) static struct xfs_open_zone * xfs_select_zone_nowait( struct xfs_mount *mp, + enum rw_hint write_hint, bool pack_tight) { struct xfs_zone_info *zi = mp->m_zone_info; @@ -595,20 +672,38 @@ xfs_select_zone_nowait( if (xfs_is_shutdown(mp)) return NULL; + /* + * Try to fill up open zones with matching temperature if available. It + * is better to try to co-locate data when this is favorable, so we can + * activate empty zones when it is statistically better to separate + * data. + */ spin_lock(&zi->zi_open_zones_lock); - if (pack_tight) - oz = xfs_select_open_zone_mru(zi); + if (xfs_colocate_eagerly(write_hint)) + oz = xfs_select_open_zone_lru(zi, write_hint, false); + else if (pack_tight) + oz = xfs_select_open_zone_mru(zi, write_hint); if (oz) goto out_unlock; /* * See if we can open a new zone and use that. */ - oz = xfs_try_open_zone(mp); + oz = xfs_try_open_zone(mp, write_hint); if (oz) goto out_unlock; - oz = xfs_select_open_zone_lru(zi); + /* + * Try to colocate cold data with other cold data if we failed to open a + * new zone for it. + */ + if (write_hint != WRITE_LIFE_NOT_SET && + !xfs_colocate_eagerly(write_hint)) + oz = xfs_select_open_zone_lru(zi, write_hint, false); + if (!oz) + oz = xfs_select_open_zone_lru(zi, WRITE_LIFE_NOT_SET, false); + if (!oz) + oz = xfs_select_open_zone_lru(zi, WRITE_LIFE_NOT_SET, true); out_unlock: spin_unlock(&zi->zi_open_zones_lock); return oz; @@ -617,19 +712,20 @@ out_unlock: static struct xfs_open_zone * xfs_select_zone( struct xfs_mount *mp, + enum rw_hint write_hint, bool pack_tight) { struct xfs_zone_info *zi = mp->m_zone_info; DEFINE_WAIT (wait); struct xfs_open_zone *oz; - oz = xfs_select_zone_nowait(mp, pack_tight); + oz = xfs_select_zone_nowait(mp, write_hint, pack_tight); if (oz) return oz; for (;;) { prepare_to_wait(&zi->zi_zone_wait, &wait, TASK_UNINTERRUPTIBLE); - oz = xfs_select_zone_nowait(mp, pack_tight); + oz = xfs_select_zone_nowait(mp, write_hint, pack_tight); if (oz) break; schedule(); @@ -707,6 +803,7 @@ xfs_zone_alloc_and_submit( { struct xfs_inode *ip = XFS_I(ioend->io_inode); struct xfs_mount *mp = ip->i_mount; + enum rw_hint write_hint = xfs_inode_write_hint(ip); bool pack_tight = xfs_zoned_pack_tight(ip); unsigned int alloc_len; struct iomap_ioend *split; @@ -724,7 +821,7 @@ xfs_zone_alloc_and_submit( *oz = xfs_last_used_zone(ioend); if (!*oz) { select_zone: - *oz = xfs_select_zone(mp, pack_tight); + *oz = xfs_select_zone(mp, write_hint, pack_tight); if (!*oz) goto out_error; } @@ -862,7 +959,8 @@ xfs_init_zone( struct xfs_open_zone *oz; atomic_inc(&rtg_group(rtg)->xg_active_ref); - oz = xfs_init_open_zone(rtg, write_pointer, false); + oz = xfs_init_open_zone(rtg, write_pointer, WRITE_LIFE_NOT_SET, + false); list_add_tail(&oz->oz_entry, &zi->zi_open_zones); zi->zi_nr_open_zones++; |