diff options
| author | Christoph Hellwig <hch@lst.de> | 2025-10-17 05:55:41 +0200 |
|---|---|---|
| committer | Carlos Maiolino <cem@kernel.org> | 2025-10-21 11:32:50 +0200 |
| commit | ca3d643a970139f5456f90dd555a0955752d70cb (patch) | |
| tree | afff7d0607e0a40f3a340de0c2d972afcf04e27c /fs/xfs/xfs_zone_alloc.c | |
| parent | a8c861f401b4b2f8feda282abff929fa91c1f73a (diff) | |
xfs: cache open zone in inode->i_private
The MRU cache for open zones is unfortunately still not ideal, as it can
time out pretty easily when doing heavy I/O to hard disks using up most
or all open zones. One option would be to just increase the timeout,
but while looking into that I realized we're just better off caching it
indefinitely as there is no real downside to that once we don't hold a
reference to the cache open zone.
So switch the open zone to RCU freeing, and then stash the last used
open zone into inode->i_private. This helps to significantly reduce
fragmentation by keeping I/O localized to zones for workloads that
write using many open files to HDD.
Fixes: 4e4d52075577 ("xfs: add the zoned space allocator")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hans Holmberg <hans.holmberg@wdc.com>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Tested-by: Damien Le Moal <dlemoal@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
Diffstat (limited to 'fs/xfs/xfs_zone_alloc.c')
| -rw-r--r-- | fs/xfs/xfs_zone_alloc.c | 129 |
1 files changed, 45 insertions, 84 deletions
diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c index 1b462cd5d8fa..23cdab4515bb 100644 --- a/fs/xfs/xfs_zone_alloc.c +++ b/fs/xfs/xfs_zone_alloc.c @@ -26,14 +26,22 @@ #include "xfs_trace.h" #include "xfs_mru_cache.h" +static void +xfs_open_zone_free_rcu( + struct callback_head *cb) +{ + struct xfs_open_zone *oz = container_of(cb, typeof(*oz), oz_rcu); + + xfs_rtgroup_rele(oz->oz_rtg); + kfree(oz); +} + void xfs_open_zone_put( struct xfs_open_zone *oz) { - if (atomic_dec_and_test(&oz->oz_ref)) { - xfs_rtgroup_rele(oz->oz_rtg); - kfree(oz); - } + if (atomic_dec_and_test(&oz->oz_ref)) + call_rcu(&oz->oz_rcu, xfs_open_zone_free_rcu); } static inline uint32_t @@ -757,97 +765,54 @@ xfs_mark_rtg_boundary( } /* - * Cache the last zone written to for an inode so that it is considered first - * for subsequent writes. - */ -struct xfs_zone_cache_item { - struct xfs_mru_cache_elem mru; - struct xfs_open_zone *oz; -}; - -static inline struct xfs_zone_cache_item * -xfs_zone_cache_item(struct xfs_mru_cache_elem *mru) -{ - return container_of(mru, struct xfs_zone_cache_item, mru); -} - -static void -xfs_zone_cache_free_func( - void *data, - struct xfs_mru_cache_elem *mru) -{ - struct xfs_zone_cache_item *item = xfs_zone_cache_item(mru); - - xfs_open_zone_put(item->oz); - kfree(item); -} - -/* * Check if we have a cached last open zone available for the inode and * if yes return a reference to it. */ static struct xfs_open_zone * -xfs_cached_zone( - struct xfs_mount *mp, - struct xfs_inode *ip) +xfs_get_cached_zone( + struct xfs_inode *ip) { - struct xfs_mru_cache_elem *mru; - struct xfs_open_zone *oz; + struct xfs_open_zone *oz; - mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino); - if (!mru) - return NULL; - oz = xfs_zone_cache_item(mru)->oz; + rcu_read_lock(); + oz = VFS_I(ip)->i_private; if (oz) { /* * GC only steals open zones at mount time, so no GC zones * should end up in the cache. */ ASSERT(!oz->oz_is_gc); - ASSERT(atomic_read(&oz->oz_ref) > 0); - atomic_inc(&oz->oz_ref); + if (!atomic_inc_not_zero(&oz->oz_ref)) + oz = NULL; } - xfs_mru_cache_done(mp->m_zone_cache); + rcu_read_unlock(); + return oz; } /* - * Update the last used zone cache for a given inode. + * Stash our zone in the inode so that is is reused for future allocations. * - * The caller must have a reference on the open zone. + * The open_zone structure will be pinned until either the inode is freed or + * until the cached open zone is replaced with a different one because the + * current one was full when we tried to use it. This means we keep any + * open zone around forever as long as any inode that used it for the last + * write is cached, which slightly increases the memory use of cached inodes + * that were every written to, but significantly simplifies the cached zone + * lookup. Because the open_zone is clearly marked as full when all data + * in the underlying RTG was written, the caching is always safe. */ static void -xfs_zone_cache_create_association( - struct xfs_inode *ip, - struct xfs_open_zone *oz) +xfs_set_cached_zone( + struct xfs_inode *ip, + struct xfs_open_zone *oz) { - struct xfs_mount *mp = ip->i_mount; - struct xfs_zone_cache_item *item = NULL; - struct xfs_mru_cache_elem *mru; + struct xfs_open_zone *old_oz; - ASSERT(atomic_read(&oz->oz_ref) > 0); atomic_inc(&oz->oz_ref); - - mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino); - if (mru) { - /* - * If we have an association already, update it to point to the - * new zone. - */ - item = xfs_zone_cache_item(mru); - xfs_open_zone_put(item->oz); - item->oz = oz; - xfs_mru_cache_done(mp->m_zone_cache); - return; - } - - item = kmalloc(sizeof(*item), GFP_KERNEL); - if (!item) { - xfs_open_zone_put(oz); - return; - } - item->oz = oz; - xfs_mru_cache_insert(mp->m_zone_cache, ip->i_ino, &item->mru); + old_oz = xchg(&VFS_I(ip)->i_private, oz); + if (old_oz) + xfs_open_zone_put(old_oz); } static void @@ -891,15 +856,14 @@ xfs_zone_alloc_and_submit( * the inode is still associated with a zone and use that if so. */ if (!*oz) - *oz = xfs_cached_zone(mp, ip); + *oz = xfs_get_cached_zone(ip); if (!*oz) { select_zone: *oz = xfs_select_zone(mp, write_hint, pack_tight); if (!*oz) goto out_error; - - xfs_zone_cache_create_association(ip, *oz); + xfs_set_cached_zone(ip, *oz); } alloc_len = xfs_zone_alloc_blocks(*oz, XFS_B_TO_FSB(mp, ioend->io_size), @@ -977,6 +941,12 @@ xfs_free_open_zones( xfs_open_zone_put(oz); } spin_unlock(&zi->zi_open_zones_lock); + + /* + * Wait for all open zones to be freed so that they drop the group + * references: + */ + rcu_barrier(); } struct xfs_init_zones { @@ -1290,14 +1260,6 @@ xfs_mount_zones( error = xfs_zone_gc_mount(mp); if (error) goto out_free_zone_info; - - /* - * Set up a mru cache to track inode to open zone for data placement - * purposes. The magic values for group count and life time is the - * same as the defaults for file streams, which seems sane enough. - */ - xfs_mru_cache_create(&mp->m_zone_cache, mp, - 5000, 10, xfs_zone_cache_free_func); return 0; out_free_zone_info: @@ -1311,5 +1273,4 @@ xfs_unmount_zones( { xfs_zone_gc_unmount(mp); xfs_free_zone_info(mp->m_zone_info); - xfs_mru_cache_destroy(mp->m_zone_cache); } |
