diff options
author | Christoph Hellwig <hch@lst.de> | 2025-01-16 07:01:42 +0100 |
---|---|---|
committer | Carlos Maiolino <cem@kernel.org> | 2025-01-16 10:19:59 +0100 |
commit | ee10f6fcdb961e810d7b16be1285319c15c78ef6 (patch) | |
tree | 922781843c3071785b84fdb05fc6171c5010b0f9 /fs/xfs/xfs_buf.c | |
parent | 07eae0fa67ca4bbb199ad85645e0f9dfaef931cd (diff) |
xfs: fix buffer lookup vs release race
Since commit 298f34224506 ("xfs: lockless buffer lookup") the buffer
lookup fastpath is done without a hash-wide lock (then pag_buf_lock, now
bc_lock) and only under RCU protection. But this means that nothing
serializes lookups against the temporary 0 reference count for buffers
that are added to the LRU after dropping the last regular reference,
and a concurrent lookup would fail to find them.
Fix this by doing all b_hold modifications under b_lock. We're already
doing this for release so this "only" ~ doubles the b_lock round trips.
We'll later look into the lockref infrastructure to optimize the number
of lock round trips again.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
Diffstat (limited to 'fs/xfs/xfs_buf.c')
-rw-r--r-- | fs/xfs/xfs_buf.c | 93 |
1 files changed, 48 insertions, 45 deletions
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 183428fbc607..7fbdd4b30676 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -127,15 +127,6 @@ __xfs_buf_ioacct_dec( } } -static inline void -xfs_buf_ioacct_dec( - struct xfs_buf *bp) -{ - spin_lock(&bp->b_lock); - __xfs_buf_ioacct_dec(bp); - spin_unlock(&bp->b_lock); -} - /* * When we mark a buffer stale, we remove the buffer from the LRU and clear the * b_lru_ref count so that the buffer is freed immediately when the buffer @@ -171,9 +162,9 @@ xfs_buf_stale( atomic_set(&bp->b_lru_ref, 0); if (!(bp->b_state & XFS_BSTATE_DISPOSE) && (list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru))) - atomic_dec(&bp->b_hold); + bp->b_hold--; - ASSERT(atomic_read(&bp->b_hold) >= 1); + ASSERT(bp->b_hold >= 1); spin_unlock(&bp->b_lock); } @@ -229,14 +220,14 @@ _xfs_buf_alloc( */ flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD); - atomic_set(&bp->b_hold, 1); + spin_lock_init(&bp->b_lock); + bp->b_hold = 1; atomic_set(&bp->b_lru_ref, 1); init_completion(&bp->b_iowait); INIT_LIST_HEAD(&bp->b_lru); INIT_LIST_HEAD(&bp->b_list); INIT_LIST_HEAD(&bp->b_li_list); sema_init(&bp->b_sema, 0); /* held, no waiters */ - spin_lock_init(&bp->b_lock); bp->b_target = target; bp->b_mount = target->bt_mount; bp->b_flags = flags; @@ -580,6 +571,20 @@ xfs_buf_find_lock( return 0; } +static bool +xfs_buf_try_hold( + struct xfs_buf *bp) +{ + spin_lock(&bp->b_lock); + if (bp->b_hold == 0) { + spin_unlock(&bp->b_lock); + return false; + } + bp->b_hold++; + spin_unlock(&bp->b_lock); + return true; +} + static inline int xfs_buf_lookup( struct xfs_buf_cache *bch, @@ -592,7 +597,7 @@ xfs_buf_lookup( rcu_read_lock(); bp = rhashtable_lookup(&bch->bc_hash, map, xfs_buf_hash_params); - if (!bp || !atomic_inc_not_zero(&bp->b_hold)) { + if (!bp || !xfs_buf_try_hold(bp)) { rcu_read_unlock(); return -ENOENT; } @@ -655,7 +660,7 @@ xfs_buf_find_insert( spin_unlock(&bch->bc_lock); goto out_free_buf; } - if (bp && atomic_inc_not_zero(&bp->b_hold)) { + if (bp && xfs_buf_try_hold(bp)) { /* found an existing buffer */ spin_unlock(&bch->bc_lock); error = xfs_buf_find_lock(bp, flags); @@ -1037,7 +1042,10 @@ xfs_buf_hold( struct xfs_buf *bp) { trace_xfs_buf_hold(bp, _RET_IP_); - atomic_inc(&bp->b_hold); + + spin_lock(&bp->b_lock); + bp->b_hold++; + spin_unlock(&bp->b_lock); } static void @@ -1045,10 +1053,15 @@ xfs_buf_rele_uncached( struct xfs_buf *bp) { ASSERT(list_empty(&bp->b_lru)); - if (atomic_dec_and_test(&bp->b_hold)) { - xfs_buf_ioacct_dec(bp); - xfs_buf_free(bp); + + spin_lock(&bp->b_lock); + if (--bp->b_hold) { + spin_unlock(&bp->b_lock); + return; } + __xfs_buf_ioacct_dec(bp); + spin_unlock(&bp->b_lock); + xfs_buf_free(bp); } static void @@ -1058,51 +1071,40 @@ xfs_buf_rele_cached( struct xfs_buftarg *btp = bp->b_target; struct xfs_perag *pag = bp->b_pag; struct xfs_buf_cache *bch = xfs_buftarg_buf_cache(btp, pag); - bool release; bool freebuf = false; trace_xfs_buf_rele(bp, _RET_IP_); - ASSERT(atomic_read(&bp->b_hold) > 0); - - /* - * We grab the b_lock here first to serialise racing xfs_buf_rele() - * calls. The pag_buf_lock being taken on the last reference only - * serialises against racing lookups in xfs_buf_find(). IOWs, the second - * to last reference we drop here is not serialised against the last - * reference until we take bp->b_lock. Hence if we don't grab b_lock - * first, the last "release" reference can win the race to the lock and - * free the buffer before the second-to-last reference is processed, - * leading to a use-after-free scenario. - */ spin_lock(&bp->b_lock); - release = atomic_dec_and_lock(&bp->b_hold, &bch->bc_lock); - if (!release) { + ASSERT(bp->b_hold >= 1); + if (bp->b_hold > 1) { /* * Drop the in-flight state if the buffer is already on the LRU * and it holds the only reference. This is racy because we * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT * ensures the decrement occurs only once per-buf. */ - if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru)) + if (--bp->b_hold == 1 && !list_empty(&bp->b_lru)) __xfs_buf_ioacct_dec(bp); goto out_unlock; } - /* the last reference has been dropped ... */ + /* we are asked to drop the last reference */ + spin_lock(&bch->bc_lock); __xfs_buf_ioacct_dec(bp); if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) { /* - * If the buffer is added to the LRU take a new reference to the + * If the buffer is added to the LRU, keep the reference to the * buffer for the LRU and clear the (now stale) dispose list - * state flag + * state flag, else drop the reference. */ - if (list_lru_add_obj(&btp->bt_lru, &bp->b_lru)) { + if (list_lru_add_obj(&btp->bt_lru, &bp->b_lru)) bp->b_state &= ~XFS_BSTATE_DISPOSE; - atomic_inc(&bp->b_hold); - } + else + bp->b_hold--; spin_unlock(&bch->bc_lock); } else { + bp->b_hold--; /* * most of the time buffers will already be removed from the * LRU, so optimise that case by checking for the @@ -1748,13 +1750,14 @@ xfs_buftarg_drain_rele( struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); struct list_head *dispose = arg; - if (atomic_read(&bp->b_hold) > 1) { + if (!spin_trylock(&bp->b_lock)) + return LRU_SKIP; + if (bp->b_hold > 1) { /* need to wait, so skip it this pass */ + spin_unlock(&bp->b_lock); trace_xfs_buf_drain_buftarg(bp, _RET_IP_); return LRU_SKIP; } - if (!spin_trylock(&bp->b_lock)) - return LRU_SKIP; /* * clear the LRU reference count so the buffer doesn't get @@ -2093,7 +2096,7 @@ xfs_buf_delwri_queue( */ bp->b_flags |= _XBF_DELWRI_Q; if (list_empty(&bp->b_list)) { - atomic_inc(&bp->b_hold); + xfs_buf_hold(bp); list_add_tail(&bp->b_list, list); } |