summaryrefslogtreecommitdiff
path: root/fs/xfs/xfs_buf.c
diff options
context:
space:
mode:
authorChristoph Hellwig <hch@lst.de>2025-01-16 07:01:42 +0100
committerCarlos Maiolino <cem@kernel.org>2025-01-16 10:19:59 +0100
commitee10f6fcdb961e810d7b16be1285319c15c78ef6 (patch)
tree922781843c3071785b84fdb05fc6171c5010b0f9 /fs/xfs/xfs_buf.c
parent07eae0fa67ca4bbb199ad85645e0f9dfaef931cd (diff)
xfs: fix buffer lookup vs release race
Since commit 298f34224506 ("xfs: lockless buffer lookup") the buffer lookup fastpath is done without a hash-wide lock (then pag_buf_lock, now bc_lock) and only under RCU protection. But this means that nothing serializes lookups against the temporary 0 reference count for buffers that are added to the LRU after dropping the last regular reference, and a concurrent lookup would fail to find them. Fix this by doing all b_hold modifications under b_lock. We're already doing this for release so this "only" ~ doubles the b_lock round trips. We'll later look into the lockref infrastructure to optimize the number of lock round trips again. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Carlos Maiolino <cem@kernel.org>
Diffstat (limited to 'fs/xfs/xfs_buf.c')
-rw-r--r--fs/xfs/xfs_buf.c93
1 files changed, 48 insertions, 45 deletions
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 183428fbc607..7fbdd4b30676 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -127,15 +127,6 @@ __xfs_buf_ioacct_dec(
}
}
-static inline void
-xfs_buf_ioacct_dec(
- struct xfs_buf *bp)
-{
- spin_lock(&bp->b_lock);
- __xfs_buf_ioacct_dec(bp);
- spin_unlock(&bp->b_lock);
-}
-
/*
* When we mark a buffer stale, we remove the buffer from the LRU and clear the
* b_lru_ref count so that the buffer is freed immediately when the buffer
@@ -171,9 +162,9 @@ xfs_buf_stale(
atomic_set(&bp->b_lru_ref, 0);
if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
(list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru)))
- atomic_dec(&bp->b_hold);
+ bp->b_hold--;
- ASSERT(atomic_read(&bp->b_hold) >= 1);
+ ASSERT(bp->b_hold >= 1);
spin_unlock(&bp->b_lock);
}
@@ -229,14 +220,14 @@ _xfs_buf_alloc(
*/
flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD);
- atomic_set(&bp->b_hold, 1);
+ spin_lock_init(&bp->b_lock);
+ bp->b_hold = 1;
atomic_set(&bp->b_lru_ref, 1);
init_completion(&bp->b_iowait);
INIT_LIST_HEAD(&bp->b_lru);
INIT_LIST_HEAD(&bp->b_list);
INIT_LIST_HEAD(&bp->b_li_list);
sema_init(&bp->b_sema, 0); /* held, no waiters */
- spin_lock_init(&bp->b_lock);
bp->b_target = target;
bp->b_mount = target->bt_mount;
bp->b_flags = flags;
@@ -580,6 +571,20 @@ xfs_buf_find_lock(
return 0;
}
+static bool
+xfs_buf_try_hold(
+ struct xfs_buf *bp)
+{
+ spin_lock(&bp->b_lock);
+ if (bp->b_hold == 0) {
+ spin_unlock(&bp->b_lock);
+ return false;
+ }
+ bp->b_hold++;
+ spin_unlock(&bp->b_lock);
+ return true;
+}
+
static inline int
xfs_buf_lookup(
struct xfs_buf_cache *bch,
@@ -592,7 +597,7 @@ xfs_buf_lookup(
rcu_read_lock();
bp = rhashtable_lookup(&bch->bc_hash, map, xfs_buf_hash_params);
- if (!bp || !atomic_inc_not_zero(&bp->b_hold)) {
+ if (!bp || !xfs_buf_try_hold(bp)) {
rcu_read_unlock();
return -ENOENT;
}
@@ -655,7 +660,7 @@ xfs_buf_find_insert(
spin_unlock(&bch->bc_lock);
goto out_free_buf;
}
- if (bp && atomic_inc_not_zero(&bp->b_hold)) {
+ if (bp && xfs_buf_try_hold(bp)) {
/* found an existing buffer */
spin_unlock(&bch->bc_lock);
error = xfs_buf_find_lock(bp, flags);
@@ -1037,7 +1042,10 @@ xfs_buf_hold(
struct xfs_buf *bp)
{
trace_xfs_buf_hold(bp, _RET_IP_);
- atomic_inc(&bp->b_hold);
+
+ spin_lock(&bp->b_lock);
+ bp->b_hold++;
+ spin_unlock(&bp->b_lock);
}
static void
@@ -1045,10 +1053,15 @@ xfs_buf_rele_uncached(
struct xfs_buf *bp)
{
ASSERT(list_empty(&bp->b_lru));
- if (atomic_dec_and_test(&bp->b_hold)) {
- xfs_buf_ioacct_dec(bp);
- xfs_buf_free(bp);
+
+ spin_lock(&bp->b_lock);
+ if (--bp->b_hold) {
+ spin_unlock(&bp->b_lock);
+ return;
}
+ __xfs_buf_ioacct_dec(bp);
+ spin_unlock(&bp->b_lock);
+ xfs_buf_free(bp);
}
static void
@@ -1058,51 +1071,40 @@ xfs_buf_rele_cached(
struct xfs_buftarg *btp = bp->b_target;
struct xfs_perag *pag = bp->b_pag;
struct xfs_buf_cache *bch = xfs_buftarg_buf_cache(btp, pag);
- bool release;
bool freebuf = false;
trace_xfs_buf_rele(bp, _RET_IP_);
- ASSERT(atomic_read(&bp->b_hold) > 0);
-
- /*
- * We grab the b_lock here first to serialise racing xfs_buf_rele()
- * calls. The pag_buf_lock being taken on the last reference only
- * serialises against racing lookups in xfs_buf_find(). IOWs, the second
- * to last reference we drop here is not serialised against the last
- * reference until we take bp->b_lock. Hence if we don't grab b_lock
- * first, the last "release" reference can win the race to the lock and
- * free the buffer before the second-to-last reference is processed,
- * leading to a use-after-free scenario.
- */
spin_lock(&bp->b_lock);
- release = atomic_dec_and_lock(&bp->b_hold, &bch->bc_lock);
- if (!release) {
+ ASSERT(bp->b_hold >= 1);
+ if (bp->b_hold > 1) {
/*
* Drop the in-flight state if the buffer is already on the LRU
* and it holds the only reference. This is racy because we
* haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT
* ensures the decrement occurs only once per-buf.
*/
- if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru))
+ if (--bp->b_hold == 1 && !list_empty(&bp->b_lru))
__xfs_buf_ioacct_dec(bp);
goto out_unlock;
}
- /* the last reference has been dropped ... */
+ /* we are asked to drop the last reference */
+ spin_lock(&bch->bc_lock);
__xfs_buf_ioacct_dec(bp);
if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
/*
- * If the buffer is added to the LRU take a new reference to the
+ * If the buffer is added to the LRU, keep the reference to the
* buffer for the LRU and clear the (now stale) dispose list
- * state flag
+ * state flag, else drop the reference.
*/
- if (list_lru_add_obj(&btp->bt_lru, &bp->b_lru)) {
+ if (list_lru_add_obj(&btp->bt_lru, &bp->b_lru))
bp->b_state &= ~XFS_BSTATE_DISPOSE;
- atomic_inc(&bp->b_hold);
- }
+ else
+ bp->b_hold--;
spin_unlock(&bch->bc_lock);
} else {
+ bp->b_hold--;
/*
* most of the time buffers will already be removed from the
* LRU, so optimise that case by checking for the
@@ -1748,13 +1750,14 @@ xfs_buftarg_drain_rele(
struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru);
struct list_head *dispose = arg;
- if (atomic_read(&bp->b_hold) > 1) {
+ if (!spin_trylock(&bp->b_lock))
+ return LRU_SKIP;
+ if (bp->b_hold > 1) {
/* need to wait, so skip it this pass */
+ spin_unlock(&bp->b_lock);
trace_xfs_buf_drain_buftarg(bp, _RET_IP_);
return LRU_SKIP;
}
- if (!spin_trylock(&bp->b_lock))
- return LRU_SKIP;
/*
* clear the LRU reference count so the buffer doesn't get
@@ -2093,7 +2096,7 @@ xfs_buf_delwri_queue(
*/
bp->b_flags |= _XBF_DELWRI_Q;
if (list_empty(&bp->b_list)) {
- atomic_inc(&bp->b_hold);
+ xfs_buf_hold(bp);
list_add_tail(&bp->b_list, list);
}