Merge patch series "fs: introduce file_ref_t"

Christian Brauner <brauner@kernel.org> says: As atomic_inc_not_zero() is implemented with a try_cmpxchg() loop it has O(N^2) behaviour under contention with N concurrent operations and it is in a hot path in __fget_files_rcu(). The rcuref infrastructures remedies this problem by using an unconditional increment relying on safe- and dead zones to make this work and requiring rcu protection for the data structure in question. This not just scales better it also introduces overflow protection. However, in contrast to generic rcuref, files require a memory barrier and thus cannot rely on *_relaxed() atomic operations and also require to be built on atomic_long_t as having massive amounts of reference isn't unheard of even if it is just an attack. As suggested by Linus, add a file specific variant instead of making this a generic library. I've been testing this with will-it-scale using a multi-threaded fstat() on the same file descriptor on a machine that Jens gave me access (thank you very much!): processor : 511 vendor_id : AuthenticAMD cpu family : 25 model : 160 model name : AMD EPYC 9754 128-Core Processor and I consistently get a 3-5% improvement on workloads with 256+ and more threads comparing v6.12-rc1 as base with and without these patches applied. * patches from https://lore.kernel.org/r/20241007-brauner-file-rcuref-v2-0-387e24dc9163@kernel.org: fs: port files to file_ref fs: add file_ref fs: protect backing files with rcu Link: https://lore.kernel.org/r/20241007-brauner-file-rcuref-v2-0-387e24dc9163@kernel.org Signed-off-by: Christian Brauner <brauner@kernel.org>
author: Christian Brauner <brauner@kernel.org> 2024-10-08 13:05:18 +0200
committer: Christian Brauner <brauner@kernel.org> 2024-10-30 09:58:02 +0100
commit: 62eec753cae265002043872ba419d0887fe33ec6 (patch)
tree: a0b6604acca7446936589359cc5c70828b9b50a4 /fs/file.c
parent: 8cf0b93919e13d1e8d4466eb4080a4c4d9d66d7b (diff)
parent: 90ee6ed776c06435a3fe79c7f5344761f52e1760 (diff)
1 files changed, 70 insertions, 7 deletions
diff --git a/fs/file.c b/fs/file.c
index eb093e736972..9598c577f713 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -20,10 +20,73 @@
 #include <linux/spinlock.h>
 #include <linux/rcupdate.h>
 #include <linux/close_range.h>
+#include <linux/file_ref.h>
 #include <net/sock.h>
 
 #include "internal.h"
 
+/**
+ * __file_ref_put - Slowpath of file_ref_put()
+ * @ref:	Pointer to the reference count
+ * @cnt:	Current reference count
+ *
+ * Invoked when the reference count is outside of the valid zone.
+ *
+ * Return:
+ *	True if this was the last reference with no future references
+ *	possible. This signals the caller that it can safely schedule the
+ *	object, which is protected by the reference counter, for
+ *	deconstruction.
+ *
+ *	False if there are still active references or the put() raced
+ *	with a concurrent get()/put() pair. Caller is not allowed to
+ *	deconstruct the protected object.
+ */
+bool __file_ref_put(file_ref_t *ref, unsigned long cnt)
+{
+	/* Did this drop the last reference? */
+	if (likely(cnt == FILE_REF_NOREF)) {
+		/*
+		 * Carefully try to set the reference count to FILE_REF_DEAD.
+		 *
+		 * This can fail if a concurrent get() operation has
+		 * elevated it again or the corresponding put() even marked
+		 * it dead already. Both are valid situations and do not
+		 * require a retry. If this fails the caller is not
+		 * allowed to deconstruct the object.
+		 */
+		if (!atomic_long_try_cmpxchg_release(&ref->refcnt, &cnt, FILE_REF_DEAD))
+			return false;
+
+		/*
+		 * The caller can safely schedule the object for
+		 * deconstruction. Provide acquire ordering.
+		 */
+		smp_acquire__after_ctrl_dep();
+		return true;
+	}
+
+	/*
+	 * If the reference count was already in the dead zone, then this
+	 * put() operation is imbalanced. Warn, put the reference count back to
+	 * DEAD and tell the caller to not deconstruct the object.
+	 */
+	if (WARN_ONCE(cnt >= FILE_REF_RELEASED, "imbalanced put on file reference count")) {
+		atomic_long_set(&ref->refcnt, FILE_REF_DEAD);
+		return false;
+	}
+
+	/*
+	 * This is a put() operation on a saturated refcount. Restore the
+	 * mean saturation value and tell the caller to not deconstruct the
+	 * object.
+	 */
+	if (cnt > FILE_REF_MAXREF)
+		atomic_long_set(&ref->refcnt, FILE_REF_SATURATED);
+	return false;
+}
+EXPORT_SYMBOL_GPL(__file_ref_put);
+
 unsigned int sysctl_nr_open __read_mostly = 1024*1024;
 unsigned int sysctl_nr_open_min = BITS_PER_LONG;
 /* our min() is unusable in constant expressions ;-/ */
@@ -839,7 +902,7 @@ static struct file *__get_file_rcu(struct file __rcu **f)
 	if (!file)
 		return NULL;
 
-	if (unlikely(!atomic_long_inc_not_zero(&file->f_count)))
+	if (unlikely(!file_ref_get(&file->f_ref)))
 		return ERR_PTR(-EAGAIN);
 
 	file_reloaded = rcu_dereference_raw(*f);
@@ -853,8 +916,8 @@ static struct file *__get_file_rcu(struct file __rcu **f)
 	OPTIMIZER_HIDE_VAR(file_reloaded_cmp);
 
 	/*
-	 * atomic_long_inc_not_zero() above provided a full memory
-	 * barrier when we acquired a reference.
+	 * file_ref_get() above provided a full memory barrier when we
+	 * acquired a reference.
 	 *
 	 * This is paired with the write barrier from assigning to the
 	 * __rcu protected file pointer so that if that pointer still
@@ -952,11 +1015,11 @@ static inline struct file *__fget_files_rcu(struct files_struct *files,
 		 * We need to confirm it by incrementing the refcount
 		 * and then check the lookup again.
 		 *
-		 * atomic_long_inc_not_zero() gives us a full memory
-		 * barrier. We only really need an 'acquire' one to
-		 * protect the loads below, but we don't have that.
+		 * file_ref_get() gives us a full memory barrier. We
+		 * only really need an 'acquire' one to protect the
+		 * loads below, but we don't have that.
 		 */
-		if (unlikely(!atomic_long_inc_not_zero(&file->f_count)))
+		if (unlikely(!file_ref_get(&file->f_ref)))
 			continue;
 
 		/*
author	Christian Brauner <brauner@kernel.org>	2024-10-08 13:05:18 +0200
committer	Christian Brauner <brauner@kernel.org>	2024-10-30 09:58:02 +0100
commit	62eec753cae265002043872ba419d0887fe33ec6 (patch)
tree	a0b6604acca7446936589359cc5c70828b9b50a4 /fs/file.c
parent	8cf0b93919e13d1e8d4466eb4080a4c4d9d66d7b (diff)
parent	90ee6ed776c06435a3fe79c7f5344761f52e1760 (diff)