summaryrefslogtreecommitdiff
path: root/fs/namespace.c
diff options
context:
space:
mode:
authorChristian Brauner <brauner@kernel.org>2024-12-13 00:03:42 +0100
committerChristian Brauner <brauner@kernel.org>2025-01-09 16:58:52 +0100
commit5dcbd85d35515532c93b337a3f8be54937aeea8a (patch)
tree2406e976afa4078deee0f5c2b92ca5660927c4b7 /fs/namespace.c
parent144acef3334eb664fae4a2e1e35fdd693fc07d4e (diff)
fs: lockless mntns rbtree lookup
Currently we use a read-write lock but for the simple search case we can make this lockless. Creating a new mount namespace is a rather rare event compared with querying mounts in a foreign mount namespace. Once this is picked up by e.g., systemd to list mounts in another mount in it's isolated services or in containers this will be used a lot so this seems worthwhile doing. Link: https://lore.kernel.org/r/20241213-work-mount-rbtree-lockless-v3-3-6e3cdaf9b280@kernel.org Reviewed-by: Jeff Layton <jlayton@kernel.org> Signed-off-by: Christian Brauner <brauner@kernel.org>
Diffstat (limited to 'fs/namespace.c')
-rw-r--r--fs/namespace.c116
1 files changed, 70 insertions, 46 deletions
diff --git a/fs/namespace.c b/fs/namespace.c
index 0ca2cda6bc16..d2dbbbc91cc0 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -79,6 +79,8 @@ static DECLARE_RWSEM(namespace_sem);
static HLIST_HEAD(unmounted); /* protected by namespace_sem */
static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
static DEFINE_RWLOCK(mnt_ns_tree_lock);
+static seqcount_rwlock_t mnt_ns_tree_seqcount = SEQCNT_RWLOCK_ZERO(mnt_ns_tree_seqcount, &mnt_ns_tree_lock);
+
static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */
struct mount_kattr {
@@ -105,17 +107,6 @@ EXPORT_SYMBOL_GPL(fs_kobj);
*/
__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
-static int mnt_ns_cmp(u64 seq, const struct mnt_namespace *ns)
-{
- u64 seq_b = ns->seq;
-
- if (seq < seq_b)
- return -1;
- if (seq > seq_b)
- return 1;
- return 0;
-}
-
static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
{
if (!node)
@@ -123,19 +114,41 @@ static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node);
}
-static bool mnt_ns_less(struct rb_node *a, const struct rb_node *b)
+static int mnt_ns_cmp(struct rb_node *a, const struct rb_node *b)
{
struct mnt_namespace *ns_a = node_to_mnt_ns(a);
struct mnt_namespace *ns_b = node_to_mnt_ns(b);
u64 seq_a = ns_a->seq;
+ u64 seq_b = ns_b->seq;
- return mnt_ns_cmp(seq_a, ns_b) < 0;
+ if (seq_a < seq_b)
+ return -1;
+ if (seq_a > seq_b)
+ return 1;
+ return 0;
+}
+
+static inline void mnt_ns_tree_write_lock(void)
+{
+ write_lock(&mnt_ns_tree_lock);
+ write_seqcount_begin(&mnt_ns_tree_seqcount);
+}
+
+static inline void mnt_ns_tree_write_unlock(void)
+{
+ write_seqcount_end(&mnt_ns_tree_seqcount);
+ write_unlock(&mnt_ns_tree_lock);
}
static void mnt_ns_tree_add(struct mnt_namespace *ns)
{
- guard(write_lock)(&mnt_ns_tree_lock);
- rb_add(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_less);
+ struct rb_node *node;
+
+ mnt_ns_tree_write_lock();
+ node = rb_find_add_rcu(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_cmp);
+ mnt_ns_tree_write_unlock();
+
+ WARN_ON_ONCE(node);
}
static void mnt_ns_release(struct mnt_namespace *ns)
@@ -150,41 +163,33 @@ static void mnt_ns_release(struct mnt_namespace *ns)
}
DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T))
+static void mnt_ns_release_rcu(struct rcu_head *rcu)
+{
+ mnt_ns_release(container_of(rcu, struct mnt_namespace, mnt_ns_rcu));
+}
+
static void mnt_ns_tree_remove(struct mnt_namespace *ns)
{
/* remove from global mount namespace list */
if (!is_anon_ns(ns)) {
- guard(write_lock)(&mnt_ns_tree_lock);
+ mnt_ns_tree_write_lock();
rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree);
+ mnt_ns_tree_write_unlock();
}
- mnt_ns_release(ns);
+ call_rcu(&ns->mnt_ns_rcu, mnt_ns_release_rcu);
}
-/*
- * Returns the mount namespace which either has the specified id, or has the
- * next smallest id afer the specified one.
- */
-static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id)
+static int mnt_ns_find(const void *key, const struct rb_node *node)
{
- struct rb_node *node = mnt_ns_tree.rb_node;
- struct mnt_namespace *ret = NULL;
+ const u64 mnt_ns_id = *(u64 *)key;
+ const struct mnt_namespace *ns = node_to_mnt_ns(node);
- lockdep_assert_held(&mnt_ns_tree_lock);
-
- while (node) {
- struct mnt_namespace *n = node_to_mnt_ns(node);
-
- if (mnt_ns_id <= n->seq) {
- ret = node_to_mnt_ns(node);
- if (mnt_ns_id == n->seq)
- break;
- node = node->rb_left;
- } else {
- node = node->rb_right;
- }
- }
- return ret;
+ if (mnt_ns_id < ns->seq)
+ return -1;
+ if (mnt_ns_id > ns->seq)
+ return 1;
+ return 0;
}
/*
@@ -194,18 +199,37 @@ static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id)
* namespace the @namespace_sem must first be acquired. If the namespace has
* already shut down before acquiring @namespace_sem, {list,stat}mount() will
* see that the mount rbtree of the namespace is empty.
+ *
+ * Note the lookup is lockless protected by a sequence counter. We only
+ * need to guard against false negatives as false positives aren't
+ * possible. So if we didn't find a mount namespace and the sequence
+ * counter has changed we need to retry. If the sequence counter is
+ * still the same we know the search actually failed.
*/
static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id)
{
- struct mnt_namespace *ns;
+ struct mnt_namespace *ns;
+ struct rb_node *node;
+ unsigned int seq;
+
+ guard(rcu)();
+ do {
+ seq = read_seqcount_begin(&mnt_ns_tree_seqcount);
+ node = rb_find_rcu(&mnt_ns_id, &mnt_ns_tree, mnt_ns_find);
+ if (node)
+ break;
+ } while (read_seqcount_retry(&mnt_ns_tree_seqcount, seq));
- guard(read_lock)(&mnt_ns_tree_lock);
- ns = mnt_ns_find_id_at(mnt_ns_id);
- if (!ns || ns->seq != mnt_ns_id)
- return NULL;
+ if (!node)
+ return NULL;
- refcount_inc(&ns->passive);
- return ns;
+ /*
+ * The last reference count is put with RCU delay so we can
+ * unconditonally acquire a reference here.
+ */
+ ns = node_to_mnt_ns(node);
+ refcount_inc(&ns->passive);
+ return ns;
}
static inline void lock_mount_hash(void)