summaryrefslogtreecommitdiff
path: root/kernel/fork.c
diff options
context:
space:
mode:
authorSuren Baghdasaryan <surenb@google.com>2025-02-13 14:46:49 -0800
committerAndrew Morton <akpm@linux-foundation.org>2025-03-16 22:06:20 -0700
commitf35ab95ca0af7a27feab57b9d7e906405bddb093 (patch)
treeedd5498aae7dadfe230f2488704c4be05f173a08 /kernel/fork.c
parent4e0dbe105d5088c77eb09de6f049aaf44711a2ec (diff)
mm: replace vm_lock and detached flag with a reference count
rw_semaphore is a sizable structure of 40 bytes and consumes considerable space for each vm_area_struct. However vma_lock has two important specifics which can be used to replace rw_semaphore with a simpler structure: 1. Readers never wait. They try to take the vma_lock and fall back to mmap_lock if that fails. 2. Only one writer at a time will ever try to write-lock a vma_lock because writers first take mmap_lock in write mode. Because of these requirements, full rw_semaphore functionality is not needed and we can replace rw_semaphore and the vma->detached flag with a refcount (vm_refcnt). When vma is in detached state, vm_refcnt is 0 and only a call to vma_mark_attached() can take it out of this state. Note that unlike before, now we enforce both vma_mark_attached() and vma_mark_detached() to be done only after vma has been write-locked. vma_mark_attached() changes vm_refcnt to 1 to indicate that it has been attached to the vma tree. When a reader takes read lock, it increments vm_refcnt, unless the top usable bit of vm_refcnt (0x40000000) is set, indicating presence of a writer. When writer takes write lock, it sets the top usable bit to indicate its presence. If there are readers, writer will wait using newly introduced mm->vma_writer_wait. Since all writers take mmap_lock in write mode first, there can be only one writer at a time. The last reader to release the lock will signal the writer to wake up. refcount might overflow if there are many competing readers, in which case read-locking will fail. Readers are expected to handle such failures. In summary: 1. all readers increment the vm_refcnt; 2. writer sets top usable (writer) bit of vm_refcnt; 3. readers cannot increment the vm_refcnt if the writer bit is set; 4. in the presence of readers, writer must wait for the vm_refcnt to drop to 1 (plus the VMA_LOCK_OFFSET writer bit), indicating an attached vma with no readers; 5. vm_refcnt overflow is handled by the readers. While this vm_lock replacement does not yet result in a smaller vm_area_struct (it stays at 256 bytes due to cacheline alignment), it allows for further size optimization by structure member regrouping to bring the size of vm_area_struct below 192 bytes. [surenb@google.com: fix a crash due to vma_end_read() that should have been removed] Link: https://lkml.kernel.org/r/20250220200208.323769-1-surenb@google.com Link: https://lkml.kernel.org/r/20250213224655.1680278-13-surenb@google.com Signed-off-by: Suren Baghdasaryan <surenb@google.com> Suggested-by: Peter Zijlstra <peterz@infradead.org> Suggested-by: Matthew Wilcox <willy@infradead.org> Tested-by: Shivank Garg <shivankg@amd.com> Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Reviewed-by: Vlastimil Babka <vbabka@suse.cz> Cc: Christian Brauner <brauner@kernel.org> Cc: David Hildenbrand <david@redhat.com> Cc: David Howells <dhowells@redhat.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: Hugh Dickins <hughd@google.com> Cc: Jann Horn <jannh@google.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Klara Modin <klarasmodin@gmail.com> Cc: Liam R. Howlett <Liam.Howlett@Oracle.com> Cc: Lokesh Gidra <lokeshgidra@google.com> Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Cc: Mateusz Guzik <mjguzik@gmail.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Minchan Kim <minchan@google.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Pasha Tatashin <pasha.tatashin@soleen.com> Cc: "Paul E . McKenney" <paulmck@kernel.org> Cc: Peter Xu <peterx@redhat.com> Cc: Shakeel Butt <shakeel.butt@linux.dev> Cc: Sourav Panda <souravpanda@google.com> Cc: Wei Yang <richard.weiyang@gmail.com> Cc: Will Deacon <will@kernel.org> Cc: Heiko Carstens <hca@linux.ibm.com> Cc: Stephen Rothwell <sfr@canb.auug.org.au> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Diffstat (limited to 'kernel/fork.c')
-rw-r--r--kernel/fork.c13
1 files changed, 6 insertions, 7 deletions
diff --git a/kernel/fork.c b/kernel/fork.c
index f1af413e5aa4..48a0038f606f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -463,12 +463,8 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
* will be reinitialized.
*/
data_race(memcpy(new, orig, sizeof(*new)));
- vma_lock_init(new);
+ vma_lock_init(new, true);
INIT_LIST_HEAD(&new->anon_vma_chain);
-#ifdef CONFIG_PER_VMA_LOCK
- /* vma is not locked, can't use vma_mark_detached() */
- new->detached = true;
-#endif
vma_numab_state_init(new);
dup_anon_vma_name(orig, new);
@@ -477,6 +473,8 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
void __vm_area_free(struct vm_area_struct *vma)
{
+ /* The vma should be detached while being destroyed. */
+ vma_assert_detached(vma);
vma_numab_state_free(vma);
free_anon_vma_name(vma);
kmem_cache_free(vm_area_cachep, vma);
@@ -488,8 +486,6 @@ static void vm_area_free_rcu_cb(struct rcu_head *head)
struct vm_area_struct *vma = container_of(head, struct vm_area_struct,
vm_rcu);
- /* The vma should not be locked while being destroyed. */
- VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock.lock), vma);
__vm_area_free(vma);
}
#endif
@@ -1234,6 +1230,9 @@ static void mmap_init_lock(struct mm_struct *mm)
{
init_rwsem(&mm->mmap_lock);
mm_lock_seqcount_init(mm);
+#ifdef CONFIG_PER_VMA_LOCK
+ rcuwait_init(&mm->vma_writer_wait);
+#endif
}
static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,