| From: Suren Baghdasaryan <surenb@google.com> |
| Subject: mm: replace vm_lock and detached flag with a reference count |
| Date: Thu, 13 Feb 2025 14:46:49 -0800 |
| |
| rw_semaphore is a sizable structure of 40 bytes and consumes considerable |
| space for each vm_area_struct. However vma_lock has two important |
| specifics which can be used to replace rw_semaphore with a simpler |
| structure: |
| |
| 1. Readers never wait. They try to take the vma_lock and fall back to |
| mmap_lock if that fails. |
| |
| 2. Only one writer at a time will ever try to write-lock a vma_lock |
| because writers first take mmap_lock in write mode. Because of these |
| requirements, full rw_semaphore functionality is not needed and we can |
| replace rw_semaphore and the vma->detached flag with a refcount |
| (vm_refcnt). |
| |
| When vma is in detached state, vm_refcnt is 0 and only a call to |
| vma_mark_attached() can take it out of this state. Note that unlike |
| before, now we enforce both vma_mark_attached() and vma_mark_detached() to |
| be done only after vma has been write-locked. vma_mark_attached() changes |
| vm_refcnt to 1 to indicate that it has been attached to the vma tree. |
| When a reader takes read lock, it increments vm_refcnt, unless the top |
| usable bit of vm_refcnt (0x40000000) is set, indicating presence of a |
| writer. When writer takes write lock, it sets the top usable bit to |
| indicate its presence. If there are readers, writer will wait using newly |
| introduced mm->vma_writer_wait. Since all writers take mmap_lock in write |
| mode first, there can be only one writer at a time. The last reader to |
| release the lock will signal the writer to wake up. refcount might |
| overflow if there are many competing readers, in which case read-locking |
| will fail. Readers are expected to handle such failures. |
| |
| In summary: |
| 1. all readers increment the vm_refcnt; |
| 2. writer sets top usable (writer) bit of vm_refcnt; |
| 3. readers cannot increment the vm_refcnt if the writer bit is set; |
| 4. in the presence of readers, writer must wait for the vm_refcnt to drop |
| to 1 (plus the VMA_LOCK_OFFSET writer bit), indicating an attached vma |
| with no readers; |
| 5. vm_refcnt overflow is handled by the readers. |
| |
| While this vm_lock replacement does not yet result in a smaller |
| vm_area_struct (it stays at 256 bytes due to cacheline alignment), it |
| allows for further size optimization by structure member regrouping to |
| bring the size of vm_area_struct below 192 bytes. |
| |
| [surenb@google.com: fix a crash due to vma_end_read() that should have been removed] |
| Link: https://lkml.kernel.org/r/20250220200208.323769-1-surenb@google.com |
| Link: https://lkml.kernel.org/r/20250213224655.1680278-13-surenb@google.com |
| Signed-off-by: Suren Baghdasaryan <surenb@google.com> |
| Suggested-by: Peter Zijlstra <peterz@infradead.org> |
| Suggested-by: Matthew Wilcox <willy@infradead.org> |
| Tested-by: Shivank Garg <shivankg@amd.com> |
| Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com |
| Reviewed-by: Vlastimil Babka <vbabka@suse.cz> |
| Cc: Christian Brauner <brauner@kernel.org> |
| Cc: David Hildenbrand <david@redhat.com> |
| Cc: David Howells <dhowells@redhat.com> |
| Cc: Davidlohr Bueso <dave@stgolabs.net> |
| Cc: Hugh Dickins <hughd@google.com> |
| Cc: Jann Horn <jannh@google.com> |
| Cc: Johannes Weiner <hannes@cmpxchg.org> |
| Cc: Jonathan Corbet <corbet@lwn.net> |
| Cc: Klara Modin <klarasmodin@gmail.com> |
| Cc: Liam R. Howlett <Liam.Howlett@Oracle.com> |
| Cc: Lokesh Gidra <lokeshgidra@google.com> |
| Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> |
| Cc: Mateusz Guzik <mjguzik@gmail.com> |
| Cc: Mel Gorman <mgorman@techsingularity.net> |
| Cc: Michal Hocko <mhocko@suse.com> |
| Cc: Minchan Kim <minchan@google.com> |
| Cc: Oleg Nesterov <oleg@redhat.com> |
| Cc: Pasha Tatashin <pasha.tatashin@soleen.com> |
| Cc: "Paul E . McKenney" <paulmck@kernel.org> |
| Cc: Peter Xu <peterx@redhat.com> |
| Cc: Shakeel Butt <shakeel.butt@linux.dev> |
| Cc: Sourav Panda <souravpanda@google.com> |
| Cc: Wei Yang <richard.weiyang@gmail.com> |
| Cc: Will Deacon <will@kernel.org> |
| Cc: Heiko Carstens <hca@linux.ibm.com> |
| Cc: Stephen Rothwell <sfr@canb.auug.org.au> |
| Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
| --- |
| |
| include/linux/mm.h | 128 +++++++++++++++++++---------- |
| include/linux/mm_types.h | 22 ++-- |
| kernel/fork.c | 13 +- |
| mm/init-mm.c | 1 |
| mm/memory.c | 90 ++++++++++++++++++-- |
| tools/testing/vma/linux/atomic.h | 5 + |
| tools/testing/vma/vma_internal.h | 63 ++++++-------- |
| 7 files changed, 217 insertions(+), 105 deletions(-) |
| |
| --- a/include/linux/mm.h~mm-replace-vm_lock-and-detached-flag-with-a-reference-count |
| +++ a/include/linux/mm.h |
| @@ -32,6 +32,7 @@ |
| #include <linux/memremap.h> |
| #include <linux/slab.h> |
| #include <linux/cacheinfo.h> |
| +#include <linux/rcuwait.h> |
| |
| struct mempolicy; |
| struct anon_vma; |
| @@ -697,19 +698,54 @@ static inline void vma_numab_state_free( |
| #endif /* CONFIG_NUMA_BALANCING */ |
| |
| #ifdef CONFIG_PER_VMA_LOCK |
| -static inline void vma_lock_init(struct vm_area_struct *vma) |
| +static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) |
| { |
| - init_rwsem(&vma->vm_lock.lock); |
| +#ifdef CONFIG_DEBUG_LOCK_ALLOC |
| + static struct lock_class_key lockdep_key; |
| + |
| + lockdep_init_map(&vma->vmlock_dep_map, "vm_lock", &lockdep_key, 0); |
| +#endif |
| + if (reset_refcnt) |
| + refcount_set(&vma->vm_refcnt, 0); |
| vma->vm_lock_seq = UINT_MAX; |
| } |
| |
| +static inline bool is_vma_writer_only(int refcnt) |
| +{ |
| + /* |
| + * With a writer and no readers, refcnt is VMA_LOCK_OFFSET if the vma |
| + * is detached and (VMA_LOCK_OFFSET + 1) if it is attached. Waiting on |
| + * a detached vma happens only in vma_mark_detached() and is a rare |
| + * case, therefore most of the time there will be no unnecessary wakeup. |
| + */ |
| + return refcnt & VMA_LOCK_OFFSET && refcnt <= VMA_LOCK_OFFSET + 1; |
| +} |
| + |
| +static inline void vma_refcount_put(struct vm_area_struct *vma) |
| +{ |
| + /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */ |
| + struct mm_struct *mm = vma->vm_mm; |
| + int oldcnt; |
| + |
| + rwsem_release(&vma->vmlock_dep_map, _RET_IP_); |
| + if (!__refcount_dec_and_test(&vma->vm_refcnt, &oldcnt)) { |
| + |
| + if (is_vma_writer_only(oldcnt - 1)) |
| + rcuwait_wake_up(&mm->vma_writer_wait); |
| + } |
| +} |
| + |
| /* |
| * Try to read-lock a vma. The function is allowed to occasionally yield false |
| * locked result to avoid performance overhead, in which case we fall back to |
| * using mmap_lock. The function should never yield false unlocked result. |
| + * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got |
| + * detached. |
| */ |
| -static inline bool vma_start_read(struct vm_area_struct *vma) |
| +static inline struct vm_area_struct *vma_start_read(struct vm_area_struct *vma) |
| { |
| + int oldcnt; |
| + |
| /* |
| * Check before locking. A race might cause false locked result. |
| * We can use READ_ONCE() for the mm_lock_seq here, and don't need |
| @@ -718,15 +754,25 @@ static inline bool vma_start_read(struct |
| * need ordering is below. |
| */ |
| if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence)) |
| - return false; |
| + return NULL; |
| |
| - if (unlikely(down_read_trylock(&vma->vm_lock.lock) == 0)) |
| - return false; |
| + /* |
| + * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire() |
| + * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET. |
| + * Acquire fence is required here to avoid reordering against later |
| + * vm_lock_seq check and checks inside lock_vma_under_rcu(). |
| + */ |
| + if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, |
| + VMA_REF_LIMIT))) { |
| + /* return EAGAIN if vma got detached from under us */ |
| + return oldcnt ? NULL : ERR_PTR(-EAGAIN); |
| + } |
| |
| + rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); |
| /* |
| - * Overflow might produce false locked result. |
| + * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result. |
| * False unlocked result is impossible because we modify and check |
| - * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq |
| + * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq |
| * modification invalidates all existing locks. |
| * |
| * We must use ACQUIRE semantics for the mm_lock_seq so that if we are |
| @@ -735,10 +781,11 @@ static inline bool vma_start_read(struct |
| * This pairs with RELEASE semantics in vma_end_write_all(). |
| */ |
| if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) { |
| - up_read(&vma->vm_lock.lock); |
| - return false; |
| + vma_refcount_put(vma); |
| + return NULL; |
| } |
| - return true; |
| + |
| + return vma; |
| } |
| |
| /* |
| @@ -749,8 +796,14 @@ static inline bool vma_start_read(struct |
| */ |
| static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass) |
| { |
| + int oldcnt; |
| + |
| mmap_assert_locked(vma->vm_mm); |
| - down_read_nested(&vma->vm_lock.lock, subclass); |
| + if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, |
| + VMA_REF_LIMIT))) |
| + return false; |
| + |
| + rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); |
| return true; |
| } |
| |
| @@ -762,16 +815,12 @@ static inline bool vma_start_read_locked |
| */ |
| static inline bool vma_start_read_locked(struct vm_area_struct *vma) |
| { |
| - mmap_assert_locked(vma->vm_mm); |
| - down_read(&vma->vm_lock.lock); |
| - return true; |
| + return vma_start_read_locked_nested(vma, 0); |
| } |
| |
| static inline void vma_end_read(struct vm_area_struct *vma) |
| { |
| - rcu_read_lock(); /* keeps vma alive till the end of up_read */ |
| - up_read(&vma->vm_lock.lock); |
| - rcu_read_unlock(); |
| + vma_refcount_put(vma); |
| } |
| |
| /* WARNING! Can only be used if mmap_lock is expected to be write-locked */ |
| @@ -813,38 +862,35 @@ static inline void vma_assert_write_lock |
| |
| static inline void vma_assert_locked(struct vm_area_struct *vma) |
| { |
| - if (!rwsem_is_locked(&vma->vm_lock.lock)) |
| - vma_assert_write_locked(vma); |
| + unsigned int mm_lock_seq; |
| + |
| + VM_BUG_ON_VMA(refcount_read(&vma->vm_refcnt) <= 1 && |
| + !__is_vma_write_locked(vma, &mm_lock_seq), vma); |
| } |
| |
| +/* |
| + * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these |
| + * assertions should be made either under mmap_write_lock or when the object |
| + * has been isolated under mmap_write_lock, ensuring no competing writers. |
| + */ |
| static inline void vma_assert_attached(struct vm_area_struct *vma) |
| { |
| - WARN_ON_ONCE(vma->detached); |
| + WARN_ON_ONCE(!refcount_read(&vma->vm_refcnt)); |
| } |
| |
| static inline void vma_assert_detached(struct vm_area_struct *vma) |
| { |
| - WARN_ON_ONCE(!vma->detached); |
| + WARN_ON_ONCE(refcount_read(&vma->vm_refcnt)); |
| } |
| |
| static inline void vma_mark_attached(struct vm_area_struct *vma) |
| { |
| - vma_assert_detached(vma); |
| - vma->detached = false; |
| -} |
| - |
| -static inline void vma_mark_detached(struct vm_area_struct *vma) |
| -{ |
| - /* When detaching vma should be write-locked */ |
| vma_assert_write_locked(vma); |
| - vma_assert_attached(vma); |
| - vma->detached = true; |
| + vma_assert_detached(vma); |
| + refcount_set(&vma->vm_refcnt, 1); |
| } |
| |
| -static inline bool is_vma_detached(struct vm_area_struct *vma) |
| -{ |
| - return vma->detached; |
| -} |
| +void vma_mark_detached(struct vm_area_struct *vma); |
| |
| static inline void release_fault_lock(struct vm_fault *vmf) |
| { |
| @@ -867,9 +913,9 @@ struct vm_area_struct *lock_vma_under_rc |
| |
| #else /* CONFIG_PER_VMA_LOCK */ |
| |
| -static inline void vma_lock_init(struct vm_area_struct *vma) {} |
| -static inline bool vma_start_read(struct vm_area_struct *vma) |
| - { return false; } |
| +static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) {} |
| +static inline struct vm_area_struct *vma_start_read(struct vm_area_struct *vma) |
| + { return NULL; } |
| static inline void vma_end_read(struct vm_area_struct *vma) {} |
| static inline void vma_start_write(struct vm_area_struct *vma) {} |
| static inline void vma_assert_write_locked(struct vm_area_struct *vma) |
| @@ -910,12 +956,8 @@ static inline void vma_init(struct vm_ar |
| vma->vm_mm = mm; |
| vma->vm_ops = &vma_dummy_vm_ops; |
| INIT_LIST_HEAD(&vma->anon_vma_chain); |
| -#ifdef CONFIG_PER_VMA_LOCK |
| - /* vma is not locked, can't use vma_mark_detached() */ |
| - vma->detached = true; |
| -#endif |
| vma_numab_state_init(vma); |
| - vma_lock_init(vma); |
| + vma_lock_init(vma, false); |
| } |
| |
| /* Use when VMA is not part of the VMA tree and needs no locking */ |
| --- a/include/linux/mm_types.h~mm-replace-vm_lock-and-detached-flag-with-a-reference-count |
| +++ a/include/linux/mm_types.h |
| @@ -19,6 +19,7 @@ |
| #include <linux/workqueue.h> |
| #include <linux/seqlock.h> |
| #include <linux/percpu_counter.h> |
| +#include <linux/types.h> |
| |
| #include <asm/mmu.h> |
| |
| @@ -629,9 +630,8 @@ static inline struct anon_vma_name *anon |
| } |
| #endif |
| |
| -struct vma_lock { |
| - struct rw_semaphore lock; |
| -}; |
| +#define VMA_LOCK_OFFSET 0x40000000 |
| +#define VMA_REF_LIMIT (VMA_LOCK_OFFSET - 1) |
| |
| struct vma_numab_state { |
| /* |
| @@ -710,18 +710,12 @@ struct vm_area_struct { |
| |
| #ifdef CONFIG_PER_VMA_LOCK |
| /* |
| - * Flag to indicate areas detached from the mm->mm_mt tree. |
| - * Unstable RCU readers are allowed to read this. |
| - */ |
| - bool detached; |
| - |
| - /* |
| * Can only be written (using WRITE_ONCE()) while holding both: |
| * - mmap_lock (in write mode) |
| - * - vm_lock->lock (in write mode) |
| + * - vm_refcnt bit at VMA_LOCK_OFFSET is set |
| * Can be read reliably while holding one of: |
| * - mmap_lock (in read or write mode) |
| - * - vm_lock->lock (in read or write mode) |
| + * - vm_refcnt bit at VMA_LOCK_OFFSET is set or vm_refcnt > 1 |
| * Can be read unreliably (using READ_ONCE()) for pessimistic bailout |
| * while holding nothing (except RCU to keep the VMA struct allocated). |
| * |
| @@ -784,7 +778,10 @@ struct vm_area_struct { |
| struct vm_userfaultfd_ctx vm_userfaultfd_ctx; |
| #ifdef CONFIG_PER_VMA_LOCK |
| /* Unstable RCU readers are allowed to read this. */ |
| - struct vma_lock vm_lock ____cacheline_aligned_in_smp; |
| + refcount_t vm_refcnt ____cacheline_aligned_in_smp; |
| +#ifdef CONFIG_DEBUG_LOCK_ALLOC |
| + struct lockdep_map vmlock_dep_map; |
| +#endif |
| #endif |
| } __randomize_layout; |
| |
| @@ -920,6 +917,7 @@ struct mm_struct { |
| * by mmlist_lock |
| */ |
| #ifdef CONFIG_PER_VMA_LOCK |
| + struct rcuwait vma_writer_wait; |
| /* |
| * This field has lock-like semantics, meaning it is sometimes |
| * accessed with ACQUIRE/RELEASE semantics. |
| --- a/kernel/fork.c~mm-replace-vm_lock-and-detached-flag-with-a-reference-count |
| +++ a/kernel/fork.c |
| @@ -463,12 +463,8 @@ struct vm_area_struct *vm_area_dup(struc |
| * will be reinitialized. |
| */ |
| data_race(memcpy(new, orig, sizeof(*new))); |
| - vma_lock_init(new); |
| + vma_lock_init(new, true); |
| INIT_LIST_HEAD(&new->anon_vma_chain); |
| -#ifdef CONFIG_PER_VMA_LOCK |
| - /* vma is not locked, can't use vma_mark_detached() */ |
| - new->detached = true; |
| -#endif |
| vma_numab_state_init(new); |
| dup_anon_vma_name(orig, new); |
| |
| @@ -477,6 +473,8 @@ struct vm_area_struct *vm_area_dup(struc |
| |
| void __vm_area_free(struct vm_area_struct *vma) |
| { |
| + /* The vma should be detached while being destroyed. */ |
| + vma_assert_detached(vma); |
| vma_numab_state_free(vma); |
| free_anon_vma_name(vma); |
| kmem_cache_free(vm_area_cachep, vma); |
| @@ -488,8 +486,6 @@ static void vm_area_free_rcu_cb(struct r |
| struct vm_area_struct *vma = container_of(head, struct vm_area_struct, |
| vm_rcu); |
| |
| - /* The vma should not be locked while being destroyed. */ |
| - VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock.lock), vma); |
| __vm_area_free(vma); |
| } |
| #endif |
| @@ -1234,6 +1230,9 @@ static void mmap_init_lock(struct mm_str |
| { |
| init_rwsem(&mm->mmap_lock); |
| mm_lock_seqcount_init(mm); |
| +#ifdef CONFIG_PER_VMA_LOCK |
| + rcuwait_init(&mm->vma_writer_wait); |
| +#endif |
| } |
| |
| static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, |
| --- a/mm/init-mm.c~mm-replace-vm_lock-and-detached-flag-with-a-reference-count |
| +++ a/mm/init-mm.c |
| @@ -40,6 +40,7 @@ struct mm_struct init_mm = { |
| .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), |
| .mmlist = LIST_HEAD_INIT(init_mm.mmlist), |
| #ifdef CONFIG_PER_VMA_LOCK |
| + .vma_writer_wait = __RCUWAIT_INITIALIZER(init_mm.vma_writer_wait), |
| .mm_lock_seq = SEQCNT_ZERO(init_mm.mm_lock_seq), |
| #endif |
| .user_ns = &init_user_ns, |
| --- a/mm/memory.c~mm-replace-vm_lock-and-detached-flag-with-a-reference-count |
| +++ a/mm/memory.c |
| @@ -6353,9 +6353,47 @@ fail: |
| #endif |
| |
| #ifdef CONFIG_PER_VMA_LOCK |
| +static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching) |
| +{ |
| + unsigned int tgt_refcnt = VMA_LOCK_OFFSET; |
| + |
| + /* Additional refcnt if the vma is attached. */ |
| + if (!detaching) |
| + tgt_refcnt++; |
| + |
| + /* |
| + * If vma is detached then only vma_mark_attached() can raise the |
| + * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached(). |
| + */ |
| + if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt)) |
| + return false; |
| + |
| + rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_); |
| + rcuwait_wait_event(&vma->vm_mm->vma_writer_wait, |
| + refcount_read(&vma->vm_refcnt) == tgt_refcnt, |
| + TASK_UNINTERRUPTIBLE); |
| + lock_acquired(&vma->vmlock_dep_map, _RET_IP_); |
| + |
| + return true; |
| +} |
| + |
| +static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached) |
| +{ |
| + *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt); |
| + rwsem_release(&vma->vmlock_dep_map, _RET_IP_); |
| +} |
| + |
| void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq) |
| { |
| - down_write(&vma->vm_lock.lock); |
| + bool locked; |
| + |
| + /* |
| + * __vma_enter_locked() returns false immediately if the vma is not |
| + * attached, otherwise it waits until refcnt is indicating that vma |
| + * is attached with no readers. |
| + */ |
| + locked = __vma_enter_locked(vma, false); |
| + |
| /* |
| * We should use WRITE_ONCE() here because we can have concurrent reads |
| * from the early lockless pessimistic check in vma_start_read(). |
| @@ -6363,10 +6401,40 @@ void __vma_start_write(struct vm_area_st |
| * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy. |
| */ |
| WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); |
| - up_write(&vma->vm_lock.lock); |
| + |
| + if (locked) { |
| + bool detached; |
| + |
| + __vma_exit_locked(vma, &detached); |
| + WARN_ON_ONCE(detached); /* vma should remain attached */ |
| + } |
| } |
| EXPORT_SYMBOL_GPL(__vma_start_write); |
| |
| +void vma_mark_detached(struct vm_area_struct *vma) |
| +{ |
| + vma_assert_write_locked(vma); |
| + vma_assert_attached(vma); |
| + |
| + /* |
| + * We are the only writer, so no need to use vma_refcount_put(). |
| + * The condition below is unlikely because the vma has been already |
| + * write-locked and readers can increment vm_refcnt only temporarily |
| + * before they check vm_lock_seq, realize the vma is locked and drop |
| + * back the vm_refcnt. That is a narrow window for observing a raised |
| + * vm_refcnt. |
| + */ |
| + if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) { |
| + /* Wait until vma is detached with no readers. */ |
| + if (__vma_enter_locked(vma, true)) { |
| + bool detached; |
| + |
| + __vma_exit_locked(vma, &detached); |
| + WARN_ON_ONCE(!detached); |
| + } |
| + } |
| +} |
| + |
| /* |
| * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be |
| * stable and not isolated. If the VMA is not found or is being modified the |
| @@ -6384,15 +6452,17 @@ retry: |
| if (!vma) |
| goto inval; |
| |
| - if (!vma_start_read(vma)) |
| - goto inval; |
| + vma = vma_start_read(vma); |
| + if (IS_ERR_OR_NULL(vma)) { |
| + /* Check if the VMA got isolated after we found it */ |
| + if (PTR_ERR(vma) == -EAGAIN) { |
| + count_vm_vma_lock_event(VMA_LOCK_MISS); |
| + /* The area was replaced with another one */ |
| + goto retry; |
| + } |
| |
| - /* Check if the VMA got isolated after we found it */ |
| - if (is_vma_detached(vma)) { |
| - vma_end_read(vma); |
| - count_vm_vma_lock_event(VMA_LOCK_MISS); |
| - /* The area was replaced with another one */ |
| - goto retry; |
| + /* Failed to lock the VMA */ |
| + goto inval; |
| } |
| /* |
| * At this point, we have a stable reference to a VMA: The VMA is |
| --- a/tools/testing/vma/linux/atomic.h~mm-replace-vm_lock-and-detached-flag-with-a-reference-count |
| +++ a/tools/testing/vma/linux/atomic.h |
| @@ -9,4 +9,9 @@ |
| #define atomic_set(x, y) uatomic_set(x, y) |
| #define U8_MAX UCHAR_MAX |
| |
| +#ifndef atomic_cmpxchg_relaxed |
| +#define atomic_cmpxchg_relaxed uatomic_cmpxchg |
| +#define atomic_cmpxchg_release uatomic_cmpxchg |
| +#endif /* atomic_cmpxchg_relaxed */ |
| + |
| #endif /* _LINUX_ATOMIC_H */ |
| --- a/tools/testing/vma/vma_internal.h~mm-replace-vm_lock-and-detached-flag-with-a-reference-count |
| +++ a/tools/testing/vma/vma_internal.h |
| @@ -25,7 +25,7 @@ |
| #include <linux/maple_tree.h> |
| #include <linux/mm.h> |
| #include <linux/rbtree.h> |
| -#include <linux/rwsem.h> |
| +#include <linux/refcount.h> |
| |
| extern unsigned long stack_guard_gap; |
| #ifdef CONFIG_MMU |
| @@ -135,10 +135,6 @@ typedef __bitwise unsigned int vm_fault_ |
| */ |
| #define pr_warn_once pr_err |
| |
| -typedef struct refcount_struct { |
| - atomic_t refs; |
| -} refcount_t; |
| - |
| struct kref { |
| refcount_t refcount; |
| }; |
| @@ -233,15 +229,12 @@ struct mm_struct { |
| unsigned long flags; /* Must use atomic bitops to access */ |
| }; |
| |
| -struct vma_lock { |
| - struct rw_semaphore lock; |
| -}; |
| - |
| - |
| struct file { |
| struct address_space *f_mapping; |
| }; |
| |
| +#define VMA_LOCK_OFFSET 0x40000000 |
| + |
| struct vm_area_struct { |
| /* The first cache line has the info for VMA tree walking. */ |
| |
| @@ -269,16 +262,13 @@ struct vm_area_struct { |
| }; |
| |
| #ifdef CONFIG_PER_VMA_LOCK |
| - /* Flag to indicate areas detached from the mm->mm_mt tree */ |
| - bool detached; |
| - |
| /* |
| * Can only be written (using WRITE_ONCE()) while holding both: |
| * - mmap_lock (in write mode) |
| - * - vm_lock.lock (in write mode) |
| + * - vm_refcnt bit at VMA_LOCK_OFFSET is set |
| * Can be read reliably while holding one of: |
| * - mmap_lock (in read or write mode) |
| - * - vm_lock.lock (in read or write mode) |
| + * - vm_refcnt bit at VMA_LOCK_OFFSET is set or vm_refcnt > 1 |
| * Can be read unreliably (using READ_ONCE()) for pessimistic bailout |
| * while holding nothing (except RCU to keep the VMA struct allocated). |
| * |
| @@ -287,7 +277,6 @@ struct vm_area_struct { |
| * slowpath. |
| */ |
| unsigned int vm_lock_seq; |
| - struct vma_lock vm_lock; |
| #endif |
| |
| /* |
| @@ -340,6 +329,10 @@ struct vm_area_struct { |
| struct vma_numab_state *numab_state; /* NUMA Balancing state */ |
| #endif |
| struct vm_userfaultfd_ctx vm_userfaultfd_ctx; |
| +#ifdef CONFIG_PER_VMA_LOCK |
| + /* Unstable RCU readers are allowed to read this. */ |
| + refcount_t vm_refcnt; |
| +#endif |
| } __randomize_layout; |
| |
| struct vm_fault {}; |
| @@ -464,33 +457,40 @@ static inline struct vm_area_struct *vma |
| return mas_find(&vmi->mas, ULONG_MAX); |
| } |
| |
| -static inline void vma_lock_init(struct vm_area_struct *vma) |
| -{ |
| - init_rwsem(&vma->vm_lock.lock); |
| - vma->vm_lock_seq = UINT_MAX; |
| -} |
| - |
| +/* |
| + * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these |
| + * assertions should be made either under mmap_write_lock or when the object |
| + * has been isolated under mmap_write_lock, ensuring no competing writers. |
| + */ |
| static inline void vma_assert_attached(struct vm_area_struct *vma) |
| { |
| - WARN_ON_ONCE(vma->detached); |
| + WARN_ON_ONCE(!refcount_read(&vma->vm_refcnt)); |
| } |
| |
| static inline void vma_assert_detached(struct vm_area_struct *vma) |
| { |
| - WARN_ON_ONCE(!vma->detached); |
| + WARN_ON_ONCE(refcount_read(&vma->vm_refcnt)); |
| } |
| |
| static inline void vma_assert_write_locked(struct vm_area_struct *); |
| static inline void vma_mark_attached(struct vm_area_struct *vma) |
| { |
| - vma->detached = false; |
| + vma_assert_write_locked(vma); |
| + vma_assert_detached(vma); |
| + refcount_set(&vma->vm_refcnt, 1); |
| } |
| |
| static inline void vma_mark_detached(struct vm_area_struct *vma) |
| { |
| - /* When detaching vma should be write-locked */ |
| vma_assert_write_locked(vma); |
| - vma->detached = true; |
| + vma_assert_attached(vma); |
| + /* We are the only writer, so no need to use vma_refcount_put(). */ |
| + if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) { |
| + /* |
| + * Reader must have temporarily raised vm_refcnt but it will |
| + * drop it without using the vma since vma is write-locked. |
| + */ |
| + } |
| } |
| |
| extern const struct vm_operations_struct vma_dummy_vm_ops; |
| @@ -503,9 +503,7 @@ static inline void vma_init(struct vm_ar |
| vma->vm_mm = mm; |
| vma->vm_ops = &vma_dummy_vm_ops; |
| INIT_LIST_HEAD(&vma->anon_vma_chain); |
| - /* vma is not locked, can't use vma_mark_detached() */ |
| - vma->detached = true; |
| - vma_lock_init(vma); |
| + vma->vm_lock_seq = UINT_MAX; |
| } |
| |
| static inline struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) |
| @@ -528,10 +526,9 @@ static inline struct vm_area_struct *vm_ |
| return NULL; |
| |
| memcpy(new, orig, sizeof(*new)); |
| - vma_lock_init(new); |
| + refcount_set(&new->vm_refcnt, 0); |
| + new->vm_lock_seq = UINT_MAX; |
| INIT_LIST_HEAD(&new->anon_vma_chain); |
| - /* vma is not locked, can't use vma_mark_detached() */ |
| - new->detached = true; |
| |
| return new; |
| } |
| _ |