| From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> |
| Subject: mm/hugetlbfs: update hugetlbfs to use mmap_prepare |
| Date: Mon, 20 Oct 2025 13:11:29 +0100 |
| |
| Since we can now perform actions after the VMA is established via |
| mmap_prepare, use desc->action_success_hook to set up the hugetlb lock |
| once the VMA is setup. |
| |
| We also make changes throughout hugetlbfs to make this possible. |
| |
| Note that we must hide newly established hugetlb VMAs from the rmap until |
| the operation is entirely complete as we establish a hugetlb lock during |
| VMA setup that can be raced by rmap users. |
| |
| Link: https://lkml.kernel.org/r/b1afa16d3cfa585a03df9ae215ae9f905b3f0ed7.1760959442.git.lorenzo.stoakes@oracle.com |
| Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> |
| Reviewed-by: Jason Gunthorpe <jgg@nvidia.com> |
| Tested-by: Sumanth Korikkar <sumanthk@linux.ibm.com> |
| Cc: Alexander Gordeev <agordeev@linux.ibm.com> |
| Cc: Al Viro <viro@zeniv.linux.org.uk> |
| Cc: Andreas Larsson <andreas@gaisler.com> |
| Cc: Andrey Konovalov <andreyknvl@gmail.com> |
| Cc: Arnd Bergmann <arnd@arndb.de> |
| Cc: Baolin Wang <baolin.wang@linux.alibaba.com> |
| Cc: Baoquan He <bhe@redhat.com> |
| Cc: Chatre, Reinette <reinette.chatre@intel.com> |
| Cc: Christian Borntraeger <borntraeger@linux.ibm.com> |
| Cc: Christian Brauner <brauner@kernel.org> |
| Cc: Dan Williams <dan.j.williams@intel.com> |
| Cc: Dave Jiang <dave.jiang@intel.com> |
| Cc: Dave Martin <dave.martin@arm.com> |
| Cc: Dave Young <dyoung@redhat.com> |
| Cc: David Hildenbrand <david@redhat.com> |
| Cc: David S. Miller <davem@davemloft.net> |
| Cc: Dmitriy Vyukov <dvyukov@google.com> |
| Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> |
| Cc: Guo Ren <guoren@kernel.org> |
| Cc: Heiko Carstens <hca@linux.ibm.com> |
| Cc: Hugh Dickins <hughd@google.com> |
| Cc: James Morse <james.morse@arm.com> |
| Cc: Jan Kara <jack@suse.cz> |
| Cc: Jann Horn <jannh@google.com> |
| Cc: Jonathan Corbet <corbet@lwn.net> |
| Cc: Kevin Tian <kevin.tian@intel.com> |
| Cc: Konstantin Komarov <almaz.alexandrovich@paragon-software.com> |
| Cc: Liam Howlett <liam.howlett@oracle.com> |
| Cc: "Luck, Tony" <tony.luck@intel.com> |
| Cc: Matthew Wilcox (Oracle) <willy@infradead.org> |
| Cc: Michal Hocko <mhocko@suse.com> |
| Cc: Mike Rapoport <rppt@kernel.org> |
| Cc: Muchun Song <muchun.song@linux.dev> |
| Cc: Nicolas Pitre <nico@fluxnic.net> |
| Cc: Oscar Salvador <osalvador@suse.de> |
| Cc: Pedro Falcato <pfalcato@suse.de> |
| Cc: Robin Murohy <robin.murphy@arm.com> |
| Cc: Suren Baghdasaryan <surenb@google.com> |
| Cc: Sven Schnelle <svens@linux.ibm.com> |
| Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de> |
| Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com> |
| Cc: Vasily Gorbik <gor@linux.ibm.com> |
| Cc: Vishal Verma <vishal.l.verma@intel.com> |
| Cc: Vivek Goyal <vgoyal@redhat.com> |
| Cc: Vlastimil Babka <vbabka@suse.cz> |
| Cc: Will Deacon <will@kernel.org> |
| Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
| --- |
| |
| fs/hugetlbfs/inode.c | 46 +++++++++++++----- |
| include/linux/hugetlb.h | 9 ++- |
| include/linux/hugetlb_inline.h | 15 ++++-- |
| mm/hugetlb.c | 77 ++++++++++++++++++------------- |
| 4 files changed, 95 insertions(+), 52 deletions(-) |
| |
| --- a/fs/hugetlbfs/inode.c~mm-hugetlbfs-update-hugetlbfs-to-use-mmap_prepare |
| +++ a/fs/hugetlbfs/inode.c |
| @@ -96,8 +96,15 @@ static const struct fs_parameter_spec hu |
| #define PGOFF_LOFFT_MAX \ |
| (((1UL << (PAGE_SHIFT + 1)) - 1) << (BITS_PER_LONG - (PAGE_SHIFT + 1))) |
| |
| -static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) |
| +static int hugetlb_file_mmap_prepare_success(const struct vm_area_struct *vma) |
| { |
| + /* Unfortunate we have to reassign vma->vm_private_data. */ |
| + return hugetlb_vma_lock_alloc((struct vm_area_struct *)vma); |
| +} |
| + |
| +static int hugetlbfs_file_mmap_prepare(struct vm_area_desc *desc) |
| +{ |
| + struct file *file = desc->file; |
| struct inode *inode = file_inode(file); |
| loff_t len, vma_len; |
| int ret; |
| @@ -112,8 +119,8 @@ static int hugetlbfs_file_mmap(struct fi |
| * way when do_mmap unwinds (may be important on powerpc |
| * and ia64). |
| */ |
| - vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND); |
| - vma->vm_ops = &hugetlb_vm_ops; |
| + desc->vm_flags |= VM_HUGETLB | VM_DONTEXPAND; |
| + desc->vm_ops = &hugetlb_vm_ops; |
| |
| /* |
| * page based offset in vm_pgoff could be sufficiently large to |
| @@ -122,16 +129,16 @@ static int hugetlbfs_file_mmap(struct fi |
| * sizeof(unsigned long). So, only check in those instances. |
| */ |
| if (sizeof(unsigned long) == sizeof(loff_t)) { |
| - if (vma->vm_pgoff & PGOFF_LOFFT_MAX) |
| + if (desc->pgoff & PGOFF_LOFFT_MAX) |
| return -EINVAL; |
| } |
| |
| /* must be huge page aligned */ |
| - if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) |
| + if (desc->pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) |
| return -EINVAL; |
| |
| - vma_len = (loff_t)(vma->vm_end - vma->vm_start); |
| - len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); |
| + vma_len = (loff_t)vma_desc_size(desc); |
| + len = vma_len + ((loff_t)desc->pgoff << PAGE_SHIFT); |
| /* check for overflow */ |
| if (len < vma_len) |
| return -EINVAL; |
| @@ -141,7 +148,7 @@ static int hugetlbfs_file_mmap(struct fi |
| |
| ret = -ENOMEM; |
| |
| - vm_flags = vma->vm_flags; |
| + vm_flags = desc->vm_flags; |
| /* |
| * for SHM_HUGETLB, the pages are reserved in the shmget() call so skip |
| * reserving here. Note: only for SHM hugetlbfs file, the inode |
| @@ -151,17 +158,30 @@ static int hugetlbfs_file_mmap(struct fi |
| vm_flags |= VM_NORESERVE; |
| |
| if (hugetlb_reserve_pages(inode, |
| - vma->vm_pgoff >> huge_page_order(h), |
| - len >> huge_page_shift(h), vma, |
| - vm_flags) < 0) |
| + desc->pgoff >> huge_page_order(h), |
| + len >> huge_page_shift(h), desc, |
| + vm_flags) < 0) |
| goto out; |
| |
| ret = 0; |
| - if (vma->vm_flags & VM_WRITE && inode->i_size < len) |
| + if ((desc->vm_flags & VM_WRITE) && inode->i_size < len) |
| i_size_write(inode, len); |
| out: |
| inode_unlock(inode); |
| |
| + if (!ret) { |
| + /* Allocate the VMA lock after we set it up. */ |
| + desc->action.success_hook = hugetlb_file_mmap_prepare_success; |
| + /* |
| + * We cannot permit the rmap finding this VMA in the time |
| + * between the VMA being inserted into the VMA tree and the |
| + * completion/success hook being invoked. |
| + * |
| + * This is because we establish a per-VMA hugetlb lock which can |
| + * be raced by rmap. |
| + */ |
| + desc->action.hide_from_rmap_until_complete = true; |
| + } |
| return ret; |
| } |
| |
| @@ -1220,7 +1240,7 @@ static void init_once(void *foo) |
| |
| static const struct file_operations hugetlbfs_file_operations = { |
| .read_iter = hugetlbfs_read_iter, |
| - .mmap = hugetlbfs_file_mmap, |
| + .mmap_prepare = hugetlbfs_file_mmap_prepare, |
| .fsync = noop_fsync, |
| .get_unmapped_area = hugetlb_get_unmapped_area, |
| .llseek = default_llseek, |
| --- a/include/linux/hugetlb.h~mm-hugetlbfs-update-hugetlbfs-to-use-mmap_prepare |
| +++ a/include/linux/hugetlb.h |
| @@ -150,8 +150,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_ |
| struct folio **foliop); |
| #endif /* CONFIG_USERFAULTFD */ |
| long hugetlb_reserve_pages(struct inode *inode, long from, long to, |
| - struct vm_area_struct *vma, |
| - vm_flags_t vm_flags); |
| + struct vm_area_desc *desc, vm_flags_t vm_flags); |
| long hugetlb_unreserve_pages(struct inode *inode, long start, long end, |
| long freed); |
| bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list); |
| @@ -280,6 +279,7 @@ bool is_hugetlb_entry_hwpoisoned(pte_t p |
| void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); |
| void fixup_hugetlb_reservations(struct vm_area_struct *vma); |
| void hugetlb_split(struct vm_area_struct *vma, unsigned long addr); |
| +int hugetlb_vma_lock_alloc(struct vm_area_struct *vma); |
| |
| #else /* !CONFIG_HUGETLB_PAGE */ |
| |
| @@ -466,6 +466,11 @@ static inline void fixup_hugetlb_reserva |
| |
| static inline void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) {} |
| |
| +static inline int hugetlb_vma_lock_alloc(struct vm_area_struct *vma) |
| +{ |
| + return 0; |
| +} |
| + |
| #endif /* !CONFIG_HUGETLB_PAGE */ |
| |
| #ifndef pgd_write |
| --- a/include/linux/hugetlb_inline.h~mm-hugetlbfs-update-hugetlbfs-to-use-mmap_prepare |
| +++ a/include/linux/hugetlb_inline.h |
| @@ -2,22 +2,27 @@ |
| #ifndef _LINUX_HUGETLB_INLINE_H |
| #define _LINUX_HUGETLB_INLINE_H |
| |
| -#ifdef CONFIG_HUGETLB_PAGE |
| - |
| #include <linux/mm.h> |
| |
| -static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) |
| +#ifdef CONFIG_HUGETLB_PAGE |
| + |
| +static inline bool is_vm_hugetlb_flags(vm_flags_t vm_flags) |
| { |
| - return !!(vma->vm_flags & VM_HUGETLB); |
| + return !!(vm_flags & VM_HUGETLB); |
| } |
| |
| #else |
| |
| -static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) |
| +static inline bool is_vm_hugetlb_flags(vm_flags_t vm_flags) |
| { |
| return false; |
| } |
| |
| #endif |
| |
| +static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) |
| +{ |
| + return is_vm_hugetlb_flags(vma->vm_flags); |
| +} |
| + |
| #endif |
| --- a/mm/hugetlb.c~mm-hugetlbfs-update-hugetlbfs-to-use-mmap_prepare |
| +++ a/mm/hugetlb.c |
| @@ -119,7 +119,6 @@ struct mutex *hugetlb_fault_mutex_table |
| /* Forward declaration */ |
| static int hugetlb_acct_memory(struct hstate *h, long delta); |
| static void hugetlb_vma_lock_free(struct vm_area_struct *vma); |
| -static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma); |
| static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma); |
| static void hugetlb_unshare_pmds(struct vm_area_struct *vma, |
| unsigned long start, unsigned long end, bool take_locks); |
| @@ -438,17 +437,21 @@ static void hugetlb_vma_lock_free(struct |
| } |
| } |
| |
| -static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma) |
| +/* |
| + * vma specific semaphore used for pmd sharing and fault/truncation |
| + * synchronization |
| + */ |
| +int hugetlb_vma_lock_alloc(struct vm_area_struct *vma) |
| { |
| struct hugetlb_vma_lock *vma_lock; |
| |
| /* Only establish in (flags) sharable vmas */ |
| if (!vma || !(vma->vm_flags & VM_MAYSHARE)) |
| - return; |
| + return 0; |
| |
| /* Should never get here with non-NULL vm_private_data */ |
| if (vma->vm_private_data) |
| - return; |
| + return -EINVAL; |
| |
| vma_lock = kmalloc(sizeof(*vma_lock), GFP_KERNEL); |
| if (!vma_lock) { |
| @@ -463,13 +466,15 @@ static void hugetlb_vma_lock_alloc(struc |
| * allocation failure. |
| */ |
| pr_warn_once("HugeTLB: unable to allocate vma specific lock\n"); |
| - return; |
| + return -EINVAL; |
| } |
| |
| kref_init(&vma_lock->refs); |
| init_rwsem(&vma_lock->rw_sema); |
| vma_lock->vma = vma; |
| vma->vm_private_data = vma_lock; |
| + |
| + return 0; |
| } |
| |
| /* Helper that removes a struct file_region from the resv_map cache and returns |
| @@ -1201,20 +1206,28 @@ static struct resv_map *vma_resv_map(str |
| } |
| } |
| |
| -static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) |
| +static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) |
| { |
| - VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); |
| - VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); |
| + VM_WARN_ON_ONCE_VMA(!is_vm_hugetlb_page(vma), vma); |
| + VM_WARN_ON_ONCE_VMA(vma->vm_flags & VM_MAYSHARE, vma); |
| |
| - set_vma_private_data(vma, (unsigned long)map); |
| + set_vma_private_data(vma, get_vma_private_data(vma) | flags); |
| } |
| |
| -static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) |
| +static void set_vma_desc_resv_map(struct vm_area_desc *desc, struct resv_map *map) |
| { |
| - VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); |
| - VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); |
| + VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags)); |
| + VM_WARN_ON_ONCE(desc->vm_flags & VM_MAYSHARE); |
| |
| - set_vma_private_data(vma, get_vma_private_data(vma) | flags); |
| + desc->private_data = map; |
| +} |
| + |
| +static void set_vma_desc_resv_flags(struct vm_area_desc *desc, unsigned long flags) |
| +{ |
| + VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags)); |
| + VM_WARN_ON_ONCE(desc->vm_flags & VM_MAYSHARE); |
| + |
| + desc->private_data = (void *)((unsigned long)desc->private_data | flags); |
| } |
| |
| static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) |
| @@ -1224,6 +1237,13 @@ static int is_vma_resv_set(struct vm_are |
| return (get_vma_private_data(vma) & flag) != 0; |
| } |
| |
| +static bool is_vma_desc_resv_set(struct vm_area_desc *desc, unsigned long flag) |
| +{ |
| + VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags)); |
| + |
| + return ((unsigned long)desc->private_data) & flag; |
| +} |
| + |
| bool __vma_private_lock(struct vm_area_struct *vma) |
| { |
| return !(vma->vm_flags & VM_MAYSHARE) && |
| @@ -7270,9 +7290,9 @@ long hugetlb_change_protection(struct vm |
| */ |
| |
| long hugetlb_reserve_pages(struct inode *inode, |
| - long from, long to, |
| - struct vm_area_struct *vma, |
| - vm_flags_t vm_flags) |
| + long from, long to, |
| + struct vm_area_desc *desc, |
| + vm_flags_t vm_flags) |
| { |
| long chg = -1, add = -1, spool_resv, gbl_resv; |
| struct hstate *h = hstate_inode(inode); |
| @@ -7288,12 +7308,6 @@ long hugetlb_reserve_pages(struct inode |
| } |
| |
| /* |
| - * vma specific semaphore used for pmd sharing and fault/truncation |
| - * synchronization |
| - */ |
| - hugetlb_vma_lock_alloc(vma); |
| - |
| - /* |
| * Only apply hugepage reservation if asked. At fault time, an |
| * attempt will be made for VM_NORESERVE to allocate a page |
| * without using reserves |
| @@ -7305,9 +7319,9 @@ long hugetlb_reserve_pages(struct inode |
| * Shared mappings base their reservation on the number of pages that |
| * are already allocated on behalf of the file. Private mappings need |
| * to reserve the full area even if read-only as mprotect() may be |
| - * called to make the mapping read-write. Assume !vma is a shm mapping |
| + * called to make the mapping read-write. Assume !desc is a shm mapping |
| */ |
| - if (!vma || vma->vm_flags & VM_MAYSHARE) { |
| + if (!desc || desc->vm_flags & VM_MAYSHARE) { |
| /* |
| * resv_map can not be NULL as hugetlb_reserve_pages is only |
| * called for inodes for which resv_maps were created (see |
| @@ -7324,8 +7338,8 @@ long hugetlb_reserve_pages(struct inode |
| |
| chg = to - from; |
| |
| - set_vma_resv_map(vma, resv_map); |
| - set_vma_resv_flags(vma, HPAGE_RESV_OWNER); |
| + set_vma_desc_resv_map(desc, resv_map); |
| + set_vma_desc_resv_flags(desc, HPAGE_RESV_OWNER); |
| } |
| |
| if (chg < 0) |
| @@ -7335,7 +7349,7 @@ long hugetlb_reserve_pages(struct inode |
| chg * pages_per_huge_page(h), &h_cg) < 0) |
| goto out_err; |
| |
| - if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) { |
| + if (desc && !(desc->vm_flags & VM_MAYSHARE) && h_cg) { |
| /* For private mappings, the hugetlb_cgroup uncharge info hangs |
| * of the resv_map. |
| */ |
| @@ -7369,7 +7383,7 @@ long hugetlb_reserve_pages(struct inode |
| * consumed reservations are stored in the map. Hence, nothing |
| * else has to be done for private mappings here |
| */ |
| - if (!vma || vma->vm_flags & VM_MAYSHARE) { |
| + if (!desc || desc->vm_flags & VM_MAYSHARE) { |
| add = region_add(resv_map, from, to, regions_needed, h, h_cg); |
| |
| if (unlikely(add < 0)) { |
| @@ -7423,16 +7437,15 @@ out_uncharge_cgroup: |
| hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h), |
| chg * pages_per_huge_page(h), h_cg); |
| out_err: |
| - hugetlb_vma_lock_free(vma); |
| - if (!vma || vma->vm_flags & VM_MAYSHARE) |
| + if (!desc || desc->vm_flags & VM_MAYSHARE) |
| /* Only call region_abort if the region_chg succeeded but the |
| * region_add failed or didn't run. |
| */ |
| if (chg >= 0 && add < 0) |
| region_abort(resv_map, from, to, regions_needed); |
| - if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { |
| + if (desc && is_vma_desc_resv_set(desc, HPAGE_RESV_OWNER)) { |
| kref_put(&resv_map->refs, resv_map_release); |
| - set_vma_resv_map(vma, NULL); |
| + set_vma_desc_resv_map(desc, NULL); |
| } |
| return chg < 0 ? chg : add < 0 ? add : -EINVAL; |
| } |
| _ |