| From: Balbir Singh <balbirs@nvidia.com> |
| Subject: mm/memory/fault: add THP fault handling for zone device private pages |
| Date: Wed, 1 Oct 2025 16:56:59 +1000 |
| |
| Implement CPU fault handling for zone device THP entries through |
| do_huge_pmd_device_private(), enabling transparent migration of |
| device-private large pages back to system memory on CPU access. |
| |
| When the CPU accesses a zone device THP entry, the fault handler calls the |
| device driver's migrate_to_ram() callback to migrate the entire large page |
| back to system memory. |
| |
| Link: https://lkml.kernel.org/r/20251001065707.920170-9-balbirs@nvidia.com |
| Signed-off-by: Balbir Singh <balbirs@nvidia.com> |
| Cc: David Hildenbrand <david@redhat.com> |
| Cc: Zi Yan <ziy@nvidia.com> |
| Cc: Joshua Hahn <joshua.hahnjy@gmail.com> |
| Cc: Rakie Kim <rakie.kim@sk.com> |
| Cc: Byungchul Park <byungchul@sk.com> |
| Cc: Gregory Price <gourry@gourry.net> |
| Cc: Ying Huang <ying.huang@linux.alibaba.com> |
| Cc: Alistair Popple <apopple@nvidia.com> |
| Cc: Oscar Salvador <osalvador@suse.de> |
| Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> |
| Cc: Baolin Wang <baolin.wang@linux.alibaba.com> |
| Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com> |
| Cc: Nico Pache <npache@redhat.com> |
| Cc: Ryan Roberts <ryan.roberts@arm.com> |
| Cc: Dev Jain <dev.jain@arm.com> |
| Cc: Barry Song <baohua@kernel.org> |
| Cc: Lyude Paul <lyude@redhat.com> |
| Cc: Danilo Krummrich <dakr@kernel.org> |
| Cc: David Airlie <airlied@gmail.com> |
| Cc: Simona Vetter <simona@ffwll.ch> |
| Cc: Ralph Campbell <rcampbell@nvidia.com> |
| Cc: Mika Penttilä <mpenttil@redhat.com> |
| Cc: Matthew Brost <matthew.brost@intel.com> |
| Cc: Francois Dugast <francois.dugast@intel.com> |
| Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
| --- |
| |
| include/linux/huge_mm.h | 7 +++++++ |
| mm/huge_memory.c | 38 ++++++++++++++++++++++++++++++++++++++ |
| mm/memory.c | 5 +++-- |
| 3 files changed, 48 insertions(+), 2 deletions(-) |
| |
| --- a/include/linux/huge_mm.h~mm-memory-fault-add-thp-fault-handling-for-zone-device-private-pages |
| +++ a/include/linux/huge_mm.h |
| @@ -481,6 +481,8 @@ static inline bool folio_test_pmd_mappab |
| |
| vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf); |
| |
| +vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf); |
| + |
| extern struct folio *huge_zero_folio; |
| extern unsigned long huge_zero_pfn; |
| |
| @@ -661,6 +663,11 @@ static inline vm_fault_t do_huge_pmd_num |
| { |
| return 0; |
| } |
| + |
| +static inline vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf) |
| +{ |
| + return 0; |
| +} |
| |
| static inline bool is_huge_zero_folio(const struct folio *folio) |
| { |
| --- a/mm/huge_memory.c~mm-memory-fault-add-thp-fault-handling-for-zone-device-private-pages |
| +++ a/mm/huge_memory.c |
| @@ -1287,6 +1287,44 @@ release: |
| |
| } |
| |
| +vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf) |
| +{ |
| + struct vm_area_struct *vma = vmf->vma; |
| + vm_fault_t ret = 0; |
| + spinlock_t *ptl; |
| + swp_entry_t swp_entry; |
| + struct page *page; |
| + struct folio *folio; |
| + |
| + if (vmf->flags & FAULT_FLAG_VMA_LOCK) { |
| + vma_end_read(vma); |
| + return VM_FAULT_RETRY; |
| + } |
| + |
| + ptl = pmd_lock(vma->vm_mm, vmf->pmd); |
| + if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd))) { |
| + spin_unlock(ptl); |
| + return 0; |
| + } |
| + |
| + swp_entry = pmd_to_swp_entry(vmf->orig_pmd); |
| + page = pfn_swap_entry_to_page(swp_entry); |
| + folio = page_folio(page); |
| + vmf->page = page; |
| + vmf->pte = NULL; |
| + if (folio_trylock(folio)) { |
| + folio_get(folio); |
| + spin_unlock(ptl); |
| + ret = page_pgmap(page)->ops->migrate_to_ram(vmf); |
| + folio_unlock(folio); |
| + folio_put(folio); |
| + } else { |
| + spin_unlock(ptl); |
| + } |
| + |
| + return ret; |
| +} |
| + |
| /* |
| * always: directly stall for all thp allocations |
| * defer: wake kswapd and fail if not immediately available |
| --- a/mm/memory.c~mm-memory-fault-add-thp-fault-handling-for-zone-device-private-pages |
| +++ a/mm/memory.c |
| @@ -6293,8 +6293,9 @@ retry_pud: |
| vmf.orig_pmd = pmdp_get_lockless(vmf.pmd); |
| |
| if (unlikely(is_swap_pmd(vmf.orig_pmd))) { |
| - VM_BUG_ON(thp_migration_supported() && |
| - !is_pmd_migration_entry(vmf.orig_pmd)); |
| + if (is_pmd_device_private_entry(vmf.orig_pmd)) |
| + return do_huge_pmd_device_private(&vmf); |
| + |
| if (is_pmd_migration_entry(vmf.orig_pmd)) |
| pmd_migration_entry_wait(mm, vmf.pmd); |
| return 0; |
| _ |