| From: Alistair Popple <apopple@nvidia.com> |
| Subject: fs/dax: properly refcount fs dax pages |
| Date: Fri, 28 Feb 2025 14:31:14 +1100 |
| |
| Currently fs dax pages are considered free when the refcount drops to one |
| and their refcounts are not increased when mapped via PTEs or decreased |
| when unmapped. This requires special logic in mm paths to detect that |
| these pages should not be properly refcounted, and to detect when the |
| refcount drops to one instead of zero. |
| |
| On the other hand get_user_pages(), etc. will properly refcount fs dax |
| pages by taking a reference and dropping it when the page is unpinned. |
| |
| Tracking this special behaviour requires extra PTE bits (eg. pte_devmap) |
| and introduces rules that are potentially confusing and specific to FS DAX |
| pages. To fix this, and to possibly allow removal of the special PTE bits |
| in future, convert the fs dax page refcounts to be zero based and instead |
| take a reference on the page each time it is mapped as is currently the |
| case for normal pages. |
| |
| This may also allow a future clean-up to remove the pgmap refcounting that |
| is currently done in mm/gup.c. |
| |
| Link: https://lkml.kernel.org/r/c7d886ad7468a20452ef6e0ddab6cfe220874e7c.1740713401.git-series.apopple@nvidia.com |
| Signed-off-by: Alistair Popple <apopple@nvidia.com> |
| Reviewed-by: Dan Williams <dan.j.williams@intel.com> |
| Tested-by: Alison Schofield <alison.schofield@intel.com> |
| Acked-by: David Hildenbrand <david@redhat.com> |
| Cc: Alexander Gordeev <agordeev@linux.ibm.com> |
| Cc: Asahi Lina <lina@asahilina.net> |
| Cc: Balbir Singh <balbirs@nvidia.com> |
| Cc: Bjorn Helgaas <bhelgaas@google.com> |
| Cc: Catalin Marinas <catalin.marinas@arm.com> |
| Cc: Christian Borntraeger <borntraeger@linux.ibm.com> |
| Cc: Christoph Hellwig <hch@lst.de> |
| Cc: Chunyan Zhang <zhang.lyra@gmail.com> |
| Cc: "Darrick J. Wong" <djwong@kernel.org> |
| Cc: Dave Chinner <david@fromorbit.com> |
| Cc: Dave Hansen <dave.hansen@linux.intel.com> |
| Cc: Dave Jiang <dave.jiang@intel.com> |
| Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com> |
| Cc: Heiko Carstens <hca@linux.ibm.com> |
| Cc: Huacai Chen <chenhuacai@kernel.org> |
| Cc: Ira Weiny <ira.weiny@intel.com> |
| Cc: Jan Kara <jack@suse.cz> |
| Cc: Jason Gunthorpe <jgg@nvidia.com> |
| Cc: Jason Gunthorpe <jgg@ziepe.ca> |
| Cc: John Hubbard <jhubbard@nvidia.com> |
| Cc: linmiaohe <linmiaohe@huawei.com> |
| Cc: Logan Gunthorpe <logang@deltatee.com> |
| Cc: Matthew Wilcow (Oracle) <willy@infradead.org> |
| Cc: Michael "Camp Drill Sergeant" Ellerman <mpe@ellerman.id.au> |
| Cc: Nicholas Piggin <npiggin@gmail.com> |
| Cc: Peter Xu <peterx@redhat.com> |
| Cc: Sven Schnelle <svens@linux.ibm.com> |
| Cc: Ted Ts'o <tytso@mit.edu> |
| Cc: Vasily Gorbik <gor@linux.ibm.com> |
| Cc: Vishal Verma <vishal.l.verma@intel.com> |
| Cc: Vivek Goyal <vgoyal@redhat.com> |
| Cc: WANG Xuerui <kernel@xen0n.name> |
| Cc: Will Deacon <will@kernel.org> |
| Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
| --- |
| |
| drivers/nvdimm/pmem.c | 4 |
| fs/dax.c | 188 ++++++++++++++++++++++--------------- |
| fs/fuse/virtio_fs.c | 3 |
| include/linux/dax.h | 2 |
| include/linux/mm.h | 27 ----- |
| include/linux/mm_types.h | 7 + |
| mm/gup.c | 9 - |
| mm/huge_memory.c | 6 - |
| mm/internal.h | 2 |
| mm/memory-failure.c | 6 - |
| mm/memory.c | 6 - |
| mm/memremap.c | 47 ++++----- |
| mm/mm_init.c | 9 - |
| mm/swap.c | 2 |
| 14 files changed, 166 insertions(+), 152 deletions(-) |
| |
| --- a/drivers/nvdimm/pmem.c~fs-dax-properly-refcount-fs-dax-pages |
| +++ a/drivers/nvdimm/pmem.c |
| @@ -513,7 +513,7 @@ static int pmem_attach_disk(struct devic |
| |
| pmem->disk = disk; |
| pmem->pgmap.owner = pmem; |
| - pmem->pfn_flags = PFN_DEV; |
| + pmem->pfn_flags = 0; |
| if (is_nd_pfn(dev)) { |
| pmem->pgmap.type = MEMORY_DEVICE_FS_DAX; |
| pmem->pgmap.ops = &fsdax_pagemap_ops; |
| @@ -522,7 +522,6 @@ static int pmem_attach_disk(struct devic |
| pmem->data_offset = le64_to_cpu(pfn_sb->dataoff); |
| pmem->pfn_pad = resource_size(res) - |
| range_len(&pmem->pgmap.range); |
| - pmem->pfn_flags |= PFN_MAP; |
| bb_range = pmem->pgmap.range; |
| bb_range.start += pmem->data_offset; |
| } else if (pmem_should_map_pages(dev)) { |
| @@ -532,7 +531,6 @@ static int pmem_attach_disk(struct devic |
| pmem->pgmap.type = MEMORY_DEVICE_FS_DAX; |
| pmem->pgmap.ops = &fsdax_pagemap_ops; |
| addr = devm_memremap_pages(dev, &pmem->pgmap); |
| - pmem->pfn_flags |= PFN_MAP; |
| bb_range = pmem->pgmap.range; |
| } else { |
| addr = devm_memremap(dev, pmem->phys_addr, |
| --- a/fs/dax.c~fs-dax-properly-refcount-fs-dax-pages |
| +++ a/fs/dax.c |
| @@ -71,6 +71,11 @@ static unsigned long dax_to_pfn(void *en |
| return xa_to_value(entry) >> DAX_SHIFT; |
| } |
| |
| +static struct folio *dax_to_folio(void *entry) |
| +{ |
| + return page_folio(pfn_to_page(dax_to_pfn(entry))); |
| +} |
| + |
| static void *dax_make_entry(pfn_t pfn, unsigned long flags) |
| { |
| return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT)); |
| @@ -338,19 +343,6 @@ static unsigned long dax_entry_size(void |
| return PAGE_SIZE; |
| } |
| |
| -static unsigned long dax_end_pfn(void *entry) |
| -{ |
| - return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE; |
| -} |
| - |
| -/* |
| - * Iterate through all mapped pfns represented by an entry, i.e. skip |
| - * 'empty' and 'zero' entries. |
| - */ |
| -#define for_each_mapped_pfn(entry, pfn) \ |
| - for (pfn = dax_to_pfn(entry); \ |
| - pfn < dax_end_pfn(entry); pfn++) |
| - |
| /* |
| * A DAX folio is considered shared if it has no mapping set and ->share (which |
| * shares the ->index field) is non-zero. Note this may return false even if the |
| @@ -359,7 +351,7 @@ static unsigned long dax_end_pfn(void *e |
| */ |
| static inline bool dax_folio_is_shared(struct folio *folio) |
| { |
| - return !folio->mapping && folio->page.share; |
| + return !folio->mapping && folio->share; |
| } |
| |
| /* |
| @@ -384,75 +376,117 @@ static void dax_folio_make_shared(struct |
| * folio has previously been mapped into one address space so set the |
| * share count. |
| */ |
| - folio->page.share = 1; |
| + folio->share = 1; |
| } |
| |
| -static inline unsigned long dax_folio_share_put(struct folio *folio) |
| +static inline unsigned long dax_folio_put(struct folio *folio) |
| { |
| - return --folio->page.share; |
| + unsigned long ref; |
| + int order, i; |
| + |
| + if (!dax_folio_is_shared(folio)) |
| + ref = 0; |
| + else |
| + ref = --folio->share; |
| + |
| + if (ref) |
| + return ref; |
| + |
| + folio->mapping = NULL; |
| + order = folio_order(folio); |
| + if (!order) |
| + return 0; |
| + |
| + for (i = 0; i < (1UL << order); i++) { |
| + struct dev_pagemap *pgmap = page_pgmap(&folio->page); |
| + struct page *page = folio_page(folio, i); |
| + struct folio *new_folio = (struct folio *)page; |
| + |
| + ClearPageHead(page); |
| + clear_compound_head(page); |
| + |
| + new_folio->mapping = NULL; |
| + /* |
| + * Reset pgmap which was over-written by |
| + * prep_compound_page(). |
| + */ |
| + new_folio->pgmap = pgmap; |
| + new_folio->share = 0; |
| + WARN_ON_ONCE(folio_ref_count(new_folio)); |
| + } |
| + |
| + return ref; |
| +} |
| + |
| +static void dax_folio_init(void *entry) |
| +{ |
| + struct folio *folio = dax_to_folio(entry); |
| + int order = dax_entry_order(entry); |
| + |
| + /* |
| + * Folio should have been split back to order-0 pages in |
| + * dax_folio_put() when they were removed from their |
| + * final mapping. |
| + */ |
| + WARN_ON_ONCE(folio_order(folio)); |
| + |
| + if (order > 0) { |
| + prep_compound_page(&folio->page, order); |
| + if (order > 1) |
| + INIT_LIST_HEAD(&folio->_deferred_list); |
| + WARN_ON_ONCE(folio_ref_count(folio)); |
| + } |
| } |
| |
| static void dax_associate_entry(void *entry, struct address_space *mapping, |
| - struct vm_area_struct *vma, unsigned long address, bool shared) |
| + struct vm_area_struct *vma, |
| + unsigned long address, bool shared) |
| { |
| - unsigned long size = dax_entry_size(entry), pfn, index; |
| - int i = 0; |
| + unsigned long size = dax_entry_size(entry), index; |
| + struct folio *folio = dax_to_folio(entry); |
| |
| if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) |
| return; |
| |
| index = linear_page_index(vma, address & ~(size - 1)); |
| - for_each_mapped_pfn(entry, pfn) { |
| - struct folio *folio = pfn_folio(pfn); |
| - |
| - if (shared && (folio->mapping || folio->page.share)) { |
| - if (folio->mapping) |
| - dax_folio_make_shared(folio); |
| - |
| - WARN_ON_ONCE(!folio->page.share); |
| - folio->page.share++; |
| - } else { |
| - WARN_ON_ONCE(folio->mapping); |
| - folio->mapping = mapping; |
| - folio->index = index + i++; |
| - } |
| + if (shared && (folio->mapping || dax_folio_is_shared(folio))) { |
| + if (folio->mapping) |
| + dax_folio_make_shared(folio); |
| + |
| + WARN_ON_ONCE(!folio->share); |
| + WARN_ON_ONCE(dax_entry_order(entry) != folio_order(folio)); |
| + folio->share++; |
| + } else { |
| + WARN_ON_ONCE(folio->mapping); |
| + dax_folio_init(entry); |
| + folio = dax_to_folio(entry); |
| + folio->mapping = mapping; |
| + folio->index = index; |
| } |
| } |
| |
| static void dax_disassociate_entry(void *entry, struct address_space *mapping, |
| - bool trunc) |
| + bool trunc) |
| { |
| - unsigned long pfn; |
| + struct folio *folio = dax_to_folio(entry); |
| |
| if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) |
| return; |
| |
| - for_each_mapped_pfn(entry, pfn) { |
| - struct folio *folio = pfn_folio(pfn); |
| - |
| - WARN_ON_ONCE(trunc && folio_ref_count(folio) > 1); |
| - if (dax_folio_is_shared(folio)) { |
| - /* keep the shared flag if this page is still shared */ |
| - if (dax_folio_share_put(folio) > 0) |
| - continue; |
| - } else |
| - WARN_ON_ONCE(folio->mapping && folio->mapping != mapping); |
| - folio->mapping = NULL; |
| - folio->index = 0; |
| - } |
| + dax_folio_put(folio); |
| } |
| |
| static struct page *dax_busy_page(void *entry) |
| { |
| - unsigned long pfn; |
| + struct folio *folio = dax_to_folio(entry); |
| |
| - for_each_mapped_pfn(entry, pfn) { |
| - struct page *page = pfn_to_page(pfn); |
| + if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) |
| + return NULL; |
| |
| - if (page_ref_count(page) > 1) |
| - return page; |
| - } |
| - return NULL; |
| + if (folio_ref_count(folio) - folio_mapcount(folio)) |
| + return &folio->page; |
| + else |
| + return NULL; |
| } |
| |
| /** |
| @@ -785,7 +819,7 @@ struct page *dax_layout_busy_page(struct |
| EXPORT_SYMBOL_GPL(dax_layout_busy_page); |
| |
| static int __dax_invalidate_entry(struct address_space *mapping, |
| - pgoff_t index, bool trunc) |
| + pgoff_t index, bool trunc) |
| { |
| XA_STATE(xas, &mapping->i_pages, index); |
| int ret = 0; |
| @@ -953,7 +987,8 @@ void dax_break_layout_final(struct inode |
| wait_page_idle_uninterruptible(page, inode); |
| } while (true); |
| |
| - dax_delete_mapping_range(inode->i_mapping, 0, LLONG_MAX); |
| + if (!page) |
| + dax_delete_mapping_range(inode->i_mapping, 0, LLONG_MAX); |
| } |
| EXPORT_SYMBOL_GPL(dax_break_layout_final); |
| |
| @@ -1039,8 +1074,10 @@ static void *dax_insert_entry(struct xa_ |
| void *old; |
| |
| dax_disassociate_entry(entry, mapping, false); |
| - dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address, |
| - shared); |
| + if (!(flags & DAX_ZERO_PAGE)) |
| + dax_associate_entry(new_entry, mapping, vmf->vma, |
| + vmf->address, shared); |
| + |
| /* |
| * Only swap our new entry into the page cache if the current |
| * entry is a zero page or an empty entry. If a normal PTE or |
| @@ -1228,9 +1265,7 @@ static int dax_iomap_direct_access(const |
| goto out; |
| if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1)) |
| goto out; |
| - /* For larger pages we need devmap */ |
| - if (length > 1 && !pfn_t_devmap(*pfnp)) |
| - goto out; |
| + |
| rc = 0; |
| |
| out_check_addr: |
| @@ -1337,7 +1372,7 @@ static vm_fault_t dax_load_hole(struct x |
| |
| *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, DAX_ZERO_PAGE); |
| |
| - ret = vmf_insert_mixed(vmf->vma, vaddr, pfn); |
| + ret = vmf_insert_page_mkwrite(vmf, pfn_t_to_page(pfn), false); |
| trace_dax_load_hole(inode, vmf, ret); |
| return ret; |
| } |
| @@ -1808,7 +1843,8 @@ static vm_fault_t dax_fault_iter(struct |
| loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT; |
| bool write = iter->flags & IOMAP_WRITE; |
| unsigned long entry_flags = pmd ? DAX_PMD : 0; |
| - int err = 0; |
| + struct folio *folio; |
| + int ret, err = 0; |
| pfn_t pfn; |
| void *kaddr; |
| |
| @@ -1840,17 +1876,19 @@ static vm_fault_t dax_fault_iter(struct |
| return dax_fault_return(err); |
| } |
| |
| + folio = dax_to_folio(*entry); |
| if (dax_fault_is_synchronous(iter, vmf->vma)) |
| return dax_fault_synchronous_pfnp(pfnp, pfn); |
| |
| - /* insert PMD pfn */ |
| + folio_ref_inc(folio); |
| if (pmd) |
| - return vmf_insert_pfn_pmd(vmf, pfn, write); |
| + ret = vmf_insert_folio_pmd(vmf, pfn_folio(pfn_t_to_pfn(pfn)), |
| + write); |
| + else |
| + ret = vmf_insert_page_mkwrite(vmf, pfn_t_to_page(pfn), write); |
| + folio_put(folio); |
| |
| - /* insert PTE pfn */ |
| - if (write) |
| - return vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn); |
| - return vmf_insert_mixed(vmf->vma, vmf->address, pfn); |
| + return ret; |
| } |
| |
| static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, |
| @@ -2089,6 +2127,7 @@ dax_insert_pfn_mkwrite(struct vm_fault * |
| { |
| struct address_space *mapping = vmf->vma->vm_file->f_mapping; |
| XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order); |
| + struct folio *folio; |
| void *entry; |
| vm_fault_t ret; |
| |
| @@ -2106,14 +2145,17 @@ dax_insert_pfn_mkwrite(struct vm_fault * |
| xas_set_mark(&xas, PAGECACHE_TAG_DIRTY); |
| dax_lock_entry(&xas, entry); |
| xas_unlock_irq(&xas); |
| + folio = pfn_folio(pfn_t_to_pfn(pfn)); |
| + folio_ref_inc(folio); |
| if (order == 0) |
| - ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn); |
| + ret = vmf_insert_page_mkwrite(vmf, &folio->page, true); |
| #ifdef CONFIG_FS_DAX_PMD |
| else if (order == PMD_ORDER) |
| - ret = vmf_insert_pfn_pmd(vmf, pfn, FAULT_FLAG_WRITE); |
| + ret = vmf_insert_folio_pmd(vmf, folio, FAULT_FLAG_WRITE); |
| #endif |
| else |
| ret = VM_FAULT_FALLBACK; |
| + folio_put(folio); |
| dax_unlock_entry(&xas, entry); |
| trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret); |
| return ret; |
| --- a/fs/fuse/virtio_fs.c~fs-dax-properly-refcount-fs-dax-pages |
| +++ a/fs/fuse/virtio_fs.c |
| @@ -1017,8 +1017,7 @@ static long virtio_fs_direct_access(stru |
| if (kaddr) |
| *kaddr = fs->window_kaddr + offset; |
| if (pfn) |
| - *pfn = phys_to_pfn_t(fs->window_phys_addr + offset, |
| - PFN_DEV | PFN_MAP); |
| + *pfn = phys_to_pfn_t(fs->window_phys_addr + offset, 0); |
| return nr_pages > max_nr_pages ? max_nr_pages : nr_pages; |
| } |
| |
| --- a/include/linux/dax.h~fs-dax-properly-refcount-fs-dax-pages |
| +++ a/include/linux/dax.h |
| @@ -209,7 +209,7 @@ int dax_truncate_page(struct inode *inod |
| |
| static inline bool dax_page_is_idle(struct page *page) |
| { |
| - return page && page_ref_count(page) == 1; |
| + return page && page_ref_count(page) == 0; |
| } |
| |
| #if IS_ENABLED(CONFIG_DAX) |
| --- a/include/linux/mm.h~fs-dax-properly-refcount-fs-dax-pages |
| +++ a/include/linux/mm.h |
| @@ -1192,6 +1192,8 @@ int vma_is_stack_for_current(struct vm_a |
| struct mmu_gather; |
| struct inode; |
| |
| +extern void prep_compound_page(struct page *page, unsigned int order); |
| + |
| /* |
| * compound_order() can be called without holding a reference, which means |
| * that niceties like page_folio() don't work. These callers should be |
| @@ -1513,25 +1515,6 @@ vm_fault_t finish_fault(struct vm_fault |
| * back into memory. |
| */ |
| |
| -#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX) |
| -DECLARE_STATIC_KEY_FALSE(devmap_managed_key); |
| - |
| -bool __put_devmap_managed_folio_refs(struct folio *folio, int refs); |
| -static inline bool put_devmap_managed_folio_refs(struct folio *folio, int refs) |
| -{ |
| - if (!static_branch_unlikely(&devmap_managed_key)) |
| - return false; |
| - if (!folio_is_zone_device(folio)) |
| - return false; |
| - return __put_devmap_managed_folio_refs(folio, refs); |
| -} |
| -#else /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */ |
| -static inline bool put_devmap_managed_folio_refs(struct folio *folio, int refs) |
| -{ |
| - return false; |
| -} |
| -#endif /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */ |
| - |
| /* 127: arbitrary random number, small enough to assemble well */ |
| #define folio_ref_zero_or_close_to_overflow(folio) \ |
| ((unsigned int) folio_ref_count(folio) + 127u <= 127u) |
| @@ -1652,12 +1635,6 @@ static inline void put_page(struct page |
| if (folio_test_slab(folio)) |
| return; |
| |
| - /* |
| - * For some devmap managed pages we need to catch refcount transition |
| - * from 2 to 1: |
| - */ |
| - if (put_devmap_managed_folio_refs(folio, 1)) |
| - return; |
| folio_put(folio); |
| } |
| |
| --- a/include/linux/mm_types.h~fs-dax-properly-refcount-fs-dax-pages |
| +++ a/include/linux/mm_types.h |
| @@ -296,6 +296,8 @@ typedef struct { |
| * anonymous memory. |
| * @index: Offset within the file, in units of pages. For anonymous memory, |
| * this is the index from the beginning of the mmap. |
| + * @share: number of DAX mappings that reference this folio. See |
| + * dax_associate_entry. |
| * @private: Filesystem per-folio data (see folio_attach_private()). |
| * @swap: Used for swp_entry_t if folio_test_swapcache(). |
| * @_mapcount: Do not access this member directly. Use folio_mapcount() to |
| @@ -345,7 +347,10 @@ struct folio { |
| struct dev_pagemap *pgmap; |
| }; |
| struct address_space *mapping; |
| - pgoff_t index; |
| + union { |
| + pgoff_t index; |
| + unsigned long share; |
| + }; |
| union { |
| void *private; |
| swp_entry_t swap; |
| --- a/mm/gup.c~fs-dax-properly-refcount-fs-dax-pages |
| +++ a/mm/gup.c |
| @@ -96,8 +96,7 @@ retry: |
| * belongs to this folio. |
| */ |
| if (unlikely(page_folio(page) != folio)) { |
| - if (!put_devmap_managed_folio_refs(folio, refs)) |
| - folio_put_refs(folio, refs); |
| + folio_put_refs(folio, refs); |
| goto retry; |
| } |
| |
| @@ -116,8 +115,7 @@ static void gup_put_folio(struct folio * |
| refs *= GUP_PIN_COUNTING_BIAS; |
| } |
| |
| - if (!put_devmap_managed_folio_refs(folio, refs)) |
| - folio_put_refs(folio, refs); |
| + folio_put_refs(folio, refs); |
| } |
| |
| /** |
| @@ -565,8 +563,7 @@ static struct folio *try_grab_folio_fast |
| */ |
| if (unlikely((flags & FOLL_LONGTERM) && |
| !folio_is_longterm_pinnable(folio))) { |
| - if (!put_devmap_managed_folio_refs(folio, refs)) |
| - folio_put_refs(folio, refs); |
| + folio_put_refs(folio, refs); |
| return NULL; |
| } |
| |
| --- a/mm/huge_memory.c~fs-dax-properly-refcount-fs-dax-pages |
| +++ a/mm/huge_memory.c |
| @@ -2225,7 +2225,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, |
| tlb->fullmm); |
| arch_check_zapped_pmd(vma, orig_pmd); |
| tlb_remove_pmd_tlb_entry(tlb, pmd, addr); |
| - if (vma_is_special_huge(vma)) { |
| + if (!vma_is_dax(vma) && vma_is_special_huge(vma)) { |
| if (arch_needs_pgtable_deposit()) |
| zap_deposited_table(tlb->mm, pmd); |
| spin_unlock(ptl); |
| @@ -2882,13 +2882,15 @@ static void __split_huge_pmd_locked(stru |
| */ |
| if (arch_needs_pgtable_deposit()) |
| zap_deposited_table(mm, pmd); |
| - if (vma_is_special_huge(vma)) |
| + if (!vma_is_dax(vma) && vma_is_special_huge(vma)) |
| return; |
| if (unlikely(is_pmd_migration_entry(old_pmd))) { |
| swp_entry_t entry; |
| |
| entry = pmd_to_swp_entry(old_pmd); |
| folio = pfn_swap_entry_folio(entry); |
| + } else if (is_huge_zero_pmd(old_pmd)) { |
| + return; |
| } else { |
| page = pmd_page(old_pmd); |
| folio = page_folio(page); |
| --- a/mm/internal.h~fs-dax-properly-refcount-fs-dax-pages |
| +++ a/mm/internal.h |
| @@ -737,8 +737,6 @@ static inline void prep_compound_tail(st |
| set_page_private(p, 0); |
| } |
| |
| -extern void prep_compound_page(struct page *page, unsigned int order); |
| - |
| void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); |
| extern bool free_pages_prepare(struct page *page, unsigned int order); |
| |
| --- a/mm/memory.c~fs-dax-properly-refcount-fs-dax-pages |
| +++ a/mm/memory.c |
| @@ -3848,13 +3848,15 @@ static vm_fault_t do_wp_page(struct vm_f |
| if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { |
| /* |
| * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a |
| - * VM_PFNMAP VMA. |
| + * VM_PFNMAP VMA. FS DAX also wants ops->pfn_mkwrite called. |
| * |
| * We should not cow pages in a shared writeable mapping. |
| * Just mark the pages writable and/or call ops->pfn_mkwrite. |
| */ |
| - if (!vmf->page) |
| + if (!vmf->page || is_fsdax_page(vmf->page)) { |
| + vmf->page = NULL; |
| return wp_pfn_shared(vmf); |
| + } |
| return wp_page_shared(vmf, folio); |
| } |
| |
| --- a/mm/memory-failure.c~fs-dax-properly-refcount-fs-dax-pages |
| +++ a/mm/memory-failure.c |
| @@ -419,18 +419,18 @@ static unsigned long dev_pagemap_mapping |
| pud = pud_offset(p4d, address); |
| if (!pud_present(*pud)) |
| return 0; |
| - if (pud_devmap(*pud)) |
| + if (pud_trans_huge(*pud)) |
| return PUD_SHIFT; |
| pmd = pmd_offset(pud, address); |
| if (!pmd_present(*pmd)) |
| return 0; |
| - if (pmd_devmap(*pmd)) |
| + if (pmd_trans_huge(*pmd)) |
| return PMD_SHIFT; |
| pte = pte_offset_map(pmd, address); |
| if (!pte) |
| return 0; |
| ptent = ptep_get(pte); |
| - if (pte_present(ptent) && pte_devmap(ptent)) |
| + if (pte_present(ptent)) |
| ret = PAGE_SHIFT; |
| pte_unmap(pte); |
| return ret; |
| --- a/mm/memremap.c~fs-dax-properly-refcount-fs-dax-pages |
| +++ a/mm/memremap.c |
| @@ -458,8 +458,13 @@ EXPORT_SYMBOL_GPL(get_dev_pagemap); |
| |
| void free_zone_device_folio(struct folio *folio) |
| { |
| - if (WARN_ON_ONCE(!folio->pgmap->ops || |
| - !folio->pgmap->ops->page_free)) |
| + struct dev_pagemap *pgmap = folio->pgmap; |
| + |
| + if (WARN_ON_ONCE(!pgmap->ops)) |
| + return; |
| + |
| + if (WARN_ON_ONCE(pgmap->type != MEMORY_DEVICE_FS_DAX && |
| + !pgmap->ops->page_free)) |
| return; |
| |
| mem_cgroup_uncharge(folio); |
| @@ -484,26 +489,36 @@ void free_zone_device_folio(struct folio |
| * For other types of ZONE_DEVICE pages, migration is either |
| * handled differently or not done at all, so there is no need |
| * to clear folio->mapping. |
| + * |
| + * FS DAX pages clear the mapping when the folio->share count hits |
| + * zero which indicating the page has been removed from the file |
| + * system mapping. |
| */ |
| - folio->mapping = NULL; |
| - folio->pgmap->ops->page_free(folio_page(folio, 0)); |
| + if (pgmap->type != MEMORY_DEVICE_FS_DAX) |
| + folio->mapping = NULL; |
| |
| - switch (folio->pgmap->type) { |
| + switch (pgmap->type) { |
| case MEMORY_DEVICE_PRIVATE: |
| case MEMORY_DEVICE_COHERENT: |
| - put_dev_pagemap(folio->pgmap); |
| + pgmap->ops->page_free(folio_page(folio, 0)); |
| + put_dev_pagemap(pgmap); |
| break; |
| |
| - case MEMORY_DEVICE_FS_DAX: |
| case MEMORY_DEVICE_GENERIC: |
| /* |
| * Reset the refcount to 1 to prepare for handing out the page |
| * again. |
| */ |
| + pgmap->ops->page_free(folio_page(folio, 0)); |
| folio_set_count(folio, 1); |
| break; |
| |
| + case MEMORY_DEVICE_FS_DAX: |
| + wake_up_var(&folio->page); |
| + break; |
| + |
| case MEMORY_DEVICE_PCI_P2PDMA: |
| + pgmap->ops->page_free(folio_page(folio, 0)); |
| break; |
| } |
| } |
| @@ -519,21 +534,3 @@ void zone_device_page_init(struct page * |
| lock_page(page); |
| } |
| EXPORT_SYMBOL_GPL(zone_device_page_init); |
| - |
| -#ifdef CONFIG_FS_DAX |
| -bool __put_devmap_managed_folio_refs(struct folio *folio, int refs) |
| -{ |
| - if (folio->pgmap->type != MEMORY_DEVICE_FS_DAX) |
| - return false; |
| - |
| - /* |
| - * fsdax page refcounts are 1-based, rather than 0-based: if |
| - * refcount is 1, then the page is free and the refcount is |
| - * stable because nobody holds a reference on the page. |
| - */ |
| - if (folio_ref_sub_return(folio, refs) == 1) |
| - wake_up_var(&folio->_refcount); |
| - return true; |
| -} |
| -EXPORT_SYMBOL(__put_devmap_managed_folio_refs); |
| -#endif /* CONFIG_FS_DAX */ |
| --- a/mm/mm_init.c~fs-dax-properly-refcount-fs-dax-pages |
| +++ a/mm/mm_init.c |
| @@ -1026,23 +1026,22 @@ static void __ref __init_zone_device_pag |
| } |
| |
| /* |
| - * ZONE_DEVICE pages other than MEMORY_TYPE_GENERIC and |
| - * MEMORY_TYPE_FS_DAX pages are released directly to the driver page |
| - * allocator which will set the page count to 1 when allocating the |
| - * page. |
| + * ZONE_DEVICE pages other than MEMORY_TYPE_GENERIC are released |
| + * directly to the driver page allocator which will set the page count |
| + * to 1 when allocating the page. |
| * |
| * MEMORY_TYPE_GENERIC and MEMORY_TYPE_FS_DAX pages automatically have |
| * their refcount reset to one whenever they are freed (ie. after |
| * their refcount drops to 0). |
| */ |
| switch (pgmap->type) { |
| + case MEMORY_DEVICE_FS_DAX: |
| case MEMORY_DEVICE_PRIVATE: |
| case MEMORY_DEVICE_COHERENT: |
| case MEMORY_DEVICE_PCI_P2PDMA: |
| set_page_count(page, 0); |
| break; |
| |
| - case MEMORY_DEVICE_FS_DAX: |
| case MEMORY_DEVICE_GENERIC: |
| break; |
| } |
| --- a/mm/swap.c~fs-dax-properly-refcount-fs-dax-pages |
| +++ a/mm/swap.c |
| @@ -956,8 +956,6 @@ void folios_put_refs(struct folio_batch |
| unlock_page_lruvec_irqrestore(lruvec, flags); |
| lruvec = NULL; |
| } |
| - if (put_devmap_managed_folio_refs(folio, nr_refs)) |
| - continue; |
| if (folio_ref_sub_and_test(folio, nr_refs)) |
| free_zone_device_folio(folio); |
| continue; |
| _ |