patches/old/fs-dax-properly-refcount-fs-dax-pages.patch - pub/scm/linux/kernel/git/akpm/25-new - Git at Google

 From: Alistair Popple <apopple@nvidia.com>
 Subject: fs/dax: properly refcount fs dax pages
 Date: Fri, 28 Feb 2025 14:31:14 +1100

 Currently fs dax pages are considered free when the refcount drops to one
 and their refcounts are not increased when mapped via PTEs or decreased
 when unmapped.  This requires special logic in mm paths to detect that
 these pages should not be properly refcounted, and to detect when the
 refcount drops to one instead of zero.

 On the other hand get_user_pages(), etc.  will properly refcount fs dax
 pages by taking a reference and dropping it when the page is unpinned.

 Tracking this special behaviour requires extra PTE bits (eg.  pte_devmap)
 and introduces rules that are potentially confusing and specific to FS DAX
 pages.  To fix this, and to possibly allow removal of the special PTE bits
 in future, convert the fs dax page refcounts to be zero based and instead
 take a reference on the page each time it is mapped as is currently the
 case for normal pages.

 This may also allow a future clean-up to remove the pgmap refcounting that
 is currently done in mm/gup.c.

 Link: https://lkml.kernel.org/r/c7d886ad7468a20452ef6e0ddab6cfe220874e7c.1740713401.git-series.apopple@nvidia.com
 Signed-off-by: Alistair Popple <apopple@nvidia.com>
 Reviewed-by: Dan Williams <dan.j.williams@intel.com>
 Tested-by: Alison Schofield <alison.schofield@intel.com>
 Acked-by: David Hildenbrand <david@redhat.com>
 Cc: Alexander Gordeev <agordeev@linux.ibm.com>
 Cc: Asahi Lina <lina@asahilina.net>
 Cc: Balbir Singh <balbirs@nvidia.com>
 Cc: Bjorn Helgaas <bhelgaas@google.com>
 Cc: Catalin Marinas <catalin.marinas@arm.com>
 Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
 Cc: Christoph Hellwig <hch@lst.de>
 Cc: Chunyan Zhang <zhang.lyra@gmail.com>
 Cc: "Darrick J. Wong" <djwong@kernel.org>
 Cc: Dave Chinner <david@fromorbit.com>
 Cc: Dave Hansen <dave.hansen@linux.intel.com>
 Cc: Dave Jiang <dave.jiang@intel.com>
 Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
 Cc: Heiko Carstens <hca@linux.ibm.com>
 Cc: Huacai Chen <chenhuacai@kernel.org>
 Cc: Ira Weiny <ira.weiny@intel.com>
 Cc: Jan Kara <jack@suse.cz>
 Cc: Jason Gunthorpe <jgg@nvidia.com>
 Cc: Jason Gunthorpe <jgg@ziepe.ca>
 Cc: John Hubbard <jhubbard@nvidia.com>
 Cc: linmiaohe <linmiaohe@huawei.com>
 Cc: Logan Gunthorpe <logang@deltatee.com>
 Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
 Cc: Michael "Camp Drill Sergeant" Ellerman <mpe@ellerman.id.au>
 Cc: Nicholas Piggin <npiggin@gmail.com>
 Cc: Peter Xu <peterx@redhat.com>
 Cc: Sven Schnelle <svens@linux.ibm.com>
 Cc: Ted Ts'o <tytso@mit.edu>
 Cc: Vasily Gorbik <gor@linux.ibm.com>
 Cc: Vishal Verma <vishal.l.verma@intel.com>
 Cc: Vivek Goyal <vgoyal@redhat.com>
 Cc: WANG Xuerui <kernel@xen0n.name>
 Cc: Will Deacon <will@kernel.org>
 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
 ---

  drivers/nvdimm/pmem.c    |    4
  fs/dax.c                 |  188 ++++++++++++++++++++++---------------
  fs/fuse/virtio_fs.c      |    3
  include/linux/dax.h      |    2
  include/linux/mm.h       |   27 -----
  include/linux/mm_types.h |    7 +
  mm/gup.c                 |    9 -
  mm/huge_memory.c         |    6 -
  mm/internal.h            |    2
  mm/memory-failure.c      |    6 -
  mm/memory.c              |    6 -
  mm/memremap.c            |   47 ++++-----
  mm/mm_init.c             |    9 -
  mm/swap.c                |    2
  14 files changed, 166 insertions(+), 152 deletions(-)

 --- a/drivers/nvdimm/pmem.c~fs-dax-properly-refcount-fs-dax-pages
 +++ a/drivers/nvdimm/pmem.c
 @@ -513,7 +513,7 @@ static int pmem_attach_disk(struct devic

  	pmem->disk = disk;
  	pmem->pgmap.owner = pmem;
 -	pmem->pfn_flags = PFN_DEV;
 +	pmem->pfn_flags = 0;
  	if (is_nd_pfn(dev)) {
  		pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
  		pmem->pgmap.ops = &fsdax_pagemap_ops;
 @@ -522,7 +522,6 @@ static int pmem_attach_disk(struct devic
  		pmem->data_offset = le64_to_cpu(pfn_sb->dataoff);
  		pmem->pfn_pad = resource_size(res) -
  			range_len(&pmem->pgmap.range);
 -		pmem->pfn_flags |= PFN_MAP;
  		bb_range = pmem->pgmap.range;
  		bb_range.start += pmem->data_offset;
  	} else if (pmem_should_map_pages(dev)) {
 @@ -532,7 +531,6 @@ static int pmem_attach_disk(struct devic
  		pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
  		pmem->pgmap.ops = &fsdax_pagemap_ops;
  		addr = devm_memremap_pages(dev, &pmem->pgmap);
 -		pmem->pfn_flags |= PFN_MAP;
  		bb_range = pmem->pgmap.range;
  	} else {
  		addr = devm_memremap(dev, pmem->phys_addr,
 --- a/fs/dax.c~fs-dax-properly-refcount-fs-dax-pages
 +++ a/fs/dax.c
 @@ -71,6 +71,11 @@ static unsigned long dax_to_pfn(void *en
  	return xa_to_value(entry) >> DAX_SHIFT;
  }

 +static struct folio *dax_to_folio(void *entry)
 +{
 +	return page_folio(pfn_to_page(dax_to_pfn(entry)));
 +}
 +
  static void *dax_make_entry(pfn_t pfn, unsigned long flags)
  {
  	return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT));
 @@ -338,19 +343,6 @@ static unsigned long dax_entry_size(void
  		return PAGE_SIZE;
  }

 -static unsigned long dax_end_pfn(void *entry)
 -{
 -	return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE;
 -}
 -
 -/*
 - * Iterate through all mapped pfns represented by an entry, i.e. skip
 - * 'empty' and 'zero' entries.
 - */
 -#define for_each_mapped_pfn(entry, pfn) \
 -	for (pfn = dax_to_pfn(entry); \
 -			pfn < dax_end_pfn(entry); pfn++)
 -
  /*
   * A DAX folio is considered shared if it has no mapping set and ->share (which
   * shares the ->index field) is non-zero. Note this may return false even if the
 @@ -359,7 +351,7 @@ static unsigned long dax_end_pfn(void *e
   */
  static inline bool dax_folio_is_shared(struct folio *folio)
  {
 -	return !folio->mapping && folio->page.share;
 +	return !folio->mapping && folio->share;
  }

  /*
 @@ -384,75 +376,117 @@ static void dax_folio_make_shared(struct
  	 * folio has previously been mapped into one address space so set the
  	 * share count.
  	 */
 -	folio->page.share = 1;
 +	folio->share = 1;
  }

 -static inline unsigned long dax_folio_share_put(struct folio *folio)
 +static inline unsigned long dax_folio_put(struct folio *folio)
  {
 -	return --folio->page.share;
 +	unsigned long ref;
 +	int order, i;
 +
 +	if (!dax_folio_is_shared(folio))
 +		ref = 0;
 +	else
 +		ref = --folio->share;
 +
 +	if (ref)
 +		return ref;
 +
 +	folio->mapping = NULL;
 +	order = folio_order(folio);
 +	if (!order)
 +		return 0;
 +
 +	for (i = 0; i < (1UL << order); i++) {
 +		struct dev_pagemap *pgmap = page_pgmap(&folio->page);
 +		struct page *page = folio_page(folio, i);
 +		struct folio *new_folio = (struct folio *)page;
 +
 +		ClearPageHead(page);
 +		clear_compound_head(page);
 +
 +		new_folio->mapping = NULL;
 +		/*
 +		 * Reset pgmap which was over-written by
 +		 * prep_compound_page().
 +		 */
 +		new_folio->pgmap = pgmap;
 +		new_folio->share = 0;
 +		WARN_ON_ONCE(folio_ref_count(new_folio));
 +	}
 +
 +	return ref;
 +}
 +
 +static void dax_folio_init(void *entry)
 +{
 +	struct folio *folio = dax_to_folio(entry);
 +	int order = dax_entry_order(entry);
 +
 +	/*
 +	 * Folio should have been split back to order-0 pages in
 +	 * dax_folio_put() when they were removed from their
 +	 * final mapping.
 +	 */
 +	WARN_ON_ONCE(folio_order(folio));
 +
 +	if (order > 0) {
 +		prep_compound_page(&folio->page, order);
 +		if (order > 1)
 +			INIT_LIST_HEAD(&folio->_deferred_list);
 +		WARN_ON_ONCE(folio_ref_count(folio));
 +	}
  }

  static void dax_associate_entry(void *entry, struct address_space *mapping,
 -		struct vm_area_struct *vma, unsigned long address, bool shared)
 +				struct vm_area_struct *vma,
 +				unsigned long address, bool shared)
  {
 -	unsigned long size = dax_entry_size(entry), pfn, index;
 -	int i = 0;
 +	unsigned long size = dax_entry_size(entry), index;
 +	struct folio *folio = dax_to_folio(entry);

  	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
  		return;

  	index = linear_page_index(vma, address & ~(size - 1));
 -	for_each_mapped_pfn(entry, pfn) {
 -		struct folio *folio = pfn_folio(pfn);
 -
 -		if (shared && (folio->mapping || folio->page.share)) {
 -			if (folio->mapping)
 -				dax_folio_make_shared(folio);
 -
 -			WARN_ON_ONCE(!folio->page.share);
 -			folio->page.share++;
 -		} else {
 -			WARN_ON_ONCE(folio->mapping);
 -			folio->mapping = mapping;
 -			folio->index = index + i++;
 -		}
 +	if (shared && (folio->mapping || dax_folio_is_shared(folio))) {
 +		if (folio->mapping)
 +			dax_folio_make_shared(folio);
 +
 +		WARN_ON_ONCE(!folio->share);
 +		WARN_ON_ONCE(dax_entry_order(entry) != folio_order(folio));
 +		folio->share++;
 +	} else {
 +		WARN_ON_ONCE(folio->mapping);
 +		dax_folio_init(entry);
 +		folio = dax_to_folio(entry);
 +		folio->mapping = mapping;
 +		folio->index = index;
  	}
  }

  static void dax_disassociate_entry(void *entry, struct address_space *mapping,
 -		bool trunc)
 +				bool trunc)
  {
 -	unsigned long pfn;
 +	struct folio *folio = dax_to_folio(entry);

  	if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
  		return;

 -	for_each_mapped_pfn(entry, pfn) {
 -		struct folio *folio = pfn_folio(pfn);
 -
 -		WARN_ON_ONCE(trunc && folio_ref_count(folio) > 1);
 -		if (dax_folio_is_shared(folio)) {
 -			/* keep the shared flag if this page is still shared */
 -			if (dax_folio_share_put(folio) > 0)
 -				continue;
 -		} else
 -			WARN_ON_ONCE(folio->mapping && folio->mapping != mapping);
 -		folio->mapping = NULL;
 -		folio->index = 0;
 -	}
 +	dax_folio_put(folio);
  }

  static struct page *dax_busy_page(void *entry)
  {
 -	unsigned long pfn;
 +	struct folio *folio = dax_to_folio(entry);

 -	for_each_mapped_pfn(entry, pfn) {
 -		struct page *page = pfn_to_page(pfn);
 +	if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry))
 +		return NULL;

 -		if (page_ref_count(page) > 1)
 -			return page;
 -	}
 -	return NULL;
 +	if (folio_ref_count(folio) - folio_mapcount(folio))
 +		return &folio->page;
 +	else
 +		return NULL;
  }

  /**
 @@ -785,7 +819,7 @@ struct page *dax_layout_busy_page(struct
  EXPORT_SYMBOL_GPL(dax_layout_busy_page);

  static int __dax_invalidate_entry(struct address_space *mapping,
 -					  pgoff_t index, bool trunc)
 +				  pgoff_t index, bool trunc)
  {
  	XA_STATE(xas, &mapping->i_pages, index);
  	int ret = 0;
 @@ -953,7 +987,8 @@ void dax_break_layout_final(struct inode
  		wait_page_idle_uninterruptible(page, inode);
  	} while (true);

 -	dax_delete_mapping_range(inode->i_mapping, 0, LLONG_MAX);
 +	if (!page)
 +		dax_delete_mapping_range(inode->i_mapping, 0, LLONG_MAX);
  }
  EXPORT_SYMBOL_GPL(dax_break_layout_final);

 @@ -1039,8 +1074,10 @@ static void *dax_insert_entry(struct xa_
  		void *old;

  		dax_disassociate_entry(entry, mapping, false);
 -		dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address,
 -				shared);
 +		if (!(flags & DAX_ZERO_PAGE))
 +			dax_associate_entry(new_entry, mapping, vmf->vma,
 +					vmf->address, shared);
 +
  		/*
  		 * Only swap our new entry into the page cache if the current
  		 * entry is a zero page or an empty entry.  If a normal PTE or
 @@ -1228,9 +1265,7 @@ static int dax_iomap_direct_access(const
  		goto out;
  	if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1))
  		goto out;
 -	/* For larger pages we need devmap */
 -	if (length > 1 && !pfn_t_devmap(*pfnp))
 -		goto out;
 +
  	rc = 0;

  out_check_addr:
 @@ -1337,7 +1372,7 @@ static vm_fault_t dax_load_hole(struct x

  	*entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, DAX_ZERO_PAGE);

 -	ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
 +	ret = vmf_insert_page_mkwrite(vmf, pfn_t_to_page(pfn), false);
  	trace_dax_load_hole(inode, vmf, ret);
  	return ret;
  }
 @@ -1808,7 +1843,8 @@ static vm_fault_t dax_fault_iter(struct
  	loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT;
  	bool write = iter->flags & IOMAP_WRITE;
  	unsigned long entry_flags = pmd ? DAX_PMD : 0;
 -	int err = 0;
 +	struct folio *folio;
 +	int ret, err = 0;
  	pfn_t pfn;
  	void *kaddr;

 @@ -1840,17 +1876,19 @@ static vm_fault_t dax_fault_iter(struct
  			return dax_fault_return(err);
  	}

 +	folio = dax_to_folio(*entry);
  	if (dax_fault_is_synchronous(iter, vmf->vma))
  		return dax_fault_synchronous_pfnp(pfnp, pfn);

 -	/* insert PMD pfn */
 +	folio_ref_inc(folio);
  	if (pmd)
 -		return vmf_insert_pfn_pmd(vmf, pfn, write);
 +		ret = vmf_insert_folio_pmd(vmf, pfn_folio(pfn_t_to_pfn(pfn)),
 +					write);
 +	else
 +		ret = vmf_insert_page_mkwrite(vmf, pfn_t_to_page(pfn), write);
 +	folio_put(folio);

 -	/* insert PTE pfn */
 -	if (write)
 -		return vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
 -	return vmf_insert_mixed(vmf->vma, vmf->address, pfn);
 +	return ret;
  }

  static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
 @@ -2089,6 +2127,7 @@ dax_insert_pfn_mkwrite(struct vm_fault *
  {
  	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
  	XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order);
 +	struct folio *folio;
  	void *entry;
  	vm_fault_t ret;

 @@ -2106,14 +2145,17 @@ dax_insert_pfn_mkwrite(struct vm_fault *
  	xas_set_mark(&xas, PAGECACHE_TAG_DIRTY);
  	dax_lock_entry(&xas, entry);
  	xas_unlock_irq(&xas);
 +	folio = pfn_folio(pfn_t_to_pfn(pfn));
 +	folio_ref_inc(folio);
  	if (order == 0)
 -		ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
 +		ret = vmf_insert_page_mkwrite(vmf, &folio->page, true);
  #ifdef CONFIG_FS_DAX_PMD
  	else if (order == PMD_ORDER)
 -		ret = vmf_insert_pfn_pmd(vmf, pfn, FAULT_FLAG_WRITE);
 +		ret = vmf_insert_folio_pmd(vmf, folio, FAULT_FLAG_WRITE);
  #endif
  	else
  		ret = VM_FAULT_FALLBACK;
 +	folio_put(folio);
  	dax_unlock_entry(&xas, entry);
  	trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret);
  	return ret;
 --- a/fs/fuse/virtio_fs.c~fs-dax-properly-refcount-fs-dax-pages
 +++ a/fs/fuse/virtio_fs.c
 @@ -1017,8 +1017,7 @@ static long virtio_fs_direct_access(stru
  	if (kaddr)
  		*kaddr = fs->window_kaddr + offset;
  	if (pfn)
 -		*pfn = phys_to_pfn_t(fs->window_phys_addr + offset,
 -					PFN_DEV | PFN_MAP);
 +		*pfn = phys_to_pfn_t(fs->window_phys_addr + offset, 0);
  	return nr_pages > max_nr_pages ? max_nr_pages : nr_pages;
  }

 --- a/include/linux/dax.h~fs-dax-properly-refcount-fs-dax-pages
 +++ a/include/linux/dax.h
 @@ -209,7 +209,7 @@ int dax_truncate_page(struct inode *inod

  static inline bool dax_page_is_idle(struct page *page)
  {
 -	return page && page_ref_count(page) == 1;
 +	return page && page_ref_count(page) == 0;
  }

  #if IS_ENABLED(CONFIG_DAX)
 --- a/include/linux/mm.h~fs-dax-properly-refcount-fs-dax-pages
 +++ a/include/linux/mm.h
 @@ -1192,6 +1192,8 @@ int vma_is_stack_for_current(struct vm_a
  struct mmu_gather;
  struct inode;

 +extern void prep_compound_page(struct page *page, unsigned int order);
 +
  /*
   * compound_order() can be called without holding a reference, which means
   * that niceties like page_folio() don't work.  These callers should be
 @@ -1513,25 +1515,6 @@ vm_fault_t finish_fault(struct vm_fault
   *   back into memory.
   */

 -#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX)
 -DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
 -
 -bool __put_devmap_managed_folio_refs(struct folio *folio, int refs);
 -static inline bool put_devmap_managed_folio_refs(struct folio *folio, int refs)
 -{
 -	if (!static_branch_unlikely(&devmap_managed_key))
 -		return false;
 -	if (!folio_is_zone_device(folio))
 -		return false;
 -	return __put_devmap_managed_folio_refs(folio, refs);
 -}
 -#else /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */
 -static inline bool put_devmap_managed_folio_refs(struct folio *folio, int refs)
 -{
 -	return false;
 -}
 -#endif /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */
 -
  /* 127: arbitrary random number, small enough to assemble well */
  #define folio_ref_zero_or_close_to_overflow(folio) \
  	((unsigned int) folio_ref_count(folio) + 127u <= 127u)
 @@ -1652,12 +1635,6 @@ static inline void put_page(struct page
  	if (folio_test_slab(folio))
  		return;

 -	/*
 -	 * For some devmap managed pages we need to catch refcount transition
 -	 * from 2 to 1:
 -	 */
 -	if (put_devmap_managed_folio_refs(folio, 1))
 -		return;
  	folio_put(folio);
  }

 --- a/include/linux/mm_types.h~fs-dax-properly-refcount-fs-dax-pages
 +++ a/include/linux/mm_types.h
 @@ -296,6 +296,8 @@ typedef struct {
   *    anonymous memory.
   * @index: Offset within the file, in units of pages.  For anonymous memory,
   *    this is the index from the beginning of the mmap.
 + * @share: number of DAX mappings that reference this folio. See
 + *    dax_associate_entry.
   * @private: Filesystem per-folio data (see folio_attach_private()).
   * @swap: Used for swp_entry_t if folio_test_swapcache().
   * @_mapcount: Do not access this member directly.  Use folio_mapcount() to
 @@ -345,7 +347,10 @@ struct folio {
  				struct dev_pagemap *pgmap;
  			};
  			struct address_space *mapping;
 -			pgoff_t index;
 +			union {
 +				pgoff_t index;
 +				unsigned long share;
 +			};
  			union {
  				void *private;
  				swp_entry_t swap;
 --- a/mm/gup.c~fs-dax-properly-refcount-fs-dax-pages
 +++ a/mm/gup.c
 @@ -96,8 +96,7 @@ retry:
  	 * belongs to this folio.
  	 */
  	if (unlikely(page_folio(page) != folio)) {
 -		if (!put_devmap_managed_folio_refs(folio, refs))
 -			folio_put_refs(folio, refs);
 +		folio_put_refs(folio, refs);
  		goto retry;
  	}

 @@ -116,8 +115,7 @@ static void gup_put_folio(struct folio *
  			refs *= GUP_PIN_COUNTING_BIAS;
  	}

 -	if (!put_devmap_managed_folio_refs(folio, refs))
 -		folio_put_refs(folio, refs);
 +	folio_put_refs(folio, refs);
  }

  /**
 @@ -565,8 +563,7 @@ static struct folio *try_grab_folio_fast
  	 */
  	if (unlikely((flags & FOLL_LONGTERM) &&
  		     !folio_is_longterm_pinnable(folio))) {
 -		if (!put_devmap_managed_folio_refs(folio, refs))
 -			folio_put_refs(folio, refs);
 +		folio_put_refs(folio, refs);
  		return NULL;
  	}

 --- a/mm/huge_memory.c~fs-dax-properly-refcount-fs-dax-pages
 +++ a/mm/huge_memory.c
 @@ -2225,7 +2225,7 @@ int zap_huge_pmd(struct mmu_gather *tlb,
  						tlb->fullmm);
  	arch_check_zapped_pmd(vma, orig_pmd);
  	tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
 -	if (vma_is_special_huge(vma)) {
 +	if (!vma_is_dax(vma) && vma_is_special_huge(vma)) {
  		if (arch_needs_pgtable_deposit())
  			zap_deposited_table(tlb->mm, pmd);
  		spin_unlock(ptl);
 @@ -2882,13 +2882,15 @@ static void __split_huge_pmd_locked(stru
  		 */
  		if (arch_needs_pgtable_deposit())
  			zap_deposited_table(mm, pmd);
 -		if (vma_is_special_huge(vma))
 +		if (!vma_is_dax(vma) && vma_is_special_huge(vma))
  			return;
  		if (unlikely(is_pmd_migration_entry(old_pmd))) {
  			swp_entry_t entry;

  			entry = pmd_to_swp_entry(old_pmd);
  			folio = pfn_swap_entry_folio(entry);
 +		} else if (is_huge_zero_pmd(old_pmd)) {
 +			return;
  		} else {
  			page = pmd_page(old_pmd);
  			folio = page_folio(page);
 --- a/mm/internal.h~fs-dax-properly-refcount-fs-dax-pages
 +++ a/mm/internal.h
 @@ -737,8 +737,6 @@ static inline void prep_compound_tail(st
  	set_page_private(p, 0);
  }

 -extern void prep_compound_page(struct page *page, unsigned int order);
 -
  void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags);
  extern bool free_pages_prepare(struct page *page, unsigned int order);

 --- a/mm/memory.c~fs-dax-properly-refcount-fs-dax-pages
 +++ a/mm/memory.c
 @@ -3848,13 +3848,15 @@ static vm_fault_t do_wp_page(struct vm_f
  	if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
  		/*
  		 * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
 -		 * VM_PFNMAP VMA.
 +		 * VM_PFNMAP VMA. FS DAX also wants ops->pfn_mkwrite called.
  		 *
  		 * We should not cow pages in a shared writeable mapping.
  		 * Just mark the pages writable and/or call ops->pfn_mkwrite.
  		 */
 -		if (!vmf->page)
 +		if (!vmf->page || is_fsdax_page(vmf->page)) {
 +			vmf->page = NULL;
  			return wp_pfn_shared(vmf);
 +		}
  		return wp_page_shared(vmf, folio);
  	}

 --- a/mm/memory-failure.c~fs-dax-properly-refcount-fs-dax-pages
 +++ a/mm/memory-failure.c
 @@ -419,18 +419,18 @@ static unsigned long dev_pagemap_mapping
  	pud = pud_offset(p4d, address);
  	if (!pud_present(*pud))
  		return 0;
 -	if (pud_devmap(*pud))
 +	if (pud_trans_huge(*pud))
  		return PUD_SHIFT;
  	pmd = pmd_offset(pud, address);
  	if (!pmd_present(*pmd))
  		return 0;
 -	if (pmd_devmap(*pmd))
 +	if (pmd_trans_huge(*pmd))
  		return PMD_SHIFT;
  	pte = pte_offset_map(pmd, address);
  	if (!pte)
  		return 0;
  	ptent = ptep_get(pte);
 -	if (pte_present(ptent) && pte_devmap(ptent))
 +	if (pte_present(ptent))
  		ret = PAGE_SHIFT;
  	pte_unmap(pte);
  	return ret;
 --- a/mm/memremap.c~fs-dax-properly-refcount-fs-dax-pages
 +++ a/mm/memremap.c
 @@ -458,8 +458,13 @@ EXPORT_SYMBOL_GPL(get_dev_pagemap);

  void free_zone_device_folio(struct folio *folio)
  {
 -	if (WARN_ON_ONCE(!folio->pgmap->ops ||
 -			!folio->pgmap->ops->page_free))
 +	struct dev_pagemap *pgmap = folio->pgmap;
 +
 +	if (WARN_ON_ONCE(!pgmap->ops))
 +		return;
 +
 +	if (WARN_ON_ONCE(pgmap->type != MEMORY_DEVICE_FS_DAX &&
 +			 !pgmap->ops->page_free))
  		return;

  	mem_cgroup_uncharge(folio);
 @@ -484,26 +489,36 @@ void free_zone_device_folio(struct folio
  	 * For other types of ZONE_DEVICE pages, migration is either
  	 * handled differently or not done at all, so there is no need
  	 * to clear folio->mapping.
 +	 *
 +	 * FS DAX pages clear the mapping when the folio->share count hits
 +	 * zero which indicating the page has been removed from the file
 +	 * system mapping.
  	 */
 -	folio->mapping = NULL;
 -	folio->pgmap->ops->page_free(folio_page(folio, 0));
 +	if (pgmap->type != MEMORY_DEVICE_FS_DAX)
 +		folio->mapping = NULL;

 -	switch (folio->pgmap->type) {
 +	switch (pgmap->type) {
  	case MEMORY_DEVICE_PRIVATE:
  	case MEMORY_DEVICE_COHERENT:
 -		put_dev_pagemap(folio->pgmap);
 +		pgmap->ops->page_free(folio_page(folio, 0));
 +		put_dev_pagemap(pgmap);
  		break;

 -	case MEMORY_DEVICE_FS_DAX:
  	case MEMORY_DEVICE_GENERIC:
  		/*
  		 * Reset the refcount to 1 to prepare for handing out the page
  		 * again.
  		 */
 +		pgmap->ops->page_free(folio_page(folio, 0));
  		folio_set_count(folio, 1);
  		break;

 +	case MEMORY_DEVICE_FS_DAX:
 +		wake_up_var(&folio->page);
 +		break;
 +
  	case MEMORY_DEVICE_PCI_P2PDMA:
 +		pgmap->ops->page_free(folio_page(folio, 0));
  		break;
  	}
  }
 @@ -519,21 +534,3 @@ void zone_device_page_init(struct page *
  	lock_page(page);
  }
  EXPORT_SYMBOL_GPL(zone_device_page_init);
 -
 -#ifdef CONFIG_FS_DAX
 -bool __put_devmap_managed_folio_refs(struct folio *folio, int refs)
 -{
 -	if (folio->pgmap->type != MEMORY_DEVICE_FS_DAX)
 -		return false;
 -
 -	/*
 -	 * fsdax page refcounts are 1-based, rather than 0-based: if
 -	 * refcount is 1, then the page is free and the refcount is
 -	 * stable because nobody holds a reference on the page.
 -	 */
 -	if (folio_ref_sub_return(folio, refs) == 1)
 -		wake_up_var(&folio->_refcount);
 -	return true;
 -}
 -EXPORT_SYMBOL(__put_devmap_managed_folio_refs);
 -#endif /* CONFIG_FS_DAX */
 --- a/mm/mm_init.c~fs-dax-properly-refcount-fs-dax-pages
 +++ a/mm/mm_init.c
 @@ -1026,23 +1026,22 @@ static void __ref __init_zone_device_pag
  	}

  	/*
 -	 * ZONE_DEVICE pages other than MEMORY_TYPE_GENERIC and
 -	 * MEMORY_TYPE_FS_DAX pages are released directly to the driver page
 -	 * allocator which will set the page count to 1 when allocating the
 -	 * page.
 +	 * ZONE_DEVICE pages other than MEMORY_TYPE_GENERIC are released
 +	 * directly to the driver page allocator which will set the page count
 +	 * to 1 when allocating the page.
  	 *
  	 * MEMORY_TYPE_GENERIC and MEMORY_TYPE_FS_DAX pages automatically have
  	 * their refcount reset to one whenever they are freed (ie. after
  	 * their refcount drops to 0).
  	 */
  	switch (pgmap->type) {
 +	case MEMORY_DEVICE_FS_DAX:
  	case MEMORY_DEVICE_PRIVATE:
  	case MEMORY_DEVICE_COHERENT:
  	case MEMORY_DEVICE_PCI_P2PDMA:
  		set_page_count(page, 0);
  		break;

 -	case MEMORY_DEVICE_FS_DAX:
  	case MEMORY_DEVICE_GENERIC:
  		break;
  	}
 --- a/mm/swap.c~fs-dax-properly-refcount-fs-dax-pages
 +++ a/mm/swap.c
 @@ -956,8 +956,6 @@ void folios_put_refs(struct folio_batch
  				unlock_page_lruvec_irqrestore(lruvec, flags);
  				lruvec = NULL;
  			}
 -			if (put_devmap_managed_folio_refs(folio, nr_refs))
 -				continue;
  			if (folio_ref_sub_and_test(folio, nr_refs))
  				free_zone_device_folio(folio);
  			continue;
 _