|  | // SPDX-License-Identifier: GPL-2.0-only | 
|  | /* | 
|  | *  mm/userfaultfd.c | 
|  | * | 
|  | *  Copyright (C) 2015  Red Hat, Inc. | 
|  | */ | 
|  |  | 
|  | #include <linux/mm.h> | 
|  | #include <linux/sched/signal.h> | 
|  | #include <linux/pagemap.h> | 
|  | #include <linux/rmap.h> | 
|  | #include <linux/swap.h> | 
|  | #include <linux/swapops.h> | 
|  | #include <linux/userfaultfd_k.h> | 
|  | #include <linux/mmu_notifier.h> | 
|  | #include <linux/hugetlb.h> | 
|  | #include <linux/shmem_fs.h> | 
|  | #include <asm/tlbflush.h> | 
|  | #include <asm/tlb.h> | 
|  | #include "internal.h" | 
|  | #include "swap.h" | 
|  |  | 
|  | static __always_inline | 
|  | bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end) | 
|  | { | 
|  | /* Make sure that the dst range is fully within dst_vma. */ | 
|  | if (dst_end > dst_vma->vm_end) | 
|  | return false; | 
|  |  | 
|  | /* | 
|  | * Check the vma is registered in uffd, this is required to | 
|  | * enforce the VM_MAYWRITE check done at uffd registration | 
|  | * time. | 
|  | */ | 
|  | if (!dst_vma->vm_userfaultfd_ctx.ctx) | 
|  | return false; | 
|  |  | 
|  | return true; | 
|  | } | 
|  |  | 
|  | static __always_inline | 
|  | struct vm_area_struct *find_vma_and_prepare_anon(struct mm_struct *mm, | 
|  | unsigned long addr) | 
|  | { | 
|  | struct vm_area_struct *vma; | 
|  |  | 
|  | mmap_assert_locked(mm); | 
|  | vma = vma_lookup(mm, addr); | 
|  | if (!vma) | 
|  | vma = ERR_PTR(-ENOENT); | 
|  | else if (!(vma->vm_flags & VM_SHARED) && | 
|  | unlikely(anon_vma_prepare(vma))) | 
|  | vma = ERR_PTR(-ENOMEM); | 
|  |  | 
|  | return vma; | 
|  | } | 
|  |  | 
|  | #ifdef CONFIG_PER_VMA_LOCK | 
|  | /* | 
|  | * uffd_lock_vma() - Lookup and lock vma corresponding to @address. | 
|  | * @mm: mm to search vma in. | 
|  | * @address: address that the vma should contain. | 
|  | * | 
|  | * Should be called without holding mmap_lock. | 
|  | * | 
|  | * Return: A locked vma containing @address, -ENOENT if no vma is found, or | 
|  | * -ENOMEM if anon_vma couldn't be allocated. | 
|  | */ | 
|  | static struct vm_area_struct *uffd_lock_vma(struct mm_struct *mm, | 
|  | unsigned long address) | 
|  | { | 
|  | struct vm_area_struct *vma; | 
|  |  | 
|  | vma = lock_vma_under_rcu(mm, address); | 
|  | if (vma) { | 
|  | /* | 
|  | * We know we're going to need to use anon_vma, so check | 
|  | * that early. | 
|  | */ | 
|  | if (!(vma->vm_flags & VM_SHARED) && unlikely(!vma->anon_vma)) | 
|  | vma_end_read(vma); | 
|  | else | 
|  | return vma; | 
|  | } | 
|  |  | 
|  | mmap_read_lock(mm); | 
|  | vma = find_vma_and_prepare_anon(mm, address); | 
|  | if (!IS_ERR(vma)) { | 
|  | bool locked = vma_start_read_locked(vma); | 
|  |  | 
|  | if (!locked) | 
|  | vma = ERR_PTR(-EAGAIN); | 
|  | } | 
|  |  | 
|  | mmap_read_unlock(mm); | 
|  | return vma; | 
|  | } | 
|  |  | 
|  | static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm, | 
|  | unsigned long dst_start, | 
|  | unsigned long len) | 
|  | { | 
|  | struct vm_area_struct *dst_vma; | 
|  |  | 
|  | dst_vma = uffd_lock_vma(dst_mm, dst_start); | 
|  | if (IS_ERR(dst_vma) || validate_dst_vma(dst_vma, dst_start + len)) | 
|  | return dst_vma; | 
|  |  | 
|  | vma_end_read(dst_vma); | 
|  | return ERR_PTR(-ENOENT); | 
|  | } | 
|  |  | 
|  | static void uffd_mfill_unlock(struct vm_area_struct *vma) | 
|  | { | 
|  | vma_end_read(vma); | 
|  | } | 
|  |  | 
|  | #else | 
|  |  | 
|  | static struct vm_area_struct *uffd_mfill_lock(struct mm_struct *dst_mm, | 
|  | unsigned long dst_start, | 
|  | unsigned long len) | 
|  | { | 
|  | struct vm_area_struct *dst_vma; | 
|  |  | 
|  | mmap_read_lock(dst_mm); | 
|  | dst_vma = find_vma_and_prepare_anon(dst_mm, dst_start); | 
|  | if (IS_ERR(dst_vma)) | 
|  | goto out_unlock; | 
|  |  | 
|  | if (validate_dst_vma(dst_vma, dst_start + len)) | 
|  | return dst_vma; | 
|  |  | 
|  | dst_vma = ERR_PTR(-ENOENT); | 
|  | out_unlock: | 
|  | mmap_read_unlock(dst_mm); | 
|  | return dst_vma; | 
|  | } | 
|  |  | 
|  | static void uffd_mfill_unlock(struct vm_area_struct *vma) | 
|  | { | 
|  | mmap_read_unlock(vma->vm_mm); | 
|  | } | 
|  | #endif | 
|  |  | 
|  | /* Check if dst_addr is outside of file's size. Must be called with ptl held. */ | 
|  | static bool mfill_file_over_size(struct vm_area_struct *dst_vma, | 
|  | unsigned long dst_addr) | 
|  | { | 
|  | struct inode *inode; | 
|  | pgoff_t offset, max_off; | 
|  |  | 
|  | if (!dst_vma->vm_file) | 
|  | return false; | 
|  |  | 
|  | inode = dst_vma->vm_file->f_inode; | 
|  | offset = linear_page_index(dst_vma, dst_addr); | 
|  | max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); | 
|  | return offset >= max_off; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Install PTEs, to map dst_addr (within dst_vma) to page. | 
|  | * | 
|  | * This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem | 
|  | * and anon, and for both shared and private VMAs. | 
|  | */ | 
|  | int mfill_atomic_install_pte(pmd_t *dst_pmd, | 
|  | struct vm_area_struct *dst_vma, | 
|  | unsigned long dst_addr, struct page *page, | 
|  | bool newly_allocated, uffd_flags_t flags) | 
|  | { | 
|  | int ret; | 
|  | struct mm_struct *dst_mm = dst_vma->vm_mm; | 
|  | pte_t _dst_pte, *dst_pte; | 
|  | bool writable = dst_vma->vm_flags & VM_WRITE; | 
|  | bool vm_shared = dst_vma->vm_flags & VM_SHARED; | 
|  | spinlock_t *ptl; | 
|  | struct folio *folio = page_folio(page); | 
|  | bool page_in_cache = folio_mapping(folio); | 
|  |  | 
|  | _dst_pte = mk_pte(page, dst_vma->vm_page_prot); | 
|  | _dst_pte = pte_mkdirty(_dst_pte); | 
|  | if (page_in_cache && !vm_shared) | 
|  | writable = false; | 
|  | if (writable) | 
|  | _dst_pte = pte_mkwrite(_dst_pte, dst_vma); | 
|  | if (flags & MFILL_ATOMIC_WP) | 
|  | _dst_pte = pte_mkuffd_wp(_dst_pte); | 
|  |  | 
|  | ret = -EAGAIN; | 
|  | dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); | 
|  | if (!dst_pte) | 
|  | goto out; | 
|  |  | 
|  | if (mfill_file_over_size(dst_vma, dst_addr)) { | 
|  | ret = -EFAULT; | 
|  | goto out_unlock; | 
|  | } | 
|  |  | 
|  | ret = -EEXIST; | 
|  | /* | 
|  | * We allow to overwrite a pte marker: consider when both MISSING|WP | 
|  | * registered, we firstly wr-protect a none pte which has no page cache | 
|  | * page backing it, then access the page. | 
|  | */ | 
|  | if (!pte_none_mostly(ptep_get(dst_pte))) | 
|  | goto out_unlock; | 
|  |  | 
|  | if (page_in_cache) { | 
|  | /* Usually, cache pages are already added to LRU */ | 
|  | if (newly_allocated) | 
|  | folio_add_lru(folio); | 
|  | folio_add_file_rmap_pte(folio, page, dst_vma); | 
|  | } else { | 
|  | folio_add_new_anon_rmap(folio, dst_vma, dst_addr, RMAP_EXCLUSIVE); | 
|  | folio_add_lru_vma(folio, dst_vma); | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Must happen after rmap, as mm_counter() checks mapping (via | 
|  | * PageAnon()), which is set by __page_set_anon_rmap(). | 
|  | */ | 
|  | inc_mm_counter(dst_mm, mm_counter(folio)); | 
|  |  | 
|  | set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); | 
|  |  | 
|  | /* No need to invalidate - it was non-present before */ | 
|  | update_mmu_cache(dst_vma, dst_addr, dst_pte); | 
|  | ret = 0; | 
|  | out_unlock: | 
|  | pte_unmap_unlock(dst_pte, ptl); | 
|  | out: | 
|  | return ret; | 
|  | } | 
|  |  | 
|  | static int mfill_atomic_pte_copy(pmd_t *dst_pmd, | 
|  | struct vm_area_struct *dst_vma, | 
|  | unsigned long dst_addr, | 
|  | unsigned long src_addr, | 
|  | uffd_flags_t flags, | 
|  | struct folio **foliop) | 
|  | { | 
|  | void *kaddr; | 
|  | int ret; | 
|  | struct folio *folio; | 
|  |  | 
|  | if (!*foliop) { | 
|  | ret = -ENOMEM; | 
|  | folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, dst_vma, | 
|  | dst_addr); | 
|  | if (!folio) | 
|  | goto out; | 
|  |  | 
|  | kaddr = kmap_local_folio(folio, 0); | 
|  | /* | 
|  | * The read mmap_lock is held here.  Despite the | 
|  | * mmap_lock being read recursive a deadlock is still | 
|  | * possible if a writer has taken a lock.  For example: | 
|  | * | 
|  | * process A thread 1 takes read lock on own mmap_lock | 
|  | * process A thread 2 calls mmap, blocks taking write lock | 
|  | * process B thread 1 takes page fault, read lock on own mmap lock | 
|  | * process B thread 2 calls mmap, blocks taking write lock | 
|  | * process A thread 1 blocks taking read lock on process B | 
|  | * process B thread 1 blocks taking read lock on process A | 
|  | * | 
|  | * Disable page faults to prevent potential deadlock | 
|  | * and retry the copy outside the mmap_lock. | 
|  | */ | 
|  | pagefault_disable(); | 
|  | ret = copy_from_user(kaddr, (const void __user *) src_addr, | 
|  | PAGE_SIZE); | 
|  | pagefault_enable(); | 
|  | kunmap_local(kaddr); | 
|  |  | 
|  | /* fallback to copy_from_user outside mmap_lock */ | 
|  | if (unlikely(ret)) { | 
|  | ret = -ENOENT; | 
|  | *foliop = folio; | 
|  | /* don't free the page */ | 
|  | goto out; | 
|  | } | 
|  |  | 
|  | flush_dcache_folio(folio); | 
|  | } else { | 
|  | folio = *foliop; | 
|  | *foliop = NULL; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * The memory barrier inside __folio_mark_uptodate makes sure that | 
|  | * preceding stores to the page contents become visible before | 
|  | * the set_pte_at() write. | 
|  | */ | 
|  | __folio_mark_uptodate(folio); | 
|  |  | 
|  | ret = -ENOMEM; | 
|  | if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL)) | 
|  | goto out_release; | 
|  |  | 
|  | ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, | 
|  | &folio->page, true, flags); | 
|  | if (ret) | 
|  | goto out_release; | 
|  | out: | 
|  | return ret; | 
|  | out_release: | 
|  | folio_put(folio); | 
|  | goto out; | 
|  | } | 
|  |  | 
|  | static int mfill_atomic_pte_zeroed_folio(pmd_t *dst_pmd, | 
|  | struct vm_area_struct *dst_vma, | 
|  | unsigned long dst_addr) | 
|  | { | 
|  | struct folio *folio; | 
|  | int ret = -ENOMEM; | 
|  |  | 
|  | folio = vma_alloc_zeroed_movable_folio(dst_vma, dst_addr); | 
|  | if (!folio) | 
|  | return ret; | 
|  |  | 
|  | if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL)) | 
|  | goto out_put; | 
|  |  | 
|  | /* | 
|  | * The memory barrier inside __folio_mark_uptodate makes sure that | 
|  | * zeroing out the folio become visible before mapping the page | 
|  | * using set_pte_at(). See do_anonymous_page(). | 
|  | */ | 
|  | __folio_mark_uptodate(folio); | 
|  |  | 
|  | ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, | 
|  | &folio->page, true, 0); | 
|  | if (ret) | 
|  | goto out_put; | 
|  |  | 
|  | return 0; | 
|  | out_put: | 
|  | folio_put(folio); | 
|  | return ret; | 
|  | } | 
|  |  | 
|  | static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd, | 
|  | struct vm_area_struct *dst_vma, | 
|  | unsigned long dst_addr) | 
|  | { | 
|  | pte_t _dst_pte, *dst_pte; | 
|  | spinlock_t *ptl; | 
|  | int ret; | 
|  |  | 
|  | if (mm_forbids_zeropage(dst_vma->vm_mm)) | 
|  | return mfill_atomic_pte_zeroed_folio(dst_pmd, dst_vma, dst_addr); | 
|  |  | 
|  | _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), | 
|  | dst_vma->vm_page_prot)); | 
|  | ret = -EAGAIN; | 
|  | dst_pte = pte_offset_map_lock(dst_vma->vm_mm, dst_pmd, dst_addr, &ptl); | 
|  | if (!dst_pte) | 
|  | goto out; | 
|  | if (mfill_file_over_size(dst_vma, dst_addr)) { | 
|  | ret = -EFAULT; | 
|  | goto out_unlock; | 
|  | } | 
|  | ret = -EEXIST; | 
|  | if (!pte_none(ptep_get(dst_pte))) | 
|  | goto out_unlock; | 
|  | set_pte_at(dst_vma->vm_mm, dst_addr, dst_pte, _dst_pte); | 
|  | /* No need to invalidate - it was non-present before */ | 
|  | update_mmu_cache(dst_vma, dst_addr, dst_pte); | 
|  | ret = 0; | 
|  | out_unlock: | 
|  | pte_unmap_unlock(dst_pte, ptl); | 
|  | out: | 
|  | return ret; | 
|  | } | 
|  |  | 
|  | /* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */ | 
|  | static int mfill_atomic_pte_continue(pmd_t *dst_pmd, | 
|  | struct vm_area_struct *dst_vma, | 
|  | unsigned long dst_addr, | 
|  | uffd_flags_t flags) | 
|  | { | 
|  | struct inode *inode = file_inode(dst_vma->vm_file); | 
|  | pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); | 
|  | struct folio *folio; | 
|  | struct page *page; | 
|  | int ret; | 
|  |  | 
|  | ret = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC); | 
|  | /* Our caller expects us to return -EFAULT if we failed to find folio */ | 
|  | if (ret == -ENOENT) | 
|  | ret = -EFAULT; | 
|  | if (ret) | 
|  | goto out; | 
|  | if (!folio) { | 
|  | ret = -EFAULT; | 
|  | goto out; | 
|  | } | 
|  |  | 
|  | page = folio_file_page(folio, pgoff); | 
|  | if (PageHWPoison(page)) { | 
|  | ret = -EIO; | 
|  | goto out_release; | 
|  | } | 
|  |  | 
|  | ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, | 
|  | page, false, flags); | 
|  | if (ret) | 
|  | goto out_release; | 
|  |  | 
|  | folio_unlock(folio); | 
|  | ret = 0; | 
|  | out: | 
|  | return ret; | 
|  | out_release: | 
|  | folio_unlock(folio); | 
|  | folio_put(folio); | 
|  | goto out; | 
|  | } | 
|  |  | 
|  | /* Handles UFFDIO_POISON for all non-hugetlb VMAs. */ | 
|  | static int mfill_atomic_pte_poison(pmd_t *dst_pmd, | 
|  | struct vm_area_struct *dst_vma, | 
|  | unsigned long dst_addr, | 
|  | uffd_flags_t flags) | 
|  | { | 
|  | int ret; | 
|  | struct mm_struct *dst_mm = dst_vma->vm_mm; | 
|  | pte_t _dst_pte, *dst_pte; | 
|  | spinlock_t *ptl; | 
|  |  | 
|  | _dst_pte = make_pte_marker(PTE_MARKER_POISONED); | 
|  | ret = -EAGAIN; | 
|  | dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); | 
|  | if (!dst_pte) | 
|  | goto out; | 
|  |  | 
|  | if (mfill_file_over_size(dst_vma, dst_addr)) { | 
|  | ret = -EFAULT; | 
|  | goto out_unlock; | 
|  | } | 
|  |  | 
|  | ret = -EEXIST; | 
|  | /* Refuse to overwrite any PTE, even a PTE marker (e.g. UFFD WP). */ | 
|  | if (!pte_none(ptep_get(dst_pte))) | 
|  | goto out_unlock; | 
|  |  | 
|  | set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); | 
|  |  | 
|  | /* No need to invalidate - it was non-present before */ | 
|  | update_mmu_cache(dst_vma, dst_addr, dst_pte); | 
|  | ret = 0; | 
|  | out_unlock: | 
|  | pte_unmap_unlock(dst_pte, ptl); | 
|  | out: | 
|  | return ret; | 
|  | } | 
|  |  | 
|  | static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) | 
|  | { | 
|  | pgd_t *pgd; | 
|  | p4d_t *p4d; | 
|  | pud_t *pud; | 
|  |  | 
|  | pgd = pgd_offset(mm, address); | 
|  | p4d = p4d_alloc(mm, pgd, address); | 
|  | if (!p4d) | 
|  | return NULL; | 
|  | pud = pud_alloc(mm, p4d, address); | 
|  | if (!pud) | 
|  | return NULL; | 
|  | /* | 
|  | * Note that we didn't run this because the pmd was | 
|  | * missing, the *pmd may be already established and in | 
|  | * turn it may also be a trans_huge_pmd. | 
|  | */ | 
|  | return pmd_alloc(mm, pud, address); | 
|  | } | 
|  |  | 
|  | #ifdef CONFIG_HUGETLB_PAGE | 
|  | /* | 
|  | * mfill_atomic processing for HUGETLB vmas.  Note that this routine is | 
|  | * called with either vma-lock or mmap_lock held, it will release the lock | 
|  | * before returning. | 
|  | */ | 
|  | static __always_inline ssize_t mfill_atomic_hugetlb( | 
|  | struct userfaultfd_ctx *ctx, | 
|  | struct vm_area_struct *dst_vma, | 
|  | unsigned long dst_start, | 
|  | unsigned long src_start, | 
|  | unsigned long len, | 
|  | uffd_flags_t flags) | 
|  | { | 
|  | struct mm_struct *dst_mm = dst_vma->vm_mm; | 
|  | ssize_t err; | 
|  | pte_t *dst_pte; | 
|  | unsigned long src_addr, dst_addr; | 
|  | long copied; | 
|  | struct folio *folio; | 
|  | unsigned long vma_hpagesize; | 
|  | pgoff_t idx; | 
|  | u32 hash; | 
|  | struct address_space *mapping; | 
|  |  | 
|  | /* | 
|  | * There is no default zero huge page for all huge page sizes as | 
|  | * supported by hugetlb.  A PMD_SIZE huge pages may exist as used | 
|  | * by THP.  Since we can not reliably insert a zero page, this | 
|  | * feature is not supported. | 
|  | */ | 
|  | if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) { | 
|  | up_read(&ctx->map_changing_lock); | 
|  | uffd_mfill_unlock(dst_vma); | 
|  | return -EINVAL; | 
|  | } | 
|  |  | 
|  | src_addr = src_start; | 
|  | dst_addr = dst_start; | 
|  | copied = 0; | 
|  | folio = NULL; | 
|  | vma_hpagesize = vma_kernel_pagesize(dst_vma); | 
|  |  | 
|  | /* | 
|  | * Validate alignment based on huge page size | 
|  | */ | 
|  | err = -EINVAL; | 
|  | if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1)) | 
|  | goto out_unlock; | 
|  |  | 
|  | retry: | 
|  | /* | 
|  | * On routine entry dst_vma is set.  If we had to drop mmap_lock and | 
|  | * retry, dst_vma will be set to NULL and we must lookup again. | 
|  | */ | 
|  | if (!dst_vma) { | 
|  | dst_vma = uffd_mfill_lock(dst_mm, dst_start, len); | 
|  | if (IS_ERR(dst_vma)) { | 
|  | err = PTR_ERR(dst_vma); | 
|  | goto out; | 
|  | } | 
|  |  | 
|  | err = -ENOENT; | 
|  | if (!is_vm_hugetlb_page(dst_vma)) | 
|  | goto out_unlock_vma; | 
|  |  | 
|  | err = -EINVAL; | 
|  | if (vma_hpagesize != vma_kernel_pagesize(dst_vma)) | 
|  | goto out_unlock_vma; | 
|  |  | 
|  | /* | 
|  | * If memory mappings are changing because of non-cooperative | 
|  | * operation (e.g. mremap) running in parallel, bail out and | 
|  | * request the user to retry later | 
|  | */ | 
|  | down_read(&ctx->map_changing_lock); | 
|  | err = -EAGAIN; | 
|  | if (atomic_read(&ctx->mmap_changing)) | 
|  | goto out_unlock; | 
|  | } | 
|  |  | 
|  | while (src_addr < src_start + len) { | 
|  | VM_WARN_ON_ONCE(dst_addr >= dst_start + len); | 
|  |  | 
|  | /* | 
|  | * Serialize via vma_lock and hugetlb_fault_mutex. | 
|  | * vma_lock ensures the dst_pte remains valid even | 
|  | * in the case of shared pmds.  fault mutex prevents | 
|  | * races with other faulting threads. | 
|  | */ | 
|  | idx = linear_page_index(dst_vma, dst_addr); | 
|  | mapping = dst_vma->vm_file->f_mapping; | 
|  | hash = hugetlb_fault_mutex_hash(mapping, idx); | 
|  | mutex_lock(&hugetlb_fault_mutex_table[hash]); | 
|  | hugetlb_vma_lock_read(dst_vma); | 
|  |  | 
|  | err = -ENOMEM; | 
|  | dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize); | 
|  | if (!dst_pte) { | 
|  | hugetlb_vma_unlock_read(dst_vma); | 
|  | mutex_unlock(&hugetlb_fault_mutex_table[hash]); | 
|  | goto out_unlock; | 
|  | } | 
|  |  | 
|  | if (!uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE) && | 
|  | !huge_pte_none_mostly(huge_ptep_get(dst_mm, dst_addr, dst_pte))) { | 
|  | err = -EEXIST; | 
|  | hugetlb_vma_unlock_read(dst_vma); | 
|  | mutex_unlock(&hugetlb_fault_mutex_table[hash]); | 
|  | goto out_unlock; | 
|  | } | 
|  |  | 
|  | err = hugetlb_mfill_atomic_pte(dst_pte, dst_vma, dst_addr, | 
|  | src_addr, flags, &folio); | 
|  |  | 
|  | hugetlb_vma_unlock_read(dst_vma); | 
|  | mutex_unlock(&hugetlb_fault_mutex_table[hash]); | 
|  |  | 
|  | cond_resched(); | 
|  |  | 
|  | if (unlikely(err == -ENOENT)) { | 
|  | up_read(&ctx->map_changing_lock); | 
|  | uffd_mfill_unlock(dst_vma); | 
|  | VM_WARN_ON_ONCE(!folio); | 
|  |  | 
|  | err = copy_folio_from_user(folio, | 
|  | (const void __user *)src_addr, true); | 
|  | if (unlikely(err)) { | 
|  | err = -EFAULT; | 
|  | goto out; | 
|  | } | 
|  |  | 
|  | dst_vma = NULL; | 
|  | goto retry; | 
|  | } else | 
|  | VM_WARN_ON_ONCE(folio); | 
|  |  | 
|  | if (!err) { | 
|  | dst_addr += vma_hpagesize; | 
|  | src_addr += vma_hpagesize; | 
|  | copied += vma_hpagesize; | 
|  |  | 
|  | if (fatal_signal_pending(current)) | 
|  | err = -EINTR; | 
|  | } | 
|  | if (err) | 
|  | break; | 
|  | } | 
|  |  | 
|  | out_unlock: | 
|  | up_read(&ctx->map_changing_lock); | 
|  | out_unlock_vma: | 
|  | uffd_mfill_unlock(dst_vma); | 
|  | out: | 
|  | if (folio) | 
|  | folio_put(folio); | 
|  | VM_WARN_ON_ONCE(copied < 0); | 
|  | VM_WARN_ON_ONCE(err > 0); | 
|  | VM_WARN_ON_ONCE(!copied && !err); | 
|  | return copied ? copied : err; | 
|  | } | 
|  | #else /* !CONFIG_HUGETLB_PAGE */ | 
|  | /* fail at build time if gcc attempts to use this */ | 
|  | extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx, | 
|  | struct vm_area_struct *dst_vma, | 
|  | unsigned long dst_start, | 
|  | unsigned long src_start, | 
|  | unsigned long len, | 
|  | uffd_flags_t flags); | 
|  | #endif /* CONFIG_HUGETLB_PAGE */ | 
|  |  | 
|  | static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd, | 
|  | struct vm_area_struct *dst_vma, | 
|  | unsigned long dst_addr, | 
|  | unsigned long src_addr, | 
|  | uffd_flags_t flags, | 
|  | struct folio **foliop) | 
|  | { | 
|  | ssize_t err; | 
|  |  | 
|  | if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) { | 
|  | return mfill_atomic_pte_continue(dst_pmd, dst_vma, | 
|  | dst_addr, flags); | 
|  | } else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) { | 
|  | return mfill_atomic_pte_poison(dst_pmd, dst_vma, | 
|  | dst_addr, flags); | 
|  | } | 
|  |  | 
|  | /* | 
|  | * The normal page fault path for a shmem will invoke the | 
|  | * fault, fill the hole in the file and COW it right away. The | 
|  | * result generates plain anonymous memory. So when we are | 
|  | * asked to fill an hole in a MAP_PRIVATE shmem mapping, we'll | 
|  | * generate anonymous memory directly without actually filling | 
|  | * the hole. For the MAP_PRIVATE case the robustness check | 
|  | * only happens in the pagetable (to verify it's still none) | 
|  | * and not in the radix tree. | 
|  | */ | 
|  | if (!(dst_vma->vm_flags & VM_SHARED)) { | 
|  | if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) | 
|  | err = mfill_atomic_pte_copy(dst_pmd, dst_vma, | 
|  | dst_addr, src_addr, | 
|  | flags, foliop); | 
|  | else | 
|  | err = mfill_atomic_pte_zeropage(dst_pmd, | 
|  | dst_vma, dst_addr); | 
|  | } else { | 
|  | err = shmem_mfill_atomic_pte(dst_pmd, dst_vma, | 
|  | dst_addr, src_addr, | 
|  | flags, foliop); | 
|  | } | 
|  |  | 
|  | return err; | 
|  | } | 
|  |  | 
|  | static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, | 
|  | unsigned long dst_start, | 
|  | unsigned long src_start, | 
|  | unsigned long len, | 
|  | uffd_flags_t flags) | 
|  | { | 
|  | struct mm_struct *dst_mm = ctx->mm; | 
|  | struct vm_area_struct *dst_vma; | 
|  | ssize_t err; | 
|  | pmd_t *dst_pmd; | 
|  | unsigned long src_addr, dst_addr; | 
|  | long copied; | 
|  | struct folio *folio; | 
|  |  | 
|  | /* | 
|  | * Sanitize the command parameters: | 
|  | */ | 
|  | VM_WARN_ON_ONCE(dst_start & ~PAGE_MASK); | 
|  | VM_WARN_ON_ONCE(len & ~PAGE_MASK); | 
|  |  | 
|  | /* Does the address range wrap, or is the span zero-sized? */ | 
|  | VM_WARN_ON_ONCE(src_start + len <= src_start); | 
|  | VM_WARN_ON_ONCE(dst_start + len <= dst_start); | 
|  |  | 
|  | src_addr = src_start; | 
|  | dst_addr = dst_start; | 
|  | copied = 0; | 
|  | folio = NULL; | 
|  | retry: | 
|  | /* | 
|  | * Make sure the vma is not shared, that the dst range is | 
|  | * both valid and fully within a single existing vma. | 
|  | */ | 
|  | dst_vma = uffd_mfill_lock(dst_mm, dst_start, len); | 
|  | if (IS_ERR(dst_vma)) { | 
|  | err = PTR_ERR(dst_vma); | 
|  | goto out; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * If memory mappings are changing because of non-cooperative | 
|  | * operation (e.g. mremap) running in parallel, bail out and | 
|  | * request the user to retry later | 
|  | */ | 
|  | down_read(&ctx->map_changing_lock); | 
|  | err = -EAGAIN; | 
|  | if (atomic_read(&ctx->mmap_changing)) | 
|  | goto out_unlock; | 
|  |  | 
|  | err = -EINVAL; | 
|  | /* | 
|  | * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but | 
|  | * it will overwrite vm_ops, so vma_is_anonymous must return false. | 
|  | */ | 
|  | if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) && | 
|  | dst_vma->vm_flags & VM_SHARED)) | 
|  | goto out_unlock; | 
|  |  | 
|  | /* | 
|  | * validate 'mode' now that we know the dst_vma: don't allow | 
|  | * a wrprotect copy if the userfaultfd didn't register as WP. | 
|  | */ | 
|  | if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP)) | 
|  | goto out_unlock; | 
|  |  | 
|  | /* | 
|  | * If this is a HUGETLB vma, pass off to appropriate routine | 
|  | */ | 
|  | if (is_vm_hugetlb_page(dst_vma)) | 
|  | return  mfill_atomic_hugetlb(ctx, dst_vma, dst_start, | 
|  | src_start, len, flags); | 
|  |  | 
|  | if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) | 
|  | goto out_unlock; | 
|  | if (!vma_is_shmem(dst_vma) && | 
|  | uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) | 
|  | goto out_unlock; | 
|  |  | 
|  | while (src_addr < src_start + len) { | 
|  | pmd_t dst_pmdval; | 
|  |  | 
|  | VM_WARN_ON_ONCE(dst_addr >= dst_start + len); | 
|  |  | 
|  | dst_pmd = mm_alloc_pmd(dst_mm, dst_addr); | 
|  | if (unlikely(!dst_pmd)) { | 
|  | err = -ENOMEM; | 
|  | break; | 
|  | } | 
|  |  | 
|  | dst_pmdval = pmdp_get_lockless(dst_pmd); | 
|  | if (unlikely(pmd_none(dst_pmdval)) && | 
|  | unlikely(__pte_alloc(dst_mm, dst_pmd))) { | 
|  | err = -ENOMEM; | 
|  | break; | 
|  | } | 
|  | dst_pmdval = pmdp_get_lockless(dst_pmd); | 
|  | /* | 
|  | * If the dst_pmd is THP don't override it and just be strict. | 
|  | * (This includes the case where the PMD used to be THP and | 
|  | * changed back to none after __pte_alloc().) | 
|  | */ | 
|  | if (unlikely(!pmd_present(dst_pmdval) || | 
|  | pmd_trans_huge(dst_pmdval))) { | 
|  | err = -EEXIST; | 
|  | break; | 
|  | } | 
|  | if (unlikely(pmd_bad(dst_pmdval))) { | 
|  | err = -EFAULT; | 
|  | break; | 
|  | } | 
|  | /* | 
|  | * For shmem mappings, khugepaged is allowed to remove page | 
|  | * tables under us; pte_offset_map_lock() will deal with that. | 
|  | */ | 
|  |  | 
|  | err = mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, | 
|  | src_addr, flags, &folio); | 
|  | cond_resched(); | 
|  |  | 
|  | if (unlikely(err == -ENOENT)) { | 
|  | void *kaddr; | 
|  |  | 
|  | up_read(&ctx->map_changing_lock); | 
|  | uffd_mfill_unlock(dst_vma); | 
|  | VM_WARN_ON_ONCE(!folio); | 
|  |  | 
|  | kaddr = kmap_local_folio(folio, 0); | 
|  | err = copy_from_user(kaddr, | 
|  | (const void __user *) src_addr, | 
|  | PAGE_SIZE); | 
|  | kunmap_local(kaddr); | 
|  | if (unlikely(err)) { | 
|  | err = -EFAULT; | 
|  | goto out; | 
|  | } | 
|  | flush_dcache_folio(folio); | 
|  | goto retry; | 
|  | } else | 
|  | VM_WARN_ON_ONCE(folio); | 
|  |  | 
|  | if (!err) { | 
|  | dst_addr += PAGE_SIZE; | 
|  | src_addr += PAGE_SIZE; | 
|  | copied += PAGE_SIZE; | 
|  |  | 
|  | if (fatal_signal_pending(current)) | 
|  | err = -EINTR; | 
|  | } | 
|  | if (err) | 
|  | break; | 
|  | } | 
|  |  | 
|  | out_unlock: | 
|  | up_read(&ctx->map_changing_lock); | 
|  | uffd_mfill_unlock(dst_vma); | 
|  | out: | 
|  | if (folio) | 
|  | folio_put(folio); | 
|  | VM_WARN_ON_ONCE(copied < 0); | 
|  | VM_WARN_ON_ONCE(err > 0); | 
|  | VM_WARN_ON_ONCE(!copied && !err); | 
|  | return copied ? copied : err; | 
|  | } | 
|  |  | 
|  | ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start, | 
|  | unsigned long src_start, unsigned long len, | 
|  | uffd_flags_t flags) | 
|  | { | 
|  | return mfill_atomic(ctx, dst_start, src_start, len, | 
|  | uffd_flags_set_mode(flags, MFILL_ATOMIC_COPY)); | 
|  | } | 
|  |  | 
|  | ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx, | 
|  | unsigned long start, | 
|  | unsigned long len) | 
|  | { | 
|  | return mfill_atomic(ctx, start, 0, len, | 
|  | uffd_flags_set_mode(0, MFILL_ATOMIC_ZEROPAGE)); | 
|  | } | 
|  |  | 
|  | ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long start, | 
|  | unsigned long len, uffd_flags_t flags) | 
|  | { | 
|  |  | 
|  | /* | 
|  | * A caller might reasonably assume that UFFDIO_CONTINUE contains an | 
|  | * smp_wmb() to ensure that any writes to the about-to-be-mapped page by | 
|  | * the thread doing the UFFDIO_CONTINUE are guaranteed to be visible to | 
|  | * subsequent loads from the page through the newly mapped address range. | 
|  | */ | 
|  | smp_wmb(); | 
|  |  | 
|  | return mfill_atomic(ctx, start, 0, len, | 
|  | uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE)); | 
|  | } | 
|  |  | 
|  | ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start, | 
|  | unsigned long len, uffd_flags_t flags) | 
|  | { | 
|  | return mfill_atomic(ctx, start, 0, len, | 
|  | uffd_flags_set_mode(flags, MFILL_ATOMIC_POISON)); | 
|  | } | 
|  |  | 
|  | long uffd_wp_range(struct vm_area_struct *dst_vma, | 
|  | unsigned long start, unsigned long len, bool enable_wp) | 
|  | { | 
|  | unsigned int mm_cp_flags; | 
|  | struct mmu_gather tlb; | 
|  | long ret; | 
|  |  | 
|  | VM_WARN_ONCE(start < dst_vma->vm_start || start + len > dst_vma->vm_end, | 
|  | "The address range exceeds VMA boundary.\n"); | 
|  | if (enable_wp) | 
|  | mm_cp_flags = MM_CP_UFFD_WP; | 
|  | else | 
|  | mm_cp_flags = MM_CP_UFFD_WP_RESOLVE; | 
|  |  | 
|  | /* | 
|  | * vma->vm_page_prot already reflects that uffd-wp is enabled for this | 
|  | * VMA (see userfaultfd_set_vm_flags()) and that all PTEs are supposed | 
|  | * to be write-protected as default whenever protection changes. | 
|  | * Try upgrading write permissions manually. | 
|  | */ | 
|  | if (!enable_wp && vma_wants_manual_pte_write_upgrade(dst_vma)) | 
|  | mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE; | 
|  | tlb_gather_mmu(&tlb, dst_vma->vm_mm); | 
|  | ret = change_protection(&tlb, dst_vma, start, start + len, mm_cp_flags); | 
|  | tlb_finish_mmu(&tlb); | 
|  |  | 
|  | return ret; | 
|  | } | 
|  |  | 
|  | int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start, | 
|  | unsigned long len, bool enable_wp) | 
|  | { | 
|  | struct mm_struct *dst_mm = ctx->mm; | 
|  | unsigned long end = start + len; | 
|  | unsigned long _start, _end; | 
|  | struct vm_area_struct *dst_vma; | 
|  | unsigned long page_mask; | 
|  | long err; | 
|  | VMA_ITERATOR(vmi, dst_mm, start); | 
|  |  | 
|  | /* | 
|  | * Sanitize the command parameters: | 
|  | */ | 
|  | VM_WARN_ON_ONCE(start & ~PAGE_MASK); | 
|  | VM_WARN_ON_ONCE(len & ~PAGE_MASK); | 
|  |  | 
|  | /* Does the address range wrap, or is the span zero-sized? */ | 
|  | VM_WARN_ON_ONCE(start + len <= start); | 
|  |  | 
|  | mmap_read_lock(dst_mm); | 
|  |  | 
|  | /* | 
|  | * If memory mappings are changing because of non-cooperative | 
|  | * operation (e.g. mremap) running in parallel, bail out and | 
|  | * request the user to retry later | 
|  | */ | 
|  | down_read(&ctx->map_changing_lock); | 
|  | err = -EAGAIN; | 
|  | if (atomic_read(&ctx->mmap_changing)) | 
|  | goto out_unlock; | 
|  |  | 
|  | err = -ENOENT; | 
|  | for_each_vma_range(vmi, dst_vma, end) { | 
|  |  | 
|  | if (!userfaultfd_wp(dst_vma)) { | 
|  | err = -ENOENT; | 
|  | break; | 
|  | } | 
|  |  | 
|  | if (is_vm_hugetlb_page(dst_vma)) { | 
|  | err = -EINVAL; | 
|  | page_mask = vma_kernel_pagesize(dst_vma) - 1; | 
|  | if ((start & page_mask) || (len & page_mask)) | 
|  | break; | 
|  | } | 
|  |  | 
|  | _start = max(dst_vma->vm_start, start); | 
|  | _end = min(dst_vma->vm_end, end); | 
|  |  | 
|  | err = uffd_wp_range(dst_vma, _start, _end - _start, enable_wp); | 
|  |  | 
|  | /* Return 0 on success, <0 on failures */ | 
|  | if (err < 0) | 
|  | break; | 
|  | err = 0; | 
|  | } | 
|  | out_unlock: | 
|  | up_read(&ctx->map_changing_lock); | 
|  | mmap_read_unlock(dst_mm); | 
|  | return err; | 
|  | } | 
|  |  | 
|  |  | 
|  | void double_pt_lock(spinlock_t *ptl1, | 
|  | spinlock_t *ptl2) | 
|  | __acquires(ptl1) | 
|  | __acquires(ptl2) | 
|  | { | 
|  | if (ptl1 > ptl2) | 
|  | swap(ptl1, ptl2); | 
|  | /* lock in virtual address order to avoid lock inversion */ | 
|  | spin_lock(ptl1); | 
|  | if (ptl1 != ptl2) | 
|  | spin_lock_nested(ptl2, SINGLE_DEPTH_NESTING); | 
|  | else | 
|  | __acquire(ptl2); | 
|  | } | 
|  |  | 
|  | void double_pt_unlock(spinlock_t *ptl1, | 
|  | spinlock_t *ptl2) | 
|  | __releases(ptl1) | 
|  | __releases(ptl2) | 
|  | { | 
|  | spin_unlock(ptl1); | 
|  | if (ptl1 != ptl2) | 
|  | spin_unlock(ptl2); | 
|  | else | 
|  | __release(ptl2); | 
|  | } | 
|  |  | 
|  | static inline bool is_pte_pages_stable(pte_t *dst_pte, pte_t *src_pte, | 
|  | pte_t orig_dst_pte, pte_t orig_src_pte, | 
|  | pmd_t *dst_pmd, pmd_t dst_pmdval) | 
|  | { | 
|  | return pte_same(ptep_get(src_pte), orig_src_pte) && | 
|  | pte_same(ptep_get(dst_pte), orig_dst_pte) && | 
|  | pmd_same(dst_pmdval, pmdp_get_lockless(dst_pmd)); | 
|  | } | 
|  |  | 
|  | static int move_present_pte(struct mm_struct *mm, | 
|  | struct vm_area_struct *dst_vma, | 
|  | struct vm_area_struct *src_vma, | 
|  | unsigned long dst_addr, unsigned long src_addr, | 
|  | pte_t *dst_pte, pte_t *src_pte, | 
|  | pte_t orig_dst_pte, pte_t orig_src_pte, | 
|  | pmd_t *dst_pmd, pmd_t dst_pmdval, | 
|  | spinlock_t *dst_ptl, spinlock_t *src_ptl, | 
|  | struct folio *src_folio) | 
|  | { | 
|  | int err = 0; | 
|  |  | 
|  | double_pt_lock(dst_ptl, src_ptl); | 
|  |  | 
|  | if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, | 
|  | dst_pmd, dst_pmdval)) { | 
|  | err = -EAGAIN; | 
|  | goto out; | 
|  | } | 
|  | if (folio_test_large(src_folio) || | 
|  | folio_maybe_dma_pinned(src_folio) || | 
|  | !PageAnonExclusive(&src_folio->page)) { | 
|  | err = -EBUSY; | 
|  | goto out; | 
|  | } | 
|  |  | 
|  | orig_src_pte = ptep_clear_flush(src_vma, src_addr, src_pte); | 
|  | /* Folio got pinned from under us. Put it back and fail the move. */ | 
|  | if (folio_maybe_dma_pinned(src_folio)) { | 
|  | set_pte_at(mm, src_addr, src_pte, orig_src_pte); | 
|  | err = -EBUSY; | 
|  | goto out; | 
|  | } | 
|  |  | 
|  | folio_move_anon_rmap(src_folio, dst_vma); | 
|  | src_folio->index = linear_page_index(dst_vma, dst_addr); | 
|  |  | 
|  | orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot); | 
|  | /* Set soft dirty bit so userspace can notice the pte was moved */ | 
|  | #ifdef CONFIG_MEM_SOFT_DIRTY | 
|  | orig_dst_pte = pte_mksoft_dirty(orig_dst_pte); | 
|  | #endif | 
|  | if (pte_dirty(orig_src_pte)) | 
|  | orig_dst_pte = pte_mkdirty(orig_dst_pte); | 
|  | orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma); | 
|  |  | 
|  | set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte); | 
|  | out: | 
|  | double_pt_unlock(dst_ptl, src_ptl); | 
|  | return err; | 
|  | } | 
|  |  | 
|  | static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma, | 
|  | unsigned long dst_addr, unsigned long src_addr, | 
|  | pte_t *dst_pte, pte_t *src_pte, | 
|  | pte_t orig_dst_pte, pte_t orig_src_pte, | 
|  | pmd_t *dst_pmd, pmd_t dst_pmdval, | 
|  | spinlock_t *dst_ptl, spinlock_t *src_ptl, | 
|  | struct folio *src_folio, | 
|  | struct swap_info_struct *si, swp_entry_t entry) | 
|  | { | 
|  | /* | 
|  | * Check if the folio still belongs to the target swap entry after | 
|  | * acquiring the lock. Folio can be freed in the swap cache while | 
|  | * not locked. | 
|  | */ | 
|  | if (src_folio && unlikely(!folio_test_swapcache(src_folio) || | 
|  | entry.val != src_folio->swap.val)) | 
|  | return -EAGAIN; | 
|  |  | 
|  | double_pt_lock(dst_ptl, src_ptl); | 
|  |  | 
|  | if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, | 
|  | dst_pmd, dst_pmdval)) { | 
|  | double_pt_unlock(dst_ptl, src_ptl); | 
|  | return -EAGAIN; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * The src_folio resides in the swapcache, requiring an update to its | 
|  | * index and mapping to align with the dst_vma, where a swap-in may | 
|  | * occur and hit the swapcache after moving the PTE. | 
|  | */ | 
|  | if (src_folio) { | 
|  | folio_move_anon_rmap(src_folio, dst_vma); | 
|  | src_folio->index = linear_page_index(dst_vma, dst_addr); | 
|  | } else { | 
|  | /* | 
|  | * Check if the swap entry is cached after acquiring the src_pte | 
|  | * lock. Otherwise, we might miss a newly loaded swap cache folio. | 
|  | * | 
|  | * Check swap_map directly to minimize overhead, READ_ONCE is sufficient. | 
|  | * We are trying to catch newly added swap cache, the only possible case is | 
|  | * when a folio is swapped in and out again staying in swap cache, using the | 
|  | * same entry before the PTE check above. The PTL is acquired and released | 
|  | * twice, each time after updating the swap_map's flag. So holding | 
|  | * the PTL here ensures we see the updated value. False positive is possible, | 
|  | * e.g. SWP_SYNCHRONOUS_IO swapin may set the flag without touching the | 
|  | * cache, or during the tiny synchronization window between swap cache and | 
|  | * swap_map, but it will be gone very quickly, worst result is retry jitters. | 
|  | */ | 
|  | if (READ_ONCE(si->swap_map[swp_offset(entry)]) & SWAP_HAS_CACHE) { | 
|  | double_pt_unlock(dst_ptl, src_ptl); | 
|  | return -EAGAIN; | 
|  | } | 
|  | } | 
|  |  | 
|  | orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); | 
|  | #ifdef CONFIG_MEM_SOFT_DIRTY | 
|  | orig_src_pte = pte_swp_mksoft_dirty(orig_src_pte); | 
|  | #endif | 
|  | set_pte_at(mm, dst_addr, dst_pte, orig_src_pte); | 
|  | double_pt_unlock(dst_ptl, src_ptl); | 
|  |  | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | static int move_zeropage_pte(struct mm_struct *mm, | 
|  | struct vm_area_struct *dst_vma, | 
|  | struct vm_area_struct *src_vma, | 
|  | unsigned long dst_addr, unsigned long src_addr, | 
|  | pte_t *dst_pte, pte_t *src_pte, | 
|  | pte_t orig_dst_pte, pte_t orig_src_pte, | 
|  | pmd_t *dst_pmd, pmd_t dst_pmdval, | 
|  | spinlock_t *dst_ptl, spinlock_t *src_ptl) | 
|  | { | 
|  | pte_t zero_pte; | 
|  |  | 
|  | double_pt_lock(dst_ptl, src_ptl); | 
|  | if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, | 
|  | dst_pmd, dst_pmdval)) { | 
|  | double_pt_unlock(dst_ptl, src_ptl); | 
|  | return -EAGAIN; | 
|  | } | 
|  |  | 
|  | zero_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), | 
|  | dst_vma->vm_page_prot)); | 
|  | ptep_clear_flush(src_vma, src_addr, src_pte); | 
|  | set_pte_at(mm, dst_addr, dst_pte, zero_pte); | 
|  | double_pt_unlock(dst_ptl, src_ptl); | 
|  |  | 
|  | return 0; | 
|  | } | 
|  |  | 
|  |  | 
|  | /* | 
|  | * The mmap_lock for reading is held by the caller. Just move the page | 
|  | * from src_pmd to dst_pmd if possible, and return true if succeeded | 
|  | * in moving the page. | 
|  | */ | 
|  | static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, | 
|  | struct vm_area_struct *dst_vma, | 
|  | struct vm_area_struct *src_vma, | 
|  | unsigned long dst_addr, unsigned long src_addr, | 
|  | __u64 mode) | 
|  | { | 
|  | swp_entry_t entry; | 
|  | struct swap_info_struct *si = NULL; | 
|  | pte_t orig_src_pte, orig_dst_pte; | 
|  | pte_t src_folio_pte; | 
|  | spinlock_t *src_ptl, *dst_ptl; | 
|  | pte_t *src_pte = NULL; | 
|  | pte_t *dst_pte = NULL; | 
|  | pmd_t dummy_pmdval; | 
|  | pmd_t dst_pmdval; | 
|  | struct folio *src_folio = NULL; | 
|  | struct anon_vma *src_anon_vma = NULL; | 
|  | struct mmu_notifier_range range; | 
|  | int err = 0; | 
|  |  | 
|  | flush_cache_range(src_vma, src_addr, src_addr + PAGE_SIZE); | 
|  | mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, | 
|  | src_addr, src_addr + PAGE_SIZE); | 
|  | mmu_notifier_invalidate_range_start(&range); | 
|  | retry: | 
|  | /* | 
|  | * Use the maywrite version to indicate that dst_pte will be modified, | 
|  | * since dst_pte needs to be none, the subsequent pte_same() check | 
|  | * cannot prevent the dst_pte page from being freed concurrently, so we | 
|  | * also need to abtain dst_pmdval and recheck pmd_same() later. | 
|  | */ | 
|  | dst_pte = pte_offset_map_rw_nolock(mm, dst_pmd, dst_addr, &dst_pmdval, | 
|  | &dst_ptl); | 
|  |  | 
|  | /* Retry if a huge pmd materialized from under us */ | 
|  | if (unlikely(!dst_pte)) { | 
|  | err = -EAGAIN; | 
|  | goto out; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Unlike dst_pte, the subsequent pte_same() check can ensure the | 
|  | * stability of the src_pte page, so there is no need to get pmdval, | 
|  | * just pass a dummy variable to it. | 
|  | */ | 
|  | src_pte = pte_offset_map_rw_nolock(mm, src_pmd, src_addr, &dummy_pmdval, | 
|  | &src_ptl); | 
|  |  | 
|  | /* | 
|  | * We held the mmap_lock for reading so MADV_DONTNEED | 
|  | * can zap transparent huge pages under us, or the | 
|  | * transparent huge page fault can establish new | 
|  | * transparent huge pages under us. | 
|  | */ | 
|  | if (unlikely(!src_pte)) { | 
|  | err = -EAGAIN; | 
|  | goto out; | 
|  | } | 
|  |  | 
|  | /* Sanity checks before the operation */ | 
|  | if (pmd_none(*dst_pmd) || pmd_none(*src_pmd) || | 
|  | pmd_trans_huge(*dst_pmd) || pmd_trans_huge(*src_pmd)) { | 
|  | err = -EINVAL; | 
|  | goto out; | 
|  | } | 
|  |  | 
|  | spin_lock(dst_ptl); | 
|  | orig_dst_pte = ptep_get(dst_pte); | 
|  | spin_unlock(dst_ptl); | 
|  | if (!pte_none(orig_dst_pte)) { | 
|  | err = -EEXIST; | 
|  | goto out; | 
|  | } | 
|  |  | 
|  | spin_lock(src_ptl); | 
|  | orig_src_pte = ptep_get(src_pte); | 
|  | spin_unlock(src_ptl); | 
|  | if (pte_none(orig_src_pte)) { | 
|  | if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) | 
|  | err = -ENOENT; | 
|  | else /* nothing to do to move a hole */ | 
|  | err = 0; | 
|  | goto out; | 
|  | } | 
|  |  | 
|  | /* If PTE changed after we locked the folio them start over */ | 
|  | if (src_folio && unlikely(!pte_same(src_folio_pte, orig_src_pte))) { | 
|  | err = -EAGAIN; | 
|  | goto out; | 
|  | } | 
|  |  | 
|  | if (pte_present(orig_src_pte)) { | 
|  | if (is_zero_pfn(pte_pfn(orig_src_pte))) { | 
|  | err = move_zeropage_pte(mm, dst_vma, src_vma, | 
|  | dst_addr, src_addr, dst_pte, src_pte, | 
|  | orig_dst_pte, orig_src_pte, | 
|  | dst_pmd, dst_pmdval, dst_ptl, src_ptl); | 
|  | goto out; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Pin and lock both source folio and anon_vma. Since we are in | 
|  | * RCU read section, we can't block, so on contention have to | 
|  | * unmap the ptes, obtain the lock and retry. | 
|  | */ | 
|  | if (!src_folio) { | 
|  | struct folio *folio; | 
|  | bool locked; | 
|  |  | 
|  | /* | 
|  | * Pin the page while holding the lock to be sure the | 
|  | * page isn't freed under us | 
|  | */ | 
|  | spin_lock(src_ptl); | 
|  | if (!pte_same(orig_src_pte, ptep_get(src_pte))) { | 
|  | spin_unlock(src_ptl); | 
|  | err = -EAGAIN; | 
|  | goto out; | 
|  | } | 
|  |  | 
|  | folio = vm_normal_folio(src_vma, src_addr, orig_src_pte); | 
|  | if (!folio || !PageAnonExclusive(&folio->page)) { | 
|  | spin_unlock(src_ptl); | 
|  | err = -EBUSY; | 
|  | goto out; | 
|  | } | 
|  |  | 
|  | locked = folio_trylock(folio); | 
|  | /* | 
|  | * We avoid waiting for folio lock with a raised | 
|  | * refcount for large folios because extra refcounts | 
|  | * will result in split_folio() failing later and | 
|  | * retrying.  If multiple tasks are trying to move a | 
|  | * large folio we can end up livelocking. | 
|  | */ | 
|  | if (!locked && folio_test_large(folio)) { | 
|  | spin_unlock(src_ptl); | 
|  | err = -EAGAIN; | 
|  | goto out; | 
|  | } | 
|  |  | 
|  | folio_get(folio); | 
|  | src_folio = folio; | 
|  | src_folio_pte = orig_src_pte; | 
|  | spin_unlock(src_ptl); | 
|  |  | 
|  | if (!locked) { | 
|  | pte_unmap(src_pte); | 
|  | pte_unmap(dst_pte); | 
|  | src_pte = dst_pte = NULL; | 
|  | /* now we can block and wait */ | 
|  | folio_lock(src_folio); | 
|  | goto retry; | 
|  | } | 
|  |  | 
|  | if (WARN_ON_ONCE(!folio_test_anon(src_folio))) { | 
|  | err = -EBUSY; | 
|  | goto out; | 
|  | } | 
|  | } | 
|  |  | 
|  | /* at this point we have src_folio locked */ | 
|  | if (folio_test_large(src_folio)) { | 
|  | /* split_folio() can block */ | 
|  | pte_unmap(src_pte); | 
|  | pte_unmap(dst_pte); | 
|  | src_pte = dst_pte = NULL; | 
|  | err = split_folio(src_folio); | 
|  | if (err) | 
|  | goto out; | 
|  | /* have to reacquire the folio after it got split */ | 
|  | folio_unlock(src_folio); | 
|  | folio_put(src_folio); | 
|  | src_folio = NULL; | 
|  | goto retry; | 
|  | } | 
|  |  | 
|  | if (!src_anon_vma) { | 
|  | /* | 
|  | * folio_referenced walks the anon_vma chain | 
|  | * without the folio lock. Serialize against it with | 
|  | * the anon_vma lock, the folio lock is not enough. | 
|  | */ | 
|  | src_anon_vma = folio_get_anon_vma(src_folio); | 
|  | if (!src_anon_vma) { | 
|  | /* page was unmapped from under us */ | 
|  | err = -EAGAIN; | 
|  | goto out; | 
|  | } | 
|  | if (!anon_vma_trylock_write(src_anon_vma)) { | 
|  | pte_unmap(src_pte); | 
|  | pte_unmap(dst_pte); | 
|  | src_pte = dst_pte = NULL; | 
|  | /* now we can block and wait */ | 
|  | anon_vma_lock_write(src_anon_vma); | 
|  | goto retry; | 
|  | } | 
|  | } | 
|  |  | 
|  | err = move_present_pte(mm,  dst_vma, src_vma, | 
|  | dst_addr, src_addr, dst_pte, src_pte, | 
|  | orig_dst_pte, orig_src_pte, dst_pmd, | 
|  | dst_pmdval, dst_ptl, src_ptl, src_folio); | 
|  | } else { | 
|  | struct folio *folio = NULL; | 
|  |  | 
|  | entry = pte_to_swp_entry(orig_src_pte); | 
|  | if (non_swap_entry(entry)) { | 
|  | if (is_migration_entry(entry)) { | 
|  | pte_unmap(src_pte); | 
|  | pte_unmap(dst_pte); | 
|  | src_pte = dst_pte = NULL; | 
|  | migration_entry_wait(mm, src_pmd, src_addr); | 
|  | err = -EAGAIN; | 
|  | } else | 
|  | err = -EFAULT; | 
|  | goto out; | 
|  | } | 
|  |  | 
|  | if (!pte_swp_exclusive(orig_src_pte)) { | 
|  | err = -EBUSY; | 
|  | goto out; | 
|  | } | 
|  |  | 
|  | si = get_swap_device(entry); | 
|  | if (unlikely(!si)) { | 
|  | err = -EAGAIN; | 
|  | goto out; | 
|  | } | 
|  | /* | 
|  | * Verify the existence of the swapcache. If present, the folio's | 
|  | * index and mapping must be updated even when the PTE is a swap | 
|  | * entry. The anon_vma lock is not taken during this process since | 
|  | * the folio has already been unmapped, and the swap entry is | 
|  | * exclusive, preventing rmap walks. | 
|  | * | 
|  | * For large folios, return -EBUSY immediately, as split_folio() | 
|  | * also returns -EBUSY when attempting to split unmapped large | 
|  | * folios in the swapcache. This issue needs to be resolved | 
|  | * separately to allow proper handling. | 
|  | */ | 
|  | if (!src_folio) | 
|  | folio = filemap_get_folio(swap_address_space(entry), | 
|  | swap_cache_index(entry)); | 
|  | if (!IS_ERR_OR_NULL(folio)) { | 
|  | if (folio_test_large(folio)) { | 
|  | err = -EBUSY; | 
|  | folio_put(folio); | 
|  | goto out; | 
|  | } | 
|  | src_folio = folio; | 
|  | src_folio_pte = orig_src_pte; | 
|  | if (!folio_trylock(src_folio)) { | 
|  | pte_unmap(src_pte); | 
|  | pte_unmap(dst_pte); | 
|  | src_pte = dst_pte = NULL; | 
|  | put_swap_device(si); | 
|  | si = NULL; | 
|  | /* now we can block and wait */ | 
|  | folio_lock(src_folio); | 
|  | goto retry; | 
|  | } | 
|  | } | 
|  | err = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte, | 
|  | orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval, | 
|  | dst_ptl, src_ptl, src_folio, si, entry); | 
|  | } | 
|  |  | 
|  | out: | 
|  | if (src_anon_vma) { | 
|  | anon_vma_unlock_write(src_anon_vma); | 
|  | put_anon_vma(src_anon_vma); | 
|  | } | 
|  | if (src_folio) { | 
|  | folio_unlock(src_folio); | 
|  | folio_put(src_folio); | 
|  | } | 
|  | if (dst_pte) | 
|  | pte_unmap(dst_pte); | 
|  | if (src_pte) | 
|  | pte_unmap(src_pte); | 
|  | mmu_notifier_invalidate_range_end(&range); | 
|  | if (si) | 
|  | put_swap_device(si); | 
|  |  | 
|  | return err; | 
|  | } | 
|  |  | 
|  | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 
|  | static inline bool move_splits_huge_pmd(unsigned long dst_addr, | 
|  | unsigned long src_addr, | 
|  | unsigned long src_end) | 
|  | { | 
|  | return (src_addr & ~HPAGE_PMD_MASK) || (dst_addr & ~HPAGE_PMD_MASK) || | 
|  | src_end - src_addr < HPAGE_PMD_SIZE; | 
|  | } | 
|  | #else | 
|  | static inline bool move_splits_huge_pmd(unsigned long dst_addr, | 
|  | unsigned long src_addr, | 
|  | unsigned long src_end) | 
|  | { | 
|  | /* This is unreachable anyway, just to avoid warnings when HPAGE_PMD_SIZE==0 */ | 
|  | return false; | 
|  | } | 
|  | #endif | 
|  |  | 
|  | static inline bool vma_move_compatible(struct vm_area_struct *vma) | 
|  | { | 
|  | return !(vma->vm_flags & (VM_PFNMAP | VM_IO |  VM_HUGETLB | | 
|  | VM_MIXEDMAP | VM_SHADOW_STACK)); | 
|  | } | 
|  |  | 
|  | static int validate_move_areas(struct userfaultfd_ctx *ctx, | 
|  | struct vm_area_struct *src_vma, | 
|  | struct vm_area_struct *dst_vma) | 
|  | { | 
|  | /* Only allow moving if both have the same access and protection */ | 
|  | if ((src_vma->vm_flags & VM_ACCESS_FLAGS) != (dst_vma->vm_flags & VM_ACCESS_FLAGS) || | 
|  | pgprot_val(src_vma->vm_page_prot) != pgprot_val(dst_vma->vm_page_prot)) | 
|  | return -EINVAL; | 
|  |  | 
|  | /* Only allow moving if both are mlocked or both aren't */ | 
|  | if ((src_vma->vm_flags & VM_LOCKED) != (dst_vma->vm_flags & VM_LOCKED)) | 
|  | return -EINVAL; | 
|  |  | 
|  | /* | 
|  | * For now, we keep it simple and only move between writable VMAs. | 
|  | * Access flags are equal, therefore cheching only the source is enough. | 
|  | */ | 
|  | if (!(src_vma->vm_flags & VM_WRITE)) | 
|  | return -EINVAL; | 
|  |  | 
|  | /* Check if vma flags indicate content which can be moved */ | 
|  | if (!vma_move_compatible(src_vma) || !vma_move_compatible(dst_vma)) | 
|  | return -EINVAL; | 
|  |  | 
|  | /* Ensure dst_vma is registered in uffd we are operating on */ | 
|  | if (!dst_vma->vm_userfaultfd_ctx.ctx || | 
|  | dst_vma->vm_userfaultfd_ctx.ctx != ctx) | 
|  | return -EINVAL; | 
|  |  | 
|  | /* Only allow moving across anonymous vmas */ | 
|  | if (!vma_is_anonymous(src_vma) || !vma_is_anonymous(dst_vma)) | 
|  | return -EINVAL; | 
|  |  | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | static __always_inline | 
|  | int find_vmas_mm_locked(struct mm_struct *mm, | 
|  | unsigned long dst_start, | 
|  | unsigned long src_start, | 
|  | struct vm_area_struct **dst_vmap, | 
|  | struct vm_area_struct **src_vmap) | 
|  | { | 
|  | struct vm_area_struct *vma; | 
|  |  | 
|  | mmap_assert_locked(mm); | 
|  | vma = find_vma_and_prepare_anon(mm, dst_start); | 
|  | if (IS_ERR(vma)) | 
|  | return PTR_ERR(vma); | 
|  |  | 
|  | *dst_vmap = vma; | 
|  | /* Skip finding src_vma if src_start is in dst_vma */ | 
|  | if (src_start >= vma->vm_start && src_start < vma->vm_end) | 
|  | goto out_success; | 
|  |  | 
|  | vma = vma_lookup(mm, src_start); | 
|  | if (!vma) | 
|  | return -ENOENT; | 
|  | out_success: | 
|  | *src_vmap = vma; | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | #ifdef CONFIG_PER_VMA_LOCK | 
|  | static int uffd_move_lock(struct mm_struct *mm, | 
|  | unsigned long dst_start, | 
|  | unsigned long src_start, | 
|  | struct vm_area_struct **dst_vmap, | 
|  | struct vm_area_struct **src_vmap) | 
|  | { | 
|  | struct vm_area_struct *vma; | 
|  | int err; | 
|  |  | 
|  | vma = uffd_lock_vma(mm, dst_start); | 
|  | if (IS_ERR(vma)) | 
|  | return PTR_ERR(vma); | 
|  |  | 
|  | *dst_vmap = vma; | 
|  | /* | 
|  | * Skip finding src_vma if src_start is in dst_vma. This also ensures | 
|  | * that we don't lock the same vma twice. | 
|  | */ | 
|  | if (src_start >= vma->vm_start && src_start < vma->vm_end) { | 
|  | *src_vmap = vma; | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Using uffd_lock_vma() to get src_vma can lead to following deadlock: | 
|  | * | 
|  | * Thread1				Thread2 | 
|  | * -------				------- | 
|  | * vma_start_read(dst_vma) | 
|  | *					mmap_write_lock(mm) | 
|  | *					vma_start_write(src_vma) | 
|  | * vma_start_read(src_vma) | 
|  | * mmap_read_lock(mm) | 
|  | *					vma_start_write(dst_vma) | 
|  | */ | 
|  | *src_vmap = lock_vma_under_rcu(mm, src_start); | 
|  | if (likely(*src_vmap)) | 
|  | return 0; | 
|  |  | 
|  | /* Undo any locking and retry in mmap_lock critical section */ | 
|  | vma_end_read(*dst_vmap); | 
|  |  | 
|  | mmap_read_lock(mm); | 
|  | err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap); | 
|  | if (err) | 
|  | goto out; | 
|  |  | 
|  | if (!vma_start_read_locked(*dst_vmap)) { | 
|  | err = -EAGAIN; | 
|  | goto out; | 
|  | } | 
|  |  | 
|  | /* Nothing further to do if both vmas are locked. */ | 
|  | if (*dst_vmap == *src_vmap) | 
|  | goto out; | 
|  |  | 
|  | if (!vma_start_read_locked_nested(*src_vmap, SINGLE_DEPTH_NESTING)) { | 
|  | /* Undo dst_vmap locking if src_vmap failed to lock */ | 
|  | vma_end_read(*dst_vmap); | 
|  | err = -EAGAIN; | 
|  | } | 
|  | out: | 
|  | mmap_read_unlock(mm); | 
|  | return err; | 
|  | } | 
|  |  | 
|  | static void uffd_move_unlock(struct vm_area_struct *dst_vma, | 
|  | struct vm_area_struct *src_vma) | 
|  | { | 
|  | vma_end_read(src_vma); | 
|  | if (src_vma != dst_vma) | 
|  | vma_end_read(dst_vma); | 
|  | } | 
|  |  | 
|  | #else | 
|  |  | 
|  | static int uffd_move_lock(struct mm_struct *mm, | 
|  | unsigned long dst_start, | 
|  | unsigned long src_start, | 
|  | struct vm_area_struct **dst_vmap, | 
|  | struct vm_area_struct **src_vmap) | 
|  | { | 
|  | int err; | 
|  |  | 
|  | mmap_read_lock(mm); | 
|  | err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap); | 
|  | if (err) | 
|  | mmap_read_unlock(mm); | 
|  | return err; | 
|  | } | 
|  |  | 
|  | static void uffd_move_unlock(struct vm_area_struct *dst_vma, | 
|  | struct vm_area_struct *src_vma) | 
|  | { | 
|  | mmap_assert_locked(src_vma->vm_mm); | 
|  | mmap_read_unlock(dst_vma->vm_mm); | 
|  | } | 
|  | #endif | 
|  |  | 
|  | /** | 
|  | * move_pages - move arbitrary anonymous pages of an existing vma | 
|  | * @ctx: pointer to the userfaultfd context | 
|  | * @dst_start: start of the destination virtual memory range | 
|  | * @src_start: start of the source virtual memory range | 
|  | * @len: length of the virtual memory range | 
|  | * @mode: flags from uffdio_move.mode | 
|  | * | 
|  | * It will either use the mmap_lock in read mode or per-vma locks | 
|  | * | 
|  | * move_pages() remaps arbitrary anonymous pages atomically in zero | 
|  | * copy. It only works on non shared anonymous pages because those can | 
|  | * be relocated without generating non linear anon_vmas in the rmap | 
|  | * code. | 
|  | * | 
|  | * It provides a zero copy mechanism to handle userspace page faults. | 
|  | * The source vma pages should have mapcount == 1, which can be | 
|  | * enforced by using madvise(MADV_DONTFORK) on src vma. | 
|  | * | 
|  | * The thread receiving the page during the userland page fault | 
|  | * will receive the faulting page in the source vma through the network, | 
|  | * storage or any other I/O device (MADV_DONTFORK in the source vma | 
|  | * avoids move_pages() to fail with -EBUSY if the process forks before | 
|  | * move_pages() is called), then it will call move_pages() to map the | 
|  | * page in the faulting address in the destination vma. | 
|  | * | 
|  | * This userfaultfd command works purely via pagetables, so it's the | 
|  | * most efficient way to move physical non shared anonymous pages | 
|  | * across different virtual addresses. Unlike mremap()/mmap()/munmap() | 
|  | * it does not create any new vmas. The mapping in the destination | 
|  | * address is atomic. | 
|  | * | 
|  | * It only works if the vma protection bits are identical from the | 
|  | * source and destination vma. | 
|  | * | 
|  | * It can remap non shared anonymous pages within the same vma too. | 
|  | * | 
|  | * If the source virtual memory range has any unmapped holes, or if | 
|  | * the destination virtual memory range is not a whole unmapped hole, | 
|  | * move_pages() will fail respectively with -ENOENT or -EEXIST. This | 
|  | * provides a very strict behavior to avoid any chance of memory | 
|  | * corruption going unnoticed if there are userland race conditions. | 
|  | * Only one thread should resolve the userland page fault at any given | 
|  | * time for any given faulting address. This means that if two threads | 
|  | * try to both call move_pages() on the same destination address at the | 
|  | * same time, the second thread will get an explicit error from this | 
|  | * command. | 
|  | * | 
|  | * The command retval will return "len" is successful. The command | 
|  | * however can be interrupted by fatal signals or errors. If | 
|  | * interrupted it will return the number of bytes successfully | 
|  | * remapped before the interruption if any, or the negative error if | 
|  | * none. It will never return zero. Either it will return an error or | 
|  | * an amount of bytes successfully moved. If the retval reports a | 
|  | * "short" remap, the move_pages() command should be repeated by | 
|  | * userland with src+retval, dst+reval, len-retval if it wants to know | 
|  | * about the error that interrupted it. | 
|  | * | 
|  | * The UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES flag can be specified to | 
|  | * prevent -ENOENT errors to materialize if there are holes in the | 
|  | * source virtual range that is being remapped. The holes will be | 
|  | * accounted as successfully remapped in the retval of the | 
|  | * command. This is mostly useful to remap hugepage naturally aligned | 
|  | * virtual regions without knowing if there are transparent hugepage | 
|  | * in the regions or not, but preventing the risk of having to split | 
|  | * the hugepmd during the remap. | 
|  | * | 
|  | * If there's any rmap walk that is taking the anon_vma locks without | 
|  | * first obtaining the folio lock (the only current instance is | 
|  | * folio_referenced), they will have to verify if the folio->mapping | 
|  | * has changed after taking the anon_vma lock. If it changed they | 
|  | * should release the lock and retry obtaining a new anon_vma, because | 
|  | * it means the anon_vma was changed by move_pages() before the lock | 
|  | * could be obtained. This is the only additional complexity added to | 
|  | * the rmap code to provide this anonymous page remapping functionality. | 
|  | */ | 
|  | ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, | 
|  | unsigned long src_start, unsigned long len, __u64 mode) | 
|  | { | 
|  | struct mm_struct *mm = ctx->mm; | 
|  | struct vm_area_struct *src_vma, *dst_vma; | 
|  | unsigned long src_addr, dst_addr; | 
|  | pmd_t *src_pmd, *dst_pmd; | 
|  | long err = -EINVAL; | 
|  | ssize_t moved = 0; | 
|  |  | 
|  | /* Sanitize the command parameters. */ | 
|  | VM_WARN_ON_ONCE(src_start & ~PAGE_MASK); | 
|  | VM_WARN_ON_ONCE(dst_start & ~PAGE_MASK); | 
|  | VM_WARN_ON_ONCE(len & ~PAGE_MASK); | 
|  |  | 
|  | /* Does the address range wrap, or is the span zero-sized? */ | 
|  | VM_WARN_ON_ONCE(src_start + len < src_start); | 
|  | VM_WARN_ON_ONCE(dst_start + len < dst_start); | 
|  |  | 
|  | err = uffd_move_lock(mm, dst_start, src_start, &dst_vma, &src_vma); | 
|  | if (err) | 
|  | goto out; | 
|  |  | 
|  | /* Re-check after taking map_changing_lock */ | 
|  | err = -EAGAIN; | 
|  | down_read(&ctx->map_changing_lock); | 
|  | if (likely(atomic_read(&ctx->mmap_changing))) | 
|  | goto out_unlock; | 
|  | /* | 
|  | * Make sure the vma is not shared, that the src and dst remap | 
|  | * ranges are both valid and fully within a single existing | 
|  | * vma. | 
|  | */ | 
|  | err = -EINVAL; | 
|  | if (src_vma->vm_flags & VM_SHARED) | 
|  | goto out_unlock; | 
|  | if (src_start + len > src_vma->vm_end) | 
|  | goto out_unlock; | 
|  |  | 
|  | if (dst_vma->vm_flags & VM_SHARED) | 
|  | goto out_unlock; | 
|  | if (dst_start + len > dst_vma->vm_end) | 
|  | goto out_unlock; | 
|  |  | 
|  | err = validate_move_areas(ctx, src_vma, dst_vma); | 
|  | if (err) | 
|  | goto out_unlock; | 
|  |  | 
|  | for (src_addr = src_start, dst_addr = dst_start; | 
|  | src_addr < src_start + len;) { | 
|  | spinlock_t *ptl; | 
|  | pmd_t dst_pmdval; | 
|  | unsigned long step_size; | 
|  |  | 
|  | /* | 
|  | * Below works because anonymous area would not have a | 
|  | * transparent huge PUD. If file-backed support is added, | 
|  | * that case would need to be handled here. | 
|  | */ | 
|  | src_pmd = mm_find_pmd(mm, src_addr); | 
|  | if (unlikely(!src_pmd)) { | 
|  | if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) { | 
|  | err = -ENOENT; | 
|  | break; | 
|  | } | 
|  | src_pmd = mm_alloc_pmd(mm, src_addr); | 
|  | if (unlikely(!src_pmd)) { | 
|  | err = -ENOMEM; | 
|  | break; | 
|  | } | 
|  | } | 
|  | dst_pmd = mm_alloc_pmd(mm, dst_addr); | 
|  | if (unlikely(!dst_pmd)) { | 
|  | err = -ENOMEM; | 
|  | break; | 
|  | } | 
|  |  | 
|  | dst_pmdval = pmdp_get_lockless(dst_pmd); | 
|  | /* | 
|  | * If the dst_pmd is mapped as THP don't override it and just | 
|  | * be strict. If dst_pmd changes into TPH after this check, the | 
|  | * move_pages_huge_pmd() will detect the change and retry | 
|  | * while move_pages_pte() will detect the change and fail. | 
|  | */ | 
|  | if (unlikely(pmd_trans_huge(dst_pmdval))) { | 
|  | err = -EEXIST; | 
|  | break; | 
|  | } | 
|  |  | 
|  | ptl = pmd_trans_huge_lock(src_pmd, src_vma); | 
|  | if (ptl) { | 
|  | /* Check if we can move the pmd without splitting it. */ | 
|  | if (move_splits_huge_pmd(dst_addr, src_addr, src_start + len) || | 
|  | !pmd_none(dst_pmdval)) { | 
|  | /* Can be a migration entry */ | 
|  | if (pmd_present(*src_pmd)) { | 
|  | struct folio *folio = pmd_folio(*src_pmd); | 
|  |  | 
|  | if (!is_huge_zero_folio(folio) && | 
|  | !PageAnonExclusive(&folio->page)) { | 
|  | spin_unlock(ptl); | 
|  | err = -EBUSY; | 
|  | break; | 
|  | } | 
|  | } | 
|  |  | 
|  | spin_unlock(ptl); | 
|  | split_huge_pmd(src_vma, src_pmd, src_addr); | 
|  | /* The folio will be split by move_pages_pte() */ | 
|  | continue; | 
|  | } | 
|  |  | 
|  | err = move_pages_huge_pmd(mm, dst_pmd, src_pmd, | 
|  | dst_pmdval, dst_vma, src_vma, | 
|  | dst_addr, src_addr); | 
|  | step_size = HPAGE_PMD_SIZE; | 
|  | } else { | 
|  | if (pmd_none(*src_pmd)) { | 
|  | if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) { | 
|  | err = -ENOENT; | 
|  | break; | 
|  | } | 
|  | if (unlikely(__pte_alloc(mm, src_pmd))) { | 
|  | err = -ENOMEM; | 
|  | break; | 
|  | } | 
|  | } | 
|  |  | 
|  | if (unlikely(pte_alloc(mm, dst_pmd))) { | 
|  | err = -ENOMEM; | 
|  | break; | 
|  | } | 
|  |  | 
|  | err = move_pages_pte(mm, dst_pmd, src_pmd, | 
|  | dst_vma, src_vma, | 
|  | dst_addr, src_addr, mode); | 
|  | step_size = PAGE_SIZE; | 
|  | } | 
|  |  | 
|  | cond_resched(); | 
|  |  | 
|  | if (fatal_signal_pending(current)) { | 
|  | /* Do not override an error */ | 
|  | if (!err || err == -EAGAIN) | 
|  | err = -EINTR; | 
|  | break; | 
|  | } | 
|  |  | 
|  | if (err) { | 
|  | if (err == -EAGAIN) | 
|  | continue; | 
|  | break; | 
|  | } | 
|  |  | 
|  | /* Proceed to the next page */ | 
|  | dst_addr += step_size; | 
|  | src_addr += step_size; | 
|  | moved += step_size; | 
|  | } | 
|  |  | 
|  | out_unlock: | 
|  | up_read(&ctx->map_changing_lock); | 
|  | uffd_move_unlock(dst_vma, src_vma); | 
|  | out: | 
|  | VM_WARN_ON_ONCE(moved < 0); | 
|  | VM_WARN_ON_ONCE(err > 0); | 
|  | VM_WARN_ON_ONCE(!moved && !err); | 
|  | return moved ? moved : err; | 
|  | } | 
|  |  | 
|  | static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, | 
|  | vm_flags_t vm_flags) | 
|  | { | 
|  | const bool uffd_wp_changed = (vma->vm_flags ^ vm_flags) & VM_UFFD_WP; | 
|  |  | 
|  | vm_flags_reset(vma, vm_flags); | 
|  | /* | 
|  | * For shared mappings, we want to enable writenotify while | 
|  | * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply | 
|  | * recalculate vma->vm_page_prot whenever userfaultfd-wp changes. | 
|  | */ | 
|  | if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed) | 
|  | vma_set_page_prot(vma); | 
|  | } | 
|  |  | 
|  | static void userfaultfd_set_ctx(struct vm_area_struct *vma, | 
|  | struct userfaultfd_ctx *ctx, | 
|  | vm_flags_t vm_flags) | 
|  | { | 
|  | vma_start_write(vma); | 
|  | vma->vm_userfaultfd_ctx = (struct vm_userfaultfd_ctx){ctx}; | 
|  | userfaultfd_set_vm_flags(vma, | 
|  | (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags); | 
|  | } | 
|  |  | 
|  | void userfaultfd_reset_ctx(struct vm_area_struct *vma) | 
|  | { | 
|  | userfaultfd_set_ctx(vma, NULL, 0); | 
|  | } | 
|  |  | 
|  | struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, | 
|  | struct vm_area_struct *prev, | 
|  | struct vm_area_struct *vma, | 
|  | unsigned long start, | 
|  | unsigned long end) | 
|  | { | 
|  | struct vm_area_struct *ret; | 
|  | bool give_up_on_oom = false; | 
|  |  | 
|  | /* | 
|  | * If we are modifying only and not splitting, just give up on the merge | 
|  | * if OOM prevents us from merging successfully. | 
|  | */ | 
|  | if (start == vma->vm_start && end == vma->vm_end) | 
|  | give_up_on_oom = true; | 
|  |  | 
|  | /* Reset ptes for the whole vma range if wr-protected */ | 
|  | if (userfaultfd_wp(vma)) | 
|  | uffd_wp_range(vma, start, end - start, false); | 
|  |  | 
|  | ret = vma_modify_flags_uffd(vmi, prev, vma, start, end, | 
|  | vma->vm_flags & ~__VM_UFFD_FLAGS, | 
|  | NULL_VM_UFFD_CTX, give_up_on_oom); | 
|  |  | 
|  | /* | 
|  | * In the vma_merge() successful mprotect-like case 8: | 
|  | * the next vma was merged into the current one and | 
|  | * the current one has not been updated yet. | 
|  | */ | 
|  | if (!IS_ERR(ret)) | 
|  | userfaultfd_reset_ctx(ret); | 
|  |  | 
|  | return ret; | 
|  | } | 
|  |  | 
|  | /* Assumes mmap write lock taken, and mm_struct pinned. */ | 
|  | int userfaultfd_register_range(struct userfaultfd_ctx *ctx, | 
|  | struct vm_area_struct *vma, | 
|  | vm_flags_t vm_flags, | 
|  | unsigned long start, unsigned long end, | 
|  | bool wp_async) | 
|  | { | 
|  | VMA_ITERATOR(vmi, ctx->mm, start); | 
|  | struct vm_area_struct *prev = vma_prev(&vmi); | 
|  | unsigned long vma_end; | 
|  | vm_flags_t new_flags; | 
|  |  | 
|  | if (vma->vm_start < start) | 
|  | prev = vma; | 
|  |  | 
|  | for_each_vma_range(vmi, vma, end) { | 
|  | cond_resched(); | 
|  |  | 
|  | VM_WARN_ON_ONCE(!vma_can_userfault(vma, vm_flags, wp_async)); | 
|  | VM_WARN_ON_ONCE(vma->vm_userfaultfd_ctx.ctx && | 
|  | vma->vm_userfaultfd_ctx.ctx != ctx); | 
|  | VM_WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE)); | 
|  |  | 
|  | /* | 
|  | * Nothing to do: this vma is already registered into this | 
|  | * userfaultfd and with the right tracking mode too. | 
|  | */ | 
|  | if (vma->vm_userfaultfd_ctx.ctx == ctx && | 
|  | (vma->vm_flags & vm_flags) == vm_flags) | 
|  | goto skip; | 
|  |  | 
|  | if (vma->vm_start > start) | 
|  | start = vma->vm_start; | 
|  | vma_end = min(end, vma->vm_end); | 
|  |  | 
|  | new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags; | 
|  | vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end, | 
|  | new_flags, | 
|  | (struct vm_userfaultfd_ctx){ctx}, | 
|  | /* give_up_on_oom = */false); | 
|  | if (IS_ERR(vma)) | 
|  | return PTR_ERR(vma); | 
|  |  | 
|  | /* | 
|  | * In the vma_merge() successful mprotect-like case 8: | 
|  | * the next vma was merged into the current one and | 
|  | * the current one has not been updated yet. | 
|  | */ | 
|  | userfaultfd_set_ctx(vma, ctx, vm_flags); | 
|  |  | 
|  | if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma)) | 
|  | hugetlb_unshare_all_pmds(vma); | 
|  |  | 
|  | skip: | 
|  | prev = vma; | 
|  | start = vma->vm_end; | 
|  | } | 
|  |  | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | void userfaultfd_release_new(struct userfaultfd_ctx *ctx) | 
|  | { | 
|  | struct mm_struct *mm = ctx->mm; | 
|  | struct vm_area_struct *vma; | 
|  | VMA_ITERATOR(vmi, mm, 0); | 
|  |  | 
|  | /* the various vma->vm_userfaultfd_ctx still points to it */ | 
|  | mmap_write_lock(mm); | 
|  | for_each_vma(vmi, vma) { | 
|  | if (vma->vm_userfaultfd_ctx.ctx == ctx) | 
|  | userfaultfd_reset_ctx(vma); | 
|  | } | 
|  | mmap_write_unlock(mm); | 
|  | } | 
|  |  | 
|  | void userfaultfd_release_all(struct mm_struct *mm, | 
|  | struct userfaultfd_ctx *ctx) | 
|  | { | 
|  | struct vm_area_struct *vma, *prev; | 
|  | VMA_ITERATOR(vmi, mm, 0); | 
|  |  | 
|  | if (!mmget_not_zero(mm)) | 
|  | return; | 
|  |  | 
|  | /* | 
|  | * Flush page faults out of all CPUs. NOTE: all page faults | 
|  | * must be retried without returning VM_FAULT_SIGBUS if | 
|  | * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx | 
|  | * changes while handle_userfault released the mmap_lock. So | 
|  | * it's critical that released is set to true (above), before | 
|  | * taking the mmap_lock for writing. | 
|  | */ | 
|  | mmap_write_lock(mm); | 
|  | prev = NULL; | 
|  | for_each_vma(vmi, vma) { | 
|  | cond_resched(); | 
|  | VM_WARN_ON_ONCE(!!vma->vm_userfaultfd_ctx.ctx ^ | 
|  | !!(vma->vm_flags & __VM_UFFD_FLAGS)); | 
|  | if (vma->vm_userfaultfd_ctx.ctx != ctx) { | 
|  | prev = vma; | 
|  | continue; | 
|  | } | 
|  |  | 
|  | vma = userfaultfd_clear_vma(&vmi, prev, vma, | 
|  | vma->vm_start, vma->vm_end); | 
|  | prev = vma; | 
|  | } | 
|  | mmap_write_unlock(mm); | 
|  | mmput(mm); | 
|  | } |