| // SPDX-License-Identifier: GPL-2.0 |
| #include <linux/pagemap.h> |
| #include <linux/mm.h> |
| #include <linux/hugetlb.h> |
| #include <linux/kernel.h> |
| #include <linux/sched.h> |
| #include <linux/proc_fs.h> |
| #include <linux/uaccess.h> |
| #include <linux/kvm.h> |
| #include <linux/kvm_host.h> |
| #include <linux/bitmap.h> |
| #include <linux/sched/mm.h> |
| #include <linux/version.h> |
| #include <linux/module.h> |
| #include <linux/io.h> |
| #include <linux/pagewalk.h> |
| #include <linux/uaccess.h> |
| #include <asm/cacheflush.h> |
| #include <asm/page.h> |
| #include <asm/pgalloc.h> |
| #include <asm/pgtable.h> |
| #include <linux/huge_mm.h> |
| #ifdef CONFIG_ARM64 |
| #include <asm/pgtable-types.h> |
| #include <asm/memory.h> |
| #include <asm/kvm_mmu.h> |
| #include <asm/kvm_arm.h> |
| #include <asm/stage2_pgtable.h> |
| #endif |
| #include "etmem_scan.h" |
| #include <linux/hugetlb_inline.h> |
| |
| #ifdef CONFIG_X86_64 |
| /* |
| * Fallback to false for kernel doens't support KVM_INVALID_SPTE |
| * ept_idle can sitll work in this situation but the scan accuracy may drop, |
| * depends on the access frequences of the workload. |
| */ |
| #ifdef KVM_INVALID_SPTE |
| #define KVM_CHECK_INVALID_SPTE(val) ((val) == KVM_INVALID_SPTE) |
| #else |
| #define KVM_CHECK_INVALID_SPTE(val) (0) |
| #endif |
| |
| # define kvm_arch_mmu_pointer(vcpu) (vcpu->arch.mmu) |
| # define kvm_mmu_ad_disabled(mmu) (mmu->mmu_role.base.ad_disabled) |
| #endif /*CONFIG_X86_64*/ |
| |
| #ifdef CONFIG_ARM64 |
| #define if_pmd_thp_or_huge(pmd) (if_pmd_huge(pmd) || pmd_trans_huge(pmd)) |
| #endif /* CONFIG_ARM64 */ |
| |
| #ifdef DEBUG |
| |
| #define debug_printk trace_printk |
| |
| #define set_restart_gpa(val, note) ({ \ |
| unsigned long old_val = pic->restart_gpa; \ |
| pic->restart_gpa = (val); \ |
| trace_printk("restart_gpa=%lx %luK %s %s %d\n", \ |
| (val), (pic->restart_gpa - old_val) >> 10, \ |
| note, __func__, __LINE__); \ |
| }) |
| |
| #define set_next_hva(val, note) ({ \ |
| unsigned long old_val = pic->next_hva; \ |
| pic->next_hva = (val); \ |
| trace_printk(" next_hva=%lx %luK %s %s %d\n", \ |
| (val), (pic->next_hva - old_val) >> 10, \ |
| note, __func__, __LINE__); \ |
| }) |
| |
| #else |
| |
| #define debug_printk(...) |
| |
| #define set_restart_gpa(val, note) ({ \ |
| pic->restart_gpa = (val); \ |
| }) |
| |
| #define set_next_hva(val, note) ({ \ |
| pic->next_hva = (val); \ |
| }) |
| |
| #endif |
| |
| #define RET_RESCAN_FLAG 0x10000 |
| |
| /* error return IDLE_PAGE_TYPE_MAX or return valid page type */ |
| enum ProcIdlePageType (*vm_handle_pte_hole)(unsigned long addr, |
| unsigned long next, int depth, struct mm_walk *walk) = NULL; |
| EXPORT_SYMBOL_GPL(vm_handle_pte_hole); |
| |
| static int set_walk_step(const char *val, const struct kernel_param *kp) |
| { |
| int ret; |
| unsigned int n; |
| |
| ret = kstrtouint(val, 0, &n); |
| if (ret != 0 || n == 0) |
| return -EINVAL; |
| |
| return param_set_uint(val, kp); |
| } |
| |
| static struct kernel_param_ops walk_step_ops = { |
| .set = set_walk_step, |
| .get = param_get_uint, |
| }; |
| |
| static unsigned int __read_mostly walk_step = 512; // in PAGE_SIZE |
| module_param_cb(walk_step, &walk_step_ops, &walk_step, 0644); |
| |
| static unsigned int resched_step = 10; |
| module_param(resched_step, uint, 0644); |
| |
| static unsigned long pagetype_size[16] = { |
| [PTE_ACCESSED] = PAGE_SIZE, /* 4k page */ |
| [PMD_ACCESSED] = PMD_SIZE, /* 2M page */ |
| [PUD_PRESENT] = PUD_SIZE, /* 1G page */ |
| |
| [PTE_DIRTY_M] = PAGE_SIZE, |
| [PMD_DIRTY_M] = PMD_SIZE, |
| |
| [PTE_IDLE] = PAGE_SIZE, |
| [PMD_IDLE] = PMD_SIZE, |
| [PMD_IDLE_PTES] = PMD_SIZE, |
| |
| [PTE_HOLE] = PAGE_SIZE, |
| [PMD_HOLE] = PMD_SIZE, |
| }; |
| |
| static void u64_to_u8(uint64_t n, uint8_t *p) |
| { |
| p += sizeof(uint64_t) - 1; |
| |
| *p-- = n; n >>= 8; |
| *p-- = n; n >>= 8; |
| *p-- = n; n >>= 8; |
| *p-- = n; n >>= 8; |
| |
| *p-- = n; n >>= 8; |
| *p-- = n; n >>= 8; |
| *p-- = n; n >>= 8; |
| *p = n; |
| } |
| |
| static void dump_pic(struct page_idle_ctrl *pic) |
| { |
| debug_printk("page_idle_ctrl: pie_read=%d pie_read_max=%d", |
| pic->pie_read, |
| pic->pie_read_max); |
| debug_printk(" buf_size=%d bytes_copied=%d next_hva=%pK", |
| pic->buf_size, |
| pic->bytes_copied, |
| pic->next_hva); |
| debug_printk(" restart_gpa=%pK pa_to_hva=%pK\n", |
| pic->restart_gpa, |
| pic->gpa_to_hva); |
| } |
| |
| #ifdef CONFIG_ARM64 |
| static int if_pmd_huge(pmd_t pmd) |
| { |
| return pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT); |
| } |
| |
| static int if_pud_huge(pud_t pud) |
| { |
| #ifndef __PAGETABLE_PMD_FOLDED |
| return pud_val(pud) && !(pud_val(pud) & PUD_TABLE_BIT); |
| #else |
| return 0; |
| #endif |
| } |
| #endif |
| |
| static void pic_report_addr(struct page_idle_ctrl *pic, unsigned long addr) |
| { |
| unsigned long hva; |
| |
| pic->kpie[pic->pie_read++] = PIP_CMD_SET_HVA; |
| hva = addr; |
| u64_to_u8(hva, &pic->kpie[pic->pie_read]); |
| pic->pie_read += sizeof(uint64_t); |
| dump_pic(pic); |
| } |
| |
| static int pic_add_page(struct page_idle_ctrl *pic, |
| unsigned long addr, |
| unsigned long next, |
| enum ProcIdlePageType page_type) |
| { |
| unsigned long page_size = pagetype_size[page_type]; |
| |
| dump_pic(pic); |
| |
| /* align kernel/user vision of cursor position */ |
| next = round_up(next, page_size); |
| |
| if (!pic->pie_read || |
| addr + pic->gpa_to_hva != pic->next_hva) { |
| /* merge hole */ |
| if (page_type == PTE_HOLE || |
| page_type == PMD_HOLE) { |
| set_restart_gpa(next, "PTE_HOLE|PMD_HOLE"); |
| return 0; |
| } |
| |
| if (addr + pic->gpa_to_hva < pic->next_hva) { |
| debug_printk("page_idle: addr moves backwards\n"); |
| WARN_ONCE(1, "page_idle: addr moves backwards"); |
| } |
| |
| if (pic->pie_read + sizeof(uint64_t) + 2 >= pic->pie_read_max) { |
| set_restart_gpa(addr, "PAGE_IDLE_KBUF_FULL"); |
| return PAGE_IDLE_KBUF_FULL; |
| } |
| |
| pic_report_addr(pic, round_down(addr, page_size) + |
| pic->gpa_to_hva); |
| } else { |
| if (PIP_TYPE(pic->kpie[pic->pie_read - 1]) == page_type && |
| PIP_SIZE(pic->kpie[pic->pie_read - 1]) < 0xF) { |
| set_next_hva(next + pic->gpa_to_hva, "IN-PLACE INC"); |
| set_restart_gpa(next, "IN-PLACE INC"); |
| pic->kpie[pic->pie_read - 1]++; |
| WARN_ONCE(page_size < next-addr, "next-addr too large"); |
| return 0; |
| } |
| if (pic->pie_read >= pic->pie_read_max) { |
| set_restart_gpa(addr, "PAGE_IDLE_KBUF_FULL"); |
| return PAGE_IDLE_KBUF_FULL; |
| } |
| } |
| |
| set_next_hva(next + pic->gpa_to_hva, "NEW-ITEM"); |
| set_restart_gpa(next, "NEW-ITEM"); |
| pic->kpie[pic->pie_read] = PIP_COMPOSE(page_type, 1); |
| pic->pie_read++; |
| |
| return 0; |
| } |
| |
| static int init_page_idle_ctrl_buffer(struct page_idle_ctrl *pic) |
| { |
| pic->pie_read = 0; |
| pic->pie_read_max = min(PAGE_IDLE_KBUF_SIZE, |
| pic->buf_size - pic->bytes_copied); |
| /* reserve space for PIP_CMD_SET_HVA in the end */ |
| pic->pie_read_max -= sizeof(uint64_t) + 1; |
| |
| /* |
| * Align with PAGE_IDLE_KBUF_FULL |
| * logic in pic_add_page(), to avoid pic->pie_read = 0 when |
| * PAGE_IDLE_KBUF_FULL happened. |
| */ |
| if (pic->pie_read_max <= sizeof(uint64_t) + 2) |
| return PAGE_IDLE_KBUF_FULL; |
| |
| memset(pic->kpie, 0, sizeof(pic->kpie)); |
| return 0; |
| } |
| |
| static void setup_page_idle_ctrl(struct page_idle_ctrl *pic, void *buf, |
| int buf_size, unsigned int flags) |
| { |
| pic->buf = buf; |
| pic->buf_size = buf_size; |
| pic->bytes_copied = 0; |
| pic->next_hva = 0; |
| pic->gpa_to_hva = 0; |
| pic->restart_gpa = 0; |
| pic->last_va = 0; |
| pic->flags = flags; |
| } |
| |
| static int page_idle_copy_user(struct page_idle_ctrl *pic, |
| unsigned long start, unsigned long end) |
| { |
| int bytes_read; |
| int ret; |
| |
| dump_pic(pic); |
| |
| bytes_read = pic->pie_read; |
| if (!bytes_read) |
| return 0; |
| |
| ret = copy_to_user(pic->buf, pic->kpie, bytes_read); |
| if (ret) |
| return -EFAULT; |
| |
| pic->buf += bytes_read; |
| pic->bytes_copied += bytes_read; |
| if (pic->bytes_copied >= pic->buf_size) |
| return PAGE_IDLE_BUF_FULL; |
| |
| ret = init_page_idle_ctrl_buffer(pic); |
| if (ret) |
| return ret; |
| |
| cond_resched(); |
| return 0; |
| } |
| |
| #ifdef CONFIG_X86_64 |
| static int vm_walk_host_range(unsigned long long start, |
| unsigned long end, |
| struct mm_walk *walk) |
| { |
| int ret; |
| struct page_idle_ctrl *pic = walk->private; |
| unsigned long tmp_gpa_to_hva = pic->gpa_to_hva; |
| |
| pic->gpa_to_hva = 0; |
| spin_unlock_irq(&pic->kvm->mmu_lock); |
| down_read(&walk->mm->mmap_lock); |
| local_irq_disable(); |
| ret = walk_page_range(walk->mm, start + tmp_gpa_to_hva, end + tmp_gpa_to_hva, |
| walk->ops, walk->private); |
| local_irq_enable(); |
| up_read(&walk->mm->mmap_lock); |
| pic->gpa_to_hva = tmp_gpa_to_hva; |
| if (pic->flags & VM_SCAN_HOST) { |
| pic->restart_gpa -= tmp_gpa_to_hva; |
| pic->flags &= ~VM_SCAN_HOST; |
| } |
| if (ret != PAGE_IDLE_KBUF_FULL && end > pic->restart_gpa) |
| pic->restart_gpa = end; |
| |
| /* ept page table may change after spin_unlock, rescan vm from root ept */ |
| ret |= RET_RESCAN_FLAG; |
| |
| return ret; |
| } |
| |
| static int ept_pte_range(struct page_idle_ctrl *pic, |
| pmd_t *pmd, unsigned long addr, unsigned long end, |
| struct mm_walk *walk) |
| { |
| pte_t *pte; |
| enum ProcIdlePageType page_type; |
| int err = 0; |
| |
| pte = pte_offset_kernel(pmd, addr); |
| do { |
| if (KVM_CHECK_INVALID_SPTE(pte->pte)) { |
| page_type = PTE_IDLE; |
| } else if (!ept_pte_present(*pte)) { |
| err = vm_walk_host_range(addr, end, walk); |
| goto next; |
| } else if (!test_and_clear_bit(_PAGE_BIT_EPT_ACCESSED, |
| (unsigned long *) &pte->pte)) |
| page_type = PTE_IDLE; |
| else { |
| page_type = PTE_ACCESSED; |
| if (pic->flags & SCAN_DIRTY_PAGE) { |
| if (test_and_clear_bit(_PAGE_BIT_EPT_DIRTY, |
| (unsigned long *) &pte->pte)) |
| page_type = PTE_DIRTY_M; |
| } |
| } |
| |
| err = pic_add_page(pic, addr, addr + PAGE_SIZE, page_type); |
| next: |
| if (err) |
| break; |
| } while (pte++, addr += PAGE_SIZE, addr != end); |
| |
| return err; |
| } |
| |
| static enum ProcIdlePageType ept_huge_accessed(pmd_t *pmd, unsigned long addr, |
| unsigned long end) |
| { |
| int accessed = PMD_IDLE; |
| pte_t *pte; |
| |
| pte = pte_offset_kernel(pmd, addr); |
| do { |
| if (!KVM_CHECK_INVALID_SPTE(pte->pte)) |
| continue; |
| if (!ept_pte_present(*pte)) |
| continue; |
| if (!test_and_clear_bit(_PAGE_BIT_EPT_ACCESSED, |
| (unsigned long *)&pte->pte)) |
| continue; |
| accessed = PMD_ACCESSED; |
| } while (pte++, addr += PAGE_SIZE, addr != end); |
| |
| return accessed; |
| } |
| |
| static int ept_pmd_range(struct page_idle_ctrl *pic, |
| pud_t *pud, unsigned long addr, unsigned long end, |
| struct mm_walk *walk) |
| { |
| pmd_t *pmd; |
| unsigned long next; |
| enum ProcIdlePageType page_type; |
| enum ProcIdlePageType pte_page_type; |
| int err = 0; |
| |
| if (pic->flags & SCAN_HUGE_PAGE) |
| pte_page_type = PMD_IDLE_PTES; |
| else |
| pte_page_type = IDLE_PAGE_TYPE_MAX; |
| |
| pmd = pmd_offset(pud, addr); |
| do { |
| next = pmd_addr_end(addr, end); |
| if (KVM_CHECK_INVALID_SPTE(pmd->pmd)) |
| page_type = PMD_IDLE; |
| else if (!ept_pmd_present(*pmd)) { |
| err = vm_walk_host_range(addr, next, walk); |
| goto next; |
| } else if (!pmd_large(*pmd)) { |
| if (pic->flags & SCAN_AS_HUGE) |
| page_type = ept_huge_accessed(pmd, addr, next); |
| else |
| page_type = pte_page_type; |
| } else if (!test_and_clear_bit(_PAGE_BIT_EPT_ACCESSED, |
| (unsigned long *)pmd)) |
| page_type = PMD_IDLE; |
| else { |
| page_type = PMD_ACCESSED; |
| if ((pic->flags & SCAN_DIRTY_PAGE) && |
| test_and_clear_bit(_PAGE_BIT_EPT_DIRTY, |
| (unsigned long *) pmd)) |
| page_type = PMD_DIRTY_M; |
| } |
| |
| if (page_type != IDLE_PAGE_TYPE_MAX) |
| err = pic_add_page(pic, addr, next, page_type); |
| else |
| err = ept_pte_range(pic, pmd, addr, next, walk); |
| |
| next: |
| if (err) |
| break; |
| } while (pmd++, addr = next, addr != end); |
| |
| return err; |
| } |
| |
| |
| static int ept_pud_range(struct page_idle_ctrl *pic, |
| p4d_t *p4d, unsigned long addr, unsigned long end, |
| struct mm_walk *walk) |
| { |
| pud_t *pud; |
| unsigned long next; |
| int err = 0; |
| |
| pud = pud_offset(p4d, addr); |
| do { |
| next = pud_addr_end(addr, end); |
| |
| if (!ept_pud_present(*pud)) { |
| err = vm_walk_host_range(addr, next, walk); |
| goto next; |
| } |
| |
| if (pud_large(*pud)) |
| err = pic_add_page(pic, addr, next, PUD_PRESENT); |
| else |
| err = ept_pmd_range(pic, pud, addr, next, walk); |
| |
| next: |
| if (err) |
| break; |
| } while (pud++, addr = next, addr != end); |
| |
| return err; |
| } |
| |
| static int ept_p4d_range(struct page_idle_ctrl *pic, |
| p4d_t *p4d, unsigned long addr, unsigned long end, |
| struct mm_walk *walk) |
| { |
| unsigned long next; |
| int err = 0; |
| |
| p4d += p4d_index(addr); |
| do { |
| next = p4d_addr_end(addr, end); |
| if (!ept_p4d_present(*p4d)) { |
| set_restart_gpa(next, "P4D_HOLE"); |
| continue; |
| } |
| |
| err = ept_pud_range(pic, p4d, addr, next, walk); |
| if (err) |
| break; |
| } while (p4d++, addr = next, addr != end); |
| |
| return err; |
| } |
| |
| static int ept_pgd_range(struct page_idle_ctrl *pic, |
| pgd_t *pgd, |
| unsigned long addr, |
| unsigned long end, |
| struct mm_walk *walk) |
| { |
| p4d_t *p4d; |
| unsigned long next; |
| int err = 0; |
| |
| pgd = pgd_offset_pgd(pgd, addr); |
| do { |
| next = pgd_addr_end(addr, end); |
| if (!ept_pgd_present(*pgd)) { |
| set_restart_gpa(next, "PGD_HOLE"); |
| continue; |
| } |
| |
| p4d = (p4d_t *)pgd_page_vaddr(*pgd); |
| err = ept_p4d_range(pic, p4d, addr, next, walk); |
| if (err) |
| break; |
| } while (pgd++, addr = next, addr != end); |
| |
| return err; |
| } |
| |
| static int ept_page_range(struct page_idle_ctrl *pic, |
| unsigned long addr, |
| unsigned long end, |
| struct mm_walk *walk) |
| { |
| struct kvm_vcpu *vcpu; |
| struct kvm_mmu *mmu; |
| uint64_t *ept_root; |
| int err = 0; |
| |
| WARN_ON(addr >= end); |
| |
| spin_lock_irq(&pic->kvm->mmu_lock); |
| |
| vcpu = kvm_get_vcpu(pic->kvm, 0); |
| if (!vcpu) { |
| pic->gpa_to_hva = 0; |
| set_restart_gpa(TASK_SIZE, "NO-VCPU"); |
| spin_unlock_irq(&pic->kvm->mmu_lock); |
| return -EINVAL; |
| } |
| |
| mmu = kvm_arch_mmu_pointer(vcpu); |
| if (!VALID_PAGE(mmu->root_hpa)) { |
| pic->gpa_to_hva = 0; |
| set_restart_gpa(TASK_SIZE, "NO-HPA"); |
| spin_unlock_irq(&pic->kvm->mmu_lock); |
| return -EINVAL; |
| } |
| |
| ept_root = __va(mmu->root_hpa); |
| |
| /* Walk start at p4d when vm has 4 level table pages */ |
| if (mmu->shadow_root_level != 4) |
| err = ept_pgd_range(pic, (pgd_t *)ept_root, addr, end, walk); |
| else |
| err = ept_p4d_range(pic, (p4d_t *)ept_root, addr, end, walk); |
| |
| /* mmu_lock is unlock in vm_walk_host_range which will unlock mmu_lock |
| * and RET_RESCAN_FLAG will be set in ret value |
| */ |
| if (!(err & RET_RESCAN_FLAG)) |
| spin_unlock_irq(&pic->kvm->mmu_lock); |
| else |
| err &= ~RET_RESCAN_FLAG; |
| |
| return err; |
| } |
| |
| static int ept_idle_supports_cpu(struct kvm *kvm) |
| { |
| struct kvm_vcpu *vcpu; |
| struct kvm_mmu *mmu; |
| int ret; |
| |
| vcpu = kvm_get_vcpu(kvm, 0); |
| if (!vcpu) |
| return -EINVAL; |
| |
| spin_lock(&kvm->mmu_lock); |
| mmu = kvm_arch_mmu_pointer(vcpu); |
| if (kvm_mmu_ad_disabled(mmu)) { |
| printk(KERN_NOTICE "CPU does not support EPT A/D bits tracking\n"); |
| ret = -EINVAL; |
| } else if (mmu->shadow_root_level < 4 || |
| (mmu->shadow_root_level == 5 && !pgtable_l5_enabled())) { |
| printk(KERN_NOTICE "Unsupported EPT level %d\n", mmu->shadow_root_level); |
| ret = -EINVAL; |
| } else |
| ret = 0; |
| spin_unlock(&kvm->mmu_lock); |
| |
| return ret; |
| } |
| |
| #else |
| static int arm_pte_range(struct page_idle_ctrl *pic, |
| pmd_t *pmd, unsigned long addr, unsigned long end) |
| { |
| pte_t *pte; |
| enum ProcIdlePageType page_type; |
| int err = 0; |
| |
| pte = pte_offset_kernel(pmd, addr); |
| do { |
| if (!pte_present(*pte)) |
| page_type = PTE_HOLE; |
| else if (!test_and_clear_bit(_PAGE_MM_BIT_ACCESSED, |
| (unsigned long *) &pte->pte)) |
| page_type = PTE_IDLE; |
| else |
| page_type = PTE_ACCESSED; |
| |
| err = pic_add_page(pic, addr, addr + PAGE_SIZE, page_type); |
| if (err) |
| break; |
| } while (pte++, addr += PAGE_SIZE, addr != end); |
| |
| return err; |
| } |
| |
| static int arm_pmd_range(struct page_idle_ctrl *pic, |
| pud_t *pud, unsigned long addr, unsigned long end) |
| { |
| pmd_t *pmd; |
| unsigned long next; |
| enum ProcIdlePageType page_type; |
| enum ProcIdlePageType pte_page_type; |
| int err = 0; |
| |
| if (pic->flags & SCAN_HUGE_PAGE) |
| pte_page_type = PMD_IDLE_PTES; |
| else |
| pte_page_type = IDLE_PAGE_TYPE_MAX; |
| |
| pmd = pmd_offset(pud, addr); |
| do { |
| next = pmd_addr_end(addr, end); |
| if (!pmd_present(*pmd)) |
| page_type = PMD_HOLE; |
| else if (!if_pmd_thp_or_huge(*pmd)) |
| page_type = pte_page_type; |
| else if (!test_and_clear_bit(_PAGE_MM_BIT_ACCESSED, |
| (unsigned long *)pmd)) |
| page_type = PMD_IDLE; |
| else |
| page_type = PMD_ACCESSED; |
| |
| if (page_type != IDLE_PAGE_TYPE_MAX) |
| err = pic_add_page(pic, addr, next, page_type); |
| else |
| err = arm_pte_range(pic, pmd, addr, next); |
| if (err) |
| break; |
| } while (pmd++, addr = next, addr != end); |
| |
| return err; |
| } |
| |
| static int arm_pud_range(struct page_idle_ctrl *pic, |
| p4d_t *p4d, unsigned long addr, unsigned long end) |
| { |
| pud_t *pud; |
| unsigned long next; |
| int err = 0; |
| |
| pud = pud_offset(p4d, addr); |
| do { |
| next = pud_addr_end(addr, end); |
| if (!pud_present(*pud)) { |
| set_restart_gpa(next, "PUD_HOLE"); |
| continue; |
| } |
| |
| if (if_pud_huge(*pud)) |
| err = pic_add_page(pic, addr, next, PUD_PRESENT); |
| else |
| err = arm_pmd_range(pic, pud, addr, next); |
| if (err) |
| break; |
| } while (pud++, addr = next, addr != end); |
| |
| return err; |
| } |
| |
| static int arm_p4d_range(struct page_idle_ctrl *pic, |
| pgd_t *pgd, unsigned long addr, unsigned long end) |
| { |
| p4d_t *p4d; |
| unsigned long next; |
| int err = 0; |
| |
| p4d = p4d_offset(pgd, addr); |
| do { |
| next = p4d_addr_end(addr, end); |
| if (!p4d_present(*p4d)) { |
| set_restart_gpa(next, "P4D_HOLE"); |
| continue; |
| } |
| |
| err = arm_pud_range(pic, p4d, addr, next); |
| if (err) |
| break; |
| } while (p4d++, addr = next, addr != end); |
| |
| return err; |
| } |
| |
| static int arm_page_range(struct page_idle_ctrl *pic, |
| unsigned long addr, |
| unsigned long end) |
| { |
| pgd_t *pgd; |
| unsigned long next; |
| struct kvm *kvm = pic->kvm; |
| int err = 0; |
| |
| WARN_ON(addr >= end); |
| |
| spin_lock(&pic->kvm->mmu_lock); |
| pgd = (pgd_t *)kvm->arch.mmu.pgt->pgd + pgd_index(addr); |
| spin_unlock(&pic->kvm->mmu_lock); |
| |
| local_irq_disable(); |
| do { |
| next = stage2_pgd_addr_end(kvm, addr, end); |
| if (!pgd_present(*pgd)) { |
| set_restart_gpa(next, "PGD_HOLE"); |
| continue; |
| } |
| |
| err = arm_p4d_range(pic, pgd, addr, next); |
| if (err) |
| break; |
| } while (pgd++, addr = next, addr != end); |
| |
| local_irq_enable(); |
| return err; |
| } |
| #endif |
| |
| /* |
| * Depending on whether hva falls in a memslot: |
| * |
| * 1) found => return gpa and remaining memslot size in *addr_range |
| * |
| * |<----- addr_range --------->| |
| * [ mem slot ] |
| * ^hva |
| * |
| * 2) not found => return hole size in *addr_range |
| * |
| * |<----- addr_range --------->| |
| * [first mem slot above hva ] |
| * ^hva |
| * |
| * If hva is above all mem slots, *addr_range will be ~0UL. |
| * We can finish read(2). |
| */ |
| static unsigned long vm_idle_find_gpa(struct page_idle_ctrl *pic, |
| unsigned long hva, |
| unsigned long *addr_range) |
| { |
| struct kvm *kvm = pic->kvm; |
| struct kvm_memslots *slots; |
| struct kvm_memory_slot *memslot; |
| unsigned long hva_end; |
| gfn_t gfn; |
| |
| *addr_range = ~0UL; |
| mutex_lock(&kvm->slots_lock); |
| slots = kvm_memslots(pic->kvm); |
| kvm_for_each_memslot(memslot, slots) { |
| hva_end = memslot->userspace_addr + |
| (memslot->npages << PAGE_SHIFT); |
| |
| if (hva >= memslot->userspace_addr && hva < hva_end) { |
| gpa_t gpa; |
| |
| gfn = hva_to_gfn_memslot(hva, memslot); |
| *addr_range = hva_end - hva; |
| gpa = gfn_to_gpa(gfn); |
| mutex_unlock(&kvm->slots_lock); |
| return gpa; |
| } |
| |
| if (memslot->userspace_addr > hva) |
| *addr_range = min(*addr_range, |
| memslot->userspace_addr - hva); |
| } |
| mutex_unlock(&kvm->slots_lock); |
| return INVALID_PAGE; |
| } |
| |
| static inline unsigned long mask_to_size(unsigned long mask) |
| { |
| return ~mask + 1; |
| } |
| |
| static int mm_idle_hugetlb_entry(pte_t *pte, unsigned long hmask, |
| unsigned long addr, unsigned long next, |
| struct mm_walk *walk); |
| static int vm_idle_hugetlb_entry(pte_t *pte, unsigned long hmask, |
| unsigned long addr, unsigned long next, |
| struct mm_walk *walk) |
| { |
| struct page_idle_ctrl *pic = walk->private; |
| enum ProcIdlePageType page_type; |
| |
| pic->flags |= VM_SCAN_HOST; |
| |
| /* hugetlb page table entry of vm maybe not present while page is resident |
| * in address_space |
| */ |
| if (mask_to_size(hmask) != PUD_SIZE && !pte_present(*pte) && |
| vm_handle_pte_hole != NULL) { |
| page_type = vm_handle_pte_hole(addr, next, -1, walk); |
| if (page_type < IDLE_PAGE_TYPE_MAX) |
| return pic_add_page(pic, addr, next, page_type); |
| } |
| |
| return mm_idle_hugetlb_entry(pte, hmask, addr, next, walk); |
| } |
| |
| static int vm_idle_pte_hole(unsigned long addr, unsigned long next, int depth, struct mm_walk *walk) |
| { |
| struct page_idle_ctrl *pic = walk->private; |
| enum ProcIdlePageType pagetype; |
| |
| if (vm_handle_pte_hole == NULL) |
| return 0; |
| |
| pagetype = vm_handle_pte_hole(addr, next, depth, walk); |
| if (pagetype >= IDLE_PAGE_TYPE_MAX) |
| return 0; |
| |
| debug_printk("scan pte hole addr %pK type %d\n", addr, pagetype); |
| pic->flags |= VM_SCAN_HOST; |
| return pic_add_page(pic, addr, next, pagetype); |
| } |
| |
| static int mm_idle_pmd_entry(pmd_t *pmd, unsigned long addr, |
| unsigned long next, struct mm_walk *walk); |
| static int vm_idle_pmd_entry(pmd_t *pmd, unsigned long addr, |
| unsigned long next, struct mm_walk *walk) |
| { |
| struct page_idle_ctrl *pic = walk->private; |
| |
| pic->flags |= VM_SCAN_HOST; |
| return mm_idle_pmd_entry(pmd, addr, next, walk); |
| } |
| |
| static int mm_idle_pud_entry(pud_t *pud, unsigned long addr, |
| unsigned long next, struct mm_walk *walk); |
| static int vm_idle_pud_entry(pud_t *pud, unsigned long addr, |
| unsigned long next, struct mm_walk *walk) |
| { |
| struct page_idle_ctrl *pic = walk->private; |
| |
| pic->flags |= VM_SCAN_HOST; |
| return mm_idle_pud_entry(pud, addr, next, walk); |
| } |
| |
| static int vm_idle_walk_hva_range(struct page_idle_ctrl *pic, |
| unsigned long start, unsigned long end, |
| struct mm_walk *walk) |
| { |
| unsigned long gpa_addr; |
| unsigned long gpa_next; |
| unsigned long gpa_end; |
| unsigned long addr_range; |
| unsigned long va_end; |
| int ret; |
| int steps; |
| |
| #ifdef CONFIG_X86_64 |
| ret = ept_idle_supports_cpu(pic->kvm); |
| if (ret) |
| return ret; |
| #endif |
| |
| ret = init_page_idle_ctrl_buffer(pic); |
| if (ret) |
| return ret; |
| |
| for (; start < end;) { |
| gpa_addr = vm_idle_find_gpa(pic, start, &addr_range); |
| |
| if (gpa_addr == INVALID_PAGE) { |
| pic->gpa_to_hva = 0; |
| if (addr_range == ~0UL) { |
| set_restart_gpa(TASK_SIZE, "EOF"); |
| va_end = end; |
| } else { |
| start += addr_range; |
| set_restart_gpa(start, "OUT-OF-SLOT"); |
| va_end = start; |
| } |
| } else { |
| pic->gpa_to_hva = start - gpa_addr; |
| gpa_end = gpa_addr + addr_range; |
| steps = 0; |
| for (; gpa_addr < gpa_end;) { |
| gpa_next = min(gpa_end, gpa_addr + walk_step * PAGE_SIZE); |
| #ifdef CONFIG_ARM64 |
| ret = arm_page_range(pic, gpa_addr, gpa_next); |
| #else |
| ret = ept_page_range(pic, gpa_addr, gpa_next, walk); |
| #endif |
| gpa_addr = pic->restart_gpa; |
| |
| if (ret) |
| break; |
| |
| if (++steps >= resched_step) { |
| cond_resched(); |
| steps = 0; |
| } |
| } |
| va_end = pic->gpa_to_hva + gpa_end; |
| } |
| |
| start = pic->restart_gpa + pic->gpa_to_hva; |
| ret = page_idle_copy_user(pic, start, va_end); |
| if (ret) |
| break; |
| } |
| |
| if (start > pic->next_hva) |
| set_next_hva(start, "NEXT-START"); |
| |
| if (pic->bytes_copied) |
| ret = 0; |
| return ret; |
| } |
| |
| static int mm_idle_test_walk(unsigned long start, unsigned long end, |
| struct mm_walk *walk); |
| static ssize_t vm_idle_read(struct file *file, char *buf, |
| size_t count, loff_t *ppos) |
| { |
| struct mm_struct *mm = file->private_data; |
| struct mm_walk mm_walk = {}; |
| struct mm_walk_ops mm_walk_ops = {}; |
| struct page_idle_ctrl *pic; |
| unsigned long hva_start = *ppos; |
| unsigned long hva_end = hva_start + (count << (3 + PAGE_SHIFT)); |
| int ret; |
| |
| pic = kzalloc(sizeof(*pic), GFP_KERNEL); |
| if (!pic) |
| return -ENOMEM; |
| |
| setup_page_idle_ctrl(pic, buf, count, file->f_flags); |
| pic->kvm = mm_kvm(mm); |
| |
| mm_walk_ops.pmd_entry = vm_idle_pmd_entry; |
| mm_walk_ops.pud_entry = vm_idle_pud_entry; |
| mm_walk_ops.hugetlb_entry = vm_idle_hugetlb_entry; |
| mm_walk_ops.pte_hole = vm_idle_pte_hole; |
| mm_walk_ops.test_walk = mm_idle_test_walk; |
| |
| mm_walk.mm = mm; |
| mm_walk.ops = &mm_walk_ops; |
| mm_walk.private = pic; |
| |
| ret = vm_idle_walk_hva_range(pic, hva_start, hva_end, &mm_walk); |
| if (ret) |
| goto out_kvm; |
| |
| ret = pic->bytes_copied; |
| *ppos = pic->next_hva; |
| out_kvm: |
| kfree(pic); |
| return ret; |
| |
| } |
| |
| static ssize_t mm_idle_read(struct file *file, char *buf, |
| size_t count, loff_t *ppos); |
| |
| static ssize_t page_scan_read(struct file *file, char *buf, |
| size_t count, loff_t *ppos) |
| { |
| struct mm_struct *mm = file->private_data; |
| unsigned long hva_start = *ppos; |
| unsigned long hva_end = hva_start + (count << (3 + PAGE_SHIFT)); |
| |
| if ((hva_start >= TASK_SIZE) || (hva_end >= TASK_SIZE)) { |
| debug_printk("page_idle_read past TASK_SIZE: %pK %pK %lx\n", |
| hva_start, hva_end, TASK_SIZE); |
| return 0; |
| } |
| if (hva_end <= hva_start) { |
| debug_printk("page_idle_read past EOF: %pK %pK\n", |
| hva_start, hva_end); |
| return 0; |
| } |
| if (*ppos & (PAGE_SIZE - 1)) { |
| debug_printk("page_idle_read unaligned ppos: %pK\n", |
| hva_start); |
| return -EINVAL; |
| } |
| if (count < PAGE_IDLE_BUF_MIN) { |
| debug_printk("page_idle_read small count: %lx\n", |
| (unsigned long)count); |
| return -EINVAL; |
| } |
| |
| if (!mm_kvm(mm)) |
| return mm_idle_read(file, buf, count, ppos); |
| |
| return vm_idle_read(file, buf, count, ppos); |
| } |
| |
| static int page_scan_open(struct inode *inode, struct file *file) |
| { |
| if (!try_module_get(THIS_MODULE)) |
| return -EBUSY; |
| |
| return 0; |
| } |
| |
| static int page_scan_release(struct inode *inode, struct file *file) |
| { |
| struct mm_struct *mm = file->private_data; |
| struct kvm *kvm; |
| int ret = 0; |
| |
| if (!mm) { |
| ret = -EBADF; |
| goto out; |
| } |
| |
| kvm = mm_kvm(mm); |
| if (!kvm) { |
| ret = -EINVAL; |
| goto out; |
| } |
| #ifdef CONFIG_X86_64 |
| spin_lock(&kvm->mmu_lock); |
| kvm_flush_remote_tlbs(kvm); |
| spin_unlock(&kvm->mmu_lock); |
| #endif |
| |
| out: |
| module_put(THIS_MODULE); |
| return ret; |
| } |
| |
| static int mm_idle_pmd_large(pmd_t pmd) |
| { |
| #ifdef CONFIG_ARM64 |
| return if_pmd_thp_or_huge(pmd); |
| #else |
| return pmd_large(pmd); |
| #endif |
| } |
| |
| static int mm_idle_pte_range(struct page_idle_ctrl *pic, pmd_t *pmd, |
| unsigned long addr, unsigned long next) |
| { |
| enum ProcIdlePageType page_type; |
| pte_t *pte; |
| int err = 0; |
| |
| pte = pte_offset_kernel(pmd, addr); |
| do { |
| if (!pte_present(*pte)) |
| page_type = PTE_HOLE; |
| else if (pic->flags & SCAN_IGN_HOST) |
| page_type = PTE_IDLE; |
| else if (!test_and_clear_bit(_PAGE_MM_BIT_ACCESSED, |
| (unsigned long *) &pte->pte)) |
| page_type = PTE_IDLE; |
| else { |
| page_type = PTE_ACCESSED; |
| } |
| |
| err = pic_add_page(pic, addr, addr + PAGE_SIZE, page_type); |
| if (err) |
| break; |
| } while (pte++, addr += PAGE_SIZE, addr != next); |
| |
| return err; |
| } |
| |
| static int mm_idle_hugetlb_entry(pte_t *pte, unsigned long hmask, |
| unsigned long addr, unsigned long next, |
| struct mm_walk *walk) |
| { |
| struct page_idle_ctrl *pic = walk->private; |
| enum ProcIdlePageType page_type; |
| unsigned long start = addr & hmask; /* hugepage may be splited in vm */ |
| int ret; |
| |
| if (mask_to_size(hmask) == PUD_SIZE) { |
| page_type = PUD_PRESENT; |
| goto add_page; |
| } |
| |
| if (!pte_present(*pte)) |
| page_type = PMD_HOLE; |
| else if (pic->flags & SCAN_IGN_HOST) |
| page_type = PMD_IDLE; |
| else if (!test_and_clear_bit(_PAGE_MM_BIT_ACCESSED, (unsigned long *)pte)) |
| page_type = PMD_IDLE; |
| else |
| page_type = PMD_ACCESSED; |
| |
| add_page: |
| ret = pic_add_page(pic, start, start + pagetype_size[page_type], page_type); |
| return ret; |
| } |
| |
| static int mm_idle_pmd_entry(pmd_t *pmd, unsigned long addr, |
| unsigned long next, struct mm_walk *walk) |
| { |
| struct page_idle_ctrl *pic = walk->private; |
| enum ProcIdlePageType page_type; |
| enum ProcIdlePageType pte_page_type; |
| int err; |
| |
| /* |
| * Skip duplicate PMD_IDLE_PTES: when the PMD crosses VMA boundary, |
| * walk_page_range() can call on the same PMD twice. |
| */ |
| if ((addr & PMD_MASK) == (pic->last_va & PMD_MASK) && (pic->flags & SCAN_HUGE_PAGE)) { |
| debug_printk("ignore duplicate addr %pK %pK\n", |
| addr, pic->last_va); |
| set_restart_gpa(round_up(next, PMD_SIZE), "DUP_ADDR"); |
| return 0; |
| } |
| pic->last_va = addr; |
| |
| if (pic->flags & SCAN_HUGE_PAGE) |
| pte_page_type = PMD_IDLE_PTES; |
| else |
| pte_page_type = IDLE_PAGE_TYPE_MAX; |
| |
| if (!pmd_present(*pmd)) |
| page_type = PMD_HOLE; |
| else if (!mm_idle_pmd_large(*pmd)) |
| page_type = pte_page_type; |
| else if (!test_and_clear_bit(_PAGE_MM_BIT_ACCESSED, |
| (unsigned long *)pmd) || |
| pic->flags & SCAN_IGN_HOST) |
| page_type = PMD_IDLE; |
| else |
| page_type = PMD_ACCESSED; |
| |
| if (page_type != IDLE_PAGE_TYPE_MAX) |
| err = pic_add_page(pic, addr, next, page_type); |
| else |
| err = mm_idle_pte_range(pic, pmd, addr, next); |
| |
| return err; |
| } |
| |
| static int mm_idle_pud_entry(pud_t *pud, unsigned long addr, |
| unsigned long next, struct mm_walk *walk) |
| { |
| struct page_idle_ctrl *pic = walk->private; |
| |
| spinlock_t *ptl = pud_trans_huge_lock(pud, walk->vma); |
| |
| if (ptl) { |
| if ((addr & PUD_MASK) != (pic->last_va & PUD_MASK)) { |
| pic_add_page(pic, addr, next, PUD_PRESENT); |
| pic->last_va = addr; |
| } |
| spin_unlock(ptl); |
| return 1; |
| } |
| |
| return 0; |
| } |
| |
| static int mm_idle_test_walk(unsigned long start, unsigned long end, |
| struct mm_walk *walk) |
| { |
| struct vm_area_struct *vma = walk->vma; |
| struct page_idle_ctrl *pic = walk->private; |
| |
| /* If the specified page swapout is set, the untagged vma is skipped. */ |
| if ((pic->flags & VMA_SCAN_FLAG) && !(vma->vm_flags & VM_SWAPFLAG)) |
| return 1; |
| |
| if (vma->vm_file) { |
| if (is_vm_hugetlb_page(vma)) |
| return 0; |
| if ((vma->vm_flags & (VM_WRITE|VM_MAYSHARE)) == VM_WRITE) |
| return 0; |
| return 1; |
| } |
| |
| return 0; |
| } |
| |
| static int mm_idle_walk_range(struct page_idle_ctrl *pic, |
| unsigned long start, |
| unsigned long end, |
| struct mm_walk *walk) |
| { |
| struct vm_area_struct *vma; |
| int ret = 0; |
| |
| ret = init_page_idle_ctrl_buffer(pic); |
| if (ret) |
| return ret; |
| |
| for (; start < end;) { |
| down_read(&walk->mm->mmap_lock); |
| vma = find_vma(walk->mm, start); |
| if (vma) { |
| if (end > vma->vm_start) { |
| local_irq_disable(); |
| ret = walk_page_range(walk->mm, start, end, |
| walk->ops, walk->private); |
| local_irq_enable(); |
| } else |
| set_restart_gpa(vma->vm_start, "VMA-HOLE"); |
| } else |
| set_restart_gpa(TASK_SIZE, "EOF"); |
| up_read(&walk->mm->mmap_lock); |
| |
| WARN_ONCE(pic->gpa_to_hva, "non-zero gpa_to_hva"); |
| if (ret != PAGE_IDLE_KBUF_FULL && end > pic->restart_gpa) |
| pic->restart_gpa = end; |
| start = pic->restart_gpa; |
| ret = page_idle_copy_user(pic, start, end); |
| if (ret) |
| break; |
| } |
| |
| if (start > pic->next_hva) |
| set_next_hva(start, "NEXT-START"); |
| |
| if (pic->bytes_copied) { |
| if (ret != PAGE_IDLE_BUF_FULL && pic->next_hva < end) |
| debug_printk("partial scan: next_hva=%pK end=%pK\n", |
| pic->next_hva, end); |
| ret = 0; |
| } else |
| debug_printk("nothing read"); |
| return ret; |
| } |
| |
| static ssize_t mm_idle_read(struct file *file, char *buf, |
| size_t count, loff_t *ppos) |
| { |
| struct mm_struct *mm = file->private_data; |
| struct mm_walk_ops *mm_walk_ops = NULL; |
| struct mm_walk mm_walk = {}; |
| struct page_idle_ctrl *pic; |
| unsigned long va_start = *ppos; |
| unsigned long va_end = va_start + (count << (3 + PAGE_SHIFT)); |
| int ret; |
| |
| if (va_end <= va_start) { |
| debug_printk("%s past EOF: %pK %pK\n", |
| __func__, va_start, va_end); |
| return 0; |
| } |
| if (*ppos & (PAGE_SIZE - 1)) { |
| debug_printk("%s unaligned ppos: %pK\n", |
| __func__, va_start); |
| return -EINVAL; |
| } |
| if (count < PAGE_IDLE_BUF_MIN) { |
| debug_printk("%s small count: %lx\n", |
| __func__, (unsigned long)count); |
| return -EINVAL; |
| } |
| |
| pic = kzalloc(sizeof(*pic), GFP_KERNEL); |
| if (!pic) |
| return -ENOMEM; |
| |
| mm_walk_ops = kzalloc(sizeof(struct mm_walk_ops), GFP_KERNEL); |
| if (!mm_walk_ops) { |
| kfree(pic); |
| return -ENOMEM; |
| } |
| |
| setup_page_idle_ctrl(pic, buf, count, file->f_flags); |
| |
| mm_walk_ops->pmd_entry = mm_idle_pmd_entry; |
| mm_walk_ops->pud_entry = mm_idle_pud_entry; |
| mm_walk_ops->hugetlb_entry = mm_idle_hugetlb_entry; |
| mm_walk_ops->test_walk = mm_idle_test_walk; |
| |
| mm_walk.mm = mm; |
| mm_walk.ops = mm_walk_ops; |
| mm_walk.private = pic; |
| mm_walk.pgd = NULL; |
| mm_walk.no_vma = false; |
| ret = mm_idle_walk_range(pic, va_start, va_end, &mm_walk); |
| if (ret) |
| goto out_free; |
| |
| ret = pic->bytes_copied; |
| *ppos = pic->next_hva; |
| out_free: |
| kfree(pic); |
| kfree(mm_walk_ops); |
| return ret; |
| } |
| |
| static long page_scan_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) |
| { |
| void __user *argp = (void __user *)arg; |
| unsigned int flags; |
| |
| if (get_user(flags, (unsigned int __user *)argp)) |
| return -EFAULT; |
| flags &= ALL_SCAN_FLAGS; |
| |
| switch (cmd) { |
| case IDLE_SCAN_ADD_FLAGS: |
| filp->f_flags |= flags; |
| break; |
| case IDLE_SCAN_REMOVE_FLAGS: |
| filp->f_flags &= ~flags; |
| break; |
| case VMA_SCAN_ADD_FLAGS: |
| filp->f_flags |= flags; |
| break; |
| case VMA_SCAN_REMOVE_FLAGS: |
| filp->f_flags &= ~flags; |
| break; |
| default: |
| return -EOPNOTSUPP; |
| } |
| |
| return 0; |
| } |
| |
| extern struct file_operations proc_page_scan_operations; |
| |
| static int page_scan_entry(void) |
| { |
| proc_page_scan_operations.flock(NULL, 1, NULL); |
| proc_page_scan_operations.owner = THIS_MODULE; |
| proc_page_scan_operations.read = page_scan_read; |
| proc_page_scan_operations.open = page_scan_open; |
| proc_page_scan_operations.release = page_scan_release; |
| proc_page_scan_operations.unlocked_ioctl = page_scan_ioctl; |
| proc_page_scan_operations.flock(NULL, 0, NULL); |
| |
| return 0; |
| } |
| |
| static void page_scan_exit(void) |
| { |
| proc_page_scan_operations.flock(NULL, 1, NULL); |
| proc_page_scan_operations.owner = NULL; |
| proc_page_scan_operations.read = NULL; |
| proc_page_scan_operations.open = NULL; |
| proc_page_scan_operations.release = NULL; |
| proc_page_scan_operations.unlocked_ioctl = NULL; |
| proc_page_scan_operations.flock(NULL, 0, NULL); |
| } |
| |
| MODULE_LICENSE("GPL"); |
| module_init(page_scan_entry); |
| module_exit(page_scan_exit); |