| // SPDX-License-Identifier: GPL-2.0-only |
| /* |
| * Kernel-based Virtual Machine driver for Linux |
| * |
| * derived from drivers/kvm/kvm_main.c |
| * |
| * Copyright (C) 2006 Qumranet, Inc. |
| * Copyright (C) 2008 Qumranet, Inc. |
| * Copyright IBM Corporation, 2008 |
| * Copyright 2010 Red Hat, Inc. and/or its affiliates. |
| * |
| * Authors: |
| * Avi Kivity <avi@qumranet.com> |
| * Yaniv Kamay <yaniv@qumranet.com> |
| * Amit Shah <amit.shah@qumranet.com> |
| * Ben-Ami Yassour <benami@il.ibm.com> |
| */ |
| |
| #include <linux/kvm_host.h> |
| #include "irq.h" |
| #include "ioapic.h" |
| #include "mmu.h" |
| #include "i8254.h" |
| #include "tss.h" |
| #include "kvm_cache_regs.h" |
| #include "kvm_emulate.h" |
| #include "x86.h" |
| #include "cpuid.h" |
| #include "pmu.h" |
| #include "hyperv.h" |
| #include "lapic.h" |
| |
| #include <linux/clocksource.h> |
| #include <linux/interrupt.h> |
| #include <linux/kvm.h> |
| #include <linux/fs.h> |
| #include <linux/vmalloc.h> |
| #include <linux/export.h> |
| #include <linux/moduleparam.h> |
| #include <linux/mman.h> |
| #include <linux/highmem.h> |
| #include <linux/iommu.h> |
| #include <linux/intel-iommu.h> |
| #include <linux/cpufreq.h> |
| #include <linux/user-return-notifier.h> |
| #include <linux/srcu.h> |
| #include <linux/slab.h> |
| #include <linux/perf_event.h> |
| #include <linux/uaccess.h> |
| #include <linux/hash.h> |
| #include <linux/pci.h> |
| #include <linux/timekeeper_internal.h> |
| #include <linux/pvclock_gtod.h> |
| #include <linux/kvm_irqfd.h> |
| #include <linux/irqbypass.h> |
| #include <linux/sched/stat.h> |
| #include <linux/sched/isolation.h> |
| #include <linux/mem_encrypt.h> |
| #include <linux/entry-kvm.h> |
| |
| #include <trace/events/kvm.h> |
| |
| #include <asm/debugreg.h> |
| #include <asm/msr.h> |
| #include <asm/desc.h> |
| #include <asm/mce.h> |
| #include <asm/pkru.h> |
| #include <linux/kernel_stat.h> |
| #include <asm/fpu/api.h> |
| #include <asm/fpu/xcr.h> |
| #include <asm/fpu/xstate.h> |
| #include <asm/pvclock.h> |
| #include <asm/div64.h> |
| #include <asm/irq_remapping.h> |
| #include <asm/mshyperv.h> |
| #include <asm/hypervisor.h> |
| #include <asm/tlbflush.h> |
| #include <asm/intel_pt.h> |
| #include <asm/emulate_prefix.h> |
| #include <asm/sgx.h> |
| #include <clocksource/hyperv_timer.h> |
| |
| #define CREATE_TRACE_POINTS |
| #include "trace.h" |
| |
| #define MAX_IO_MSRS 256 |
| #define KVM_MAX_MCE_BANKS 32 |
| u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P; |
| EXPORT_SYMBOL_GPL(kvm_mce_cap_supported); |
| |
| #define emul_to_vcpu(ctxt) \ |
| ((struct kvm_vcpu *)(ctxt)->vcpu) |
| |
| /* EFER defaults: |
| * - enable syscall per default because its emulated by KVM |
| * - enable LME and LMA per default on 64 bit KVM |
| */ |
| #ifdef CONFIG_X86_64 |
| static |
| u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA)); |
| #else |
| static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); |
| #endif |
| |
| static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS; |
| |
| #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \ |
| KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) |
| |
| static void update_cr8_intercept(struct kvm_vcpu *vcpu); |
| static void process_nmi(struct kvm_vcpu *vcpu); |
| static void process_smi(struct kvm_vcpu *vcpu); |
| static void enter_smm(struct kvm_vcpu *vcpu); |
| static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); |
| static void store_regs(struct kvm_vcpu *vcpu); |
| static int sync_regs(struct kvm_vcpu *vcpu); |
| |
| struct kvm_x86_ops kvm_x86_ops __read_mostly; |
| EXPORT_SYMBOL_GPL(kvm_x86_ops); |
| |
| static bool __read_mostly ignore_msrs = 0; |
| module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR); |
| |
| bool __read_mostly report_ignored_msrs = true; |
| module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR); |
| EXPORT_SYMBOL_GPL(report_ignored_msrs); |
| |
| unsigned int min_timer_period_us = 200; |
| module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); |
| |
| static bool __read_mostly kvmclock_periodic_sync = true; |
| module_param(kvmclock_periodic_sync, bool, S_IRUGO); |
| |
| bool __read_mostly kvm_has_tsc_control; |
| EXPORT_SYMBOL_GPL(kvm_has_tsc_control); |
| u32 __read_mostly kvm_max_guest_tsc_khz; |
| EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); |
| u8 __read_mostly kvm_tsc_scaling_ratio_frac_bits; |
| EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits); |
| u64 __read_mostly kvm_max_tsc_scaling_ratio; |
| EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio); |
| u64 __read_mostly kvm_default_tsc_scaling_ratio; |
| EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio); |
| bool __read_mostly kvm_has_bus_lock_exit; |
| EXPORT_SYMBOL_GPL(kvm_has_bus_lock_exit); |
| bool __read_mostly kvm_has_notify_vmexit; |
| EXPORT_SYMBOL_GPL(kvm_has_notify_vmexit); |
| |
| /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */ |
| static u32 __read_mostly tsc_tolerance_ppm = 250; |
| module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); |
| |
| /* |
| * lapic timer advance (tscdeadline mode only) in nanoseconds. '-1' enables |
| * adaptive tuning starting from default advancment of 1000ns. '0' disables |
| * advancement entirely. Any other value is used as-is and disables adaptive |
| * tuning, i.e. allows priveleged userspace to set an exact advancement time. |
| */ |
| static int __read_mostly lapic_timer_advance_ns = -1; |
| module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR); |
| |
| static bool __read_mostly vector_hashing = true; |
| module_param(vector_hashing, bool, S_IRUGO); |
| |
| bool __read_mostly enable_vmware_backdoor = false; |
| module_param(enable_vmware_backdoor, bool, S_IRUGO); |
| EXPORT_SYMBOL_GPL(enable_vmware_backdoor); |
| |
| static bool __read_mostly force_emulation_prefix = false; |
| module_param(force_emulation_prefix, bool, S_IRUGO); |
| |
| int __read_mostly pi_inject_timer = -1; |
| module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR); |
| |
| /* |
| * Restoring the host value for MSRs that are only consumed when running in |
| * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU |
| * returns to userspace, i.e. the kernel can run with the guest's value. |
| */ |
| #define KVM_MAX_NR_USER_RETURN_MSRS 16 |
| |
| struct kvm_user_return_msrs_global { |
| int nr; |
| u32 msrs[KVM_MAX_NR_USER_RETURN_MSRS]; |
| }; |
| |
| struct kvm_user_return_msrs { |
| struct user_return_notifier urn; |
| bool registered; |
| struct kvm_user_return_msr_values { |
| u64 host; |
| u64 curr; |
| } values[KVM_MAX_NR_USER_RETURN_MSRS]; |
| }; |
| |
| static struct kvm_user_return_msrs_global __read_mostly user_return_msrs_global; |
| static struct kvm_user_return_msrs __percpu *user_return_msrs; |
| |
| #define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ |
| | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \ |
| | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \ |
| | XFEATURE_MASK_PKRU) |
| |
| u64 __read_mostly host_efer; |
| EXPORT_SYMBOL_GPL(host_efer); |
| |
| bool __read_mostly allow_smaller_maxphyaddr = 0; |
| EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr); |
| |
| static u64 __read_mostly host_xss; |
| u64 __read_mostly supported_xss; |
| EXPORT_SYMBOL_GPL(supported_xss); |
| |
| struct kvm_stats_debugfs_item debugfs_entries[] = { |
| VCPU_STAT("pf_fixed", pf_fixed), |
| VCPU_STAT("pf_guest", pf_guest), |
| VCPU_STAT("tlb_flush", tlb_flush), |
| VCPU_STAT("invlpg", invlpg), |
| VCPU_STAT("exits", exits), |
| VCPU_STAT("io_exits", io_exits), |
| VCPU_STAT("mmio_exits", mmio_exits), |
| VCPU_STAT("signal_exits", signal_exits), |
| VCPU_STAT("irq_window", irq_window_exits), |
| VCPU_STAT("nmi_window", nmi_window_exits), |
| VCPU_STAT("halt_exits", halt_exits), |
| VCPU_STAT("halt_successful_poll", halt_successful_poll), |
| VCPU_STAT("halt_attempted_poll", halt_attempted_poll), |
| VCPU_STAT("halt_poll_invalid", halt_poll_invalid), |
| VCPU_STAT("halt_wakeup", halt_wakeup), |
| VCPU_STAT("hypercalls", hypercalls), |
| VCPU_STAT("request_irq", request_irq_exits), |
| VCPU_STAT("irq_exits", irq_exits), |
| VCPU_STAT("host_state_reload", host_state_reload), |
| VCPU_STAT("fpu_reload", fpu_reload), |
| VCPU_STAT("insn_emulation", insn_emulation), |
| VCPU_STAT("insn_emulation_fail", insn_emulation_fail), |
| VCPU_STAT("irq_injections", irq_injections), |
| VCPU_STAT("nmi_injections", nmi_injections), |
| VCPU_STAT("req_event", req_event), |
| VCPU_STAT("l1d_flush", l1d_flush), |
| VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns), |
| VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns), |
| VCPU_STAT("preemption_reported", preemption_reported), |
| VCPU_STAT("preemption_other", preemption_other), |
| VCPU_STAT("notify_window_exits", notify_window_exits), |
| VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped), |
| VM_STAT("mmu_pte_write", mmu_pte_write), |
| VM_STAT("mmu_pde_zapped", mmu_pde_zapped), |
| VM_STAT("mmu_flooded", mmu_flooded), |
| VM_STAT("mmu_recycled", mmu_recycled), |
| VM_STAT("mmu_cache_miss", mmu_cache_miss), |
| VM_STAT("mmu_unsync", mmu_unsync), |
| VM_STAT("remote_tlb_flush", remote_tlb_flush), |
| VM_STAT("largepages", lpages, .mode = 0444), |
| VM_STAT("nx_largepages_splitted", nx_lpage_splits, .mode = 0444), |
| VM_STAT("max_mmu_page_hash_collisions", max_mmu_page_hash_collisions), |
| { "vcpu_stat", 0, KVM_STAT_DFX }, |
| { NULL } |
| }; |
| |
| /* debugfs entries of Detail For vcpu stat EXtension */ |
| struct dfx_kvm_stats_debugfs_item dfx_debugfs_entries[] = { |
| DFX_STAT("pid", pid), |
| DFX_STAT("pf_fixed", pf_fixed), |
| DFX_STAT("pf_guest", pf_guest), |
| DFX_STAT("tlb_flush", tlb_flush), |
| DFX_STAT("invlpg", invlpg), |
| DFX_STAT("exits", exits), |
| DFX_STAT("io_exits", io_exits), |
| DFX_STAT("mmio_exits", mmio_exits), |
| DFX_STAT("signal_exits", signal_exits), |
| DFX_STAT("irq_window", irq_window_exits), |
| DFX_STAT("nmi_window", nmi_window_exits), |
| DFX_STAT("halt_exits", halt_exits), |
| DFX_STAT("halt_successful_poll", halt_successful_poll), |
| DFX_STAT("halt_attempted_poll", halt_attempted_poll), |
| DFX_STAT("halt_wakeup", halt_wakeup), |
| DFX_STAT("request_irq", request_irq_exits), |
| DFX_STAT("irq_exits", irq_exits), |
| DFX_STAT("host_state_reload", host_state_reload), |
| DFX_STAT("fpu_reload", fpu_reload), |
| DFX_STAT("insn_emulation", insn_emulation), |
| DFX_STAT("insn_emulation_fail", insn_emulation_fail), |
| DFX_STAT("hypercalls", hypercalls), |
| DFX_STAT("irq_injections", irq_injections), |
| DFX_STAT("nmi_injections", nmi_injections), |
| DFX_STAT("cr_exits", cr_exits), |
| DFX_STAT("msr_rd_exits", msr_rd_exits), |
| DFX_STAT("msr_wr_exits", msr_wr_exits), |
| DFX_STAT("apic_wr_exits", apic_wr_exits), |
| DFX_STAT("ept_vio_exits", ept_vio_exits), |
| DFX_STAT("ept_mis_exits", ept_mis_exits), |
| DFX_STAT("pause_exits", pause_exits), |
| DFX_STAT("steal", steal), |
| DFX_STAT("st_max", st_max), |
| DFX_STAT("utime", utime), |
| DFX_STAT("stime", stime), |
| DFX_STAT("gtime", gtime), |
| DFX_STAT("preemption_timer_exits", preemption_timer_exits), |
| DFX_STAT("preemption_reported", preemption_reported), |
| DFX_STAT("preemption_other", preemption_other), |
| { NULL } |
| }; |
| |
| u64 __read_mostly host_xcr0; |
| u64 __read_mostly supported_xcr0; |
| EXPORT_SYMBOL_GPL(supported_xcr0); |
| |
| static struct kmem_cache *x86_emulator_cache; |
| |
| /* |
| * When called, it means the previous get/set msr reached an invalid msr. |
| * Return true if we want to ignore/silent this failed msr access. |
| */ |
| static bool kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr, |
| u64 data, bool write) |
| { |
| const char *op = write ? "wrmsr" : "rdmsr"; |
| |
| if (ignore_msrs) { |
| if (report_ignored_msrs) |
| kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n", |
| op, msr, data); |
| /* Mask the error */ |
| return true; |
| } else { |
| kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n", |
| op, msr, data); |
| return false; |
| } |
| } |
| |
| static struct kmem_cache *kvm_alloc_emulator_cache(void) |
| { |
| unsigned int useroffset = offsetof(struct x86_emulate_ctxt, src); |
| unsigned int size = sizeof(struct x86_emulate_ctxt); |
| |
| return kmem_cache_create_usercopy("x86_emulator", size, |
| __alignof__(struct x86_emulate_ctxt), |
| SLAB_ACCOUNT, useroffset, |
| size - useroffset, NULL); |
| } |
| |
| static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); |
| |
| static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) |
| { |
| int i; |
| for (i = 0; i < ASYNC_PF_PER_VCPU; i++) |
| vcpu->arch.apf.gfns[i] = ~0; |
| } |
| |
| static void kvm_on_user_return(struct user_return_notifier *urn) |
| { |
| unsigned slot; |
| struct kvm_user_return_msrs *msrs |
| = container_of(urn, struct kvm_user_return_msrs, urn); |
| struct kvm_user_return_msr_values *values; |
| unsigned long flags; |
| |
| /* |
| * Disabling irqs at this point since the following code could be |
| * interrupted and executed through kvm_arch_hardware_disable() |
| */ |
| local_irq_save(flags); |
| if (msrs->registered) { |
| msrs->registered = false; |
| user_return_notifier_unregister(urn); |
| } |
| local_irq_restore(flags); |
| for (slot = 0; slot < user_return_msrs_global.nr; ++slot) { |
| values = &msrs->values[slot]; |
| if (values->host != values->curr) { |
| wrmsrl(user_return_msrs_global.msrs[slot], values->host); |
| values->curr = values->host; |
| } |
| } |
| } |
| |
| int kvm_probe_user_return_msr(u32 msr) |
| { |
| u64 val; |
| int ret; |
| |
| preempt_disable(); |
| ret = rdmsrl_safe(msr, &val); |
| if (ret) |
| goto out; |
| ret = wrmsrl_safe(msr, val); |
| out: |
| preempt_enable(); |
| return ret; |
| } |
| EXPORT_SYMBOL_GPL(kvm_probe_user_return_msr); |
| |
| void kvm_define_user_return_msr(unsigned slot, u32 msr) |
| { |
| BUG_ON(slot >= KVM_MAX_NR_USER_RETURN_MSRS); |
| user_return_msrs_global.msrs[slot] = msr; |
| if (slot >= user_return_msrs_global.nr) |
| user_return_msrs_global.nr = slot + 1; |
| } |
| EXPORT_SYMBOL_GPL(kvm_define_user_return_msr); |
| |
| static void kvm_user_return_msr_cpu_online(void) |
| { |
| unsigned int cpu = smp_processor_id(); |
| struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); |
| u64 value; |
| int i; |
| |
| for (i = 0; i < user_return_msrs_global.nr; ++i) { |
| rdmsrl_safe(user_return_msrs_global.msrs[i], &value); |
| msrs->values[i].host = value; |
| msrs->values[i].curr = value; |
| } |
| } |
| |
| int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask) |
| { |
| unsigned int cpu = smp_processor_id(); |
| struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); |
| int err; |
| |
| value = (value & mask) | (msrs->values[slot].host & ~mask); |
| if (value == msrs->values[slot].curr) |
| return 0; |
| err = wrmsrl_safe(user_return_msrs_global.msrs[slot], value); |
| if (err) |
| return 1; |
| |
| msrs->values[slot].curr = value; |
| if (!msrs->registered) { |
| msrs->urn.on_user_return = kvm_on_user_return; |
| user_return_notifier_register(&msrs->urn); |
| msrs->registered = true; |
| } |
| return 0; |
| } |
| EXPORT_SYMBOL_GPL(kvm_set_user_return_msr); |
| |
| static void drop_user_return_notifiers(void) |
| { |
| unsigned int cpu = smp_processor_id(); |
| struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); |
| |
| if (msrs->registered) |
| kvm_on_user_return(&msrs->urn); |
| } |
| |
| u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) |
| { |
| return vcpu->arch.apic_base; |
| } |
| EXPORT_SYMBOL_GPL(kvm_get_apic_base); |
| |
| enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu) |
| { |
| return kvm_apic_mode(kvm_get_apic_base(vcpu)); |
| } |
| EXPORT_SYMBOL_GPL(kvm_get_apic_mode); |
| |
| int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info) |
| { |
| enum lapic_mode old_mode = kvm_get_apic_mode(vcpu); |
| enum lapic_mode new_mode = kvm_apic_mode(msr_info->data); |
| u64 reserved_bits = ((~0ULL) << cpuid_maxphyaddr(vcpu)) | 0x2ff | |
| (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE); |
| |
| if ((msr_info->data & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID) |
| return 1; |
| if (!msr_info->host_initiated) { |
| if (old_mode == LAPIC_MODE_X2APIC && new_mode == LAPIC_MODE_XAPIC) |
| return 1; |
| if (old_mode == LAPIC_MODE_DISABLED && new_mode == LAPIC_MODE_X2APIC) |
| return 1; |
| } |
| |
| kvm_lapic_set_base(vcpu, msr_info->data); |
| kvm_recalculate_apic_map(vcpu->kvm); |
| return 0; |
| } |
| EXPORT_SYMBOL_GPL(kvm_set_apic_base); |
| |
| asmlinkage __visible noinstr void kvm_spurious_fault(void) |
| { |
| /* Fault while not rebooting. We want the trace. */ |
| BUG_ON(!kvm_rebooting); |
| } |
| EXPORT_SYMBOL_GPL(kvm_spurious_fault); |
| |
| #define EXCPT_BENIGN 0 |
| #define EXCPT_CONTRIBUTORY 1 |
| #define EXCPT_PF 2 |
| |
| static int exception_class(int vector) |
| { |
| switch (vector) { |
| case PF_VECTOR: |
| return EXCPT_PF; |
| case DE_VECTOR: |
| case TS_VECTOR: |
| case NP_VECTOR: |
| case SS_VECTOR: |
| case GP_VECTOR: |
| return EXCPT_CONTRIBUTORY; |
| default: |
| break; |
| } |
| return EXCPT_BENIGN; |
| } |
| |
| #define EXCPT_FAULT 0 |
| #define EXCPT_TRAP 1 |
| #define EXCPT_ABORT 2 |
| #define EXCPT_INTERRUPT 3 |
| |
| static int exception_type(int vector) |
| { |
| unsigned int mask; |
| |
| if (WARN_ON(vector > 31 || vector == NMI_VECTOR)) |
| return EXCPT_INTERRUPT; |
| |
| mask = 1 << vector; |
| |
| /* #DB is trap, as instruction watchpoints are handled elsewhere */ |
| if (mask & ((1 << DB_VECTOR) | (1 << BP_VECTOR) | (1 << OF_VECTOR))) |
| return EXCPT_TRAP; |
| |
| if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR))) |
| return EXCPT_ABORT; |
| |
| /* Reserved exceptions will result in fault */ |
| return EXCPT_FAULT; |
| } |
| |
| void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu) |
| { |
| unsigned nr = vcpu->arch.exception.nr; |
| bool has_payload = vcpu->arch.exception.has_payload; |
| unsigned long payload = vcpu->arch.exception.payload; |
| |
| if (!has_payload) |
| return; |
| |
| switch (nr) { |
| case DB_VECTOR: |
| /* |
| * "Certain debug exceptions may clear bit 0-3. The |
| * remaining contents of the DR6 register are never |
| * cleared by the processor". |
| */ |
| vcpu->arch.dr6 &= ~DR_TRAP_BITS; |
| /* |
| * In order to reflect the #DB exception payload in guest |
| * dr6, three components need to be considered: active low |
| * bit, FIXED_1 bits and active high bits (e.g. DR6_BD, |
| * DR6_BS and DR6_BT) |
| * DR6_ACTIVE_LOW contains the FIXED_1 and active low bits. |
| * In the target guest dr6: |
| * FIXED_1 bits should always be set. |
| * Active low bits should be cleared if 1-setting in payload. |
| * Active high bits should be set if 1-setting in payload. |
| * |
| * Note, the payload is compatible with the pending debug |
| * exceptions/exit qualification under VMX, that active_low bits |
| * are active high in payload. |
| * So they need to be flipped for DR6. |
| */ |
| vcpu->arch.dr6 |= DR6_ACTIVE_LOW; |
| vcpu->arch.dr6 |= payload; |
| vcpu->arch.dr6 ^= payload & DR6_ACTIVE_LOW; |
| |
| /* |
| * The #DB payload is defined as compatible with the 'pending |
| * debug exceptions' field under VMX, not DR6. While bit 12 is |
| * defined in the 'pending debug exceptions' field (enabled |
| * breakpoint), it is reserved and must be zero in DR6. |
| */ |
| vcpu->arch.dr6 &= ~BIT(12); |
| break; |
| case PF_VECTOR: |
| vcpu->arch.cr2 = payload; |
| break; |
| } |
| |
| vcpu->arch.exception.has_payload = false; |
| vcpu->arch.exception.payload = 0; |
| } |
| EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload); |
| |
| static void kvm_multiple_exception(struct kvm_vcpu *vcpu, |
| unsigned nr, bool has_error, u32 error_code, |
| bool has_payload, unsigned long payload, bool reinject) |
| { |
| u32 prev_nr; |
| int class1, class2; |
| |
| kvm_make_request(KVM_REQ_EVENT, vcpu); |
| |
| if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) { |
| queue: |
| if (reinject) { |
| /* |
| * On vmentry, vcpu->arch.exception.pending is only |
| * true if an event injection was blocked by |
| * nested_run_pending. In that case, however, |
| * vcpu_enter_guest requests an immediate exit, |
| * and the guest shouldn't proceed far enough to |
| * need reinjection. |
| */ |
| WARN_ON_ONCE(vcpu->arch.exception.pending); |
| vcpu->arch.exception.injected = true; |
| if (WARN_ON_ONCE(has_payload)) { |
| /* |
| * A reinjected event has already |
| * delivered its payload. |
| */ |
| has_payload = false; |
| payload = 0; |
| } |
| } else { |
| vcpu->arch.exception.pending = true; |
| vcpu->arch.exception.injected = false; |
| } |
| vcpu->arch.exception.has_error_code = has_error; |
| vcpu->arch.exception.nr = nr; |
| vcpu->arch.exception.error_code = error_code; |
| vcpu->arch.exception.has_payload = has_payload; |
| vcpu->arch.exception.payload = payload; |
| if (!is_guest_mode(vcpu)) |
| kvm_deliver_exception_payload(vcpu); |
| return; |
| } |
| |
| /* to check exception */ |
| prev_nr = vcpu->arch.exception.nr; |
| if (prev_nr == DF_VECTOR) { |
| /* triple fault -> shutdown */ |
| kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); |
| return; |
| } |
| class1 = exception_class(prev_nr); |
| class2 = exception_class(nr); |
| if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) |
| || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) { |
| /* |
| * Generate double fault per SDM Table 5-5. Set |
| * exception.pending = true so that the double fault |
| * can trigger a nested vmexit. |
| */ |
| vcpu->arch.exception.pending = true; |
| vcpu->arch.exception.injected = false; |
| vcpu->arch.exception.has_error_code = true; |
| vcpu->arch.exception.nr = DF_VECTOR; |
| vcpu->arch.exception.error_code = 0; |
| vcpu->arch.exception.has_payload = false; |
| vcpu->arch.exception.payload = 0; |
| } else |
| /* replace previous exception with a new one in a hope |
| that instruction re-execution will regenerate lost |
| exception */ |
| goto queue; |
| } |
| |
| void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) |
| { |
| kvm_multiple_exception(vcpu, nr, false, 0, false, 0, false); |
| } |
| EXPORT_SYMBOL_GPL(kvm_queue_exception); |
| |
| void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr) |
| { |
| kvm_multiple_exception(vcpu, nr, false, 0, false, 0, true); |
| } |
| EXPORT_SYMBOL_GPL(kvm_requeue_exception); |
| |
| void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, |
| unsigned long payload) |
| { |
| kvm_multiple_exception(vcpu, nr, false, 0, true, payload, false); |
| } |
| EXPORT_SYMBOL_GPL(kvm_queue_exception_p); |
| |
| static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr, |
| u32 error_code, unsigned long payload) |
| { |
| kvm_multiple_exception(vcpu, nr, true, error_code, |
| true, payload, false); |
| } |
| |
| int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) |
| { |
| if (err) |
| kvm_inject_gp(vcpu, 0); |
| else |
| return kvm_skip_emulated_instruction(vcpu); |
| |
| return 1; |
| } |
| EXPORT_SYMBOL_GPL(kvm_complete_insn_gp); |
| |
| void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) |
| { |
| ++vcpu->stat.pf_guest; |
| vcpu->arch.exception.nested_apf = |
| is_guest_mode(vcpu) && fault->async_page_fault; |
| if (vcpu->arch.exception.nested_apf) { |
| vcpu->arch.apf.nested_apf_token = fault->address; |
| kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code); |
| } else { |
| kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code, |
| fault->address); |
| } |
| } |
| EXPORT_SYMBOL_GPL(kvm_inject_page_fault); |
| |
| bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, |
| struct x86_exception *fault) |
| { |
| struct kvm_mmu *fault_mmu; |
| WARN_ON_ONCE(fault->vector != PF_VECTOR); |
| |
| fault_mmu = fault->nested_page_fault ? vcpu->arch.mmu : |
| vcpu->arch.walk_mmu; |
| |
| /* |
| * Invalidate the TLB entry for the faulting address, if it exists, |
| * else the access will fault indefinitely (and to emulate hardware). |
| */ |
| if ((fault->error_code & PFERR_PRESENT_MASK) && |
| !(fault->error_code & PFERR_RSVD_MASK)) |
| kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address, |
| fault_mmu->root_hpa); |
| |
| fault_mmu->inject_page_fault(vcpu, fault); |
| return fault->nested_page_fault; |
| } |
| EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault); |
| |
| void kvm_inject_nmi(struct kvm_vcpu *vcpu) |
| { |
| atomic_inc(&vcpu->arch.nmi_queued); |
| kvm_make_request(KVM_REQ_NMI, vcpu); |
| } |
| EXPORT_SYMBOL_GPL(kvm_inject_nmi); |
| |
| void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) |
| { |
| kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, false); |
| } |
| EXPORT_SYMBOL_GPL(kvm_queue_exception_e); |
| |
| void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) |
| { |
| kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, true); |
| } |
| EXPORT_SYMBOL_GPL(kvm_requeue_exception_e); |
| |
| /* |
| * Checks if cpl <= required_cpl; if true, return true. Otherwise queue |
| * a #GP and return false. |
| */ |
| bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) |
| { |
| if (kvm_x86_ops.get_cpl(vcpu) <= required_cpl) |
| return true; |
| kvm_queue_exception_e(vcpu, GP_VECTOR, 0); |
| return false; |
| } |
| EXPORT_SYMBOL_GPL(kvm_require_cpl); |
| |
| bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr) |
| { |
| if ((dr != 4 && dr != 5) || !kvm_read_cr4_bits(vcpu, X86_CR4_DE)) |
| return true; |
| |
| kvm_queue_exception(vcpu, UD_VECTOR); |
| return false; |
| } |
| EXPORT_SYMBOL_GPL(kvm_require_dr); |
| |
| /* |
| * This function will be used to read from the physical memory of the currently |
| * running guest. The difference to kvm_vcpu_read_guest_page is that this function |
| * can read from guest physical or from the guest's guest physical memory. |
| */ |
| int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, |
| gfn_t ngfn, void *data, int offset, int len, |
| u32 access) |
| { |
| struct x86_exception exception; |
| gfn_t real_gfn; |
| gpa_t ngpa; |
| |
| ngpa = gfn_to_gpa(ngfn); |
| real_gfn = mmu->translate_gpa(vcpu, ngpa, access, &exception); |
| if (real_gfn == UNMAPPED_GVA) |
| return -EFAULT; |
| |
| real_gfn = gpa_to_gfn(real_gfn); |
| |
| return kvm_vcpu_read_guest_page(vcpu, real_gfn, data, offset, len); |
| } |
| EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu); |
| |
| static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, |
| void *data, int offset, int len, u32 access) |
| { |
| return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn, |
| data, offset, len, access); |
| } |
| |
| static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu) |
| { |
| return rsvd_bits(cpuid_maxphyaddr(vcpu), 63) | rsvd_bits(5, 8) | |
| rsvd_bits(1, 2); |
| } |
| |
| /* |
| * Load the pae pdptrs. Return 1 if they are all valid, 0 otherwise. |
| */ |
| int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) |
| { |
| gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; |
| unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; |
| int i; |
| int ret; |
| u64 pdpte[ARRAY_SIZE(mmu->pdptrs)]; |
| |
| ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte, |
| offset * sizeof(u64), sizeof(pdpte), |
| PFERR_USER_MASK|PFERR_WRITE_MASK); |
| if (ret < 0) { |
| ret = 0; |
| goto out; |
| } |
| for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { |
| if ((pdpte[i] & PT_PRESENT_MASK) && |
| (pdpte[i] & pdptr_rsvd_bits(vcpu))) { |
| ret = 0; |
| goto out; |
| } |
| } |
| ret = 1; |
| |
| memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)); |
| kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); |
| |
| out: |
| |
| return ret; |
| } |
| EXPORT_SYMBOL_GPL(load_pdptrs); |
| |
| bool pdptrs_changed(struct kvm_vcpu *vcpu) |
| { |
| u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)]; |
| int offset; |
| gfn_t gfn; |
| int r; |
| |
| if (!is_pae_paging(vcpu)) |
| return false; |
| |
| if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR)) |
| return true; |
| |
| gfn = (kvm_read_cr3(vcpu) & 0xffffffe0ul) >> PAGE_SHIFT; |
| offset = (kvm_read_cr3(vcpu) & 0xffffffe0ul) & (PAGE_SIZE - 1); |
| r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte), |
| PFERR_USER_MASK | PFERR_WRITE_MASK); |
| if (r < 0) |
| return true; |
| |
| return memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0; |
| } |
| EXPORT_SYMBOL_GPL(pdptrs_changed); |
| |
| int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) |
| { |
| unsigned long old_cr0 = kvm_read_cr0(vcpu); |
| unsigned long pdptr_bits = X86_CR0_CD | X86_CR0_NW | X86_CR0_PG; |
| unsigned long update_bits = X86_CR0_PG | X86_CR0_WP; |
| |
| cr0 |= X86_CR0_ET; |
| |
| #ifdef CONFIG_X86_64 |
| if (cr0 & 0xffffffff00000000UL) |
| return 1; |
| #endif |
| |
| cr0 &= ~CR0_RESERVED_BITS; |
| |
| if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) |
| return 1; |
| |
| if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) |
| return 1; |
| |
| #ifdef CONFIG_X86_64 |
| if ((vcpu->arch.efer & EFER_LME) && !is_paging(vcpu) && |
| (cr0 & X86_CR0_PG)) { |
| int cs_db, cs_l; |
| |
| if (!is_pae(vcpu)) |
| return 1; |
| kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l); |
| if (cs_l) |
| return 1; |
| } |
| #endif |
| if (!(vcpu->arch.efer & EFER_LME) && (cr0 & X86_CR0_PG) && |
| is_pae(vcpu) && ((cr0 ^ old_cr0) & pdptr_bits) && |
| !load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu))) |
| return 1; |
| |
| if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) |
| return 1; |
| |
| kvm_x86_ops.set_cr0(vcpu, cr0); |
| |
| if ((cr0 ^ old_cr0) & X86_CR0_PG) { |
| kvm_clear_async_pf_completion_queue(vcpu); |
| kvm_async_pf_hash_reset(vcpu); |
| } |
| |
| if ((cr0 ^ old_cr0) & update_bits) |
| kvm_mmu_reset_context(vcpu); |
| |
| if (((cr0 ^ old_cr0) & X86_CR0_CD) && |
| kvm_arch_has_noncoherent_dma(vcpu->kvm) && |
| !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) |
| kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL); |
| |
| return 0; |
| } |
| EXPORT_SYMBOL_GPL(kvm_set_cr0); |
| |
| void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) |
| { |
| (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f)); |
| } |
| EXPORT_SYMBOL_GPL(kvm_lmsw); |
| |
| void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) |
| { |
| if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) { |
| |
| if (vcpu->arch.xcr0 != host_xcr0) |
| xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0); |
| |
| if (vcpu->arch.xsaves_enabled && |
| vcpu->arch.ia32_xss != host_xss) |
| wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss); |
| } |
| |
| if (static_cpu_has(X86_FEATURE_PKU) && |
| (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || |
| (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU)) && |
| vcpu->arch.pkru != vcpu->arch.host_pkru) |
| write_pkru(vcpu->arch.pkru); |
| } |
| EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state); |
| |
| void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) |
| { |
| if (static_cpu_has(X86_FEATURE_PKU) && |
| (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || |
| (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU))) { |
| vcpu->arch.pkru = rdpkru(); |
| if (vcpu->arch.pkru != vcpu->arch.host_pkru) |
| write_pkru(vcpu->arch.host_pkru); |
| } |
| |
| if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) { |
| |
| if (vcpu->arch.xcr0 != host_xcr0) |
| xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0); |
| |
| if (vcpu->arch.xsaves_enabled && |
| vcpu->arch.ia32_xss != host_xss) |
| wrmsrl(MSR_IA32_XSS, host_xss); |
| } |
| |
| } |
| EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state); |
| |
| static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) |
| { |
| u64 xcr0 = xcr; |
| u64 old_xcr0 = vcpu->arch.xcr0; |
| u64 valid_bits; |
| |
| /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */ |
| if (index != XCR_XFEATURE_ENABLED_MASK) |
| return 1; |
| if (!(xcr0 & XFEATURE_MASK_FP)) |
| return 1; |
| if ((xcr0 & XFEATURE_MASK_YMM) && !(xcr0 & XFEATURE_MASK_SSE)) |
| return 1; |
| |
| /* |
| * Do not allow the guest to set bits that we do not support |
| * saving. However, xcr0 bit 0 is always set, even if the |
| * emulated CPU does not support XSAVE (see fx_init). |
| */ |
| valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP; |
| if (xcr0 & ~valid_bits) |
| return 1; |
| |
| if ((!(xcr0 & XFEATURE_MASK_BNDREGS)) != |
| (!(xcr0 & XFEATURE_MASK_BNDCSR))) |
| return 1; |
| |
| if (xcr0 & XFEATURE_MASK_AVX512) { |
| if (!(xcr0 & XFEATURE_MASK_YMM)) |
| return 1; |
| if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512) |
| return 1; |
| } |
| vcpu->arch.xcr0 = xcr0; |
| |
| if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND) |
| kvm_update_cpuid_runtime(vcpu); |
| return 0; |
| } |
| |
| int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) |
| { |
| if (kvm_x86_ops.get_cpl(vcpu) != 0 || |
| __kvm_set_xcr(vcpu, index, xcr)) { |
| kvm_inject_gp(vcpu, 0); |
| return 1; |
| } |
| return 0; |
| } |
| EXPORT_SYMBOL_GPL(kvm_set_xcr); |
| |
| int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) |
| { |
| if (cr4 & cr4_reserved_bits) |
| return -EINVAL; |
| |
| if (cr4 & vcpu->arch.cr4_guest_rsvd_bits) |
| return -EINVAL; |
| |
| return 0; |
| } |
| EXPORT_SYMBOL_GPL(kvm_valid_cr4); |
| |
| int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) |
| { |
| unsigned long old_cr4 = kvm_read_cr4(vcpu); |
| unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | |
| X86_CR4_SMEP; |
| unsigned long mmu_role_bits = pdptr_bits | X86_CR4_SMAP | X86_CR4_PKE; |
| |
| if (kvm_valid_cr4(vcpu, cr4)) |
| return 1; |
| |
| if (is_long_mode(vcpu)) { |
| if (!(cr4 & X86_CR4_PAE)) |
| return 1; |
| if ((cr4 ^ old_cr4) & X86_CR4_LA57) |
| return 1; |
| } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) |
| && ((cr4 ^ old_cr4) & pdptr_bits) |
| && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, |
| kvm_read_cr3(vcpu))) |
| return 1; |
| |
| if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) { |
| if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID)) |
| return 1; |
| |
| /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */ |
| if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu)) |
| return 1; |
| } |
| |
| if (kvm_x86_ops.set_cr4(vcpu, cr4)) |
| return 1; |
| |
| if (((cr4 ^ old_cr4) & mmu_role_bits) || |
| (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) |
| kvm_mmu_reset_context(vcpu); |
| |
| if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) |
| kvm_update_cpuid_runtime(vcpu); |
| |
| return 0; |
| } |
| EXPORT_SYMBOL_GPL(kvm_set_cr4); |
| |
| int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) |
| { |
| bool skip_tlb_flush = false; |
| #ifdef CONFIG_X86_64 |
| bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE); |
| |
| if (pcid_enabled) { |
| skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH; |
| cr3 &= ~X86_CR3_PCID_NOFLUSH; |
| } |
| #endif |
| |
| if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) { |
| if (!skip_tlb_flush) { |
| kvm_mmu_sync_roots(vcpu); |
| kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); |
| } |
| return 0; |
| } |
| |
| if (is_long_mode(vcpu) && |
| (cr3 & vcpu->arch.cr3_lm_rsvd_bits)) |
| return 1; |
| else if (is_pae_paging(vcpu) && |
| !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) |
| return 1; |
| |
| kvm_mmu_new_pgd(vcpu, cr3, skip_tlb_flush, skip_tlb_flush); |
| vcpu->arch.cr3 = cr3; |
| kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); |
| |
| return 0; |
| } |
| EXPORT_SYMBOL_GPL(kvm_set_cr3); |
| |
| int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) |
| { |
| if (cr8 & CR8_RESERVED_BITS) |
| return 1; |
| if (lapic_in_kernel(vcpu)) |
| kvm_lapic_set_tpr(vcpu, cr8); |
| else |
| vcpu->arch.cr8 = cr8; |
| return 0; |
| } |
| EXPORT_SYMBOL_GPL(kvm_set_cr8); |
| |
| unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) |
| { |
| if (lapic_in_kernel(vcpu)) |
| return kvm_lapic_get_cr8(vcpu); |
| else |
| return vcpu->arch.cr8; |
| } |
| EXPORT_SYMBOL_GPL(kvm_get_cr8); |
| |
| static void kvm_update_dr0123(struct kvm_vcpu *vcpu) |
| { |
| int i; |
| |
| if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { |
| for (i = 0; i < KVM_NR_DB_REGS; i++) |
| vcpu->arch.eff_db[i] = vcpu->arch.db[i]; |
| vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD; |
| } |
| } |
| |
| void kvm_update_dr7(struct kvm_vcpu *vcpu) |
| { |
| unsigned long dr7; |
| |
| if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) |
| dr7 = vcpu->arch.guest_debug_dr7; |
| else |
| dr7 = vcpu->arch.dr7; |
| kvm_x86_ops.set_dr7(vcpu, dr7); |
| vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED; |
| if (dr7 & DR7_BP_EN_MASK) |
| vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED; |
| } |
| EXPORT_SYMBOL_GPL(kvm_update_dr7); |
| |
| static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu) |
| { |
| u64 fixed = DR6_FIXED_1; |
| |
| if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM)) |
| fixed |= DR6_RTM; |
| |
| if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)) |
| fixed |= DR6_BUS_LOCK; |
| return fixed; |
| } |
| |
| static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) |
| { |
| size_t size = ARRAY_SIZE(vcpu->arch.db); |
| |
| switch (dr) { |
| case 0 ... 3: |
| vcpu->arch.db[array_index_nospec(dr, size)] = val; |
| if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) |
| vcpu->arch.eff_db[dr] = val; |
| break; |
| case 4: |
| case 6: |
| if (!kvm_dr6_valid(val)) |
| return -1; /* #GP */ |
| vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu); |
| break; |
| case 5: |
| default: /* 7 */ |
| if (!kvm_dr7_valid(val)) |
| return -1; /* #GP */ |
| vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; |
| kvm_update_dr7(vcpu); |
| break; |
| } |
| |
| return 0; |
| } |
| |
| int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) |
| { |
| if (__kvm_set_dr(vcpu, dr, val)) { |
| kvm_inject_gp(vcpu, 0); |
| return 1; |
| } |
| return 0; |
| } |
| EXPORT_SYMBOL_GPL(kvm_set_dr); |
| |
| int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) |
| { |
| size_t size = ARRAY_SIZE(vcpu->arch.db); |
| |
| switch (dr) { |
| case 0 ... 3: |
| *val = vcpu->arch.db[array_index_nospec(dr, size)]; |
| break; |
| case 4: |
| case 6: |
| *val = vcpu->arch.dr6; |
| break; |
| case 5: |
| default: /* 7 */ |
| *val = vcpu->arch.dr7; |
| break; |
| } |
| return 0; |
| } |
| EXPORT_SYMBOL_GPL(kvm_get_dr); |
| |
| bool kvm_rdpmc(struct kvm_vcpu *vcpu) |
| { |
| u32 ecx = kvm_rcx_read(vcpu); |
| u64 data; |
| int err; |
| |
| err = kvm_pmu_rdpmc(vcpu, ecx, &data); |
| if (err) |
| return err; |
| kvm_rax_write(vcpu, (u32)data); |
| kvm_rdx_write(vcpu, data >> 32); |
| return err; |
| } |
| EXPORT_SYMBOL_GPL(kvm_rdpmc); |
| |
| /* |
| * List of msr numbers which we expose to userspace through KVM_GET_MSRS |
| * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. |
| * |
| * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) |
| * extract the supported MSRs from the related const lists. |
| * msrs_to_save is selected from the msrs_to_save_all to reflect the |
| * capabilities of the host cpu. This capabilities test skips MSRs that are |
| * kvm-specific. Those are put in emulated_msrs_all; filtering of emulated_msrs |
| * may depend on host virtualization features rather than host cpu features. |
| */ |
| |
| static const u32 msrs_to_save_all[] = { |
| MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, |
| MSR_STAR, |
| #ifdef CONFIG_X86_64 |
| MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, |
| #endif |
| MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, |
| MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, |
| MSR_IA32_SPEC_CTRL, |
| MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH, |
| MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK, |
| MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B, |
| MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B, |
| MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B, |
| MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, |
| MSR_IA32_UMWAIT_CONTROL, |
| |
| MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, |
| MSR_ARCH_PERFMON_FIXED_CTR0 + 2, |
| MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, |
| MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL, |
| MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1, |
| MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3, |
| MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5, |
| MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7, |
| MSR_ARCH_PERFMON_PERFCTR0 + 8, MSR_ARCH_PERFMON_PERFCTR0 + 9, |
| MSR_ARCH_PERFMON_PERFCTR0 + 10, MSR_ARCH_PERFMON_PERFCTR0 + 11, |
| MSR_ARCH_PERFMON_PERFCTR0 + 12, MSR_ARCH_PERFMON_PERFCTR0 + 13, |
| MSR_ARCH_PERFMON_PERFCTR0 + 14, MSR_ARCH_PERFMON_PERFCTR0 + 15, |
| MSR_ARCH_PERFMON_PERFCTR0 + 16, MSR_ARCH_PERFMON_PERFCTR0 + 17, |
| MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1, |
| MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3, |
| MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5, |
| MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7, |
| MSR_ARCH_PERFMON_EVENTSEL0 + 8, MSR_ARCH_PERFMON_EVENTSEL0 + 9, |
| MSR_ARCH_PERFMON_EVENTSEL0 + 10, MSR_ARCH_PERFMON_EVENTSEL0 + 11, |
| MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13, |
| MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15, |
| MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17, |
| |
| MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3, |
| MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3, |
| MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2, |
| MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5, |
| MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2, |
| MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5, |
| }; |
| |
| static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)]; |
| static unsigned num_msrs_to_save; |
| |
| static const u32 emulated_msrs_all[] = { |
| MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, |
| MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, |
| HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, |
| HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC, |
| HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY, |
| HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2, |
| HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL, |
| HV_X64_MSR_RESET, |
| HV_X64_MSR_VP_INDEX, |
| HV_X64_MSR_VP_RUNTIME, |
| HV_X64_MSR_SCONTROL, |
| HV_X64_MSR_STIMER0_CONFIG, |
| HV_X64_MSR_VP_ASSIST_PAGE, |
| HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL, |
| HV_X64_MSR_TSC_EMULATION_STATUS, |
| HV_X64_MSR_SYNDBG_OPTIONS, |
| HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS, |
| HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER, |
| HV_X64_MSR_SYNDBG_PENDING_BUFFER, |
| |
| MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, |
| MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK, |
| |
| MSR_IA32_TSC_ADJUST, |
| MSR_IA32_TSCDEADLINE, |
| MSR_IA32_ARCH_CAPABILITIES, |
| MSR_IA32_PERF_CAPABILITIES, |
| MSR_IA32_MISC_ENABLE, |
| MSR_IA32_MCG_STATUS, |
| MSR_IA32_MCG_CTL, |
| MSR_IA32_MCG_EXT_CTL, |
| MSR_IA32_SMBASE, |
| MSR_SMI_COUNT, |
| MSR_PLATFORM_INFO, |
| MSR_MISC_FEATURES_ENABLES, |
| MSR_AMD64_VIRT_SPEC_CTRL, |
| MSR_IA32_POWER_CTL, |
| MSR_IA32_UCODE_REV, |
| |
| /* |
| * The following list leaves out MSRs whose values are determined |
| * by arch/x86/kvm/vmx/nested.c based on CPUID or other MSRs. |
| * We always support the "true" VMX control MSRs, even if the host |
| * processor does not, so I am putting these registers here rather |
| * than in msrs_to_save_all. |
| */ |
| MSR_IA32_VMX_BASIC, |
| MSR_IA32_VMX_TRUE_PINBASED_CTLS, |
| MSR_IA32_VMX_TRUE_PROCBASED_CTLS, |
| MSR_IA32_VMX_TRUE_EXIT_CTLS, |
| MSR_IA32_VMX_TRUE_ENTRY_CTLS, |
| MSR_IA32_VMX_MISC, |
| MSR_IA32_VMX_CR0_FIXED0, |
| MSR_IA32_VMX_CR4_FIXED0, |
| MSR_IA32_VMX_VMCS_ENUM, |
| MSR_IA32_VMX_PROCBASED_CTLS2, |
| MSR_IA32_VMX_EPT_VPID_CAP, |
| MSR_IA32_VMX_VMFUNC, |
| |
| MSR_K7_HWCR, |
| MSR_KVM_POLL_CONTROL, |
| }; |
| |
| static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)]; |
| static unsigned num_emulated_msrs; |
| |
| /* |
| * List of msr numbers which are used to expose MSR-based features that |
| * can be used by a hypervisor to validate requested CPU features. |
| */ |
| static const u32 msr_based_features_all[] = { |
| MSR_IA32_VMX_BASIC, |
| MSR_IA32_VMX_TRUE_PINBASED_CTLS, |
| MSR_IA32_VMX_PINBASED_CTLS, |
| MSR_IA32_VMX_TRUE_PROCBASED_CTLS, |
| MSR_IA32_VMX_PROCBASED_CTLS, |
| MSR_IA32_VMX_TRUE_EXIT_CTLS, |
| MSR_IA32_VMX_EXIT_CTLS, |
| MSR_IA32_VMX_TRUE_ENTRY_CTLS, |
| MSR_IA32_VMX_ENTRY_CTLS, |
| MSR_IA32_VMX_MISC, |
| MSR_IA32_VMX_CR0_FIXED0, |
| MSR_IA32_VMX_CR0_FIXED1, |
| MSR_IA32_VMX_CR4_FIXED0, |
| MSR_IA32_VMX_CR4_FIXED1, |
| MSR_IA32_VMX_VMCS_ENUM, |
| MSR_IA32_VMX_PROCBASED_CTLS2, |
| MSR_IA32_VMX_EPT_VPID_CAP, |
| MSR_IA32_VMX_VMFUNC, |
| |
| MSR_F10H_DECFG, |
| MSR_IA32_UCODE_REV, |
| MSR_IA32_ARCH_CAPABILITIES, |
| MSR_IA32_PERF_CAPABILITIES, |
| }; |
| |
| static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)]; |
| static unsigned int num_msr_based_features; |
| |
| static u64 kvm_get_arch_capabilities(void) |
| { |
| u64 data = 0; |
| |
| if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) |
| rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data); |
| |
| /* |
| * If nx_huge_pages is enabled, KVM's shadow paging will ensure that |
| * the nested hypervisor runs with NX huge pages. If it is not, |
| * L1 is anyway vulnerable to ITLB_MULTIHIT explots from other |
| * L1 guests, so it need not worry about its own (L2) guests. |
| */ |
| data |= ARCH_CAP_PSCHANGE_MC_NO; |
| |
| /* |
| * If we're doing cache flushes (either "always" or "cond") |
| * we will do one whenever the guest does a vmlaunch/vmresume. |
| * If an outer hypervisor is doing the cache flush for us |
| * (VMENTER_L1D_FLUSH_NESTED_VM), we can safely pass that |
| * capability to the guest too, and if EPT is disabled we're not |
| * vulnerable. Overall, only VMENTER_L1D_FLUSH_NEVER will |
| * require a nested hypervisor to do a flush of its own. |
| */ |
| if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER) |
| data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH; |
| |
| if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) |
| data |= ARCH_CAP_RDCL_NO; |
| if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) |
| data |= ARCH_CAP_SSB_NO; |
| if (!boot_cpu_has_bug(X86_BUG_MDS)) |
| data |= ARCH_CAP_MDS_NO; |
| |
| if (!boot_cpu_has(X86_FEATURE_RTM)) { |
| /* |
| * If RTM=0 because the kernel has disabled TSX, the host might |
| * have TAA_NO or TSX_CTRL. Clear TAA_NO (the guest sees RTM=0 |
| * and therefore knows that there cannot be TAA) but keep |
| * TSX_CTRL: some buggy userspaces leave it set on tsx=on hosts, |
| * and we want to allow migrating those guests to tsx=off hosts. |
| */ |
| data &= ~ARCH_CAP_TAA_NO; |
| } else if (!boot_cpu_has_bug(X86_BUG_TAA)) { |
| data |= ARCH_CAP_TAA_NO; |
| } else { |
| /* |
| * Nothing to do here; we emulate TSX_CTRL if present on the |
| * host so the guest can choose between disabling TSX or |
| * using VERW to clear CPU buffers. |
| */ |
| } |
| |
| /* Guests don't need to know "Fill buffer clear control" exists */ |
| data &= ~ARCH_CAP_FB_CLEAR_CTRL; |
| |
| return data; |
| } |
| |
| static int kvm_get_msr_feature(struct kvm_msr_entry *msr) |
| { |
| switch (msr->index) { |
| case MSR_IA32_ARCH_CAPABILITIES: |
| msr->data = kvm_get_arch_capabilities(); |
| break; |
| case MSR_IA32_UCODE_REV: |
| rdmsrl_safe(msr->index, &msr->data); |
| break; |
| default: |
| return kvm_x86_ops.get_msr_feature(msr); |
| } |
| return 0; |
| } |
| |
| static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data) |
| { |
| struct kvm_msr_entry msr; |
| int r; |
| |
| msr.index = index; |
| r = kvm_get_msr_feature(&msr); |
| |
| if (r == KVM_MSR_RET_INVALID) { |
| /* Unconditionally clear the output for simplicity */ |
| *data = 0; |
| if (kvm_msr_ignored_check(vcpu, index, 0, false)) |
| r = 0; |
| } |
| |
| if (r) |
| return r; |
| |
| *data = msr.data; |
| |
| return 0; |
| } |
| |
| static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) |
| { |
| if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT)) |
| return false; |
| |
| if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM)) |
| return false; |
| |
| if (efer & (EFER_LME | EFER_LMA) && |
| !guest_cpuid_has(vcpu, X86_FEATURE_LM)) |
| return false; |
| |
| if (efer & EFER_NX && !guest_cpuid_has(vcpu, X86_FEATURE_NX)) |
| return false; |
| |
| return true; |
| |
| } |
| bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) |
| { |
| if (efer & efer_reserved_bits) |
| return false; |
| |
| return __kvm_valid_efer(vcpu, efer); |
| } |
| EXPORT_SYMBOL_GPL(kvm_valid_efer); |
| |
| static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info) |
| { |
| u64 old_efer = vcpu->arch.efer; |
| u64 efer = msr_info->data; |
| int r; |
| |
| if (efer & efer_reserved_bits) |
| return 1; |
| |
| if (!msr_info->host_initiated) { |
| if (!__kvm_valid_efer(vcpu, efer)) |
| return 1; |
| |
| if (is_paging(vcpu) && |
| (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) |
| return 1; |
| } |
| |
| efer &= ~EFER_LMA; |
| efer |= vcpu->arch.efer & EFER_LMA; |
| |
| r = kvm_x86_ops.set_efer(vcpu, efer); |
| if (r) { |
| WARN_ON(r > 0); |
| return r; |
| } |
| |
| /* Update reserved bits */ |
| if ((efer ^ old_efer) & EFER_NX) |
| kvm_mmu_reset_context(vcpu); |
| |
| return 0; |
| } |
| |
| void kvm_enable_efer_bits(u64 mask) |
| { |
| efer_reserved_bits &= ~mask; |
| } |
| EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); |
| |
| bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type) |
| { |
| struct kvm_x86_msr_filter *msr_filter; |
| struct msr_bitmap_range *ranges; |
| struct kvm *kvm = vcpu->kvm; |
| bool allowed; |
| int idx; |
| u32 i; |
| |
| /* x2APIC MSRs do not support filtering. */ |
| if (index >= 0x800 && index <= 0x8ff) |
| return true; |
| |
| idx = srcu_read_lock(&kvm->srcu); |
| |
| msr_filter = srcu_dereference(kvm->arch.msr_filter, &kvm->srcu); |
| if (!msr_filter) { |
| allowed = true; |
| goto out; |
| } |
| |
| allowed = msr_filter->default_allow; |
| ranges = msr_filter->ranges; |
| |
| for (i = 0; i < msr_filter->count; i++) { |
| u32 start = ranges[i].base; |
| u32 end = start + ranges[i].nmsrs; |
| u32 flags = ranges[i].flags; |
| unsigned long *bitmap = ranges[i].bitmap; |
| |
| if ((index >= start) && (index < end) && (flags & type)) { |
| allowed = !!test_bit(index - start, bitmap); |
| break; |
| } |
| } |
| |
| out: |
| srcu_read_unlock(&kvm->srcu, idx); |
| |
| return allowed; |
| } |
| EXPORT_SYMBOL_GPL(kvm_msr_allowed); |
| |
| /* |
| * Write @data into the MSR specified by @index. Select MSR specific fault |
| * checks are bypassed if @host_initiated is %true. |
| * Returns 0 on success, non-0 otherwise. |
| * Assumes vcpu_load() was already called. |
| */ |
| static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, |
| bool host_initiated) |
| { |
| struct msr_data msr; |
| |
| if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE)) |
| return KVM_MSR_RET_FILTERED; |
| |
| switch (index) { |
| case MSR_FS_BASE: |
| case MSR_GS_BASE: |
| case MSR_KERNEL_GS_BASE: |
| case MSR_CSTAR: |
| case MSR_LSTAR: |
| if (is_noncanonical_address(data, vcpu)) |
| return 1; |
| break; |
| case MSR_IA32_SYSENTER_EIP: |
| case MSR_IA32_SYSENTER_ESP: |
| /* |
| * IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if |
| * non-canonical address is written on Intel but not on |
| * AMD (which ignores the top 32-bits, because it does |
| * not implement 64-bit SYSENTER). |
| * |
| * 64-bit code should hence be able to write a non-canonical |
| * value on AMD. Making the address canonical ensures that |
| * vmentry does not fail on Intel after writing a non-canonical |
| * value, and that something deterministic happens if the guest |
| * invokes 64-bit SYSENTER. |
| */ |
| data = get_canonical(data, vcpu_virt_addr_bits(vcpu)); |
| } |
| |
| msr.data = data; |
| msr.index = index; |
| msr.host_initiated = host_initiated; |
| |
| return kvm_x86_ops.set_msr(vcpu, &msr); |
| } |
| |
| static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu, |
| u32 index, u64 data, bool host_initiated) |
| { |
| int ret = __kvm_set_msr(vcpu, index, data, host_initiated); |
| |
| if (ret == KVM_MSR_RET_INVALID) |
| if (kvm_msr_ignored_check(vcpu, index, data, true)) |
| ret = 0; |
| |
| return ret; |
| } |
| |
| /* |
| * Read the MSR specified by @index into @data. Select MSR specific fault |
| * checks are bypassed if @host_initiated is %true. |
| * Returns 0 on success, non-0 otherwise. |
| * Assumes vcpu_load() was already called. |
| */ |
| int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, |
| bool host_initiated) |
| { |
| struct msr_data msr; |
| int ret; |
| |
| if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ)) |
| return KVM_MSR_RET_FILTERED; |
| |
| msr.index = index; |
| msr.host_initiated = host_initiated; |
| |
| ret = kvm_x86_ops.get_msr(vcpu, &msr); |
| if (!ret) |
| *data = msr.data; |
| return ret; |
| } |
| |
| static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu, |
| u32 index, u64 *data, bool host_initiated) |
| { |
| int ret = __kvm_get_msr(vcpu, index, data, host_initiated); |
| |
| if (ret == KVM_MSR_RET_INVALID) { |
| /* Unconditionally clear *data for simplicity */ |
| *data = 0; |
| if (kvm_msr_ignored_check(vcpu, index, 0, false)) |
| ret = 0; |
| } |
| |
| return ret; |
| } |
| |
| int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data) |
| { |
| return kvm_get_msr_ignored_check(vcpu, index, data, false); |
| } |
| EXPORT_SYMBOL_GPL(kvm_get_msr); |
| |
| int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) |
| { |
| return kvm_set_msr_ignored_check(vcpu, index, data, false); |
| } |
| EXPORT_SYMBOL_GPL(kvm_set_msr); |
| |
| static int complete_emulated_msr(struct kvm_vcpu *vcpu, bool is_read) |
| { |
| if (vcpu->run->msr.error) { |
| kvm_inject_gp(vcpu, 0); |
| return 1; |
| } else if (is_read) { |
| kvm_rax_write(vcpu, (u32)vcpu->run->msr.data); |
| kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32); |
| } |
| |
| return kvm_skip_emulated_instruction(vcpu); |
| } |
| |
| static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu) |
| { |
| return complete_emulated_msr(vcpu, true); |
| } |
| |
| static int complete_emulated_wrmsr(struct kvm_vcpu *vcpu) |
| { |
| return complete_emulated_msr(vcpu, false); |
| } |
| |
| static u64 kvm_msr_reason(int r) |
| { |
| switch (r) { |
| case KVM_MSR_RET_INVALID: |
| return KVM_MSR_EXIT_REASON_UNKNOWN; |
| case KVM_MSR_RET_FILTERED: |
| return KVM_MSR_EXIT_REASON_FILTER; |
| default: |
| return KVM_MSR_EXIT_REASON_INVAL; |
| } |
| } |
| |
| static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index, |
| u32 exit_reason, u64 data, |
| int (*completion)(struct kvm_vcpu *vcpu), |
| int r) |
| { |
| u64 msr_reason = kvm_msr_reason(r); |
| |
| /* Check if the user wanted to know about this MSR fault */ |
| if (!(vcpu->kvm->arch.user_space_msr_mask & msr_reason)) |
| return 0; |
| |
| vcpu->run->exit_reason = exit_reason; |
| vcpu->run->msr.error = 0; |
| memset(vcpu->run->msr.pad, 0, sizeof(vcpu->run->msr.pad)); |
| vcpu->run->msr.reason = msr_reason; |
| vcpu->run->msr.index = index; |
| vcpu->run->msr.data = data; |
| vcpu->arch.complete_userspace_io = completion; |
| |
| return 1; |
| } |
| |
| static int kvm_get_msr_user_space(struct kvm_vcpu *vcpu, u32 index, int r) |
| { |
| return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_RDMSR, 0, |
| complete_emulated_rdmsr, r); |
| } |
| |
| static int kvm_set_msr_user_space(struct kvm_vcpu *vcpu, u32 index, u64 data, int r) |
| { |
| return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_WRMSR, data, |
| complete_emulated_wrmsr, r); |
| } |
| |
| int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu) |
| { |
| u32 ecx = kvm_rcx_read(vcpu); |
| u64 data; |
| int r; |
| |
| vcpu->stat.msr_rd_exits++; |
| r = kvm_get_msr(vcpu, ecx, &data); |
| |
| /* MSR read failed? See if we should ask user space */ |
| if (r && kvm_get_msr_user_space(vcpu, ecx, r)) { |
| /* Bounce to user space */ |
| return 0; |
| } |
| |
| /* MSR read failed? Inject a #GP */ |
| if (r) { |
| trace_kvm_msr_read_ex(ecx); |
| kvm_inject_gp(vcpu, 0); |
| return 1; |
| } |
| |
| trace_kvm_msr_read(ecx, data); |
| |
| kvm_rax_write(vcpu, data & -1u); |
| kvm_rdx_write(vcpu, (data >> 32) & -1u); |
| return kvm_skip_emulated_instruction(vcpu); |
| } |
| EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr); |
| |
| int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) |
| { |
| u32 ecx = kvm_rcx_read(vcpu); |
| u64 data = kvm_read_edx_eax(vcpu); |
| int r; |
| |
| vcpu->stat.msr_wr_exits++; |
| r = kvm_set_msr(vcpu, ecx, data); |
| |
| /* MSR write failed? See if we should ask user space */ |
| if (r && kvm_set_msr_user_space(vcpu, ecx, data, r)) |
| /* Bounce to user space */ |
| return 0; |
| |
| /* Signal all other negative errors to userspace */ |
| if (r < 0) |
| return r; |
| |
| /* MSR write failed? Inject a #GP */ |
| if (r > 0) { |
| trace_kvm_msr_write_ex(ecx, data); |
| kvm_inject_gp(vcpu, 0); |
| return 1; |
| } |
| |
| trace_kvm_msr_write(ecx, data); |
| return kvm_skip_emulated_instruction(vcpu); |
| } |
| EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr); |
| |
| bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu) |
| { |
| return vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) || |
| xfer_to_guest_mode_work_pending(); |
| } |
| EXPORT_SYMBOL_GPL(kvm_vcpu_exit_request); |
| |
| /* |
| * The fast path for frequent and performance sensitive wrmsr emulation, |
| * i.e. the sending of IPI, sending IPI early in the VM-Exit flow reduces |
| * the latency of virtual IPI by avoiding the expensive bits of transitioning |
| * from guest to host, e.g. reacquiring KVM's SRCU lock. In contrast to the |
| * other cases which must be called after interrupts are enabled on the host. |
| */ |
| static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data) |
| { |
| if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic)) |
| return 1; |
| |
| if (((data & APIC_SHORT_MASK) == APIC_DEST_NOSHORT) && |
| ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) && |
| ((data & APIC_MODE_MASK) == APIC_DM_FIXED) && |
| ((u32)(data >> 32) != X2APIC_BROADCAST)) { |
| |
| data &= ~(1 << 12); |
| kvm_apic_send_ipi(vcpu->arch.apic, (u32)data, (u32)(data >> 32)); |
| kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR2, (u32)(data >> 32)); |
| kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR, (u32)data); |
| trace_kvm_apic_write(APIC_ICR, (u32)data); |
| return 0; |
| } |
| |
| return 1; |
| } |
| |
| static int handle_fastpath_set_tscdeadline(struct kvm_vcpu *vcpu, u64 data) |
| { |
| if (!kvm_can_use_hv_timer(vcpu)) |
| return 1; |
| |
| kvm_set_lapic_tscdeadline_msr(vcpu, data); |
| return 0; |
| } |
| |
| fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) |
| { |
| u32 msr = kvm_rcx_read(vcpu); |
| u64 data; |
| fastpath_t ret = EXIT_FASTPATH_NONE; |
| |
| switch (msr) { |
| case APIC_BASE_MSR + (APIC_ICR >> 4): |
| data = kvm_read_edx_eax(vcpu); |
| if (!handle_fastpath_set_x2apic_icr_irqoff(vcpu, data)) { |
| kvm_skip_emulated_instruction(vcpu); |
| ret = EXIT_FASTPATH_EXIT_HANDLED; |
| } |
| break; |
| case MSR_IA32_TSCDEADLINE: |
| data = kvm_read_edx_eax(vcpu); |
| if (!handle_fastpath_set_tscdeadline(vcpu, data)) { |
| kvm_skip_emulated_instruction(vcpu); |
| ret = EXIT_FASTPATH_REENTER_GUEST; |
| } |
| break; |
| default: |
| break; |
| } |
| |
| if (ret != EXIT_FASTPATH_NONE) |
| trace_kvm_msr_write(msr, data); |
| |
| return ret; |
| } |
| EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff); |
| |
| /* |
| * Adapt set_msr() to msr_io()'s calling convention |
| */ |
| static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) |
| { |
| return kvm_get_msr_ignored_check(vcpu, index, data, true); |
| } |
| |
| static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) |
| { |
| return kvm_set_msr_ignored_check(vcpu, index, *data, true); |
| } |
| |
| #ifdef CONFIG_X86_64 |
| struct pvclock_clock { |
| int vclock_mode; |
| u64 cycle_last; |
| u64 mask; |
| u32 mult; |
| u32 shift; |
| u64 base_cycles; |
| u64 offset; |
| }; |
| |
| struct pvclock_gtod_data { |
| seqcount_t seq; |
| |
| struct pvclock_clock clock; /* extract of a clocksource struct */ |
| struct pvclock_clock raw_clock; /* extract of a clocksource struct */ |
| |
| ktime_t offs_boot; |
| u64 wall_time_sec; |
| }; |
| |
| static struct pvclock_gtod_data pvclock_gtod_data; |
| |
| static void update_pvclock_gtod(struct timekeeper *tk) |
| { |
| struct pvclock_gtod_data *vdata = &pvclock_gtod_data; |
| |
| write_seqcount_begin(&vdata->seq); |
| |
| /* copy pvclock gtod data */ |
| vdata->clock.vclock_mode = tk->tkr_mono.clock->vdso_clock_mode; |
| vdata->clock.cycle_last = tk->tkr_mono.cycle_last; |
| vdata->clock.mask = tk->tkr_mono.mask; |
| vdata->clock.mult = tk->tkr_mono.mult; |
| vdata->clock.shift = tk->tkr_mono.shift; |
| vdata->clock.base_cycles = tk->tkr_mono.xtime_nsec; |
| vdata->clock.offset = tk->tkr_mono.base; |
| |
| vdata->raw_clock.vclock_mode = tk->tkr_raw.clock->vdso_clock_mode; |
| vdata->raw_clock.cycle_last = tk->tkr_raw.cycle_last; |
| vdata->raw_clock.mask = tk->tkr_raw.mask; |
| vdata->raw_clock.mult = tk->tkr_raw.mult; |
| vdata->raw_clock.shift = tk->tkr_raw.shift; |
| vdata->raw_clock.base_cycles = tk->tkr_raw.xtime_nsec; |
| vdata->raw_clock.offset = tk->tkr_raw.base; |
| |
| vdata->wall_time_sec = tk->xtime_sec; |
| |
| vdata->offs_boot = tk->offs_boot; |
| |
| write_seqcount_end(&vdata->seq); |
| } |
| |
| static s64 get_kvmclock_base_ns(void) |
| { |
| /* Count up from boot time, but with the frequency of the raw clock. */ |
| return ktime_to_ns(ktime_add(ktime_get_raw(), pvclock_gtod_data.offs_boot)); |
| } |
| #else |
| static s64 get_kvmclock_base_ns(void) |
| { |
| /* Master clock not used, so we can just use CLOCK_BOOTTIME. */ |
| return ktime_get_boottime_ns(); |
| } |
| #endif |
| |
| static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) |
| { |
| int version; |
| int r; |
| struct pvclock_wall_clock wc; |
| u64 wall_nsec; |
| |
| kvm->arch.wall_clock = wall_clock; |
| |
| if (!wall_clock) |
| return; |
| |
| r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version)); |
| if (r) |
| return; |
| |
| if (version & 1) |
| ++version; /* first time write, random junk */ |
| |
| ++version; |
| |
| if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version))) |
| return; |
| |
| /* |
| * The guest calculates current wall clock time by adding |
| * system time (updated by kvm_guest_time_update below) to the |
| * wall clock specified here. We do the reverse here. |
| */ |
| wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm); |
| |
| wc.nsec = do_div(wall_nsec, 1000000000); |
| wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */ |
| wc.version = version; |
| |
| kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); |
| |
| version++; |
| kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); |
| } |
| |
| static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time, |
| bool old_msr, bool host_initiated) |
| { |
| struct kvm_arch *ka = &vcpu->kvm->arch; |
| |
| if (vcpu->vcpu_id == 0 && !host_initiated) { |
| if (ka->boot_vcpu_runs_old_kvmclock != old_msr) |
| kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); |
| |
| ka->boot_vcpu_runs_old_kvmclock = old_msr; |
| } |
| |
| vcpu->arch.time = system_time; |
| kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); |
| |
| /* we verify if the enable bit is set... */ |
| vcpu->arch.pv_time_enabled = false; |
| if (!(system_time & 1)) |
| return; |
| |
| if (!kvm_gfn_to_hva_cache_init(vcpu->kvm, |
| &vcpu->arch.pv_time, system_time & ~1ULL, |
| sizeof(struct pvclock_vcpu_time_info))) |
| vcpu->arch.pv_time_enabled = true; |
| |
| return; |
| } |
| |
| static uint32_t div_frac(uint32_t dividend, uint32_t divisor) |
| { |
| do_shl32_div32(dividend, divisor); |
| return dividend; |
| } |
| |
| static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz, |
| s8 *pshift, u32 *pmultiplier) |
| { |
| uint64_t scaled64; |
| int32_t shift = 0; |
| uint64_t tps64; |
| uint32_t tps32; |
| |
| tps64 = base_hz; |
| scaled64 = scaled_hz; |
| while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) { |
| tps64 >>= 1; |
| shift--; |
| } |
| |
| tps32 = (uint32_t)tps64; |
| while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) { |
| if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000) |
| scaled64 >>= 1; |
| else |
| tps32 <<= 1; |
| shift++; |
| } |
| |
| *pshift = shift; |
| *pmultiplier = div_frac(scaled64, tps32); |
| } |
| |
| #ifdef CONFIG_X86_64 |
| static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0); |
| #endif |
| |
| static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); |
| static unsigned long max_tsc_khz; |
| |
| static u32 adjust_tsc_khz(u32 khz, s32 ppm) |
| { |
| u64 v = (u64)khz * (1000000 + ppm); |
| do_div(v, 1000000); |
| return v; |
| } |
| |
| static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) |
| { |
| u64 ratio; |
| |
| /* Guest TSC same frequency as host TSC? */ |
| if (!scale) { |
| vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio; |
| return 0; |
| } |
| |
| /* TSC scaling supported? */ |
| if (!kvm_has_tsc_control) { |
| if (user_tsc_khz > tsc_khz) { |
| vcpu->arch.tsc_catchup = 1; |
| vcpu->arch.tsc_always_catchup = 1; |
| return 0; |
| } else { |
| pr_warn_ratelimited("user requested TSC rate below hardware speed\n"); |
| return -1; |
| } |
| } |
| |
| /* TSC scaling required - calculate ratio */ |
| ratio = mul_u64_u32_div(1ULL << kvm_tsc_scaling_ratio_frac_bits, |
| user_tsc_khz, tsc_khz); |
| |
| if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) { |
| pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n", |
| user_tsc_khz); |
| return -1; |
| } |
| |
| vcpu->arch.tsc_scaling_ratio = ratio; |
| return 0; |
| } |
| |
| static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) |
| { |
| u32 thresh_lo, thresh_hi; |
| int use_scaling = 0; |
| |
| /* tsc_khz can be zero if TSC calibration fails */ |
| if (user_tsc_khz == 0) { |
| /* set tsc_scaling_ratio to a safe value */ |
| vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio; |
| return -1; |
| } |
| |
| /* Compute a scale to convert nanoseconds in TSC cycles */ |
| kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC, |
| &vcpu->arch.virtual_tsc_shift, |
| &vcpu->arch.virtual_tsc_mult); |
| vcpu->arch.virtual_tsc_khz = user_tsc_khz; |
| |
| /* |
| * Compute the variation in TSC rate which is acceptable |
| * within the range of tolerance and decide if the |
| * rate being applied is within that bounds of the hardware |
| * rate. If so, no scaling or compensation need be done. |
| */ |
| thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm); |
| thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm); |
| if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) { |
| pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi); |
| use_scaling = 1; |
| } |
| return set_tsc_khz(vcpu, user_tsc_khz, use_scaling); |
| } |
| |
| static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) |
| { |
| u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec, |
| vcpu->arch.virtual_tsc_mult, |
| vcpu->arch.virtual_tsc_shift); |
| tsc += vcpu->arch.this_tsc_write; |
| return tsc; |
| } |
| |
| static inline int gtod_is_based_on_tsc(int mode) |
| { |
| return mode == VDSO_CLOCKMODE_TSC || mode == VDSO_CLOCKMODE_HVCLOCK; |
| } |
| |
| static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) |
| { |
| #ifdef CONFIG_X86_64 |
| bool vcpus_matched; |
| struct kvm_arch *ka = &vcpu->kvm->arch; |
| struct pvclock_gtod_data *gtod = &pvclock_gtod_data; |
| |
| vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == |
| atomic_read(&vcpu->kvm->online_vcpus)); |
| |
| /* |
| * Once the masterclock is enabled, always perform request in |
| * order to update it. |
| * |
| * In order to enable masterclock, the host clocksource must be TSC |
| * and the vcpus need to have matched TSCs. When that happens, |
| * perform request to enable masterclock. |
| */ |
| if (ka->use_master_clock || |
| (gtod_is_based_on_tsc(gtod->clock.vclock_mode) && vcpus_matched)) |
| kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); |
| |
| trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc, |
| atomic_read(&vcpu->kvm->online_vcpus), |
| ka->use_master_clock, gtod->clock.vclock_mode); |
| #endif |
| } |
| |
| /* |
| * Multiply tsc by a fixed point number represented by ratio. |
| * |
| * The most significant 64-N bits (mult) of ratio represent the |
| * integral part of the fixed point number; the remaining N bits |
| * (frac) represent the fractional part, ie. ratio represents a fixed |
| * point number (mult + frac * 2^(-N)). |
| * |
| * N equals to kvm_tsc_scaling_ratio_frac_bits. |
| */ |
| static inline u64 __scale_tsc(u64 ratio, u64 tsc) |
| { |
| return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits); |
| } |
| |
| u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc) |
| { |
| u64 _tsc = tsc; |
| u64 ratio = vcpu->arch.tsc_scaling_ratio; |
| |
| if (ratio != kvm_default_tsc_scaling_ratio) |
| _tsc = __scale_tsc(ratio, tsc); |
| |
| return _tsc; |
| } |
| EXPORT_SYMBOL_GPL(kvm_scale_tsc); |
| |
| static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) |
| { |
| u64 tsc; |
| |
| tsc = kvm_scale_tsc(vcpu, rdtsc()); |
| |
| return target_tsc - tsc; |
| } |
| |
| u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) |
| { |
| return vcpu->arch.l1_tsc_offset + kvm_scale_tsc(vcpu, host_tsc); |
| } |
| EXPORT_SYMBOL_GPL(kvm_read_l1_tsc); |
| |
| static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) |
| { |
| vcpu->arch.l1_tsc_offset = offset; |
| vcpu->arch.tsc_offset = kvm_x86_ops.write_l1_tsc_offset(vcpu, offset); |
| } |
| |
| static inline bool kvm_check_tsc_unstable(void) |
| { |
| #ifdef CONFIG_X86_64 |
| /* |
| * TSC is marked unstable when we're running on Hyper-V, |
| * 'TSC page' clocksource is good. |
| */ |
| if (pvclock_gtod_data.clock.vclock_mode == VDSO_CLOCKMODE_HVCLOCK) |
| return false; |
| #endif |
| return check_tsc_unstable(); |
| } |
| |
| static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data) |
| { |
| struct kvm *kvm = vcpu->kvm; |
| u64 offset, ns, elapsed; |
| unsigned long flags; |
| bool matched; |
| bool already_matched; |
| bool synchronizing = false; |
| |
| raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); |
| offset = kvm_compute_tsc_offset(vcpu, data); |
| ns = get_kvmclock_base_ns(); |
| elapsed = ns - kvm->arch.last_tsc_nsec; |
| |
| if (vcpu->arch.virtual_tsc_khz) { |
| if (data == 0) { |
| /* |
| * detection of vcpu initialization -- need to sync |
| * with other vCPUs. This particularly helps to keep |
| * kvm_clock stable after CPU hotplug |
| */ |
| synchronizing = true; |
| } else { |
| u64 tsc_exp = kvm->arch.last_tsc_write + |
| nsec_to_cycles(vcpu, elapsed); |
| u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL; |
| /* |
| * Special case: TSC write with a small delta (1 second) |
| * of virtual cycle time against real time is |
| * interpreted as an attempt to synchronize the CPU. |
| */ |
| synchronizing = data < tsc_exp + tsc_hz && |
| data + tsc_hz > tsc_exp; |
| } |
| } |
| |
| /* |
| * For a reliable TSC, we can match TSC offsets, and for an unstable |
| * TSC, we add elapsed time in this computation. We could let the |
| * compensation code attempt to catch up if we fall behind, but |
| * it's better to try to match offsets from the beginning. |
| */ |
| if (synchronizing && |
| vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) { |
| if (!kvm_check_tsc_unstable()) { |
| offset = kvm->arch.cur_tsc_offset; |
| } else { |
| u64 delta = nsec_to_cycles(vcpu, elapsed); |
| data += delta; |
| offset = kvm_compute_tsc_offset(vcpu, data); |
| } |
| matched = true; |
| already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation); |
| } else { |
| /* |
| * We split periods of matched TSC writes into generations. |
| * For each generation, we track the original measured |
| * nanosecond time, offset, and write, so if TSCs are in |
| * sync, we can match exact offset, and if not, we can match |
| * exact software computation in compute_guest_tsc() |
| * |
| * These values are tracked in kvm->arch.cur_xxx variables. |
| */ |
| kvm->arch.cur_tsc_generation++; |
| kvm->arch.cur_tsc_nsec = ns; |
| kvm->arch.cur_tsc_write = data; |
| kvm->arch.cur_tsc_offset = offset; |
| matched = false; |
| } |
| |
| /* |
| * We also track th most recent recorded KHZ, write and time to |
| * allow the matching interval to be extended at each write. |
| */ |
| kvm->arch.last_tsc_nsec = ns; |
| kvm->arch.last_tsc_write = data; |
| kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz; |
| |
| vcpu->arch.last_guest_tsc = data; |
| |
| /* Keep track of which generation this VCPU has synchronized to */ |
| vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation; |
| vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; |
| vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; |
| |
| kvm_vcpu_write_tsc_offset(vcpu, offset); |
| raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); |
| |
| spin_lock(&kvm->arch.pvclock_gtod_sync_lock); |
| if (!matched) { |
| kvm->arch.nr_vcpus_matched_tsc = 0; |
| } else if (!already_matched) { |
| kvm->arch.nr_vcpus_matched_tsc++; |
| } |
| |
| kvm_track_tsc_matching(vcpu); |
| spin_unlock(&kvm->arch.pvclock_gtod_sync_lock); |
| } |
| |
| static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, |
| s64 adjustment) |
| { |
| u64 tsc_offset = vcpu->arch.l1_tsc_offset; |
| kvm_vcpu_write_tsc_offset(vcpu, tsc_offset + adjustment); |
| } |
| |
| static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment) |
| { |
| if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio) |
| WARN_ON(adjustment < 0); |
| adjustment = kvm_scale_tsc(vcpu, (u64) adjustment); |
| adjust_tsc_offset_guest(vcpu, adjustment); |
| } |
| |
| #ifdef CONFIG_X86_64 |
| |
| static u64 read_tsc(void) |
| { |
| u64 ret = (u64)rdtsc_ordered(); |
| u64 last = pvclock_gtod_data.clock.cycle_last; |
| |
| if (likely(ret >= last)) |
| return ret; |
| |
| /* |
| * GCC likes to generate cmov here, but this branch is extremely |
| * predictable (it's just a function of time and the likely is |
| * very likely) and there's a data dependence, so force GCC |
| * to generate a branch instead. I don't barrier() because |
| * we don't actually need a barrier, and if this function |
| * ever gets inlined it will generate worse code. |
| */ |
| asm volatile (""); |
| return last; |
| } |
| |
| static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp, |
| int *mode) |
| { |
| long v; |
| u64 tsc_pg_val; |
| |
| switch (clock->vclock_mode) { |
| case VDSO_CLOCKMODE_HVCLOCK: |
| tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(), |
| tsc_timestamp); |
| if (tsc_pg_val != U64_MAX) { |
| /* TSC page valid */ |
| *mode = VDSO_CLOCKMODE_HVCLOCK; |
| v = (tsc_pg_val - clock->cycle_last) & |
| clock->mask; |
| } else { |
| /* TSC page invalid */ |
| *mode = VDSO_CLOCKMODE_NONE; |
| } |
| break; |
| case VDSO_CLOCKMODE_TSC: |
| *mode = VDSO_CLOCKMODE_TSC; |
| *tsc_timestamp = read_tsc(); |
| v = (*tsc_timestamp - clock->cycle_last) & |
| clock->mask; |
| break; |
| default: |
| *mode = VDSO_CLOCKMODE_NONE; |
| } |
| |
| if (*mode == VDSO_CLOCKMODE_NONE) |
| *tsc_timestamp = v = 0; |
| |
| return v * clock->mult; |
| } |
| |
| static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp) |
| { |
| struct pvclock_gtod_data *gtod = &pvclock_gtod_data; |
| unsigned long seq; |
| int mode; |
| u64 ns; |
| |
| do { |
| seq = read_seqcount_begin(>od->seq); |
| ns = gtod->raw_clock.base_cycles; |
| ns += vgettsc(>od->raw_clock, tsc_timestamp, &mode); |
| ns >>= gtod->raw_clock.shift; |
| ns += ktime_to_ns(ktime_add(gtod->raw_clock.offset, gtod->offs_boot)); |
| } while (unlikely(read_seqcount_retry(>od->seq, seq))); |
| *t = ns; |
| |
| return mode; |
| } |
| |
| static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp) |
| { |
| struct pvclock_gtod_data *gtod = &pvclock_gtod_data; |
| unsigned long seq; |
| int mode; |
| u64 ns; |
| |
| do { |
| seq = read_seqcount_begin(>od->seq); |
| ts->tv_sec = gtod->wall_time_sec; |
| ns = gtod->clock.base_cycles; |
| ns += vgettsc(>od->clock, tsc_timestamp, &mode); |
| ns >>= gtod->clock.shift; |
| } while (unlikely(read_seqcount_retry(>od->seq, seq))); |
| |
| ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); |
| ts->tv_nsec = ns; |
| |
| return mode; |
| } |
| |
| /* returns true if host is using TSC based clocksource */ |
| static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp) |
| { |
| /* checked again under seqlock below */ |
| if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode)) |
| return false; |
| |
| return gtod_is_based_on_tsc(do_monotonic_raw(kernel_ns, |
| tsc_timestamp)); |
| } |
| |
| /* returns true if host is using TSC based clocksource */ |
| static bool kvm_get_walltime_and_clockread(struct timespec64 *ts, |
| u64 *tsc_timestamp) |
| { |
| /* checked again under seqlock below */ |
| if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode)) |
| return false; |
| |
| return gtod_is_based_on_tsc(do_realtime(ts, tsc_timestamp)); |
| } |
| #endif |
| |
| /* |
| * |
| * Assuming a stable TSC across physical CPUS, and a stable TSC |
| * across virtual CPUs, the following condition is possible. |
| * Each numbered line represents an event visible to both |
| * CPUs at the next numbered event. |
| * |
| * "timespecX" represents host monotonic time. "tscX" represents |
| * RDTSC value. |
| * |
| * VCPU0 on CPU0 | VCPU1 on CPU1 |
| * |
| * 1. read timespec0,tsc0 |
| * 2. | timespec1 = timespec0 + N |
| * | tsc1 = tsc0 + M |
| * 3. transition to guest | transition to guest |
| * 4. ret0 = timespec0 + (rdtsc - tsc0) | |
| * 5. | ret1 = timespec1 + (rdtsc - tsc1) |
| * | ret1 = timespec0 + N + (rdtsc - (tsc0 + M)) |
| * |
| * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity: |
| * |
| * - ret0 < ret1 |
| * - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M)) |
| * ... |
| * - 0 < N - M => M < N |
| * |
| * That is, when timespec0 != timespec1, M < N. Unfortunately that is not |
| * always the case (the difference between two distinct xtime instances |
| * might be smaller then the difference between corresponding TSC reads, |
| * when updating guest vcpus pvclock areas). |
| * |
| * To avoid that problem, do not allow visibility of distinct |
| * system_timestamp/tsc_timestamp values simultaneously: use a master |
| * copy of host monotonic time values. Update that master copy |
| * in lockstep. |
| * |
| * Rely on synchronization of host TSCs and guest TSCs for monotonicity. |
| * |
| */ |
| |
| static void pvclock_update_vm_gtod_copy(struct kvm *kvm) |
| { |
| #ifdef CONFIG_X86_64 |
| struct kvm_arch *ka = &kvm->arch; |
| int vclock_mode; |
| bool host_tsc_clocksource, vcpus_matched; |
| |
| vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == |
| atomic_read(&kvm->online_vcpus)); |
| |
| /* |
| * If the host uses TSC clock, then passthrough TSC as stable |
| * to the guest. |
| */ |
| host_tsc_clocksource = kvm_get_time_and_clockread( |
| &ka->master_kernel_ns, |
| &ka->master_cycle_now); |
| |
| ka->use_master_clock = host_tsc_clocksource && vcpus_matched |
| && !ka->backwards_tsc_observed |
| && !ka->boot_vcpu_runs_old_kvmclock; |
| |
| if (ka->use_master_clock) |
| atomic_set(&kvm_guest_has_master_clock, 1); |
| |
| vclock_mode = pvclock_gtod_data.clock.vclock_mode; |
| trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode, |
| vcpus_matched); |
| #endif |
| } |
| |
| void kvm_make_mclock_inprogress_request(struct kvm *kvm) |
| { |
| kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS); |
| } |
| |
| static void kvm_gen_update_masterclock(struct kvm *kvm) |
| { |
| #ifdef CONFIG_X86_64 |
| int i; |
| struct kvm_vcpu *vcpu; |
| struct kvm_arch *ka = &kvm->arch; |
| |
| spin_lock(&ka->pvclock_gtod_sync_lock); |
| kvm_make_mclock_inprogress_request(kvm); |
| /* no guest entries from this point */ |
| pvclock_update_vm_gtod_copy(kvm); |
| |
| kvm_for_each_vcpu(i, vcpu, kvm) |
| kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); |
| |
| /* guest entries allowed */ |
| kvm_for_each_vcpu(i, vcpu, kvm) |
| kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu); |
| |
| spin_unlock(&ka->pvclock_gtod_sync_lock); |
| #endif |
| } |
| |
| u64 get_kvmclock_ns(struct kvm *kvm) |
| { |
| struct kvm_arch *ka = &kvm->arch; |
| struct pvclock_vcpu_time_info hv_clock; |
| u64 ret; |
| |
| spin_lock(&ka->pvclock_gtod_sync_lock); |
| if (!ka->use_master_clock) { |
| spin_unlock(&ka->pvclock_gtod_sync_lock); |
| return get_kvmclock_base_ns() + ka->kvmclock_offset; |
| } |
| |
| hv_clock.tsc_timestamp = ka->master_cycle_now; |
| hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset; |
| spin_unlock(&ka->pvclock_gtod_sync_lock); |
| |
| /* both __this_cpu_read() and rdtsc() should be on the same cpu */ |
| get_cpu(); |
| |
| if (__this_cpu_read(cpu_tsc_khz)) { |
| kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL, |
| &hv_clock.tsc_shift, |
| &hv_clock.tsc_to_system_mul); |
| ret = __pvclock_read_cycles(&hv_clock, rdtsc()); |
| } else |
| ret = get_kvmclock_base_ns() + ka->kvmclock_offset; |
| |
| put_cpu(); |
| |
| return ret; |
| } |
| |
| static void kvm_setup_pvclock_page(struct kvm_vcpu *v) |
| { |
| struct kvm_vcpu_arch *vcpu = &v->arch; |
| struct pvclock_vcpu_time_info guest_hv_clock; |
| |
| if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time, |
| &guest_hv_clock, sizeof(guest_hv_clock)))) |
| return; |
| |
| /* This VCPU is paused, but it's legal for a guest to read another |
| * VCPU's kvmclock, so we really have to follow the specification where |
| * it says that version is odd if data is being modified, and even after |
| * it is consistent. |
| * |
| * Version field updates must be kept separate. This is because |
| * kvm_write_guest_cached might use a "rep movs" instruction, and |
| * writes within a string instruction are weakly ordered. So there |
| * are three writes overall. |
| * |
| * As a small optimization, only write the version field in the first |
| * and third write. The vcpu->pv_time cache is still valid, because the |
| * version field is the first in the struct. |
| */ |
| BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0); |
| |
| if (guest_hv_clock.version & 1) |
| ++guest_hv_clock.version; /* first time write, random junk */ |
| |
| vcpu->hv_clock.version = guest_hv_clock.version + 1; |
| kvm_write_guest_cached(v->kvm, &vcpu->pv_time, |
| &vcpu->hv_clock, |
| sizeof(vcpu->hv_clock.version)); |
| |
| smp_wmb(); |
| |
| /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */ |
| vcpu->hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED); |
| |
| if (vcpu->pvclock_set_guest_stopped_request) { |
| vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED; |
| vcpu->pvclock_set_guest_stopped_request = false; |
| } |
| |
| trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock); |
| |
| kvm_write_guest_cached(v->kvm, &vcpu->pv_time, |
| &vcpu->hv_clock, |
| sizeof(vcpu->hv_clock)); |
| |
| smp_wmb(); |
| |
| vcpu->hv_clock.version++; |
| kvm_write_guest_cached(v->kvm, &vcpu->pv_time, |
| &vcpu->hv_clock, |
| sizeof(vcpu->hv_clock.version)); |
| } |
| |
| static int kvm_guest_time_update(struct kvm_vcpu *v) |
| { |
| unsigned long flags, tgt_tsc_khz; |
| struct kvm_vcpu_arch *vcpu = &v->arch; |
| struct kvm_arch *ka = &v->kvm->arch; |
| s64 kernel_ns; |
| u64 tsc_timestamp, host_tsc; |
| u8 pvclock_flags; |
| bool use_master_clock; |
| |
| kernel_ns = 0; |
| host_tsc = 0; |
| |
| /* |
| * If the host uses TSC clock, then passthrough TSC as stable |
| * to the guest. |
| */ |
| spin_lock(&ka->pvclock_gtod_sync_lock); |
| use_master_clock = ka->use_master_clock; |
| if (use_master_clock) { |
| host_tsc = ka->master_cycle_now; |
| kernel_ns = ka->master_kernel_ns; |
| } |
| spin_unlock(&ka->pvclock_gtod_sync_lock); |
| |
| /* Keep irq disabled to prevent changes to the clock */ |
| local_irq_save(flags); |
| tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz); |
| if (unlikely(tgt_tsc_khz == 0)) { |
| local_irq_restore(flags); |
| kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); |
| return 1; |
| } |
| if (!use_master_clock) { |
| host_tsc = rdtsc(); |
| kernel_ns = get_kvmclock_base_ns(); |
| } |
| |
| tsc_timestamp = kvm_read_l1_tsc(v, host_tsc); |
| |
| /* |
| * We may have to catch up the TSC to match elapsed wall clock |
| * time for two reasons, even if kvmclock is used. |
| * 1) CPU could have been running below the maximum TSC rate |
| * 2) Broken TSC compensation resets the base at each VCPU |
| * entry to avoid unknown leaps of TSC even when running |
| * again on the same CPU. This may cause apparent elapsed |
| * time to disappear, and the guest to stand still or run |
| * very slowly. |
| */ |
| if (vcpu->tsc_catchup) { |
| u64 tsc = compute_guest_tsc(v, kernel_ns); |
| if (tsc > tsc_timestamp) { |
| adjust_tsc_offset_guest(v, tsc - tsc_timestamp); |
| tsc_timestamp = tsc; |
| } |
| } |
| |
| local_irq_restore(flags); |
| |
| /* With all the info we got, fill in the values */ |
| |
| if (kvm_has_tsc_control) |
| tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz); |
| |
| if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) { |
| kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL, |
| &vcpu->hv_clock.tsc_shift, |
| &vcpu->hv_clock.tsc_to_system_mul); |
| vcpu->hw_tsc_khz = tgt_tsc_khz; |
| } |
| |
| vcpu->hv_clock.tsc_timestamp = tsc_timestamp; |
| vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; |
| vcpu->last_guest_tsc = tsc_timestamp; |
| |
| /* If the host uses TSC clocksource, then it is stable */ |
| pvclock_flags = 0; |
| if (use_master_clock) |
| pvclock_flags |= PVCLOCK_TSC_STABLE_BIT; |
| |
| vcpu->hv_clock.flags = pvclock_flags; |
| |
| if (vcpu->pv_time_enabled) |
| kvm_setup_pvclock_page(v); |
| if (v == kvm_get_vcpu(v->kvm, 0)) |
| kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock); |
| return 0; |
| } |
| |
| /* |
| * kvmclock updates which are isolated to a given vcpu, such as |
| * vcpu->cpu migration, should not allow system_timestamp from |
| * the rest of the vcpus to remain static. Otherwise ntp frequency |
| * correction applies to one vcpu's system_timestamp but not |
| * the others. |
| * |
| * So in those cases, request a kvmclock update for all vcpus. |
| * We need to rate-limit these requests though, as they can |
| * considerably slow guests that have a large number of vcpus. |
| * The time for a remote vcpu to update its kvmclock is bound |
| * by the delay we use to rate-limit the updates. |
| */ |
| |
| #define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100) |
| |
| static void kvmclock_update_fn(struct work_struct *work) |
| { |
| int i; |
| struct delayed_work *dwork = to_delayed_work(work); |
| struct kvm_arch *ka = container_of(dwork, struct kvm_arch, |
| kvmclock_update_work); |
| struct kvm *kvm = container_of(ka, struct kvm, arch); |
| struct kvm_vcpu *vcpu; |
| |
| kvm_for_each_vcpu(i, vcpu, kvm) { |
| kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); |
| kvm_vcpu_kick(vcpu); |
| } |
| } |
| |
| static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) |
| { |
| struct kvm *kvm = v->kvm; |
| |
| kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); |
| schedule_delayed_work(&kvm->arch.kvmclock_update_work, |
| KVMCLOCK_UPDATE_DELAY); |
| } |
| |
| #define KVMCLOCK_SYNC_PERIOD (300 * HZ) |
| |
| static void kvmclock_sync_fn(struct work_struct *work) |
| { |
| struct delayed_work *dwork = to_delayed_work(work); |
| struct kvm_arch *ka = container_of(dwork, struct kvm_arch, |
| kvmclock_sync_work); |
| struct kvm *kvm = container_of(ka, struct kvm, arch); |
| |
| if (!kvmclock_periodic_sync) |
| return; |
| |
| schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0); |
| schedule_delayed_work(&kvm->arch.kvmclock_sync_work, |
| KVMCLOCK_SYNC_PERIOD); |
| } |
| |
| /* |
| * On AMD, HWCR[McStatusWrEn] controls whether setting MCi_STATUS results in #GP. |
| */ |
| static bool can_set_mci_status(struct kvm_vcpu *vcpu) |
| { |
| /* McStatusWrEn enabled? */ |
| if (guest_cpuid_is_amd_or_hygon(vcpu)) |
| return !!(vcpu->arch.msr_hwcr & BIT_ULL(18)); |
| |
| return false; |
| } |
| |
| static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) |
| { |
| u64 mcg_cap = vcpu->arch.mcg_cap; |
| unsigned bank_num = mcg_cap & 0xff; |
| u32 msr = msr_info->index; |
| u64 data = msr_info->data; |
| |
| switch (msr) { |
| case MSR_IA32_MCG_STATUS: |
| vcpu->arch.mcg_status = data; |
| break; |
| case MSR_IA32_MCG_CTL: |
| if (!(mcg_cap & MCG_CTL_P) && |
| (data || !msr_info->host_initiated)) |
| return 1; |
| if (data != 0 && data != ~(u64)0) |
| return 1; |
| vcpu->arch.mcg_ctl = data; |
| break; |
| default: |
| if (msr >= MSR_IA32_MC0_CTL && |
| msr < MSR_IA32_MCx_CTL(bank_num)) { |
| u32 offset = array_index_nospec( |
| msr - MSR_IA32_MC0_CTL, |
| MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL); |
| |
| /* only 0 or all 1s can be written to IA32_MCi_CTL |
| * some Linux kernels though clear bit 10 in bank 4 to |
| * workaround a BIOS/GART TBL issue on AMD K8s, ignore |
| * this to avoid an uncatched #GP in the guest |
| */ |
| if ((offset & 0x3) == 0 && |
| data != 0 && (data | (1 << 10)) != ~(u64)0) |
| return -1; |
| |
| /* MCi_STATUS */ |
| if (!msr_info->host_initiated && |
| (offset & 0x3) == 1 && data != 0) { |
| if (!can_set_mci_status(vcpu)) |
| return -1; |
| } |
| |
| vcpu->arch.mce_banks[offset] = data; |
| break; |
| } |
| return 1; |
| } |
| return 0; |
| } |
| |
| static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data) |
| { |
| struct kvm *kvm = vcpu->kvm; |
| int lm = is_long_mode(vcpu); |
| u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64 |
| : (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32; |
| u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64 |
| : kvm->arch.xen_hvm_config.blob_size_32; |
| u32 page_num = data & ~PAGE_MASK; |
| u64 page_addr = data & PAGE_MASK; |
| u8 *page; |
| |
| if (page_num >= blob_size) |
| return 1; |
| |
| page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE); |
| if (IS_ERR(page)) |
| return PTR_ERR(page); |
| |
| if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) { |
| kfree(page); |
| return 1; |
| } |
| return 0; |
| } |
| |
| static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu) |
| { |
| u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT; |
| |
| return (vcpu->arch.apf.msr_en_val & mask) == mask; |
| } |
| |
| static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) |
| { |
| gpa_t gpa = data & ~0x3f; |
| |
| /* Bits 4:5 are reserved, Should be zero */ |
| if (data & 0x30) |
| return 1; |
| |
| if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_VMEXIT) && |
| (data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT)) |
| return 1; |
| |
| if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT) && |
| (data & KVM_ASYNC_PF_DELIVERY_AS_INT)) |
| return 1; |
| |
| if (!lapic_in_kernel(vcpu)) |
| return data ? 1 : 0; |
| |
| vcpu->arch.apf.msr_en_val = data; |
| |
| if (!kvm_pv_async_pf_enabled(vcpu)) { |
| kvm_clear_async_pf_completion_queue(vcpu); |
| kvm_async_pf_hash_reset(vcpu); |
| return 0; |
| } |
| |
| if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa, |
| sizeof(u64))) |
| return 1; |
| |
| vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS); |
| vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT; |
| |
| kvm_async_pf_wakeup_all(vcpu); |
| |
| return 0; |
| } |
| |
| static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data) |
| { |
| /* Bits 8-63 are reserved */ |
| if (data >> 8) |
| return 1; |
| |
| if (!lapic_in_kernel(vcpu)) |
| return 1; |
| |
| vcpu->arch.apf.msr_int_val = data; |
| |
| vcpu->arch.apf.vec = data & KVM_ASYNC_PF_VEC_MASK; |
| |
| return 0; |
| } |
| |
| static void kvmclock_reset(struct kvm_vcpu *vcpu) |
| { |
| vcpu->arch.pv_time_enabled = false; |
| vcpu->arch.time = 0; |
| } |
| |
| static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu) |
| { |
| ++vcpu->stat.tlb_flush; |
| kvm_x86_ops.tlb_flush_all(vcpu); |
| } |
| |
| static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) |
| { |
| ++vcpu->stat.tlb_flush; |
| kvm_x86_ops.tlb_flush_guest(vcpu); |
| } |
| |
| static u64 accumulate_stat_steal_time(u64 *last_steal) |
| { |
| u64 delta; |
| |
| if (*last_steal == 0) |
| delta = 0; |
| else |
| delta = current->sched_info.run_delay - *last_steal; |
| |
| *last_steal = current->sched_info.run_delay; |
| return delta; |
| } |
| |
| static void update_stat_steal_time(struct kvm_vcpu *vcpu) |
| { |
| u64 delta; |
| |
| delta = accumulate_stat_steal_time(&vcpu->stat.steal); |
| vcpu->stat.st_max = max(vcpu->stat.st_max, delta); |
| } |
| |
| static void record_steal_time(struct kvm_vcpu *vcpu) |
| { |
| struct kvm_host_map map; |
| struct kvm_steal_time *st; |
| |
| update_stat_steal_time(vcpu); |
| if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) |
| return; |
| |
| /* -EAGAIN is returned in atomic context so we can just return. */ |
| if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, |
| &map, &vcpu->arch.st.cache, false)) |
| return; |
| |
| st = map.hva + |
| offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS); |
| |
| /* |
| * Doing a TLB flush here, on the guest's behalf, can avoid |
| * expensive IPIs. |
| */ |
| if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) { |
| trace_kvm_pv_tlb_flush(vcpu->vcpu_id, |
| st->preempted & KVM_VCPU_FLUSH_TLB); |
| if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB) |
| kvm_vcpu_flush_tlb_guest(vcpu); |
| } else { |
| st->preempted = 0; |
| } |
| |
| vcpu->arch.st.preempted = 0; |
| |
| if (st->version & 1) |
| st->version += 1; /* first time write, random junk */ |
| |
| |