blob: 9663d41cc2bc7ee3e7e17c227880a9ea7992d89f [file] [log] [blame]
/*
* Kernel-based Virtual Machine driver for Linux
*
* This module enables machines with Intel VT-x extensions to run virtual
* machines without emulation or binary translation.
*
* Copyright (C) 2006 Qumranet, Inc.
* Copyright 2010 Red Hat, Inc. and/or its affiliates.
*
* Authors:
* Avi Kivity <avi@qumranet.com>
* Yaniv Kamay <yaniv@qumranet.com>
*
* This work is licensed under the terms of the GNU GPL, version 2. See
* the COPYING file in the top-level directory.
*
*/
#include <linux/frame.h>
#include <linux/highmem.h>
#include <linux/hrtimer.h>
#include <linux/kernel.h>
#include <linux/kvm_host.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/mod_devicetable.h>
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/sched/smt.h>
#include <linux/slab.h>
#include <linux/tboot.h>
#include <linux/trace_events.h>
#include <asm/apic.h>
#include <asm/asm.h>
#include <asm/cpu.h>
#include <asm/debugreg.h>
#include <asm/desc.h>
#include <asm/fpu/internal.h>
#include <asm/io.h>
#include <asm/irq_remapping.h>
#include <asm/kexec.h>
#include <asm/perf_event.h>
#include <asm/mce.h>
#include <asm/mmu_context.h>
#include <asm/mshyperv.h>
#include <asm/spec-ctrl.h>
#include <asm/virtext.h>
#include <asm/vmx.h>
#include "capabilities.h"
#include "cpuid.h"
#include "evmcs.h"
#include "irq.h"
#include "kvm_cache_regs.h"
#include "lapic.h"
#include "mmu.h"
#include "nested.h"
#include "ops.h"
#include "pmu.h"
#include "trace.h"
#include "vmcs.h"
#include "vmcs12.h"
#include "vmx.h"
#include "x86.h"
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");
static const struct x86_cpu_id vmx_cpu_id[] = {
X86_FEATURE_MATCH(X86_FEATURE_VMX),
{}
};
MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
bool __read_mostly enable_vpid = 1;
module_param_named(vpid, enable_vpid, bool, 0444);
static bool __read_mostly enable_vnmi = 1;
module_param_named(vnmi, enable_vnmi, bool, S_IRUGO);
bool __read_mostly flexpriority_enabled = 1;
module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
bool __read_mostly enable_ept = 1;
module_param_named(ept, enable_ept, bool, S_IRUGO);
bool __read_mostly enable_unrestricted_guest = 1;
module_param_named(unrestricted_guest,
enable_unrestricted_guest, bool, S_IRUGO);
bool __read_mostly enable_ept_ad_bits = 1;
module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
static bool __read_mostly emulate_invalid_guest_state = true;
module_param(emulate_invalid_guest_state, bool, S_IRUGO);
static bool __read_mostly fasteoi = 1;
module_param(fasteoi, bool, S_IRUGO);
static bool __read_mostly enable_apicv = 1;
module_param(enable_apicv, bool, S_IRUGO);
/*
* If nested=1, nested virtualization is supported, i.e., guests may use
* VMX and be a hypervisor for its own guests. If nested=0, guests may not
* use VMX instructions.
*/
static bool __read_mostly nested = 1;
module_param(nested, bool, S_IRUGO);
static u64 __read_mostly host_xss;
bool __read_mostly enable_pml = 1;
module_param_named(pml, enable_pml, bool, S_IRUGO);
#define MSR_BITMAP_MODE_X2APIC 1
#define MSR_BITMAP_MODE_X2APIC_APICV 2
#define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL
/* Guest_tsc -> host_tsc conversion requires 64-bit division. */
static int __read_mostly cpu_preemption_timer_multi;
static bool __read_mostly enable_preemption_timer = 1;
#ifdef CONFIG_X86_64
module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
#endif
#define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
#define KVM_VM_CR0_ALWAYS_ON \
(KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \
X86_CR0_WP | X86_CR0_PG | X86_CR0_PE)
#define KVM_CR4_GUEST_OWNED_BITS \
(X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
| X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_TSD)
#define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
#define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN | \
RTIT_STATUS_CONTEXTEN | RTIT_STATUS_TRIGGEREN | \
RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \
RTIT_STATUS_BYTECNT))
#define MSR_IA32_RTIT_OUTPUT_BASE_MASK \
(~((1UL << cpuid_query_maxphyaddr(vcpu)) - 1) | 0x7f)
/*
* These 2 parameters are used to config the controls for Pause-Loop Exiting:
* ple_gap: upper bound on the amount of time between two successive
* executions of PAUSE in a loop. Also indicate if ple enabled.
* According to test, this time is usually smaller than 128 cycles.
* ple_window: upper bound on the amount of time a guest is allowed to execute
* in a PAUSE loop. Tests indicate that most spinlocks are held for
* less than 2^12 cycles
* Time is measured based on a counter that runs at the same rate as the TSC,
* refer SDM volume 3b section 21.6.13 & 22.1.3.
*/
static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP;
module_param(ple_gap, uint, 0444);
static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
module_param(ple_window, uint, 0444);
/* Default doubles per-vcpu window every exit. */
static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
module_param(ple_window_grow, uint, 0444);
/* Default resets per-vcpu window every exit to ple_window. */
static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
module_param(ple_window_shrink, uint, 0444);
/* Default is to compute the maximum so we can never overflow. */
static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
module_param(ple_window_max, uint, 0444);
/* Default is SYSTEM mode, 1 for host-guest mode */
int __read_mostly pt_mode = PT_MODE_SYSTEM;
module_param(pt_mode, int, S_IRUGO);
static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
static DEFINE_MUTEX(vmx_l1d_flush_mutex);
/* Storage for pre module init parameter parsing */
static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;
static const struct {
const char *option;
bool for_parse;
} vmentry_l1d_param[] = {
[VMENTER_L1D_FLUSH_AUTO] = {"auto", true},
[VMENTER_L1D_FLUSH_NEVER] = {"never", true},
[VMENTER_L1D_FLUSH_COND] = {"cond", true},
[VMENTER_L1D_FLUSH_ALWAYS] = {"always", true},
[VMENTER_L1D_FLUSH_EPT_DISABLED] = {"EPT disabled", false},
[VMENTER_L1D_FLUSH_NOT_REQUIRED] = {"not required", false},
};
#define L1D_CACHE_ORDER 4
static void *vmx_l1d_flush_pages;
static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
{
struct page *page;
unsigned int i;
if (!enable_ept) {
l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
return 0;
}
if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
u64 msr;
rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
if (msr & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
return 0;
}
}
/* If set to auto use the default l1tf mitigation method */
if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
switch (l1tf_mitigation) {
case L1TF_MITIGATION_OFF:
l1tf = VMENTER_L1D_FLUSH_NEVER;
break;
case L1TF_MITIGATION_FLUSH_NOWARN:
case L1TF_MITIGATION_FLUSH:
case L1TF_MITIGATION_FLUSH_NOSMT:
l1tf = VMENTER_L1D_FLUSH_COND;
break;
case L1TF_MITIGATION_FULL:
case L1TF_MITIGATION_FULL_FORCE:
l1tf = VMENTER_L1D_FLUSH_ALWAYS;
break;
}
} else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {
l1tf = VMENTER_L1D_FLUSH_ALWAYS;
}
if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
!boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
/*
* This allocation for vmx_l1d_flush_pages is not tied to a VM
* lifetime and so should not be charged to a memcg.
*/
page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
if (!page)
return -ENOMEM;
vmx_l1d_flush_pages = page_address(page);
/*
* Initialize each page with a different pattern in
* order to protect against KSM in the nested
* virtualization case.
*/
for (i = 0; i < 1u << L1D_CACHE_ORDER; ++i) {
memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + 1,
PAGE_SIZE);
}
}
l1tf_vmx_mitigation = l1tf;
if (l1tf != VMENTER_L1D_FLUSH_NEVER)
static_branch_enable(&vmx_l1d_should_flush);
else
static_branch_disable(&vmx_l1d_should_flush);
if (l1tf == VMENTER_L1D_FLUSH_COND)
static_branch_enable(&vmx_l1d_flush_cond);
else
static_branch_disable(&vmx_l1d_flush_cond);
return 0;
}
static int vmentry_l1d_flush_parse(const char *s)
{
unsigned int i;
if (s) {
for (i = 0; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
if (vmentry_l1d_param[i].for_parse &&
sysfs_streq(s, vmentry_l1d_param[i].option))
return i;
}
}
return -EINVAL;
}
static int vmentry_l1d_flush_set(const char *s, const struct kernel_param *kp)
{
int l1tf, ret;
l1tf = vmentry_l1d_flush_parse(s);
if (l1tf < 0)
return l1tf;
if (!boot_cpu_has(X86_BUG_L1TF))
return 0;
/*
* Has vmx_init() run already? If not then this is the pre init
* parameter parsing. In that case just store the value and let
* vmx_init() do the proper setup after enable_ept has been
* established.
*/
if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {
vmentry_l1d_flush_param = l1tf;
return 0;
}
mutex_lock(&vmx_l1d_flush_mutex);
ret = vmx_setup_l1d_flush(l1tf);
mutex_unlock(&vmx_l1d_flush_mutex);
return ret;
}
static int vmentry_l1d_flush_get(char *s, const struct kernel_param *kp)
{
if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param)))
return sprintf(s, "???\n");
return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
}
static const struct kernel_param_ops vmentry_l1d_flush_ops = {
.set = vmentry_l1d_flush_set,
.get = vmentry_l1d_flush_get,
};
module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
static bool guest_state_valid(struct kvm_vcpu *vcpu);
static u32 vmx_segment_access_rights(struct kvm_segment *var);
static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
u32 msr, int type);
void vmx_vmexit(void);
static DEFINE_PER_CPU(struct vmcs *, vmxarea);
DEFINE_PER_CPU(struct vmcs *, current_vmcs);
/*
* We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
* when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
*/
static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
/*
* We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
* can find which vCPU should be waken up.
*/
static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
static DEFINE_SPINLOCK(vmx_vpid_lock);
struct vmcs_config vmcs_config;
struct vmx_capability vmx_capability;
#define VMX_SEGMENT_FIELD(seg) \
[VCPU_SREG_##seg] = { \
.selector = GUEST_##seg##_SELECTOR, \
.base = GUEST_##seg##_BASE, \
.limit = GUEST_##seg##_LIMIT, \
.ar_bytes = GUEST_##seg##_AR_BYTES, \
}
static const struct kvm_vmx_segment_field {
unsigned selector;
unsigned base;
unsigned limit;
unsigned ar_bytes;
} kvm_vmx_segment_fields[] = {
VMX_SEGMENT_FIELD(CS),
VMX_SEGMENT_FIELD(DS),
VMX_SEGMENT_FIELD(ES),
VMX_SEGMENT_FIELD(FS),
VMX_SEGMENT_FIELD(GS),
VMX_SEGMENT_FIELD(SS),
VMX_SEGMENT_FIELD(TR),
VMX_SEGMENT_FIELD(LDTR),
};
u64 host_efer;
/*
* Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
* will emulate SYSCALL in legacy mode if the vendor string in guest
* CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
* support this emulation, IA32_STAR must always be included in
* vmx_msr_index[], even in i386 builds.
*/
const u32 vmx_msr_index[] = {
#ifdef CONFIG_X86_64
MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
#endif
MSR_EFER, MSR_TSC_AUX, MSR_STAR,
};
#if IS_ENABLED(CONFIG_HYPERV)
static bool __read_mostly enlightened_vmcs = true;
module_param(enlightened_vmcs, bool, 0444);
/* check_ept_pointer() should be under protection of ept_pointer_lock. */
static void check_ept_pointer_match(struct kvm *kvm)
{
struct kvm_vcpu *vcpu;
u64 tmp_eptp = INVALID_PAGE;
int i;
kvm_for_each_vcpu(i, vcpu, kvm) {
if (!VALID_PAGE(tmp_eptp)) {
tmp_eptp = to_vmx(vcpu)->ept_pointer;
} else if (tmp_eptp != to_vmx(vcpu)->ept_pointer) {
to_kvm_vmx(kvm)->ept_pointers_match
= EPT_POINTERS_MISMATCH;
return;
}
}
to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH;
}
static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush,
void *data)
{
struct kvm_tlb_range *range = data;
return hyperv_fill_flush_guest_mapping_list(flush, range->start_gfn,
range->pages);
}
static inline int __hv_remote_flush_tlb_with_range(struct kvm *kvm,
struct kvm_vcpu *vcpu, struct kvm_tlb_range *range)
{
u64 ept_pointer = to_vmx(vcpu)->ept_pointer;
/*
* FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE hypercall needs address
* of the base of EPT PML4 table, strip off EPT configuration
* information.
*/
if (range)
return hyperv_flush_guest_mapping_range(ept_pointer & PAGE_MASK,
kvm_fill_hv_flush_list_func, (void *)range);
else
return hyperv_flush_guest_mapping(ept_pointer & PAGE_MASK);
}
static int hv_remote_flush_tlb_with_range(struct kvm *kvm,
struct kvm_tlb_range *range)
{
struct kvm_vcpu *vcpu;
int ret = 0, i;
spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK)
check_ept_pointer_match(kvm);
if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) {
kvm_for_each_vcpu(i, vcpu, kvm) {
/* If ept_pointer is invalid pointer, bypass flush request. */
if (VALID_PAGE(to_vmx(vcpu)->ept_pointer))
ret |= __hv_remote_flush_tlb_with_range(
kvm, vcpu, range);
}
} else {
ret = __hv_remote_flush_tlb_with_range(kvm,
kvm_get_vcpu(kvm, 0), range);
}
spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
return ret;
}
static int hv_remote_flush_tlb(struct kvm *kvm)
{
return hv_remote_flush_tlb_with_range(kvm, NULL);
}
#endif /* IS_ENABLED(CONFIG_HYPERV) */
/*
* Comment's format: document - errata name - stepping - processor name.
* Refer from
* https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
*/
static u32 vmx_preemption_cpu_tfms[] = {
/* 323344.pdf - BA86 - D0 - Xeon 7500 Series */
0x000206E6,
/* 323056.pdf - AAX65 - C2 - Xeon L3406 */
/* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
/* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
0x00020652,
/* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
0x00020655,
/* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */
/* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */
/*
* 320767.pdf - AAP86 - B1 -
* i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
*/
0x000106E5,
/* 321333.pdf - AAM126 - C0 - Xeon 3500 */
0x000106A0,
/* 321333.pdf - AAM126 - C1 - Xeon 3500 */
0x000106A1,
/* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
0x000106A4,
/* 321333.pdf - AAM126 - D0 - Xeon 3500 */
/* 321324.pdf - AAK139 - D0 - Xeon 5500 */
/* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
0x000106A5,
/* Xeon E3-1220 V2 */
0x000306A8,
};
static inline bool cpu_has_broken_vmx_preemption_timer(void)
{
u32 eax = cpuid_eax(0x00000001), i;
/* Clear the reserved bits */
eax &= ~(0x3U << 14 | 0xfU << 28);
for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
if (eax == vmx_preemption_cpu_tfms[i])
return true;
return false;
}
static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
{
return flexpriority_enabled && lapic_in_kernel(vcpu);
}
static inline bool report_flexpriority(void)
{
return flexpriority_enabled;
}
static inline int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
{
int i;
for (i = 0; i < vmx->nmsrs; ++i)
if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
return i;
return -1;
}
struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
{
int i;
i = __find_msr_index(vmx, msr);
if (i >= 0)
return &vmx->guest_msrs[i];
return NULL;
}
void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
{
vmcs_clear(loaded_vmcs->vmcs);
if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
vmcs_clear(loaded_vmcs->shadow_vmcs);
loaded_vmcs->cpu = -1;
loaded_vmcs->launched = 0;
}
#ifdef CONFIG_KEXEC_CORE
/*
* This bitmap is used to indicate whether the vmclear
* operation is enabled on all cpus. All disabled by
* default.
*/
static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
static inline void crash_enable_local_vmclear(int cpu)
{
cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
}
static inline void crash_disable_local_vmclear(int cpu)
{
cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
}
static inline int crash_local_vmclear_enabled(int cpu)
{
return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
}
static void crash_vmclear_local_loaded_vmcss(void)
{
int cpu = raw_smp_processor_id();
struct loaded_vmcs *v;
if (!crash_local_vmclear_enabled(cpu))
return;
list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
loaded_vmcss_on_cpu_link)
vmcs_clear(v->vmcs);
}
#else
static inline void crash_enable_local_vmclear(int cpu) { }
static inline void crash_disable_local_vmclear(int cpu) { }
#endif /* CONFIG_KEXEC_CORE */
static void __loaded_vmcs_clear(void *arg)
{
struct loaded_vmcs *loaded_vmcs = arg;
int cpu = raw_smp_processor_id();
if (loaded_vmcs->cpu != cpu)
return; /* vcpu migration can race with cpu offline */
if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
per_cpu(current_vmcs, cpu) = NULL;
crash_disable_local_vmclear(cpu);
list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
/*
* we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link
* is before setting loaded_vmcs->vcpu to -1 which is done in
* loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist
* then adds the vmcs into percpu list before it is deleted.
*/
smp_wmb();
loaded_vmcs_init(loaded_vmcs);
crash_enable_local_vmclear(cpu);
}
void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
{
int cpu = loaded_vmcs->cpu;
if (cpu != -1)
smp_call_function_single(cpu,
__loaded_vmcs_clear, loaded_vmcs, 1);
}
static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
unsigned field)
{
bool ret;
u32 mask = 1 << (seg * SEG_FIELD_NR + field);
if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) {
vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS);
vmx->segment_cache.bitmask = 0;
}
ret = vmx->segment_cache.bitmask & mask;
vmx->segment_cache.bitmask |= mask;
return ret;
}
static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
{
u16 *p = &vmx->segment_cache.seg[seg].selector;
if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
*p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
return *p;
}
static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
{
ulong *p = &vmx->segment_cache.seg[seg].base;
if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
*p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
return *p;
}
static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
{
u32 *p = &vmx->segment_cache.seg[seg].limit;
if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
*p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
return *p;
}
static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
{
u32 *p = &vmx->segment_cache.seg[seg].ar;
if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
*p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
return *p;
}
void update_exception_bitmap(struct kvm_vcpu *vcpu)
{
u32 eb;
eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
(1u << DB_VECTOR) | (1u << AC_VECTOR);
/*
* Guest access to VMware backdoor ports could legitimately
* trigger #GP because of TSS I/O permission bitmap.
* We intercept those #GP and allow access to them anyway
* as VMware does.
*/
if (enable_vmware_backdoor)
eb |= (1u << GP_VECTOR);
if ((vcpu->guest_debug &
(KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
(KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
eb |= 1u << BP_VECTOR;
if (to_vmx(vcpu)->rmode.vm86_active)
eb = ~0;
if (enable_ept)
eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
/* When we are running a nested L2 guest and L1 specified for it a
* certain exception bitmap, we must trap the same exceptions and pass
* them to L1. When running L2, we will only handle the exceptions
* specified above if L1 did not want them.
*/
if (is_guest_mode(vcpu))
eb |= get_vmcs12(vcpu)->exception_bitmap;
vmcs_write32(EXCEPTION_BITMAP, eb);
}
/*
* Check if MSR is intercepted for currently loaded MSR bitmap.
*/
static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
{
unsigned long *msr_bitmap;
int f = sizeof(unsigned long);
if (!cpu_has_vmx_msr_bitmap())
return true;
msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap;
if (msr <= 0x1fff) {
return !!test_bit(msr, msr_bitmap + 0x800 / f);
} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
msr &= 0x1fff;
return !!test_bit(msr, msr_bitmap + 0xc00 / f);
}
return true;
}
static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
unsigned long entry, unsigned long exit)
{
vm_entry_controls_clearbit(vmx, entry);
vm_exit_controls_clearbit(vmx, exit);
}
static int find_msr(struct vmx_msrs *m, unsigned int msr)
{
unsigned int i;
for (i = 0; i < m->nr; ++i) {
if (m->val[i].index == msr)
return i;
}
return -ENOENT;
}
static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
{
int i;
struct msr_autoload *m = &vmx->msr_autoload;
switch (msr) {
case MSR_EFER:
if (cpu_has_load_ia32_efer()) {
clear_atomic_switch_msr_special(vmx,
VM_ENTRY_LOAD_IA32_EFER,
VM_EXIT_LOAD_IA32_EFER);
return;
}
break;
case MSR_CORE_PERF_GLOBAL_CTRL:
if (cpu_has_load_perf_global_ctrl()) {
clear_atomic_switch_msr_special(vmx,
VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
return;
}
break;
}
i = find_msr(&m->guest, msr);
if (i < 0)
goto skip_guest;
--m->guest.nr;
m->guest.val[i] = m->guest.val[m->guest.nr];
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
skip_guest:
i = find_msr(&m->host, msr);
if (i < 0)
return;
--m->host.nr;
m->host.val[i] = m->host.val[m->host.nr];
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
}
static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
unsigned long entry, unsigned long exit,
unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
u64 guest_val, u64 host_val)
{
vmcs_write64(guest_val_vmcs, guest_val);
if (host_val_vmcs != HOST_IA32_EFER)
vmcs_write64(host_val_vmcs, host_val);
vm_entry_controls_setbit(vmx, entry);
vm_exit_controls_setbit(vmx, exit);
}
static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
u64 guest_val, u64 host_val, bool entry_only)
{
int i, j = 0;
struct msr_autoload *m = &vmx->msr_autoload;
switch (msr) {
case MSR_EFER:
if (cpu_has_load_ia32_efer()) {
add_atomic_switch_msr_special(vmx,
VM_ENTRY_LOAD_IA32_EFER,
VM_EXIT_LOAD_IA32_EFER,
GUEST_IA32_EFER,
HOST_IA32_EFER,
guest_val, host_val);
return;
}
break;
case MSR_CORE_PERF_GLOBAL_CTRL:
if (cpu_has_load_perf_global_ctrl()) {
add_atomic_switch_msr_special(vmx,
VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
GUEST_IA32_PERF_GLOBAL_CTRL,
HOST_IA32_PERF_GLOBAL_CTRL,
guest_val, host_val);
return;
}
break;
case MSR_IA32_PEBS_ENABLE:
/* PEBS needs a quiescent period after being disabled (to write
* a record). Disabling PEBS through VMX MSR swapping doesn't
* provide that period, so a CPU could write host's record into
* guest's memory.
*/
wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
}
i = find_msr(&m->guest, msr);
if (!entry_only)
j = find_msr(&m->host, msr);
if ((i < 0 && m->guest.nr == NR_AUTOLOAD_MSRS) ||
(j < 0 && m->host.nr == NR_AUTOLOAD_MSRS)) {
printk_once(KERN_WARNING "Not enough msr switch entries. "
"Can't add msr %x\n", msr);
return;
}
if (i < 0) {
i = m->guest.nr++;
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
}
m->guest.val[i].index = msr;
m->guest.val[i].value = guest_val;
if (entry_only)
return;
if (j < 0) {
j = m->host.nr++;
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
}
m->host.val[j].index = msr;
m->host.val[j].value = host_val;
}
static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
{
u64 guest_efer = vmx->vcpu.arch.efer;
u64 ignore_bits = 0;
if (!enable_ept) {
/*
* NX is needed to handle CR0.WP=1, CR4.SMEP=1. Testing
* host CPUID is more efficient than testing guest CPUID
* or CR4. Host SMEP is anyway a requirement for guest SMEP.
*/
if (boot_cpu_has(X86_FEATURE_SMEP))
guest_efer |= EFER_NX;
else if (!(guest_efer & EFER_NX))
ignore_bits |= EFER_NX;
}
/*
* LMA and LME handled by hardware; SCE meaningless outside long mode.
*/
ignore_bits |= EFER_SCE;
#ifdef CONFIG_X86_64
ignore_bits |= EFER_LMA | EFER_LME;
/* SCE is meaningful only in long mode on Intel */
if (guest_efer & EFER_LMA)
ignore_bits &= ~(u64)EFER_SCE;
#endif
/*
* On EPT, we can't emulate NX, so we must switch EFER atomically.
* On CPUs that support "load IA32_EFER", always switch EFER
* atomically, since it's faster than switching it manually.
*/
if (cpu_has_load_ia32_efer() ||
(enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
if (!(guest_efer & EFER_LMA))
guest_efer &= ~EFER_LME;
if (guest_efer != host_efer)
add_atomic_switch_msr(vmx, MSR_EFER,
guest_efer, host_efer, false);
else
clear_atomic_switch_msr(vmx, MSR_EFER);
return false;
} else {
clear_atomic_switch_msr(vmx, MSR_EFER);
guest_efer &= ~ignore_bits;
guest_efer |= host_efer & ignore_bits;
vmx->guest_msrs[efer_offset].data = guest_efer;
vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
return true;
}
}
#ifdef CONFIG_X86_32
/*
* On 32-bit kernels, VM exits still load the FS and GS bases from the
* VMCS rather than the segment table. KVM uses this helper to figure
* out the current bases to poke them into the VMCS before entry.
*/
static unsigned long segment_base(u16 selector)
{
struct desc_struct *table;
unsigned long v;
if (!(selector & ~SEGMENT_RPL_MASK))
return 0;
table = get_current_gdt_ro();
if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
u16 ldt_selector = kvm_read_ldt();
if (!(ldt_selector & ~SEGMENT_RPL_MASK))
return 0;
table = (struct desc_struct *)segment_base(ldt_selector);
}
v = get_desc_base(&table[selector >> 3]);
return v;
}
#endif
static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
{
u32 i;
wrmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
wrmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
for (i = 0; i < addr_range; i++) {
wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
}
}
static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range)
{
u32 i;
rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
for (i = 0; i < addr_range; i++) {
rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * 2, ctx->addr_a[i]);
rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * 2, ctx->addr_b[i]);
}
}
static void pt_guest_enter(struct vcpu_vmx *vmx)
{
if (pt_mode == PT_MODE_SYSTEM)
return;
/*
* GUEST_IA32_RTIT_CTL is already set in the VMCS.
* Save host state before VM entry.
*/
rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
wrmsrl(MSR_IA32_RTIT_CTL, 0);
pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range);
pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range);
}
}
static void pt_guest_exit(struct vcpu_vmx *vmx)
{
if (pt_mode == PT_MODE_SYSTEM)
return;
if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.addr_range);
pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.addr_range);
}
/* Reload host state (IA32_RTIT_CTL will be cleared on VM exit). */
wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
}
void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmcs_host_state *host_state;
#ifdef CONFIG_X86_64
int cpu = raw_smp_processor_id();
#endif
unsigned long fs_base, gs_base;
u16 fs_sel, gs_sel;
int i;
vmx->req_immediate_exit = false;
/*
* Note that guest MSRs to be saved/restored can also be changed
* when guest state is loaded. This happens when guest transitions
* to/from long-mode by setting MSR_EFER.LMA.
*/
if (!vmx->loaded_cpu_state || vmx->guest_msrs_dirty) {
vmx->guest_msrs_dirty = false;
for (i = 0; i < vmx->save_nmsrs; ++i)
kvm_set_shared_msr(vmx->guest_msrs[i].index,
vmx->guest_msrs[i].data,
vmx->guest_msrs[i].mask);
}
if (vmx->loaded_cpu_state)
return;
vmx->loaded_cpu_state = vmx->loaded_vmcs;
host_state = &vmx->loaded_cpu_state->host_state;
/*
* Set host fs and gs selectors. Unfortunately, 22.2.3 does not
* allow segment selectors with cpl > 0 or ti == 1.
*/
host_state->ldt_sel = kvm_read_ldt();
#ifdef CONFIG_X86_64
savesegment(ds, host_state->ds_sel);
savesegment(es, host_state->es_sel);
gs_base = cpu_kernelmode_gs_base(cpu);
if (likely(is_64bit_mm(current->mm))) {
save_fsgs_for_kvm();
fs_sel = current->thread.fsindex;
gs_sel = current->thread.gsindex;
fs_base = current->thread.fsbase;
vmx->msr_host_kernel_gs_base = current->thread.gsbase;
} else {
savesegment(fs, fs_sel);
savesegment(gs, gs_sel);
fs_base = read_msr(MSR_FS_BASE);
vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
}
wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
#else
savesegment(fs, fs_sel);
savesegment(gs, gs_sel);
fs_base = segment_base(fs_sel);
gs_base = segment_base(gs_sel);
#endif
if (unlikely(fs_sel != host_state->fs_sel)) {
if (!(fs_sel & 7))
vmcs_write16(HOST_FS_SELECTOR, fs_sel);
else
vmcs_write16(HOST_FS_SELECTOR, 0);
host_state->fs_sel = fs_sel;
}
if (unlikely(gs_sel != host_state->gs_sel)) {
if (!(gs_sel & 7))
vmcs_write16(HOST_GS_SELECTOR, gs_sel);
else
vmcs_write16(HOST_GS_SELECTOR, 0);
host_state->gs_sel = gs_sel;
}
if (unlikely(fs_base != host_state->fs_base)) {
vmcs_writel(HOST_FS_BASE, fs_base);
host_state->fs_base = fs_base;
}
if (unlikely(gs_base != host_state->gs_base)) {
vmcs_writel(HOST_GS_BASE, gs_base);
host_state->gs_base = gs_base;
}
}
static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
{
struct vmcs_host_state *host_state;
if (!vmx->loaded_cpu_state)
return;
WARN_ON_ONCE(vmx->loaded_cpu_state != vmx->loaded_vmcs);
host_state = &vmx->loaded_cpu_state->host_state;
++vmx->vcpu.stat.host_state_reload;
vmx->loaded_cpu_state = NULL;
#ifdef CONFIG_X86_64
rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
#endif
if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
kvm_load_ldt(host_state->ldt_sel);
#ifdef CONFIG_X86_64
load_gs_index(host_state->gs_sel);
#else
loadsegment(gs, host_state->gs_sel);
#endif
}
if (host_state->fs_sel & 7)
loadsegment(fs, host_state->fs_sel);
#ifdef CONFIG_X86_64
if (unlikely(host_state->ds_sel | host_state->es_sel)) {
loadsegment(ds, host_state->ds_sel);
loadsegment(es, host_state->es_sel);
}
#endif
invalidate_tss_limit();
#ifdef CONFIG_X86_64
wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
#endif
load_fixmap_gdt(raw_smp_processor_id());
}
#ifdef CONFIG_X86_64
static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
{
preempt_disable();
if (vmx->loaded_cpu_state)
rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
preempt_enable();
return vmx->msr_guest_kernel_gs_base;
}
static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
{
preempt_disable();
if (vmx->loaded_cpu_state)
wrmsrl(MSR_KERNEL_GS_BASE, data);
preempt_enable();
vmx->msr_guest_kernel_gs_base = data;
}
#endif
static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
{
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
struct pi_desc old, new;
unsigned int dest;
/*
* In case of hot-plug or hot-unplug, we may have to undo
* vmx_vcpu_pi_put even if there is no assigned device. And we
* always keep PI.NDST up to date for simplicity: it makes the
* code easier, and CPU migration is not a fast path.
*/
if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
return;
/* The full case. */
do {
old.control = new.control = pi_desc->control;
dest = cpu_physical_id(cpu);
if (x2apic_enabled())
new.ndst = dest;
else
new.ndst = (dest << 8) & 0xFF00;
new.sn = 0;
} while (cmpxchg64(&pi_desc->control, old.control,
new.control) != old.control);
/*
* Clear SN before reading the bitmap. The VT-d firmware
* writes the bitmap and reads SN atomically (5.2.3 in the
* spec), so it doesn't really have a memory barrier that
* pairs with this, but we cannot do that and we need one.
*/
smp_mb__after_atomic();
if (!bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS))
pi_set_on(pi_desc);
}
/*
* Switches to specified vcpu, until a matching vcpu_put(), but assumes
* vcpu mutex is already taken.
*/
void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
if (!already_loaded) {
loaded_vmcs_clear(vmx->loaded_vmcs);
local_irq_disable();
crash_disable_local_vmclear(cpu);
/*
* Read loaded_vmcs->cpu should be before fetching
* loaded_vmcs->loaded_vmcss_on_cpu_link.
* See the comments in __loaded_vmcs_clear().
*/
smp_rmb();
list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
&per_cpu(loaded_vmcss_on_cpu, cpu));
crash_enable_local_vmclear(cpu);
local_irq_enable();
}
if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
vmcs_load(vmx->loaded_vmcs->vmcs);
indirect_branch_prediction_barrier();
}
if (!already_loaded) {
void *gdt = get_current_gdt_ro();
unsigned long sysenter_esp;
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
/*
* Linux uses per-cpu TSS and GDT, so set these when switching
* processors. See 22.2.4.
*/
vmcs_writel(HOST_TR_BASE,
(unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */
/*
* VM exits change the host TR limit to 0x67 after a VM
* exit. This is okay, since 0x67 covers everything except
* the IO bitmap and have have code to handle the IO bitmap
* being lost after a VM exit.
*/
BUILD_BUG_ON(IO_BITMAP_OFFSET - 1 != 0x67);
rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
vmx->loaded_vmcs->cpu = cpu;
}
/* Setup TSC multiplier */
if (kvm_has_tsc_control &&
vmx->current_tsc_ratio != vcpu->arch.tsc_scaling_ratio)
decache_tsc_multiplier(vmx);
vmx_vcpu_pi_load(vcpu, cpu);
vmx->host_pkru = read_pkru();
vmx->host_debugctlmsr = get_debugctlmsr();
}
static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
{
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
!irq_remapping_cap(IRQ_POSTING_CAP) ||
!kvm_vcpu_apicv_active(vcpu))
return;
/* Set SN when the vCPU is preempted */
if (vcpu->preempted)
pi_set_sn(pi_desc);
}
void vmx_vcpu_put(struct kvm_vcpu *vcpu)
{
vmx_vcpu_pi_put(vcpu);
vmx_prepare_switch_to_host(to_vmx(vcpu));
}
static bool emulation_required(struct kvm_vcpu *vcpu)
{
return emulate_invalid_guest_state && !guest_state_valid(vcpu);
}
static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
{
unsigned long rflags, save_rflags;
if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) {
__set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
rflags = vmcs_readl(GUEST_RFLAGS);
if (to_vmx(vcpu)->rmode.vm86_active) {
rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
save_rflags = to_vmx(vcpu)->rmode.save_rflags;
rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
}
to_vmx(vcpu)->rflags = rflags;
}
return to_vmx(vcpu)->rflags;
}
void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
{
unsigned long old_rflags = vmx_get_rflags(vcpu);
__set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
to_vmx(vcpu)->rflags = rflags;
if (to_vmx(vcpu)->rmode.vm86_active) {
to_vmx(vcpu)->rmode.save_rflags = rflags;
rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
}
vmcs_writel(GUEST_RFLAGS, rflags);
if ((old_rflags ^ to_vmx(vcpu)->rflags) & X86_EFLAGS_VM)
to_vmx(vcpu)->emulation_required = emulation_required(vcpu);
}
u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
{
u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
int ret = 0;
if (interruptibility & GUEST_INTR_STATE_STI)
ret |= KVM_X86_SHADOW_INT_STI;
if (interruptibility & GUEST_INTR_STATE_MOV_SS)
ret |= KVM_X86_SHADOW_INT_MOV_SS;
return ret;
}
void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
{
u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
u32 interruptibility = interruptibility_old;
interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
if (mask & KVM_X86_SHADOW_INT_MOV_SS)
interruptibility |= GUEST_INTR_STATE_MOV_SS;
else if (mask & KVM_X86_SHADOW_INT_STI)
interruptibility |= GUEST_INTR_STATE_STI;
if ((interruptibility != interruptibility_old))
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
}
static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long value;
/*
* Any MSR write that attempts to change bits marked reserved will
* case a #GP fault.
*/
if (data & vmx->pt_desc.ctl_bitmask)
return 1;
/*
* Any attempt to modify IA32_RTIT_CTL while TraceEn is set will
* result in a #GP unless the same write also clears TraceEn.
*/
if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) &&
((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN))
return 1;
/*
* WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit
* and FabricEn would cause #GP, if
* CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0
*/
if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) &&
!(data & RTIT_CTL_FABRIC_EN) &&
!intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_single_range_output))
return 1;
/*
* MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that
* utilize encodings marked reserved will casue a #GP fault.
*/
value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods);
if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) &&
!test_bit((data & RTIT_CTL_MTC_RANGE) >>
RTIT_CTL_MTC_RANGE_OFFSET, &value))
return 1;
value = intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_cycle_thresholds);
if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
!test_bit((data & RTIT_CTL_CYC_THRESH) >>
RTIT_CTL_CYC_THRESH_OFFSET, &value))
return 1;
value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_periods);
if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_psb_cyc) &&
!test_bit((data & RTIT_CTL_PSB_FREQ) >>
RTIT_CTL_PSB_FREQ_OFFSET, &value))
return 1;
/*
* If ADDRx_CFG is reserved or the encodings is >2 will
* cause a #GP fault.
*/
value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET;
if ((value && (vmx->pt_desc.addr_range < 1)) || (value > 2))
return 1;
value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET;
if ((value && (vmx->pt_desc.addr_range < 2)) || (value > 2))
return 1;
value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET;
if ((value && (vmx->pt_desc.addr_range < 3)) || (value > 2))
return 1;
value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET;
if ((value && (vmx->pt_desc.addr_range < 4)) || (value > 2))
return 1;
return 0;
}
static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
unsigned long rip;
rip = kvm_rip_read(vcpu);
rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
kvm_rip_write(vcpu, rip);
/* skipping an emulated instruction also counts */
vmx_set_interrupt_shadow(vcpu, 0);
}
static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
{
/*
* Ensure that we clear the HLT state in the VMCS. We don't need to
* explicitly skip the instruction because if the HLT state is set,
* then the instruction is already executing and RIP has already been
* advanced.
*/
if (kvm_hlt_in_guest(vcpu->kvm) &&
vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
}
static void vmx_queue_exception(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned nr = vcpu->arch.exception.nr;
bool has_error_code = vcpu->arch.exception.has_error_code;
u32 error_code = vcpu->arch.exception.error_code;
u32 intr_info = nr | INTR_INFO_VALID_MASK;
kvm_deliver_exception_payload(vcpu);
if (has_error_code) {
vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
intr_info |= INTR_INFO_DELIVER_CODE_MASK;
}
if (vmx->rmode.vm86_active) {
int inc_eip = 0;
if (kvm_exception_is_soft(nr))
inc_eip = vcpu->arch.event_exit_inst_len;
if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE)
kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
return;
}
WARN_ON_ONCE(vmx->emulation_required);
if (kvm_exception_is_soft(nr)) {
vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
vmx->vcpu.arch.event_exit_inst_len);
intr_info |= INTR_TYPE_SOFT_EXCEPTION;
} else
intr_info |= INTR_TYPE_HARD_EXCEPTION;
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
vmx_clear_hlt(vcpu);
}
static bool vmx_rdtscp_supported(void)
{
return cpu_has_vmx_rdtscp();
}
static bool vmx_invpcid_supported(void)
{
return cpu_has_vmx_invpcid();
}
/*
* Swap MSR entry in host/guest MSR entry array.
*/
static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
{
struct shared_msr_entry tmp;
tmp = vmx->guest_msrs[to];
vmx->guest_msrs[to] = vmx->guest_msrs[from];
vmx->guest_msrs[from] = tmp;
}
/*
* Set up the vmcs to automatically save and restore system
* msrs. Don't touch the 64-bit msrs if the guest is in legacy
* mode, as fiddling with msrs is very expensive.
*/
static void setup_msrs(struct vcpu_vmx *vmx)
{
int save_nmsrs, index;
save_nmsrs = 0;
#ifdef CONFIG_X86_64
/*
* The SYSCALL MSRs are only needed on long mode guests, and only
* when EFER.SCE is set.
*/
if (is_long_mode(&vmx->vcpu) && (vmx->vcpu.arch.efer & EFER_SCE)) {
index = __find_msr_index(vmx, MSR_STAR);
if (index >= 0)
move_msr_up(vmx, index, save_nmsrs++);
index = __find_msr_index(vmx, MSR_LSTAR);
if (index >= 0)
move_msr_up(vmx, index, save_nmsrs++);
index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
if (index >= 0)
move_msr_up(vmx, index, save_nmsrs++);
}
#endif
index = __find_msr_index(vmx, MSR_EFER);
if (index >= 0 && update_transition_efer(vmx, index))
move_msr_up(vmx, index, save_nmsrs++);
index = __find_msr_index(vmx, MSR_TSC_AUX);
if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP))
move_msr_up(vmx, index, save_nmsrs++);
vmx->save_nmsrs = save_nmsrs;
vmx->guest_msrs_dirty = true;
if (cpu_has_vmx_msr_bitmap())
vmx_update_msr_bitmap(&vmx->vcpu);
}
static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
if (is_guest_mode(vcpu) &&
(vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING))
return vcpu->arch.tsc_offset - vmcs12->tsc_offset;
return vcpu->arch.tsc_offset;
}
static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
u64 g_tsc_offset = 0;
/*
* We're here if L1 chose not to trap WRMSR to TSC. According
* to the spec, this should set L1's TSC; The offset that L1
* set for L2 remains unchanged, and still needs to be added
* to the newly set TSC to get L2's TSC.
*/
if (is_guest_mode(vcpu) &&
(vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING))
g_tsc_offset = vmcs12->tsc_offset;
trace_kvm_write_tsc_offset(vcpu->vcpu_id,
vcpu->arch.tsc_offset - g_tsc_offset,
offset);
vmcs_write64(TSC_OFFSET, offset + g_tsc_offset);
return offset + g_tsc_offset;
}
/*
* nested_vmx_allowed() checks whether a guest should be allowed to use VMX
* instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
* all guests if the "nested" module option is off, and can also be disabled
* for a single guest by disabling its VMX cpuid bit.
*/
bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
{
return nested && guest_cpuid_has(vcpu, X86_FEATURE_VMX);
}
static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
uint64_t val)
{
uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits;
return !(val & ~valid_bits);
}
static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
{
switch (msr->index) {
case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
if (!nested)
return 1;
return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
default:
return 1;
}
return 0;
}
/*
* Reads an msr value (of 'msr_index') into 'pdata'.
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct shared_msr_entry *msr;
u32 index;
switch (msr_info->index) {
#ifdef CONFIG_X86_64
case MSR_FS_BASE:
msr_info->data = vmcs_readl(GUEST_FS_BASE);
break;
case MSR_GS_BASE:
msr_info->data = vmcs_readl(GUEST_GS_BASE);
break;
case MSR_KERNEL_GS_BASE:
msr_info->data = vmx_read_guest_kernel_gs_base(vmx);
break;
#endif
case MSR_EFER:
return kvm_get_msr_common(vcpu, msr_info);
case MSR_IA32_SPEC_CTRL:
if (!msr_info->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
return 1;
msr_info->data = to_vmx(vcpu)->spec_ctrl;
break;
case MSR_IA32_SYSENTER_CS:
msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
break;
case MSR_IA32_SYSENTER_EIP:
msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP);
break;
case MSR_IA32_SYSENTER_ESP:
msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
break;
case MSR_IA32_BNDCFGS:
if (!kvm_mpx_supported() ||
(!msr_info->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
return 1;
msr_info->data = vmcs_read64(GUEST_BNDCFGS);
break;
case MSR_IA32_MCG_EXT_CTL:
if (!msr_info->host_initiated &&
!(vmx->msr_ia32_feature_control &
FEATURE_CONTROL_LMCE))
return 1;
msr_info->data = vcpu->arch.mcg_ext_ctl;
break;
case MSR_IA32_FEATURE_CONTROL:
msr_info->data = vmx->msr_ia32_feature_control;
break;
case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
if (!nested_vmx_allowed(vcpu))
return 1;
return vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
&msr_info->data);
case MSR_IA32_XSS:
if (!vmx_xsaves_supported())
return 1;
msr_info->data = vcpu->arch.ia32_xss;
break;
case MSR_IA32_RTIT_CTL:
if (pt_mode != PT_MODE_HOST_GUEST)
return 1;
msr_info->data = vmx->pt_desc.guest.ctl;
break;
case MSR_IA32_RTIT_STATUS:
if (pt_mode != PT_MODE_HOST_GUEST)
return 1;
msr_info->data = vmx->pt_desc.guest.status;
break;
case MSR_IA32_RTIT_CR3_MATCH:
if ((pt_mode != PT_MODE_HOST_GUEST) ||
!intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_cr3_filtering))
return 1;
msr_info->data = vmx->pt_desc.guest.cr3_match;
break;
case MSR_IA32_RTIT_OUTPUT_BASE:
if ((pt_mode != PT_MODE_HOST_GUEST) ||
(!intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_topa_output) &&
!intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_single_range_output)))
return 1;
msr_info->data = vmx->pt_desc.guest.output_base;
break;
case MSR_IA32_RTIT_OUTPUT_MASK:
if ((pt_mode != PT_MODE_HOST_GUEST) ||
(!intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_topa_output) &&
!intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_single_range_output)))
return 1;
msr_info->data = vmx->pt_desc.guest.output_mask;
break;
case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
if ((pt_mode != PT_MODE_HOST_GUEST) ||
(index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_num_address_ranges)))
return 1;
if (index % 2)
msr_info->data = vmx->pt_desc.guest.addr_b[index / 2];
else
msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
break;
case MSR_TSC_AUX:
if (!msr_info->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
return 1;
/* Else, falls through */
default:
msr = find_msr_entry(vmx, msr_info->index);
if (msr) {
msr_info->data = msr->data;
break;
}
return kvm_get_msr_common(vcpu, msr_info);
}
return 0;
}
/*
* Writes msr value into into the appropriate "register".
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct shared_msr_entry *msr;
int ret = 0;
u32 msr_index = msr_info->index;
u64 data = msr_info->data;
u32 index;
switch (msr_index) {
case MSR_EFER:
ret = kvm_set_msr_common(vcpu, msr_info);
break;
#ifdef CONFIG_X86_64
case MSR_FS_BASE:
vmx_segment_cache_clear(vmx);
vmcs_writel(GUEST_FS_BASE, data);
break;
case MSR_GS_BASE:
vmx_segment_cache_clear(vmx);
vmcs_writel(GUEST_GS_BASE, data);
break;
case MSR_KERNEL_GS_BASE:
vmx_write_guest_kernel_gs_base(vmx, data);
break;
#endif
case MSR_IA32_SYSENTER_CS:
vmcs_write32(GUEST_SYSENTER_CS, data);
break;
case MSR_IA32_SYSENTER_EIP:
vmcs_writel(GUEST_SYSENTER_EIP, data);
break;
case MSR_IA32_SYSENTER_ESP:
vmcs_writel(GUEST_SYSENTER_ESP, data);
break;
case MSR_IA32_BNDCFGS:
if (!kvm_mpx_supported() ||
(!msr_info->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
return 1;
if (is_noncanonical_address(data & PAGE_MASK, vcpu) ||
(data & MSR_IA32_BNDCFGS_RSVD))
return 1;
vmcs_write64(GUEST_BNDCFGS, data);
break;
case MSR_IA32_SPEC_CTRL:
if (!msr_info->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
return 1;
/* The STIBP bit doesn't fault even if it's not advertised */
if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD))
return 1;
vmx->spec_ctrl = data;
if (!data)
break;
/*
* For non-nested:
* When it's written (to non-zero) for the first time, pass
* it through.
*
* For nested:
* The handling of the MSR bitmap for L2 guests is done in
* nested_vmx_merge_msr_bitmap. We should not touch the
* vmcs02.msr_bitmap here since it gets completely overwritten
* in the merging. We update the vmcs01 here for L1 as well
* since it will end up touching the MSR anyway now.
*/
vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap,
MSR_IA32_SPEC_CTRL,
MSR_TYPE_RW);
break;
case MSR_IA32_PRED_CMD:
if (!msr_info->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
return 1;
if (data & ~PRED_CMD_IBPB)
return 1;
if (!data)
break;
wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
/*
* For non-nested:
* When it's written (to non-zero) for the first time, pass
* it through.
*
* For nested:
* The handling of the MSR bitmap for L2 guests is done in
* nested_vmx_merge_msr_bitmap. We should not touch the
* vmcs02.msr_bitmap here since it gets completely overwritten
* in the merging.
*/
vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
MSR_TYPE_W);
break;
case MSR_IA32_CR_PAT:
if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
return 1;
vmcs_write64(GUEST_IA32_PAT, data);
vcpu->arch.pat = data;
break;
}
ret = kvm_set_msr_common(vcpu, msr_info);
break;
case MSR_IA32_TSC_ADJUST:
ret = kvm_set_msr_common(vcpu, msr_info);
break;
case MSR_IA32_MCG_EXT_CTL:
if ((!msr_info->host_initiated &&
!(to_vmx(vcpu)->msr_ia32_feature_control &
FEATURE_CONTROL_LMCE)) ||
(data & ~MCG_EXT_CTL_LMCE_EN))
return 1;
vcpu->arch.mcg_ext_ctl = data;
break;
case MSR_IA32_FEATURE_CONTROL:
if (!vmx_feature_control_msr_valid(vcpu, data) ||
(to_vmx(vcpu)->msr_ia32_feature_control &
FEATURE_CONTROL_LOCKED && !msr_info->host_initiated))
return 1;
vmx->msr_ia32_feature_control = data;
if (msr_info->host_initiated && data == 0)
vmx_leave_nested(vcpu);
break;
case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
if (!msr_info->host_initiated)
return 1; /* they are read-only */
if (!nested_vmx_allowed(vcpu))
return 1;
return vmx_set_vmx_msr(vcpu, msr_index, data);
case MSR_IA32_XSS:
if (!vmx_xsaves_supported())
return 1;
/*
* The only supported bit as of Skylake is bit 8, but
* it is not supported on KVM.
*/
if (data != 0)
return 1;
vcpu->arch.ia32_xss = data;
if (vcpu->arch.ia32_xss != host_xss)
add_atomic_switch_msr(vmx, MSR_IA32_XSS,
vcpu->arch.ia32_xss, host_xss, false);
else
clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
break;
case MSR_IA32_RTIT_CTL:
if ((pt_mode != PT_MODE_HOST_GUEST) ||
vmx_rtit_ctl_check(vcpu, data) ||
vmx->nested.vmxon)
return 1;
vmcs_write64(GUEST_IA32_RTIT_CTL, data);
vmx->pt_desc.guest.ctl = data;
pt_update_intercept_for_msr(vmx);
break;
case MSR_IA32_RTIT_STATUS:
if ((pt_mode != PT_MODE_HOST_GUEST) ||
(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
(data & MSR_IA32_RTIT_STATUS_MASK))
return 1;
vmx->pt_desc.guest.status = data;
break;
case MSR_IA32_RTIT_CR3_MATCH:
if ((pt_mode != PT_MODE_HOST_GUEST) ||
(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
!intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_cr3_filtering))
return 1;
vmx->pt_desc.guest.cr3_match = data;
break;
case MSR_IA32_RTIT_OUTPUT_BASE:
if ((pt_mode != PT_MODE_HOST_GUEST) ||
(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
(!intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_topa_output) &&
!intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_single_range_output)) ||
(data & MSR_IA32_RTIT_OUTPUT_BASE_MASK))
return 1;
vmx->pt_desc.guest.output_base = data;
break;
case MSR_IA32_RTIT_OUTPUT_MASK:
if ((pt_mode != PT_MODE_HOST_GUEST) ||
(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
(!intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_topa_output) &&
!intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_single_range_output)))
return 1;
vmx->pt_desc.guest.output_mask = data;
break;
case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
if ((pt_mode != PT_MODE_HOST_GUEST) ||
(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
(index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_num_address_ranges)))
return 1;
if (index % 2)
vmx->pt_desc.guest.addr_b[index / 2] = data;
else
vmx->pt_desc.guest.addr_a[index / 2] = data;
break;
case MSR_TSC_AUX:
if (!msr_info->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
return 1;
/* Check reserved bit, higher 32 bits should be zero */
if ((data >> 32) != 0)
return 1;
/* Else, falls through */
default:
msr = find_msr_entry(vmx, msr_index);
if (msr) {
u64 old_msr_data = msr->data;
msr->data = data;
if (msr - vmx->guest_msrs < vmx->save_nmsrs) {
preempt_disable();
ret = kvm_set_shared_msr(msr->index, msr->data,
msr->mask);
preempt_enable();
if (ret)
msr->data = old_msr_data;
}
break;
}
ret = kvm_set_msr_common(vcpu, msr_info);
}
return ret;
}
static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
{
__set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
switch (reg) {
case VCPU_REGS_RSP:
vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
break;
case VCPU_REGS_RIP:
vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
break;
case VCPU_EXREG_PDPTR:
if (enable_ept)
ept_save_pdptrs(vcpu);
break;
default:
break;
}
}
static __init int cpu_has_kvm_support(void)
{
return cpu_has_vmx();
}
static __init int vmx_disabled_by_bios(void)
{
u64 msr;
rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
if (msr & FEATURE_CONTROL_LOCKED) {
/* launched w/ TXT and VMX disabled */
if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
&& tboot_enabled())
return 1;
/* launched w/o TXT and VMX only enabled w/ TXT */
if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
&& (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
&& !tboot_enabled()) {
printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
"activate TXT before enabling KVM\n");
return 1;
}
/* launched w/o TXT and VMX disabled */
if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
&& !tboot_enabled())
return 1;
}
return 0;
}
static void kvm_cpu_vmxon(u64 addr)
{
cr4_set_bits(X86_CR4_VMXE);
intel_pt_handle_vmx(1);
asm volatile ("vmxon %0" : : "m"(addr));
}
static int hardware_enable(void)
{
int cpu = raw_smp_processor_id();
u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
u64 old, test_bits;
if (cr4_read_shadow() & X86_CR4_VMXE)
return -EBUSY;
/*
* This can happen if we hot-added a CPU but failed to allocate
* VP assist page for it.
*/
if (static_branch_unlikely(&enable_evmcs) &&
!hv_get_vp_assist_page(cpu))
return -EFAULT;
INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
/*
* Now we can enable the vmclear operation in kdump
* since the loaded_vmcss_on_cpu list on this cpu
* has been initialized.
*
* Though the cpu is not in VMX operation now, there
* is no problem to enable the vmclear operation
* for the loaded_vmcss_on_cpu list is empty!
*/
crash_enable_local_vmclear(cpu);
rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
test_bits = FEATURE_CONTROL_LOCKED;
test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
if (tboot_enabled())
test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
if ((old & test_bits) != test_bits) {
/* enable and lock */
wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
}
kvm_cpu_vmxon(phys_addr);
if (enable_ept)
ept_sync_global();
return 0;
}
static void vmclear_local_loaded_vmcss(void)
{
int cpu = raw_smp_processor_id();
struct loaded_vmcs *v, *n;
list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
loaded_vmcss_on_cpu_link)
__loaded_vmcs_clear(v);
}
/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
* tricks.
*/
static void kvm_cpu_vmxoff(void)
{
asm volatile (__ex("vmxoff"));
intel_pt_handle_vmx(0);
cr4_clear_bits(X86_CR4_VMXE);
}
static void hardware_disable(void)
{
vmclear_local_loaded_vmcss();
kvm_cpu_vmxoff();
}
static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
u32 msr, u32 *result)
{
u32 vmx_msr_low, vmx_msr_high;
u32 ctl = ctl_min | ctl_opt;
rdmsr(msr, vmx_msr_low, vmx_msr_high);
ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */
/* Ensure minimum (required) set of control bits are supported. */
if (ctl_min & ~ctl)
return -EIO;
*result = ctl;
return 0;
}
static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
struct vmx_capability *vmx_cap)
{
u32 vmx_msr_low, vmx_msr_high;
u32 min, opt, min2, opt2;
u32 _pin_based_exec_control = 0;
u32 _cpu_based_exec_control = 0;
u32 _cpu_based_2nd_exec_control = 0;
u32 _vmexit_control = 0;
u32 _vmentry_control = 0;
memset(vmcs_conf, 0, sizeof(*vmcs_conf));
min = CPU_BASED_HLT_EXITING |
#ifdef CONFIG_X86_64
CPU_BASED_CR8_LOAD_EXITING |
CPU_BASED_CR8_STORE_EXITING |
#endif
CPU_BASED_CR3_LOAD_EXITING |
CPU_BASED_CR3_STORE_EXITING |
CPU_BASED_UNCOND_IO_EXITING |
CPU_BASED_MOV_DR_EXITING |
CPU_BASED_USE_TSC_OFFSETING |
CPU_BASED_MWAIT_EXITING |
CPU_BASED_MONITOR_EXITING |
CPU_BASED_INVLPG_EXITING |
CPU_BASED_RDPMC_EXITING;
opt = CPU_BASED_TPR_SHADOW |
CPU_BASED_USE_MSR_BITMAPS |
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
&_cpu_based_exec_control) < 0)
return -EIO;
#ifdef CONFIG_X86_64
if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
_cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
~CPU_BASED_CR8_STORE_EXITING;
#endif
if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
min2 = 0;
opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
SECONDARY_EXEC_WBINVD_EXITING |
SECONDARY_EXEC_ENABLE_VPID |
SECONDARY_EXEC_ENABLE_EPT |
SECONDARY_EXEC_UNRESTRICTED_GUEST |
SECONDARY_EXEC_PAUSE_LOOP_EXITING |
SECONDARY_EXEC_DESC |
SECONDARY_EXEC_RDTSCP |
SECONDARY_EXEC_ENABLE_INVPCID |
SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_SHADOW_VMCS |
SECONDARY_EXEC_XSAVES |
SECONDARY_EXEC_RDSEED_EXITING |
SECONDARY_EXEC_RDRAND_EXITING |
SECONDARY_EXEC_ENABLE_PML |
SECONDARY_EXEC_TSC_SCALING |
SECONDARY_EXEC_PT_USE_GPA |
SECONDARY_EXEC_PT_CONCEAL_VMX |
SECONDARY_EXEC_ENABLE_VMFUNC |
SECONDARY_EXEC_ENCLS_EXITING;
if (adjust_vmx_controls(min2, opt2,
MSR_IA32_VMX_PROCBASED_CTLS2,
&_cpu_based_2nd_exec_control) < 0)
return -EIO;
}
#ifndef CONFIG_X86_64
if (!(_cpu_based_2nd_exec_control &
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
_cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
#endif
if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
_cpu_based_2nd_exec_control &= ~(
SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
&vmx_cap->ept, &vmx_cap->vpid);
if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
/* CR3 accesses and invlpg don't need to cause VM Exits when EPT
enabled */
_cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
CPU_BASED_CR3_STORE_EXITING |
CPU_BASED_INVLPG_EXITING);
} else if (vmx_cap->ept) {
vmx_cap->ept = 0;
pr_warn_once("EPT CAP should not exist if not support "
"1-setting enable EPT VM-execution control\n");
}
if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
vmx_cap->vpid) {
vmx_cap->vpid = 0;
pr_warn_once("VPID CAP should not exist if not support "
"1-setting enable VPID VM-execution control\n");
}
min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
#ifdef CONFIG_X86_64
min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
#endif
opt = VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL |
VM_EXIT_SAVE_IA32_PAT |
VM_EXIT_LOAD_IA32_PAT |
VM_EXIT_LOAD_IA32_EFER |
VM_EXIT_CLEAR_BNDCFGS |
VM_EXIT_PT_CONCEAL_PIP |
VM_EXIT_CLEAR_IA32_RTIT_CTL;
if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
&_vmexit_control) < 0)
return -EIO;
min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |
PIN_BASED_VMX_PREEMPTION_TIMER;
if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
&_pin_based_exec_control) < 0)
return -EIO;
if (cpu_has_broken_vmx_preemption_timer())
_pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
if (!(_cpu_based_2nd_exec_control &
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
_pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
min = VM_ENTRY_LOAD_DEBUG_CONTROLS;
opt = VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL |
VM_ENTRY_LOAD_IA32_PAT |
VM_ENTRY_LOAD_IA32_EFER |
VM_ENTRY_LOAD_BNDCFGS |
VM_ENTRY_PT_CONCEAL_PIP |
VM_ENTRY_LOAD_IA32_RTIT_CTL;
if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
&_vmentry_control) < 0)
return -EIO;
/*
* Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they
* can't be used due to an errata where VM Exit may incorrectly clear
* IA32_PERF_GLOBAL_CTRL[34:32]. Workaround the errata by using the
* MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL.
*/
if (boot_cpu_data.x86 == 0x6) {
switch (boot_cpu_data.x86_model) {
case 26: /* AAK155 */
case 30: /* AAP115 */
case 37: /* AAT100 */
case 44: /* BC86,AAY89,BD102 */
case 46: /* BA97 */
_vmentry_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
_vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
pr_warn_once("kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
"does not work properly. Using workaround\n");
break;
default:
break;
}
}
rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
/* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
return -EIO;
#ifdef CONFIG_X86_64
/* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
if (vmx_msr_high & (1u<<16))
return -EIO;
#endif
/* Require Write-Back (WB) memory type for VMCS accesses. */
if (((vmx_msr_high >> 18) & 15) != 6)
return -EIO;
vmcs_conf->size = vmx_msr_high & 0x1fff;
vmcs_conf->order = get_order(vmcs_conf->size);
vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
vmcs_conf->revision_id = vmx_msr_low;
vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
vmcs_conf->vmexit_ctrl = _vmexit_control;
vmcs_conf->vmentry_ctrl = _vmentry_control;
if (static_branch_unlikely(&enable_evmcs))
evmcs_sanitize_exec_ctrls(vmcs_conf);
return 0;
}
struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
{
int node = cpu_to_node(cpu);
struct page *pages;
struct vmcs *vmcs;
pages = __alloc_pages_node(node, flags, vmcs_config.order);
if (!pages)
return NULL;
vmcs = page_address(pages);
memset(vmcs, 0, vmcs_config.size);
/* KVM supports Enlightened VMCS v1 only */
if (static_branch_unlikely(&enable_evmcs))
vmcs->hdr.revision_id = KVM_EVMCS_VERSION;
else
vmcs->hdr.revision_id = vmcs_config.revision_id;
if (shadow)
vmcs->hdr.shadow_vmcs = 1;
return vmcs;
}
void free_vmcs(struct vmcs *vmcs)
{
free_pages((unsigned long)vmcs, vmcs_config.order);
}
/*
* Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
*/
void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
{
if (!loaded_vmcs->vmcs)
return;
loaded_vmcs_clear(loaded_vmcs);
free_vmcs(loaded_vmcs->vmcs);
loaded_vmcs->vmcs = NULL;
if (loaded_vmcs->msr_bitmap)
free_page((unsigned long)loaded_vmcs->msr_bitmap);
WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
}
int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
{
loaded_vmcs->vmcs = alloc_vmcs(false);
if (!loaded_vmcs->vmcs)
return -ENOMEM;
loaded_vmcs->shadow_vmcs = NULL;
loaded_vmcs_init(loaded_vmcs);
if (cpu_has_vmx_msr_bitmap()) {
loaded_vmcs->msr_bitmap = (unsigned long *)
__get_free_page(GFP_KERNEL_ACCOUNT);
if (!loaded_vmcs->msr_bitmap)
goto out_vmcs;
memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
if (IS_ENABLED(CONFIG_HYPERV) &&
static_branch_unlikely(&enable_evmcs) &&
(ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
struct hv_enlightened_vmcs *evmcs =
(struct hv_enlightened_vmcs *)loaded_vmcs->vmcs;
evmcs->hv_enlightenments_control.msr_bitmap = 1;
}
}
memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
return 0;
out_vmcs:
free_loaded_vmcs(loaded_vmcs);
return -ENOMEM;
}
static void free_kvm_area(void)
{
int cpu;
for_each_possible_cpu(cpu) {
free_vmcs(per_cpu(vmxarea, cpu));
per_cpu(vmxarea, cpu) = NULL;
}
}
static __init int alloc_kvm_area(void)
{
int cpu;
for_each_possible_cpu(cpu) {
struct vmcs *vmcs;
vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL);
if (!vmcs) {
free_kvm_area();
return -ENOMEM;
}
/*
* When eVMCS is enabled, alloc_vmcs_cpu() sets
* vmcs->revision_id to KVM_EVMCS_VERSION instead of
* revision_id reported by MSR_IA32_VMX_BASIC.
*
* However, even though not explicitly documented by
* TLFS, VMXArea passed as VMXON argument should
* still be marked with revision_id reported by
* physical CPU.
*/
if (static_branch_unlikely(&enable_evmcs))
vmcs->hdr.revision_id = vmcs_config.revision_id;
per_cpu(vmxarea, cpu) = vmcs;
}
return 0;
}
static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
struct kvm_segment *save)
{
if (!emulate_invalid_guest_state) {
/*
* CS and SS RPL should be equal during guest entry according
* to VMX spec, but in reality it is not always so. Since vcpu
* is in the middle of the transition from real mode to
* protected mode it is safe to assume that RPL 0 is a good
* default value.
*/
if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
save->selector &= ~SEGMENT_RPL_MASK;
save->dpl = save->selector & SEGMENT_RPL_MASK;
save->s = 1;
}
vmx_set_segment(vcpu, save, seg);
}
static void enter_pmode(struct kvm_vcpu *vcpu)
{
unsigned long flags;
struct vcpu_vmx *vmx = to_vmx(vcpu);
/*
* Update real mode segment cache. It may be not up-to-date if sement
* register was written while vcpu was in a guest mode.
*/
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
vmx->rmode.vm86_active = 0;
vmx_segment_cache_clear(vmx);
vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
flags = vmcs_readl(GUEST_RFLAGS);
flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
vmcs_writel(GUEST_RFLAGS, flags);
vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
(vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
update_exception_bitmap(vcpu);
fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
}
static void fix_rmode_seg(int seg, struct kvm_segment *save)
{
const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
struct kvm_segment var = *save;
var.dpl = 0x3;
if (seg == VCPU_SREG_CS)
var.type = 0x3;
if (!emulate_invalid_guest_state) {
var.selector = var.base >> 4;
var.base = var.base & 0xffff0;
var.limit = 0xffff;
var.g = 0;
var.db = 0;
var.present = 1;
var.s = 1;
var.l = 0;
var.unusable = 0;
var.type = 0x3;
var.avl = 0;
if (save->base & 0xf)
printk_once(KERN_WARNING "kvm: segment base is not "
"paragraph aligned when entering "
"protected mode (seg=%d)", seg);
}
vmcs_write16(sf->selector, var.selector);
vmcs_writel(sf->base, var.base);
vmcs_write32(sf->limit, var.limit);
vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
}
static void enter_rmode(struct kvm_vcpu *vcpu)
{
unsigned long flags;
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct kvm_vmx *kvm_vmx = to_kvm_vmx(vcpu->kvm);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
vmx->rmode.vm86_active = 1;
/*
* Very old userspace does not call KVM_SET_TSS_ADDR before entering
* vcpu. Warn the user that an update is overdue.
*/
if (!kvm_vmx->tss_addr)
printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
"called before entering vcpu\n");
vmx_segment_cache_clear(vmx);
vmcs_writel(GUEST_TR_BASE, kvm_vmx->tss_addr);
vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
flags = vmcs_readl(GUEST_RFLAGS);
vmx->rmode.save_rflags = flags;
flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
vmcs_writel(GUEST_RFLAGS, flags);
vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
update_exception_bitmap(vcpu);
fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
kvm_mmu_reset_context(vcpu);
}
void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
if (!msr)
return;
vcpu->arch.efer = efer;
if (efer & EFER_LMA) {
vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
msr->data = efer;
} else {
vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
msr->data = efer & ~EFER_LME;
}
setup_msrs(vmx);
}
#ifdef CONFIG_X86_64
static void enter_lmode(struct kvm_vcpu *vcpu)
{
u32 guest_tr_ar;
vmx_segment_cache_clear(to_vmx(vcpu));
guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) {
pr_debug_ratelimited("%s: tss fixup for long mode. \n",
__func__);
vmcs_write32(GUEST_TR_AR_BYTES,
(guest_tr_ar & ~VMX_AR_TYPE_MASK)
| VMX_AR_TYPE_BUSY_64_TSS);
}
vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
}
static void exit_lmode(struct kvm_vcpu *vcpu)
{
vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
}
#endif
static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
{
int vpid = to_vmx(vcpu)->vpid;
if (!vpid_sync_vcpu_addr(vpid, addr))
vpid_sync_context(vpid);
/*
* If VPIDs are not supported or enabled, then the above is a no-op.
* But we don't really need a TLB flush in that case anyway, because
* each VM entry/exit includes an implicit flush when VPID is 0.
*/
}
static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
{
ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
vcpu->arch.cr0 &= ~cr0_guest_owned_bits;
vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
}
static void vmx_decache_cr3(struct kvm_vcpu *vcpu)
{
if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu)))
vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
}
static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
{
ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
vcpu->arch.cr4 &= ~cr4_guest_owned_bits;
vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits;
}
static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
if (!test_bit(VCPU_EXREG_PDPTR,
(unsigned long *)&vcpu->arch.regs_dirty))
return;
if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]);
vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]);
vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]);
vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]);
}
}
void ept_save_pdptrs(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
}
__set_bit(VCPU_EXREG_PDPTR,
(unsigned long *)&vcpu->arch.regs_avail);
__set_bit(VCPU_EXREG_PDPTR,
(unsigned long *)&vcpu->arch.regs_dirty);
}
static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
unsigned long cr0,
struct kvm_vcpu *vcpu)
{
if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
vmx_decache_cr3(vcpu);
if (!(cr0 & X86_CR0_PG)) {
/* From paging/starting to nonpaging */
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) |
(CPU_BASED_CR3_LOAD_EXITING |
CPU_BASED_CR3_STORE_EXITING));
vcpu->arch.cr0 = cr0;
vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
} else if (!is_paging(vcpu)) {