blob: d48ec60ea421a8211271195db0193bc42a9a7532 [file] [log] [blame]
/*
* Kernel-based Virtual Machine driver for Linux
*
* This module enables machines with Intel VT-x extensions to run virtual
* machines without emulation or binary translation.
*
* Copyright (C) 2006 Qumranet, Inc.
* Copyright 2010 Red Hat, Inc. and/or its affiliates.
*
* Authors:
* Avi Kivity <avi@qumranet.com>
* Yaniv Kamay <yaniv@qumranet.com>
*
* This work is licensed under the terms of the GNU GPL, version 2. See
* the COPYING file in the top-level directory.
*
*/
#include "irq.h"
#include "mmu.h"
#include <linux/kvm_host.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/sched.h>
#include <linux/moduleparam.h>
#include <linux/ftrace_event.h>
#include <linux/slab.h>
#include <linux/tboot.h>
#include "kvm_cache_regs.h"
#include "x86.h"
#include <asm/io.h>
#include <asm/desc.h>
#include <asm/vmx.h>
#include <asm/virtext.h>
#include <asm/mce.h>
#include <asm/i387.h>
#include <asm/xcr.h>
#include "trace.h"
#define __ex(x) __kvm_handle_fault_on_reboot(x)
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");
static int __read_mostly bypass_guest_pf = 1;
module_param(bypass_guest_pf, bool, S_IRUGO);
static int __read_mostly enable_vpid = 1;
module_param_named(vpid, enable_vpid, bool, 0444);
static int __read_mostly flexpriority_enabled = 1;
module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
static int __read_mostly enable_ept = 1;
module_param_named(ept, enable_ept, bool, S_IRUGO);
static int __read_mostly enable_unrestricted_guest = 1;
module_param_named(unrestricted_guest,
enable_unrestricted_guest, bool, S_IRUGO);
static int __read_mostly emulate_invalid_guest_state = 0;
module_param(emulate_invalid_guest_state, bool, S_IRUGO);
static int __read_mostly vmm_exclusive = 1;
module_param(vmm_exclusive, bool, S_IRUGO);
static int __read_mostly yield_on_hlt = 1;
module_param(yield_on_hlt, bool, S_IRUGO);
#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
(X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
#define KVM_GUEST_CR0_MASK \
(KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \
(X86_CR0_WP | X86_CR0_NE)
#define KVM_VM_CR0_ALWAYS_ON \
(KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
#define KVM_CR4_GUEST_OWNED_BITS \
(X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
| X86_CR4_OSXMMEXCPT)
#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
/*
* These 2 parameters are used to config the controls for Pause-Loop Exiting:
* ple_gap: upper bound on the amount of time between two successive
* executions of PAUSE in a loop. Also indicate if ple enabled.
* According to test, this time is usually smaller than 128 cycles.
* ple_window: upper bound on the amount of time a guest is allowed to execute
* in a PAUSE loop. Tests indicate that most spinlocks are held for
* less than 2^12 cycles
* Time is measured based on a counter that runs at the same rate as the TSC,
* refer SDM volume 3b section 21.6.13 & 22.1.3.
*/
#define KVM_VMX_DEFAULT_PLE_GAP 128
#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP;
module_param(ple_gap, int, S_IRUGO);
static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
module_param(ple_window, int, S_IRUGO);
#define NR_AUTOLOAD_MSRS 1
struct vmcs {
u32 revision_id;
u32 abort;
char data[0];
};
struct shared_msr_entry {
unsigned index;
u64 data;
u64 mask;
};
struct vcpu_vmx {
struct kvm_vcpu vcpu;
struct list_head local_vcpus_link;
unsigned long host_rsp;
int launched;
u8 fail;
u8 cpl;
bool nmi_known_unmasked;
u32 exit_intr_info;
u32 idt_vectoring_info;
ulong rflags;
struct shared_msr_entry *guest_msrs;
int nmsrs;
int save_nmsrs;
#ifdef CONFIG_X86_64
u64 msr_host_kernel_gs_base;
u64 msr_guest_kernel_gs_base;
#endif
struct vmcs *vmcs;
struct msr_autoload {
unsigned nr;
struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
struct vmx_msr_entry host[NR_AUTOLOAD_MSRS];
} msr_autoload;
struct {
int loaded;
u16 fs_sel, gs_sel, ldt_sel;
int gs_ldt_reload_needed;
int fs_reload_needed;
} host_state;
struct {
int vm86_active;
ulong save_rflags;
struct kvm_save_segment {
u16 selector;
unsigned long base;
u32 limit;
u32 ar;
} tr, es, ds, fs, gs;
} rmode;
struct {
u32 bitmask; /* 4 bits per segment (1 bit per field) */
struct kvm_save_segment seg[8];
} segment_cache;
int vpid;
bool emulation_required;
/* Support for vnmi-less CPUs */
int soft_vnmi_blocked;
ktime_t entry_time;
s64 vnmi_blocked_time;
u32 exit_reason;
bool rdtscp_enabled;
};
enum segment_cache_field {
SEG_FIELD_SEL = 0,
SEG_FIELD_BASE = 1,
SEG_FIELD_LIMIT = 2,
SEG_FIELD_AR = 3,
SEG_FIELD_NR = 4
};
static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
{
return container_of(vcpu, struct vcpu_vmx, vcpu);
}
static u64 construct_eptp(unsigned long root_hpa);
static void kvm_cpu_vmxon(u64 addr);
static void kvm_cpu_vmxoff(void);
static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
static DEFINE_PER_CPU(struct vmcs *, vmxarea);
static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
static unsigned long *vmx_io_bitmap_a;
static unsigned long *vmx_io_bitmap_b;
static unsigned long *vmx_msr_bitmap_legacy;
static unsigned long *vmx_msr_bitmap_longmode;
static bool cpu_has_load_ia32_efer;
static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
static DEFINE_SPINLOCK(vmx_vpid_lock);
static struct vmcs_config {
int size;
int order;
u32 revision_id;
u32 pin_based_exec_ctrl;
u32 cpu_based_exec_ctrl;
u32 cpu_based_2nd_exec_ctrl;
u32 vmexit_ctrl;
u32 vmentry_ctrl;
} vmcs_config;
static struct vmx_capability {
u32 ept;
u32 vpid;
} vmx_capability;
#define VMX_SEGMENT_FIELD(seg) \
[VCPU_SREG_##seg] = { \
.selector = GUEST_##seg##_SELECTOR, \
.base = GUEST_##seg##_BASE, \
.limit = GUEST_##seg##_LIMIT, \
.ar_bytes = GUEST_##seg##_AR_BYTES, \
}
static struct kvm_vmx_segment_field {
unsigned selector;
unsigned base;
unsigned limit;
unsigned ar_bytes;
} kvm_vmx_segment_fields[] = {
VMX_SEGMENT_FIELD(CS),
VMX_SEGMENT_FIELD(DS),
VMX_SEGMENT_FIELD(ES),
VMX_SEGMENT_FIELD(FS),
VMX_SEGMENT_FIELD(GS),
VMX_SEGMENT_FIELD(SS),
VMX_SEGMENT_FIELD(TR),
VMX_SEGMENT_FIELD(LDTR),
};
static u64 host_efer;
static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
/*
* Keep MSR_STAR at the end, as setup_msrs() will try to optimize it
* away by decrementing the array size.
*/
static const u32 vmx_msr_index[] = {
#ifdef CONFIG_X86_64
MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
#endif
MSR_EFER, MSR_TSC_AUX, MSR_STAR,
};
#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
static inline bool is_page_fault(u32 intr_info)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
INTR_INFO_VALID_MASK)) ==
(INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
}
static inline bool is_no_device(u32 intr_info)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
INTR_INFO_VALID_MASK)) ==
(INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
}
static inline bool is_invalid_opcode(u32 intr_info)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
INTR_INFO_VALID_MASK)) ==
(INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
}
static inline bool is_external_interrupt(u32 intr_info)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
== (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
}
static inline bool is_machine_check(u32 intr_info)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
INTR_INFO_VALID_MASK)) ==
(INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
}
static inline bool cpu_has_vmx_msr_bitmap(void)
{
return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
}
static inline bool cpu_has_vmx_tpr_shadow(void)
{
return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
}
static inline bool vm_need_tpr_shadow(struct kvm *kvm)
{
return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
}
static inline bool cpu_has_secondary_exec_ctrls(void)
{
return vmcs_config.cpu_based_exec_ctrl &
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
}
static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
}
static inline bool cpu_has_vmx_flexpriority(void)
{
return cpu_has_vmx_tpr_shadow() &&
cpu_has_vmx_virtualize_apic_accesses();
}
static inline bool cpu_has_vmx_ept_execute_only(void)
{
return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
}
static inline bool cpu_has_vmx_eptp_uncacheable(void)
{
return vmx_capability.ept & VMX_EPTP_UC_BIT;
}
static inline bool cpu_has_vmx_eptp_writeback(void)
{
return vmx_capability.ept & VMX_EPTP_WB_BIT;
}
static inline bool cpu_has_vmx_ept_2m_page(void)
{
return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
}
static inline bool cpu_has_vmx_ept_1g_page(void)
{
return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
}
static inline bool cpu_has_vmx_ept_4levels(void)
{
return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
}
static inline bool cpu_has_vmx_invept_individual_addr(void)
{
return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
}
static inline bool cpu_has_vmx_invept_context(void)
{
return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
}
static inline bool cpu_has_vmx_invept_global(void)
{
return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
}
static inline bool cpu_has_vmx_invvpid_single(void)
{
return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
}
static inline bool cpu_has_vmx_invvpid_global(void)
{
return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
}
static inline bool cpu_has_vmx_ept(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_ENABLE_EPT;
}
static inline bool cpu_has_vmx_unrestricted_guest(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_UNRESTRICTED_GUEST;
}
static inline bool cpu_has_vmx_ple(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_PAUSE_LOOP_EXITING;
}
static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm)
{
return flexpriority_enabled && irqchip_in_kernel(kvm);
}
static inline bool cpu_has_vmx_vpid(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_ENABLE_VPID;
}
static inline bool cpu_has_vmx_rdtscp(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_RDTSCP;
}
static inline bool cpu_has_virtual_nmis(void)
{
return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
}
static inline bool cpu_has_vmx_wbinvd_exit(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_WBINVD_EXITING;
}
static inline bool report_flexpriority(void)
{
return flexpriority_enabled;
}
static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
{
int i;
for (i = 0; i < vmx->nmsrs; ++i)
if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
return i;
return -1;
}
static inline void __invvpid(int ext, u16 vpid, gva_t gva)
{
struct {
u64 vpid : 16;
u64 rsvd : 48;
u64 gva;
} operand = { vpid, 0, gva };
asm volatile (__ex(ASM_VMX_INVVPID)
/* CF==1 or ZF==1 --> rc = -1 */
"; ja 1f ; ud2 ; 1:"
: : "a"(&operand), "c"(ext) : "cc", "memory");
}
static inline void __invept(int ext, u64 eptp, gpa_t gpa)
{
struct {
u64 eptp, gpa;
} operand = {eptp, gpa};
asm volatile (__ex(ASM_VMX_INVEPT)
/* CF==1 or ZF==1 --> rc = -1 */
"; ja 1f ; ud2 ; 1:\n"
: : "a" (&operand), "c" (ext) : "cc", "memory");
}
static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
{
int i;
i = __find_msr_index(vmx, msr);
if (i >= 0)
return &vmx->guest_msrs[i];
return NULL;
}
static void vmcs_clear(struct vmcs *vmcs)
{
u64 phys_addr = __pa(vmcs);
u8 error;
asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0"
: "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
: "cc", "memory");
if (error)
printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
vmcs, phys_addr);
}
static void vmcs_load(struct vmcs *vmcs)
{
u64 phys_addr = __pa(vmcs);
u8 error;
asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
: "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
: "cc", "memory");
if (error)
printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
vmcs, phys_addr);
}
static void __vcpu_clear(void *arg)
{
struct vcpu_vmx *vmx = arg;
int cpu = raw_smp_processor_id();
if (vmx->vcpu.cpu == cpu)
vmcs_clear(vmx->vmcs);
if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
per_cpu(current_vmcs, cpu) = NULL;
list_del(&vmx->local_vcpus_link);
vmx->vcpu.cpu = -1;
vmx->launched = 0;
}
static void vcpu_clear(struct vcpu_vmx *vmx)
{
if (vmx->vcpu.cpu == -1)
return;
smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1);
}
static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
{
if (vmx->vpid == 0)
return;
if (cpu_has_vmx_invvpid_single())
__invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
}
static inline void vpid_sync_vcpu_global(void)
{
if (cpu_has_vmx_invvpid_global())
__invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
}
static inline void vpid_sync_context(struct vcpu_vmx *vmx)
{
if (cpu_has_vmx_invvpid_single())
vpid_sync_vcpu_single(vmx);
else
vpid_sync_vcpu_global();
}
static inline void ept_sync_global(void)
{
if (cpu_has_vmx_invept_global())
__invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
}
static inline void ept_sync_context(u64 eptp)
{
if (enable_ept) {
if (cpu_has_vmx_invept_context())
__invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
else
ept_sync_global();
}
}
static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
{
if (enable_ept) {
if (cpu_has_vmx_invept_individual_addr())
__invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR,
eptp, gpa);
else
ept_sync_context(eptp);
}
}
static unsigned long vmcs_readl(unsigned long field)
{
unsigned long value = 0;
asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX)
: "+a"(value) : "d"(field) : "cc");
return value;
}
static u16 vmcs_read16(unsigned long field)
{
return vmcs_readl(field);
}
static u32 vmcs_read32(unsigned long field)
{
return vmcs_readl(field);
}
static u64 vmcs_read64(unsigned long field)
{
#ifdef CONFIG_X86_64
return vmcs_readl(field);
#else
return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32);
#endif
}
static noinline void vmwrite_error(unsigned long field, unsigned long value)
{
printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
dump_stack();
}
static void vmcs_writel(unsigned long field, unsigned long value)
{
u8 error;
asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) "; setna %0"
: "=q"(error) : "a"(value), "d"(field) : "cc");
if (unlikely(error))
vmwrite_error(field, value);
}
static void vmcs_write16(unsigned long field, u16 value)
{
vmcs_writel(field, value);
}
static void vmcs_write32(unsigned long field, u32 value)
{
vmcs_writel(field, value);
}
static void vmcs_write64(unsigned long field, u64 value)
{
vmcs_writel(field, value);
#ifndef CONFIG_X86_64
asm volatile ("");
vmcs_writel(field+1, value >> 32);
#endif
}
static void vmcs_clear_bits(unsigned long field, u32 mask)
{
vmcs_writel(field, vmcs_readl(field) & ~mask);
}
static void vmcs_set_bits(unsigned long field, u32 mask)
{
vmcs_writel(field, vmcs_readl(field) | mask);
}
static void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
{
vmx->segment_cache.bitmask = 0;
}
static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
unsigned field)
{
bool ret;
u32 mask = 1 << (seg * SEG_FIELD_NR + field);
if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) {
vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS);
vmx->segment_cache.bitmask = 0;
}
ret = vmx->segment_cache.bitmask & mask;
vmx->segment_cache.bitmask |= mask;
return ret;
}
static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
{
u16 *p = &vmx->segment_cache.seg[seg].selector;
if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
*p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
return *p;
}
static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
{
ulong *p = &vmx->segment_cache.seg[seg].base;
if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
*p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
return *p;
}
static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
{
u32 *p = &vmx->segment_cache.seg[seg].limit;
if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
*p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
return *p;
}
static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
{
u32 *p = &vmx->segment_cache.seg[seg].ar;
if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
*p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
return *p;
}
static void update_exception_bitmap(struct kvm_vcpu *vcpu)
{
u32 eb;
eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
(1u << NM_VECTOR) | (1u << DB_VECTOR);
if ((vcpu->guest_debug &
(KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
(KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
eb |= 1u << BP_VECTOR;
if (to_vmx(vcpu)->rmode.vm86_active)
eb = ~0;
if (enable_ept)
eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
if (vcpu->fpu_active)
eb &= ~(1u << NM_VECTOR);
vmcs_write32(EXCEPTION_BITMAP, eb);
}
static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
{
unsigned i;
struct msr_autoload *m = &vmx->msr_autoload;
if (msr == MSR_EFER && cpu_has_load_ia32_efer) {
vmcs_clear_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER);
vmcs_clear_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER);
return;
}
for (i = 0; i < m->nr; ++i)
if (m->guest[i].index == msr)
break;
if (i == m->nr)
return;
--m->nr;
m->guest[i] = m->guest[m->nr];
m->host[i] = m->host[m->nr];
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
}
static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
u64 guest_val, u64 host_val)
{
unsigned i;
struct msr_autoload *m = &vmx->msr_autoload;
if (msr == MSR_EFER && cpu_has_load_ia32_efer) {
vmcs_write64(GUEST_IA32_EFER, guest_val);
vmcs_write64(HOST_IA32_EFER, host_val);
vmcs_set_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER);
vmcs_set_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER);
return;
}
for (i = 0; i < m->nr; ++i)
if (m->guest[i].index == msr)
break;
if (i == m->nr) {
++m->nr;
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
}
m->guest[i].index = msr;
m->guest[i].value = guest_val;
m->host[i].index = msr;
m->host[i].value = host_val;
}
static void reload_tss(void)
{
/*
* VT restores TR but not its size. Useless.
*/
struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
struct desc_struct *descs;
descs = (void *)gdt->address;
descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
load_TR_desc();
}
static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
{
u64 guest_efer;
u64 ignore_bits;
guest_efer = vmx->vcpu.arch.efer;
/*
* NX is emulated; LMA and LME handled by hardware; SCE meaninless
* outside long mode
*/
ignore_bits = EFER_NX | EFER_SCE;
#ifdef CONFIG_X86_64
ignore_bits |= EFER_LMA | EFER_LME;
/* SCE is meaningful only in long mode on Intel */
if (guest_efer & EFER_LMA)
ignore_bits &= ~(u64)EFER_SCE;
#endif
guest_efer &= ~ignore_bits;
guest_efer |= host_efer & ignore_bits;
vmx->guest_msrs[efer_offset].data = guest_efer;
vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
clear_atomic_switch_msr(vmx, MSR_EFER);
/* On ept, can't emulate nx, and must switch nx atomically */
if (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX)) {
guest_efer = vmx->vcpu.arch.efer;
if (!(guest_efer & EFER_LMA))
guest_efer &= ~EFER_LME;
add_atomic_switch_msr(vmx, MSR_EFER, guest_efer, host_efer);
return false;
}
return true;
}
static unsigned long segment_base(u16 selector)
{
struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
struct desc_struct *d;
unsigned long table_base;
unsigned long v;
if (!(selector & ~3))
return 0;
table_base = gdt->address;
if (selector & 4) { /* from ldt */
u16 ldt_selector = kvm_read_ldt();
if (!(ldt_selector & ~3))
return 0;
table_base = segment_base(ldt_selector);
}
d = (struct desc_struct *)(table_base + (selector & ~7));
v = get_desc_base(d);
#ifdef CONFIG_X86_64
if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
#endif
return v;
}
static inline unsigned long kvm_read_tr_base(void)
{
u16 tr;
asm("str %0" : "=g"(tr));
return segment_base(tr);
}
static void vmx_save_host_state(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
int i;
if (vmx->host_state.loaded)
return;
vmx->host_state.loaded = 1;
/*
* Set host fs and gs selectors. Unfortunately, 22.2.3 does not
* allow segment selectors with cpl > 0 or ti == 1.
*/
vmx->host_state.ldt_sel = kvm_read_ldt();
vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
savesegment(fs, vmx->host_state.fs_sel);
if (!(vmx->host_state.fs_sel & 7)) {
vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
vmx->host_state.fs_reload_needed = 0;
} else {
vmcs_write16(HOST_FS_SELECTOR, 0);
vmx->host_state.fs_reload_needed = 1;
}
savesegment(gs, vmx->host_state.gs_sel);
if (!(vmx->host_state.gs_sel & 7))
vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
else {
vmcs_write16(HOST_GS_SELECTOR, 0);
vmx->host_state.gs_ldt_reload_needed = 1;
}
#ifdef CONFIG_X86_64
vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
#else
vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel));
vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel));
#endif
#ifdef CONFIG_X86_64
rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
if (is_long_mode(&vmx->vcpu))
wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
#endif
for (i = 0; i < vmx->save_nmsrs; ++i)
kvm_set_shared_msr(vmx->guest_msrs[i].index,
vmx->guest_msrs[i].data,
vmx->guest_msrs[i].mask);
}
static void __vmx_load_host_state(struct vcpu_vmx *vmx)
{
if (!vmx->host_state.loaded)
return;
++vmx->vcpu.stat.host_state_reload;
vmx->host_state.loaded = 0;
#ifdef CONFIG_X86_64
if (is_long_mode(&vmx->vcpu))
rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
#endif
if (vmx->host_state.gs_ldt_reload_needed) {
kvm_load_ldt(vmx->host_state.ldt_sel);
#ifdef CONFIG_X86_64
load_gs_index(vmx->host_state.gs_sel);
#else
loadsegment(gs, vmx->host_state.gs_sel);
#endif
}
if (vmx->host_state.fs_reload_needed)
loadsegment(fs, vmx->host_state.fs_sel);
reload_tss();
#ifdef CONFIG_X86_64
wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
#endif
if (current_thread_info()->status & TS_USEDFPU)
clts();
load_gdt(&__get_cpu_var(host_gdt));
}
static void vmx_load_host_state(struct vcpu_vmx *vmx)
{
preempt_disable();
__vmx_load_host_state(vmx);
preempt_enable();
}
/*
* Switches to specified vcpu, until a matching vcpu_put(), but assumes
* vcpu mutex is already taken.
*/
static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
if (!vmm_exclusive)
kvm_cpu_vmxon(phys_addr);
else if (vcpu->cpu != cpu)
vcpu_clear(vmx);
if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
per_cpu(current_vmcs, cpu) = vmx->vmcs;
vmcs_load(vmx->vmcs);
}
if (vcpu->cpu != cpu) {
struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
unsigned long sysenter_esp;
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
local_irq_disable();
list_add(&vmx->local_vcpus_link,
&per_cpu(vcpus_on_cpu, cpu));
local_irq_enable();
/*
* Linux uses per-cpu TSS and GDT, so set these when switching
* processors.
*/
vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */
vmcs_writel(HOST_GDTR_BASE, gdt->address); /* 22.2.4 */
rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
}
}
static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
{
__vmx_load_host_state(to_vmx(vcpu));
if (!vmm_exclusive) {
__vcpu_clear(to_vmx(vcpu));
kvm_cpu_vmxoff();
}
}
static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
{
ulong cr0;
if (vcpu->fpu_active)
return;
vcpu->fpu_active = 1;
cr0 = vmcs_readl(GUEST_CR0);
cr0 &= ~(X86_CR0_TS | X86_CR0_MP);
cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP);
vmcs_writel(GUEST_CR0, cr0);
update_exception_bitmap(vcpu);
vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
}
static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
{
vmx_decache_cr0_guest_bits(vcpu);
vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP);
update_exception_bitmap(vcpu);
vcpu->arch.cr0_guest_owned_bits = 0;
vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
}
static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
{
unsigned long rflags, save_rflags;
if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) {
__set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
rflags = vmcs_readl(GUEST_RFLAGS);
if (to_vmx(vcpu)->rmode.vm86_active) {
rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
save_rflags = to_vmx(vcpu)->rmode.save_rflags;
rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
}
to_vmx(vcpu)->rflags = rflags;
}
return to_vmx(vcpu)->rflags;
}
static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
{
__set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
__clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
to_vmx(vcpu)->rflags = rflags;
if (to_vmx(vcpu)->rmode.vm86_active) {
to_vmx(vcpu)->rmode.save_rflags = rflags;
rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
}
vmcs_writel(GUEST_RFLAGS, rflags);
}
static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
{
u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
int ret = 0;
if (interruptibility & GUEST_INTR_STATE_STI)
ret |= KVM_X86_SHADOW_INT_STI;
if (interruptibility & GUEST_INTR_STATE_MOV_SS)
ret |= KVM_X86_SHADOW_INT_MOV_SS;
return ret & mask;
}
static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
{
u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
u32 interruptibility = interruptibility_old;
interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
if (mask & KVM_X86_SHADOW_INT_MOV_SS)
interruptibility |= GUEST_INTR_STATE_MOV_SS;
else if (mask & KVM_X86_SHADOW_INT_STI)
interruptibility |= GUEST_INTR_STATE_STI;
if ((interruptibility != interruptibility_old))
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
}
static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
unsigned long rip;
rip = kvm_rip_read(vcpu);
rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
kvm_rip_write(vcpu, rip);
/* skipping an emulated instruction also counts */
vmx_set_interrupt_shadow(vcpu, 0);
}
static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
{
/* Ensure that we clear the HLT state in the VMCS. We don't need to
* explicitly skip the instruction because if the HLT state is set, then
* the instruction is already executing and RIP has already been
* advanced. */
if (!yield_on_hlt &&
vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
}
static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
bool has_error_code, u32 error_code,
bool reinject)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 intr_info = nr | INTR_INFO_VALID_MASK;
if (has_error_code) {
vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
intr_info |= INTR_INFO_DELIVER_CODE_MASK;
}
if (vmx->rmode.vm86_active) {
int inc_eip = 0;
if (kvm_exception_is_soft(nr))
inc_eip = vcpu->arch.event_exit_inst_len;
if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE)
kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
return;
}
if (kvm_exception_is_soft(nr)) {
vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
vmx->vcpu.arch.event_exit_inst_len);
intr_info |= INTR_TYPE_SOFT_EXCEPTION;
} else
intr_info |= INTR_TYPE_HARD_EXCEPTION;
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
vmx_clear_hlt(vcpu);
}
static bool vmx_rdtscp_supported(void)
{
return cpu_has_vmx_rdtscp();
}
/*
* Swap MSR entry in host/guest MSR entry array.
*/
static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
{
struct shared_msr_entry tmp;
tmp = vmx->guest_msrs[to];
vmx->guest_msrs[to] = vmx->guest_msrs[from];
vmx->guest_msrs[from] = tmp;
}
/*
* Set up the vmcs to automatically save and restore system
* msrs. Don't touch the 64-bit msrs if the guest is in legacy
* mode, as fiddling with msrs is very expensive.
*/
static void setup_msrs(struct vcpu_vmx *vmx)
{
int save_nmsrs, index;
unsigned long *msr_bitmap;
vmx_load_host_state(vmx);
save_nmsrs = 0;
#ifdef CONFIG_X86_64
if (is_long_mode(&vmx->vcpu)) {
index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
if (index >= 0)
move_msr_up(vmx, index, save_nmsrs++);
index = __find_msr_index(vmx, MSR_LSTAR);
if (index >= 0)
move_msr_up(vmx, index, save_nmsrs++);
index = __find_msr_index(vmx, MSR_CSTAR);
if (index >= 0)
move_msr_up(vmx, index, save_nmsrs++);
index = __find_msr_index(vmx, MSR_TSC_AUX);
if (index >= 0 && vmx->rdtscp_enabled)
move_msr_up(vmx, index, save_nmsrs++);
/*
* MSR_STAR is only needed on long mode guests, and only
* if efer.sce is enabled.
*/
index = __find_msr_index(vmx, MSR_STAR);
if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE))
move_msr_up(vmx, index, save_nmsrs++);
}
#endif
index = __find_msr_index(vmx, MSR_EFER);
if (index >= 0 && update_transition_efer(vmx, index))
move_msr_up(vmx, index, save_nmsrs++);
vmx->save_nmsrs = save_nmsrs;
if (cpu_has_vmx_msr_bitmap()) {
if (is_long_mode(&vmx->vcpu))
msr_bitmap = vmx_msr_bitmap_longmode;
else
msr_bitmap = vmx_msr_bitmap_legacy;
vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
}
}
/*
* reads and returns guest's timestamp counter "register"
* guest_tsc = host_tsc + tsc_offset -- 21.3
*/
static u64 guest_read_tsc(void)
{
u64 host_tsc, tsc_offset;
rdtscll(host_tsc);
tsc_offset = vmcs_read64(TSC_OFFSET);
return host_tsc + tsc_offset;
}
/*
* Empty call-back. Needs to be implemented when VMX enables the SET_TSC_KHZ
* ioctl. In this case the call-back should update internal vmx state to make
* the changes effective.
*/
static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
{
/* Nothing to do here */
}
/*
* writes 'offset' into guest's timestamp counter offset register
*/
static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
{
vmcs_write64(TSC_OFFSET, offset);
}
static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
{
u64 offset = vmcs_read64(TSC_OFFSET);
vmcs_write64(TSC_OFFSET, offset + adjustment);
}
static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
{
return target_tsc - native_read_tsc();
}
/*
* Reads an msr value (of 'msr_index') into 'pdata'.
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
{
u64 data;
struct shared_msr_entry *msr;
if (!pdata) {
printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
return -EINVAL;
}
switch (msr_index) {
#ifdef CONFIG_X86_64
case MSR_FS_BASE:
data = vmcs_readl(GUEST_FS_BASE);
break;
case MSR_GS_BASE:
data = vmcs_readl(GUEST_GS_BASE);
break;
case MSR_KERNEL_GS_BASE:
vmx_load_host_state(to_vmx(vcpu));
data = to_vmx(vcpu)->msr_guest_kernel_gs_base;
break;
#endif
case MSR_EFER:
return kvm_get_msr_common(vcpu, msr_index, pdata);
case MSR_IA32_TSC:
data = guest_read_tsc();
break;
case MSR_IA32_SYSENTER_CS:
data = vmcs_read32(GUEST_SYSENTER_CS);
break;
case MSR_IA32_SYSENTER_EIP:
data = vmcs_readl(GUEST_SYSENTER_EIP);
break;
case MSR_IA32_SYSENTER_ESP:
data = vmcs_readl(GUEST_SYSENTER_ESP);
break;
case MSR_TSC_AUX:
if (!to_vmx(vcpu)->rdtscp_enabled)
return 1;
/* Otherwise falls through */
default:
vmx_load_host_state(to_vmx(vcpu));
msr = find_msr_entry(to_vmx(vcpu), msr_index);
if (msr) {
vmx_load_host_state(to_vmx(vcpu));
data = msr->data;
break;
}
return kvm_get_msr_common(vcpu, msr_index, pdata);
}
*pdata = data;
return 0;
}
/*
* Writes msr value into into the appropriate "register".
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct shared_msr_entry *msr;
int ret = 0;
switch (msr_index) {
case MSR_EFER:
vmx_load_host_state(vmx);
ret = kvm_set_msr_common(vcpu, msr_index, data);
break;
#ifdef CONFIG_X86_64
case MSR_FS_BASE:
vmx_segment_cache_clear(vmx);
vmcs_writel(GUEST_FS_BASE, data);
break;
case MSR_GS_BASE:
vmx_segment_cache_clear(vmx);
vmcs_writel(GUEST_GS_BASE, data);
break;
case MSR_KERNEL_GS_BASE:
vmx_load_host_state(vmx);
vmx->msr_guest_kernel_gs_base = data;
break;
#endif
case MSR_IA32_SYSENTER_CS:
vmcs_write32(GUEST_SYSENTER_CS, data);
break;
case MSR_IA32_SYSENTER_EIP:
vmcs_writel(GUEST_SYSENTER_EIP, data);
break;
case MSR_IA32_SYSENTER_ESP:
vmcs_writel(GUEST_SYSENTER_ESP, data);
break;
case MSR_IA32_TSC:
kvm_write_tsc(vcpu, data);
break;
case MSR_IA32_CR_PAT:
if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
vmcs_write64(GUEST_IA32_PAT, data);
vcpu->arch.pat = data;
break;
}
ret = kvm_set_msr_common(vcpu, msr_index, data);
break;
case MSR_TSC_AUX:
if (!vmx->rdtscp_enabled)
return 1;
/* Check reserved bit, higher 32 bits should be zero */
if ((data >> 32) != 0)
return 1;
/* Otherwise falls through */
default:
msr = find_msr_entry(vmx, msr_index);
if (msr) {
vmx_load_host_state(vmx);
msr->data = data;
break;
}
ret = kvm_set_msr_common(vcpu, msr_index, data);
}
return ret;
}
static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
{
__set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
switch (reg) {
case VCPU_REGS_RSP:
vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
break;
case VCPU_REGS_RIP:
vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
break;
case VCPU_EXREG_PDPTR:
if (enable_ept)
ept_save_pdptrs(vcpu);
break;
default:
break;
}
}
static void set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
{
if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]);
else
vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
update_exception_bitmap(vcpu);
}
static __init int cpu_has_kvm_support(void)
{
return cpu_has_vmx();
}
static __init int vmx_disabled_by_bios(void)
{
u64 msr;
rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
if (msr & FEATURE_CONTROL_LOCKED) {
/* launched w/ TXT and VMX disabled */
if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
&& tboot_enabled())
return 1;
/* launched w/o TXT and VMX only enabled w/ TXT */
if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
&& (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
&& !tboot_enabled()) {
printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
"activate TXT before enabling KVM\n");
return 1;
}
/* launched w/o TXT and VMX disabled */
if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
&& !tboot_enabled())
return 1;
}
return 0;
}
static void kvm_cpu_vmxon(u64 addr)
{
asm volatile (ASM_VMX_VMXON_RAX
: : "a"(&addr), "m"(addr)
: "memory", "cc");
}
static int hardware_enable(void *garbage)
{
int cpu = raw_smp_processor_id();
u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
u64 old, test_bits;
if (read_cr4() & X86_CR4_VMXE)
return -EBUSY;
INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
test_bits = FEATURE_CONTROL_LOCKED;
test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
if (tboot_enabled())
test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
if ((old & test_bits) != test_bits) {
/* enable and lock */
wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
}
write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
if (vmm_exclusive) {
kvm_cpu_vmxon(phys_addr);
ept_sync_global();
}
store_gdt(&__get_cpu_var(host_gdt));
return 0;
}
static void vmclear_local_vcpus(void)
{
int cpu = raw_smp_processor_id();
struct vcpu_vmx *vmx, *n;
list_for_each_entry_safe(vmx, n, &per_cpu(vcpus_on_cpu, cpu),
local_vcpus_link)
__vcpu_clear(vmx);
}
/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
* tricks.
*/
static void kvm_cpu_vmxoff(void)
{
asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
}
static void hardware_disable(void *garbage)
{
if (vmm_exclusive) {
vmclear_local_vcpus();
kvm_cpu_vmxoff();
}
write_cr4(read_cr4() & ~X86_CR4_VMXE);
}
static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
u32 msr, u32 *result)
{
u32 vmx_msr_low, vmx_msr_high;
u32 ctl = ctl_min | ctl_opt;
rdmsr(msr, vmx_msr_low, vmx_msr_high);
ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */
/* Ensure minimum (required) set of control bits are supported. */
if (ctl_min & ~ctl)
return -EIO;
*result = ctl;
return 0;
}
static __init bool allow_1_setting(u32 msr, u32 ctl)
{
u32 vmx_msr_low, vmx_msr_high;
rdmsr(msr, vmx_msr_low, vmx_msr_high);
return vmx_msr_high & ctl;
}
static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
{
u32 vmx_msr_low, vmx_msr_high;
u32 min, opt, min2, opt2;
u32 _pin_based_exec_control = 0;
u32 _cpu_based_exec_control = 0;
u32 _cpu_based_2nd_exec_control = 0;
u32 _vmexit_control = 0;
u32 _vmentry_control = 0;
min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
opt = PIN_BASED_VIRTUAL_NMIS;
if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
&_pin_based_exec_control) < 0)
return -EIO;
min =
#ifdef CONFIG_X86_64
CPU_BASED_CR8_LOAD_EXITING |
CPU_BASED_CR8_STORE_EXITING |
#endif
CPU_BASED_CR3_LOAD_EXITING |
CPU_BASED_CR3_STORE_EXITING |
CPU_BASED_USE_IO_BITMAPS |
CPU_BASED_MOV_DR_EXITING |
CPU_BASED_USE_TSC_OFFSETING |
CPU_BASED_MWAIT_EXITING |
CPU_BASED_MONITOR_EXITING |
CPU_BASED_INVLPG_EXITING;
if (yield_on_hlt)
min |= CPU_BASED_HLT_EXITING;
opt = CPU_BASED_TPR_SHADOW |
CPU_BASED_USE_MSR_BITMAPS |
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
&_cpu_based_exec_control) < 0)
return -EIO;
#ifdef CONFIG_X86_64
if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
_cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
~CPU_BASED_CR8_STORE_EXITING;
#endif
if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
min2 = 0;
opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_WBINVD_EXITING |
SECONDARY_EXEC_ENABLE_VPID |
SECONDARY_EXEC_ENABLE_EPT |
SECONDARY_EXEC_UNRESTRICTED_GUEST |
SECONDARY_EXEC_PAUSE_LOOP_EXITING |
SECONDARY_EXEC_RDTSCP;
if (adjust_vmx_controls(min2, opt2,
MSR_IA32_VMX_PROCBASED_CTLS2,
&_cpu_based_2nd_exec_control) < 0)
return -EIO;
}
#ifndef CONFIG_X86_64
if (!(_cpu_based_2nd_exec_control &
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
_cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
#endif
if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
/* CR3 accesses and invlpg don't need to cause VM Exits when EPT
enabled */
_cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
CPU_BASED_CR3_STORE_EXITING |
CPU_BASED_INVLPG_EXITING);
rdmsr(MSR_IA32_VMX_EPT_VPID_CAP,
vmx_capability.ept, vmx_capability.vpid);
}
min = 0;
#ifdef CONFIG_X86_64
min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
#endif
opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT;
if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
&_vmexit_control) < 0)
return -EIO;
min = 0;
opt = VM_ENTRY_LOAD_IA32_PAT;
if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
&_vmentry_control) < 0)
return -EIO;
rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
/* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
return -EIO;
#ifdef CONFIG_X86_64
/* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
if (vmx_msr_high & (1u<<16))
return -EIO;
#endif
/* Require Write-Back (WB) memory type for VMCS accesses. */
if (((vmx_msr_high >> 18) & 15) != 6)
return -EIO;
vmcs_conf->size = vmx_msr_high & 0x1fff;
vmcs_conf->order = get_order(vmcs_config.size);
vmcs_conf->revision_id = vmx_msr_low;
vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
vmcs_conf->vmexit_ctrl = _vmexit_control;
vmcs_conf->vmentry_ctrl = _vmentry_control;
cpu_has_load_ia32_efer =
allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
VM_ENTRY_LOAD_IA32_EFER)
&& allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
VM_EXIT_LOAD_IA32_EFER);
return 0;
}
static struct vmcs *alloc_vmcs_cpu(int cpu)
{
int node = cpu_to_node(cpu);
struct page *pages;
struct vmcs *vmcs;
pages = alloc_pages_exact_node(node, GFP_KERNEL, vmcs_config.order);
if (!pages)
return NULL;
vmcs = page_address(pages);
memset(vmcs, 0, vmcs_config.size);
vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */
return vmcs;
}
static struct vmcs *alloc_vmcs(void)
{
return alloc_vmcs_cpu(raw_smp_processor_id());
}
static void free_vmcs(struct vmcs *vmcs)
{
free_pages((unsigned long)vmcs, vmcs_config.order);
}
static void free_kvm_area(void)
{
int cpu;
for_each_possible_cpu(cpu) {
free_vmcs(per_cpu(vmxarea, cpu));
per_cpu(vmxarea, cpu) = NULL;
}
}
static __init int alloc_kvm_area(void)
{
int cpu;
for_each_possible_cpu(cpu) {
struct vmcs *vmcs;
vmcs = alloc_vmcs_cpu(cpu);
if (!vmcs) {
free_kvm_area();
return -ENOMEM;
}
per_cpu(vmxarea, cpu) = vmcs;
}
return 0;
}
static __init int hardware_setup(void)
{
if (setup_vmcs_config(&vmcs_config) < 0)
return -EIO;
if (boot_cpu_has(X86_FEATURE_NX))
kvm_enable_efer_bits(EFER_NX);
if (!cpu_has_vmx_vpid())
enable_vpid = 0;
if (!cpu_has_vmx_ept() ||
!cpu_has_vmx_ept_4levels()) {
enable_ept = 0;
enable_unrestricted_guest = 0;
}
if (!cpu_has_vmx_unrestricted_guest())
enable_unrestricted_guest = 0;
if (!cpu_has_vmx_flexpriority())
flexpriority_enabled = 0;
if (!cpu_has_vmx_tpr_shadow())
kvm_x86_ops->update_cr8_intercept = NULL;
if (enable_ept && !cpu_has_vmx_ept_2m_page())
kvm_disable_largepages();
if (!cpu_has_vmx_ple())
ple_gap = 0;
return alloc_kvm_area();
}
static __exit void hardware_unsetup(void)
{
free_kvm_area();
}
static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save)
{
struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
if (vmcs_readl(sf->base) == save->base && (save->base & AR_S_MASK)) {
vmcs_write16(sf->selector, save->selector);
vmcs_writel(sf->base, save->base);
vmcs_write32(sf->limit, save->limit);
vmcs_write32(sf->ar_bytes, save->ar);
} else {
u32 dpl = (vmcs_read16(sf->selector) & SELECTOR_RPL_MASK)
<< AR_DPL_SHIFT;
vmcs_write32(sf->ar_bytes, 0x93 | dpl);
}
}
static void enter_pmode(struct kvm_vcpu *vcpu)
{
unsigned long flags;
struct vcpu_vmx *vmx = to_vmx(vcpu);
vmx->emulation_required = 1;
vmx->rmode.vm86_active = 0;
vmx_segment_cache_clear(vmx);
vmcs_write16(GUEST_TR_SELECTOR, vmx->rmode.tr.selector);
vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base);
vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit);
vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar);
flags = vmcs_readl(GUEST_RFLAGS);
flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
vmcs_writel(GUEST_RFLAGS, flags);
vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
(vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
update_exception_bitmap(vcpu);
if (emulate_invalid_guest_state)
return;
fix_pmode_dataseg(VCPU_SREG_ES, &vmx->rmode.es);
fix_pmode_dataseg(VCPU_SREG_DS, &vmx->rmode.ds);
fix_pmode_dataseg(VCPU_SREG_GS, &vmx->rmode.gs);
fix_pmode_dataseg(VCPU_SREG_FS, &vmx->rmode.fs);
vmx_segment_cache_clear(vmx);
vmcs_write16(GUEST_SS_SELECTOR, 0);
vmcs_write32(GUEST_SS_AR_BYTES, 0x93);
vmcs_write16(GUEST_CS_SELECTOR,
vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK);
vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
}
static gva_t rmode_tss_base(struct kvm *kvm)
{
if (!kvm->arch.tss_addr) {
struct kvm_memslots *slots;
gfn_t base_gfn;
slots = kvm_memslots(kvm);
base_gfn = slots->memslots[0].base_gfn +
kvm->memslots->memslots[0].npages - 3;
return base_gfn << PAGE_SHIFT;
}
return kvm->arch.tss_addr;
}
static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
{
struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
save->selector = vmcs_read16(sf->selector);
save->base = vmcs_readl(sf->base);
save->limit = vmcs_read32(sf->limit);
save->ar = vmcs_read32(sf->ar_bytes);
vmcs_write16(sf->selector, save->base >> 4);
vmcs_write32(sf->base, save->base & 0xffff0);
vmcs_write32(sf->limit, 0xffff);
vmcs_write32(sf->ar_bytes, 0xf3);
if (save->base & 0xf)
printk_once(KERN_WARNING "kvm: segment base is not paragraph"
" aligned when entering protected mode (seg=%d)",
seg);
}
static void enter_rmode(struct kvm_vcpu *vcpu)
{
unsigned long flags;
struct vcpu_vmx *vmx = to_vmx(vcpu);
if (enable_unrestricted_guest)
return;
vmx->emulation_required = 1;
vmx->rmode.vm86_active = 1;
/*
* Very old userspace does not call KVM_SET_TSS_ADDR before entering
* vcpu. Call it here with phys address pointing 16M below 4G.
*/
if (!vcpu->kvm->arch.tss_addr) {
printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
"called before entering vcpu\n");
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
vmx_set_tss_addr(vcpu->kvm, 0xfeffd000);
vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
}
vmx_segment_cache_clear(vmx);
vmx->rmode.tr.selector = vmcs_read16(GUEST_TR_SELECTOR);
vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
vmx->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
vmx->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
flags = vmcs_readl(GUEST_RFLAGS);
vmx->rmode.save_rflags = flags;
flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
vmcs_writel(GUEST_RFLAGS, flags);
vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
update_exception_bitmap(vcpu);
if (emulate_invalid_guest_state)
goto continue_rmode;
vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4);
vmcs_write32(GUEST_SS_LIMIT, 0xffff);
vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
vmcs_write32(GUEST_CS_LIMIT, 0xffff);
if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000)
vmcs_writel(GUEST_CS_BASE, 0xf0000);
vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es);
fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds);
fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs);
fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs);
continue_rmode:
kvm_mmu_reset_context(vcpu);
}
static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
if (!msr)
return;
/*
* Force kernel_gs_base reloading before EFER changes, as control
* of this msr depends on is_long_mode().
*/
vmx_load_host_state(to_vmx(vcpu));
vcpu->arch.efer = efer;
if (efer & EFER_LMA) {
vmcs_write32(VM_ENTRY_CONTROLS,
vmcs_read32(VM_ENTRY_CONTROLS) |
VM_ENTRY_IA32E_MODE);
msr->data = efer;
} else {
vmcs_write32(VM_ENTRY_CONTROLS,
vmcs_read32(VM_ENTRY_CONTROLS) &
~VM_ENTRY_IA32E_MODE);
msr->data = efer & ~EFER_LME;
}
setup_msrs(vmx);
}
#ifdef CONFIG_X86_64
static void enter_lmode(struct kvm_vcpu *vcpu)
{
u32 guest_tr_ar;
vmx_segment_cache_clear(to_vmx(vcpu));
guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
printk(KERN_DEBUG "%s: tss fixup for long mode. \n",
__func__);
vmcs_write32(GUEST_TR_AR_BYTES,
(guest_tr_ar & ~AR_TYPE_MASK)
| AR_TYPE_BUSY_64_TSS);
}
vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
}
static void exit_lmode(struct kvm_vcpu *vcpu)
{
vmcs_write32(VM_ENTRY_CONTROLS,
vmcs_read32(VM_ENTRY_CONTROLS)
& ~VM_ENTRY_IA32E_MODE);
vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
}
#endif
static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
{
vpid_sync_context(to_vmx(vcpu));
if (enable_ept) {
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
}
}
static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
{
ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
vcpu->arch.cr0 &= ~cr0_guest_owned_bits;
vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
}
static void vmx_decache_cr3(struct kvm_vcpu *vcpu)
{
if (enable_ept && is_paging(vcpu))
vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
}
static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
{
ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
vcpu->arch.cr4 &= ~cr4_guest_owned_bits;
vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits;
}
static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
{
if (!test_bit(VCPU_EXREG_PDPTR,
(unsigned long *)&vcpu->arch.regs_dirty))
return;
if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
vmcs_write64(GUEST_PDPTR0, vcpu->arch.mmu.pdptrs[0]);
vmcs_write64(GUEST_PDPTR1, vcpu->arch.mmu.pdptrs[1]);
vmcs_write64(GUEST_PDPTR2, vcpu->arch.mmu.pdptrs[2]);
vmcs_write64(GUEST_PDPTR3, vcpu->arch.mmu.pdptrs[3]);
}
}
static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
{
if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
vcpu->arch.mmu.pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
vcpu->arch.mmu.pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
vcpu->arch.mmu.pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
vcpu->arch.mmu.pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
}
__set_bit(VCPU_EXREG_PDPTR,
(unsigned long *)&vcpu->arch.regs_avail);
__set_bit(VCPU_EXREG_PDPTR,
(unsigned long *)&vcpu->arch.regs_dirty);
}
static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
unsigned long cr0,
struct kvm_vcpu *vcpu)
{
if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
vmx_decache_cr3(vcpu);
if (!(cr0 & X86_CR0_PG)) {
/* From paging/starting to nonpaging */
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) |
(CPU_BASED_CR3_LOAD_EXITING |
CPU_BASED_CR3_STORE_EXITING));
vcpu->arch.cr0 = cr0;
vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
} else if (!is_paging(vcpu)) {
/* From nonpaging to paging */
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
~(CPU_BASED_CR3_LOAD_EXITING |
CPU_BASED_CR3_STORE_EXITING));
vcpu->arch.cr0 = cr0;
vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
}
if (!(cr0 & X86_CR0_WP))
*hw_cr0 &= ~X86_CR0_WP;
}
static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long hw_cr0;
if (enable_unrestricted_guest)
hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST)
| KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
else
hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON;
if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
enter_pmode(vcpu);
if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
enter_rmode(vcpu);
#ifdef CONFIG_X86_64
if (vcpu->arch.efer & EFER_LME) {
if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
enter_lmode(vcpu);
if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
exit_lmode(vcpu);
}
#endif
if (enable_ept)
ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
if (!vcpu->fpu_active)
hw_cr0 |= X86_CR0_TS | X86_CR0_MP;
vmcs_writel(CR0_READ_SHADOW, cr0);
vmcs_writel(GUEST_CR0, hw_cr0);
vcpu->arch.cr0 = cr0;
__clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
}
static u64 construct_eptp(unsigned long root_hpa)
{
u64 eptp;
/* TODO write the value reading from MSR */
eptp = VMX_EPT_DEFAULT_MT |
VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT;
eptp |= (root_hpa & PAGE_MASK);
return eptp;
}
static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{
unsigned long guest_cr3;
u64 eptp;
guest_cr3 = cr3;
if (enable_ept) {
eptp = construct_eptp(cr3);
vmcs_write64(EPT_POINTER, eptp);
guest_cr3 = is_paging(vcpu) ? kvm_read_cr3(vcpu) :
vcpu->kvm->arch.ept_identity_map_addr;
ept_load_pdptrs(vcpu);
}
vmx_flush_tlb(vcpu);
vmcs_writel(GUEST_CR3, guest_cr3);
}
static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ?
KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
vcpu->arch.cr4 = cr4;
if (enable_ept) {
if (!is_paging(vcpu)) {
hw_cr4 &= ~X86_CR4_PAE;
hw_cr4 |= X86_CR4_PSE;
} else if (!(cr4 & X86_CR4_PAE)) {
hw_cr4 &= ~X86_CR4_PAE;
}
}
vmcs_writel(CR4_READ_SHADOW, cr4);
vmcs_writel(GUEST_CR4, hw_cr4);
}
static void vmx_get_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct kvm_save_segment *save;
u32 ar;
if (vmx->rmode.vm86_active
&& (seg == VCPU_SREG_TR || seg == VCPU_SREG_ES
|| seg == VCPU_SREG_DS || seg == VCPU_SREG_FS
|| seg == VCPU_SREG_GS)
&& !emulate_invalid_guest_state) {
switch (seg) {
case VCPU_SREG_TR: save = &vmx->rmode.tr; break;
case VCPU_SREG_ES: save = &vmx->rmode.es; break;
case VCPU_SREG_DS: save = &vmx->rmode.ds; break;
case VCPU_SREG_FS: save = &vmx->rmode.fs; break;
case VCPU_SREG_GS: save = &vmx->rmode.gs; break;
default: BUG();
}
var->selector = save->selector;
var->base = save->base;
var->limit = save->limit;
ar = save->ar;
if (seg == VCPU_SREG_TR
|| var->selector == vmx_read_guest_seg_selector(vmx, seg))
goto use_saved_rmode_seg;
}
var->base = vmx_read_guest_seg_base(vmx, seg);
var->limit = vmx_read_guest_seg_limit(vmx, seg);
var->selector = vmx_read_guest_seg_selector(vmx, seg);
ar = vmx_read_guest_seg_ar(vmx, seg);
use_saved_rmode_seg:
if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state)
ar = 0;
var->type = ar & 15;
var->s = (ar >> 4) & 1;
var->dpl = (ar >> 5) & 3;
var->present = (ar >> 7) & 1;
var->avl = (ar >> 12) & 1;
var->l = (ar >> 13) & 1;
var->db = (ar >> 14) & 1;
var->g = (ar >> 15) & 1;
var->unusable = (ar >> 16) & 1;
}
static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
{
struct kvm_segment s;
if (to_vmx(vcpu)->rmode.vm86_active) {
vmx_get_segment(vcpu, &s, seg);
return s.base;
}
return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
}
static int __vmx_get_cpl(struct kvm_vcpu *vcpu)
{
if (!is_protmode(vcpu))
return 0;
if (!is_long_mode(vcpu)
&& (kvm_get_rflags(vcpu) & X86_EFLAGS_VM)) /* if virtual 8086 */
return 3;
return vmx_read_guest_seg_selector(to_vmx(vcpu), VCPU_SREG_CS) & 3;
}
static int vmx_get_cpl(struct kvm_vcpu *vcpu)
{
if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) {
__set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
to_vmx(vcpu)->cpl = __vmx_get_cpl(vcpu);
}
return to_vmx(vcpu)->cpl;
}
static u32 vmx_segment_access_rights(struct kvm_segment *var)
{
u32 ar;
if (var->unusable)
ar = 1 << 16;
else {
ar = var->type & 15;
ar |= (var->s & 1) << 4;
ar |= (var->dpl & 3) << 5;
ar |= (var->present & 1) << 7;
ar |= (var->avl & 1) << 12;
ar |= (var->l & 1) << 13;
ar |= (var->db & 1) << 14;
ar |= (var->g & 1) << 15;
}
if (ar == 0) /* a 0 value means unusable */
ar = AR_UNUSABLE_MASK;
return ar;
}
static void vmx_set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
u32 ar;
vmx_segment_cache_clear(vmx);
if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) {
vmcs_write16(sf->selector, var->selector);
vmx->rmode.tr.selector = var->selector;
vmx->rmode.tr.base = var->base;
vmx->rmode.tr.limit = var->limit;
vmx->rmode.tr.ar = vmx_segment_access_rights(var);
return;
}
vmcs_writel(sf->base, var->base);
vmcs_write32(sf->limit, var->limit);
vmcs_write16(sf->selector, var->selector);
if (vmx->rmode.vm86_active && var->s) {
/*
* Hack real-mode segments into vm86 compatibility.
*/
if (var->base == 0xffff0000 && var->selector == 0xf000)
vmcs_writel(sf->base, 0xf0000);
ar = 0xf3;
} else
ar = vmx_segment_access_rights(var);
/*
* Fix the "Accessed" bit in AR field of segment registers for older
* qemu binaries.
* IA32 arch specifies that at the time of processor reset the
* "Accessed" bit in the AR field of segment registers is 1. And qemu
* is setting it to 0 in the usedland code. This causes invalid guest
* state vmexit when "unrestricted guest" mode is turned on.
* Fix for this setup issue in cpu_reset is being pushed in the qemu
* tree. Newer qemu binaries with that qemu fix would not need this
* kvm hack.
*/
if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR))
ar |= 0x1; /* Accessed */
vmcs_write32(sf->ar_bytes, ar);
__clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
}
static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
{
u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
*db = (ar >> 14) & 1;
*l = (ar >> 13) & 1;
}
static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
dt->address = vmcs_readl(GUEST_IDTR_BASE);
}
static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
vmcs_writel(GUEST_IDTR_BASE, dt->address);
}
static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
dt->address = vmcs_readl(GUEST_GDTR_BASE);
}
static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
vmcs_writel(GUEST_GDTR_BASE, dt->address);
}
static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
{
struct kvm_segment var;
u32 ar;
vmx_get_segment(vcpu, &var, seg);
ar = vmx_segment_access_rights(&var);
if (var.base != (var.selector << 4))
return false;
if (var.limit != 0xffff)
return false;
if (ar != 0xf3)
return false;
return true;
}
static bool code_segment_valid(struct kvm_vcpu *vcpu)
{
struct kvm_segment cs;
unsigned int cs_rpl;
vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
cs_rpl = cs.selector & SELECTOR_RPL_MASK;
if (cs.unusable)
return false;
if (~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK))
return false;
if (!cs.s)
return false;
if (cs.type & AR_TYPE_WRITEABLE_MASK) {
if (cs.dpl > cs_rpl)
return false;
} else {
if (cs.dpl != cs_rpl)
return false;
}
if (!cs.present)
return false;
/* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
return true;
}
static bool stack_segment_valid(struct kvm_vcpu *vcpu)
{
struct kvm_segment ss;
unsigned int ss_rpl;
vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
ss_rpl = ss.selector & SELECTOR_RPL_MASK;
if (ss.unusable)
return true;
if (ss.type != 3 && ss.type != 7)
return false;
if (!ss.s)
return false;
if (ss.dpl != ss_rpl) /* DPL != RPL */
return false;
if (!ss.present)
return false;
return true;
}
static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
{
struct kvm_segment var;
unsigned int rpl;
vmx_get_segment(vcpu, &var, seg);
rpl = var.selector & SELECTOR_RPL_MASK;
if (var.unusable)
return true;
if (!var.s)
return false;
if (!var.present)
return false;
if (~var.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK)) {
if (var.dpl < rpl) /* DPL < RPL */
return false;
}
/* TODO: Add other members to kvm_segment_field to allow checking for other access
* rights flags
*/
return true;
}
static bool tr_valid(struct kvm_vcpu *vcpu)
{
struct kvm_segment tr;
vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
if (tr.unusable)
return false;
if (tr.selector & SELECTOR_TI_MASK) /* TI = 1 */
return false;
if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
return false;
if (!tr.present)
return false;
return true;
}
static bool ldtr_valid(struct kvm_vcpu *vcpu)
{
struct kvm_segment ldtr;
vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
if (ldtr.unusable)
return true;
if (ldtr.selector & SELECTOR_TI_MASK) /* TI = 1 */
return false;
if (ldtr.type != 2)
return false;
if (!ldtr.present)
return false;
return true;
}
static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
{
struct kvm_segment cs, ss;
vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
return ((cs.selector & SELECTOR_RPL_MASK) ==
(ss.selector & SELECTOR_RPL_MASK));
}
/*
* Check if guest state is valid. Returns true if valid, false if
* not.
* We assume that registers are always usable
*/
static bool guest_state_valid(struct kvm_vcpu *vcpu)
{
/* real mode guest state checks */
if (!is_protmode(vcpu)) {
if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
return false;
if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
return false;
if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
return false;
if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
return false;
if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
return false;
if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
return false;
} else {
/* protected mode guest state checks */
if (!cs_ss_rpl_check(vcpu))
return false;
if (!code_segment_valid(vcpu))
return false;
if (!stack_segment_valid(vcpu))
return false;
if (!data_segment_valid(vcpu, VCPU_SREG_DS))
return false;
if (!data_segment_valid(vcpu, VCPU_SREG_ES))
return false;
if (!data_segment_valid(vcpu, VCPU_SREG_FS))
return false;
if (!data_segment_valid(vcpu, VCPU_SREG_GS))
return false;
if (!tr_valid(vcpu))
return false;
if (!ldtr_valid(vcpu))
return false;
}
/* TODO:
* - Add checks on RIP
* - Add checks on RFLAGS
*/
return true;
}
static int init_rmode_tss(struct kvm *kvm)
{
gfn_t fn;
u16 data = 0;
int r, idx, ret = 0;
idx = srcu_read_lock(&kvm->srcu);
fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
if (r < 0)
goto out;
data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
r = kvm_write_guest_page(kvm, fn++, &data,
TSS_IOPB_BASE_OFFSET, sizeof(u16));
if (r < 0)
goto out;
r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
if (r < 0)
goto out;
r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
if (r < 0)
goto out;
data = ~0;
r = kvm_write_guest_page(kvm, fn, &data,
RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
sizeof(u8));
if (r < 0)
goto out;
ret = 1;
out:
srcu_read_unlock(&kvm->srcu, idx);
return ret;
}
static int init_rmode_identity_map(struct kvm *kvm)
{
int i, idx, r, ret;
pfn_t identity_map_pfn;
u32 tmp;
if (!enable_ept)
return 1;
if (unlikely(!kvm->arch.ept_identity_pagetable)) {
printk(KERN_ERR "EPT: identity-mapping pagetable "
"haven't been allocated!\n");
return 0;
}
if (likely(kvm->arch.ept_identity_pagetable_done))
return 1;
ret = 0;
identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT;
idx = srcu_read_lock(&kvm->srcu);
r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
if (r < 0)
goto out;
/* Set up identity-mapping pagetable for EPT in real mode */
for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
_PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
r = kvm_write_guest_page(kvm, identity_map_pfn,
&tmp, i * sizeof(tmp), sizeof(tmp));
if (r < 0)
goto out;
}
kvm->arch.ept_identity_pagetable_done = true;
ret = 1;
out:
srcu_read_unlock(&kvm->srcu, idx);
return ret;
}
static void seg_setup(int seg)
{
struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
unsigned int ar;
vmcs_write16(sf->selector, 0);
vmcs_writel(sf->base, 0);
vmcs_write32(sf->limit, 0xffff);
if (enable_unrestricted_guest) {
ar = 0x93;
if (seg == VCPU_SREG_CS)
ar |= 0x08; /* code segment */
} else
ar = 0xf3;
vmcs_write32(sf->ar_bytes, ar);
}
static int alloc_apic_access_page(struct kvm *kvm)
{
struct kvm_userspace_memory_region kvm_userspace_mem;
int r = 0;
mutex_lock(&kvm->slots_lock);
if (kvm->arch.apic_access_page)
goto out;
kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
kvm_userspace_mem.flags = 0;
kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL;
kvm_userspace_mem.memory_size = PAGE_SIZE;
r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
if (r)
goto out;
kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
out:
mutex_unlock(&kvm->slots_lock);
return r;
}
static int alloc_identity_pagetable(struct kvm *kvm)
{
struct kvm_userspace_memory_region kvm_userspace_mem;
int r = 0;
mutex_lock(&kvm->slots_lock);
if (kvm->arch.ept_identity_pagetable)
goto out;
kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
kvm_userspace_mem.flags = 0;
kvm_userspace_mem.guest_phys_addr =
kvm->arch.ept_identity_map_addr;
kvm_userspace_mem.memory_size = PAGE_SIZE;
r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
if (r)
goto out;
kvm->arch.ept_identity_pagetable = gfn_to_page(kvm,
kvm->arch.ept_identity_map_addr >> PAGE_SHIFT);
out:
mutex_unlock(&kvm->slots_lock);
return r;
}
static void allocate_vpid(struct vcpu_vmx *vmx)
{
int vpid;
vmx->vpid = 0;
if (!enable_vpid)
return;
spin_lock(&vmx_vpid_lock);
vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
if (vpid < VMX_NR_VPIDS) {
vmx->vpid = vpid;
__set_bit(vpid, vmx_vpid_bitmap);
}
spin_unlock(&vmx_vpid_lock);
}
static void free_vpid(struct vcpu_vmx *vmx)
{
if (!enable_vpid)
return;
spin_lock(&vmx_vpid_lock);
if (vmx->vpid != 0)
__clear_bit(vmx->vpid, vmx_vpid_bitmap);
spin_unlock(&vmx_vpid_lock);
}
static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
{
int f = sizeof(unsigned long);
if (!cpu_has_vmx_msr_bitmap())
return;
/*
* See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
* have the write-low and read-high bitmap offsets the wrong way round.
* We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
*/
if (msr <= 0x1fff) {
__clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
__clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
msr &= 0x1fff;
__clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
__clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
}
}
static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
{
if (!longmode_only)
__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr);
__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr);
}
/*
* Sets up the vmcs for emulated real mode.
*/
static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
{
u32 host_sysenter_cs, msr_low, msr_high;
u32 junk;
u64 host_pat;
unsigned long a;
struct desc_ptr dt;
int i;
unsigned long kvm_vmx_return;
u32 exec_control;
/* I/O */
vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a));
vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b));
if (cpu_has_vmx_msr_bitmap())
vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy));
vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
/* Control */
vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
vmcs_config.pin_based_exec_ctrl);
exec_control = vmcs_config.cpu_based_exec_ctrl;
<