blob: 8e0c0844c6b9681e31e64bdeba0f1822108bec3e [file] [log] [blame]
/*
* Kernel-based Virtual Machine driver for Linux
*
* AMD SVM support
*
* Copyright (C) 2006 Qumranet, Inc.
* Copyright 2010 Red Hat, Inc. and/or its affiliates.
*
* Authors:
* Yaniv Kamay <yaniv@qumranet.com>
* Avi Kivity <avi@qumranet.com>
*
* This work is licensed under the terms of the GNU GPL, version 2. See
* the COPYING file in the top-level directory.
*
*/
#include <linux/kvm_host.h>
#include "irq.h"
#include "mmu.h"
#include "kvm_cache_regs.h"
#include "x86.h"
#include "cpuid.h"
#include "pmu.h"
#include <linux/module.h>
#include <linux/mod_devicetable.h>
#include <linux/kernel.h>
#include <linux/vmalloc.h>
#include <linux/highmem.h>
#include <linux/sched.h>
#include <linux/trace_events.h>
#include <linux/slab.h>
#include <asm/perf_event.h>
#include <asm/tlbflush.h>
#include <asm/desc.h>
#include <asm/debugreg.h>
#include <asm/kvm_para.h>
#include <asm/virtext.h>
#include "trace.h"
#define __ex(x) __kvm_handle_fault_on_reboot(x)
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");
static const struct x86_cpu_id svm_cpu_id[] = {
X86_FEATURE_MATCH(X86_FEATURE_SVM),
{}
};
MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
#define IOPM_ALLOC_ORDER 2
#define MSRPM_ALLOC_ORDER 1
#define SEG_TYPE_LDT 2
#define SEG_TYPE_BUSY_TSS16 3
#define SVM_FEATURE_NPT (1 << 0)
#define SVM_FEATURE_LBRV (1 << 1)
#define SVM_FEATURE_SVML (1 << 2)
#define SVM_FEATURE_NRIP (1 << 3)
#define SVM_FEATURE_TSC_RATE (1 << 4)
#define SVM_FEATURE_VMCB_CLEAN (1 << 5)
#define SVM_FEATURE_FLUSH_ASID (1 << 6)
#define SVM_FEATURE_DECODE_ASSIST (1 << 7)
#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */
#define NESTED_EXIT_CONTINUE 2 /* Further checks needed */
#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
#define TSC_RATIO_RSVD 0xffffff0000000000ULL
#define TSC_RATIO_MIN 0x0000000000000001ULL
#define TSC_RATIO_MAX 0x000000ffffffffffULL
static bool erratum_383_found __read_mostly;
static const u32 host_save_user_msrs[] = {
#ifdef CONFIG_X86_64
MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
MSR_FS_BASE,
#endif
MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
};
#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
struct kvm_vcpu;
struct nested_state {
struct vmcb *hsave;
u64 hsave_msr;
u64 vm_cr_msr;
u64 vmcb;
/* These are the merged vectors */
u32 *msrpm;
/* gpa pointers to the real vectors */
u64 vmcb_msrpm;
u64 vmcb_iopm;
/* A VMEXIT is required but not yet emulated */
bool exit_required;
/* cache for intercepts of the guest */
u32 intercept_cr;
u32 intercept_dr;
u32 intercept_exceptions;
u64 intercept;
/* Nested Paging related state */
u64 nested_cr3;
};
#define MSRPM_OFFSETS 16
static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
/*
* Set osvw_len to higher value when updated Revision Guides
* are published and we know what the new status bits are
*/
static uint64_t osvw_len = 4, osvw_status;
struct vcpu_svm {
struct kvm_vcpu vcpu;
struct vmcb *vmcb;
unsigned long vmcb_pa;
struct svm_cpu_data *svm_data;
uint64_t asid_generation;
uint64_t sysenter_esp;
uint64_t sysenter_eip;
u64 next_rip;
u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
struct {
u16 fs;
u16 gs;
u16 ldt;
u64 gs_base;
} host;
u32 *msrpm;
ulong nmi_iret_rip;
struct nested_state nested;
bool nmi_singlestep;
unsigned int3_injected;
unsigned long int3_rip;
u32 apf_reason;
u64 tsc_ratio;
};
static DEFINE_PER_CPU(u64, current_tsc_ratio);
#define TSC_RATIO_DEFAULT 0x0100000000ULL
#define MSR_INVALID 0xffffffffU
static const struct svm_direct_access_msrs {
u32 index; /* Index of the MSR */
bool always; /* True if intercept is always on */
} direct_access_msrs[] = {
{ .index = MSR_STAR, .always = true },
{ .index = MSR_IA32_SYSENTER_CS, .always = true },
#ifdef CONFIG_X86_64
{ .index = MSR_GS_BASE, .always = true },
{ .index = MSR_FS_BASE, .always = true },
{ .index = MSR_KERNEL_GS_BASE, .always = true },
{ .index = MSR_LSTAR, .always = true },
{ .index = MSR_CSTAR, .always = true },
{ .index = MSR_SYSCALL_MASK, .always = true },
#endif
{ .index = MSR_IA32_LASTBRANCHFROMIP, .always = false },
{ .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
{ .index = MSR_IA32_LASTINTFROMIP, .always = false },
{ .index = MSR_IA32_LASTINTTOIP, .always = false },
{ .index = MSR_INVALID, .always = false },
};
/* enable NPT for AMD64 and X86 with PAE */
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
static bool npt_enabled = true;
#else
static bool npt_enabled;
#endif
/* allow nested paging (virtualized MMU) for all guests */
static int npt = true;
module_param(npt, int, S_IRUGO);
/* allow nested virtualization in KVM/SVM */
static int nested = true;
module_param(nested, int, S_IRUGO);
static void svm_flush_tlb(struct kvm_vcpu *vcpu);
static void svm_complete_interrupts(struct vcpu_svm *svm);
static int nested_svm_exit_handled(struct vcpu_svm *svm);
static int nested_svm_intercept(struct vcpu_svm *svm);
static int nested_svm_vmexit(struct vcpu_svm *svm);
static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
bool has_error_code, u32 error_code);
static u64 __scale_tsc(u64 ratio, u64 tsc);
enum {
VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
pause filter count */
VMCB_PERM_MAP, /* IOPM Base and MSRPM Base */
VMCB_ASID, /* ASID */
VMCB_INTR, /* int_ctl, int_vector */
VMCB_NPT, /* npt_en, nCR3, gPAT */
VMCB_CR, /* CR0, CR3, CR4, EFER */
VMCB_DR, /* DR6, DR7 */
VMCB_DT, /* GDT, IDT */
VMCB_SEG, /* CS, DS, SS, ES, CPL */
VMCB_CR2, /* CR2 only */
VMCB_LBR, /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */
VMCB_DIRTY_MAX,
};
/* TPR and CR2 are always written before VMRUN */
#define VMCB_ALWAYS_DIRTY_MASK ((1U << VMCB_INTR) | (1U << VMCB_CR2))
static inline void mark_all_dirty(struct vmcb *vmcb)
{
vmcb->control.clean = 0;
}
static inline void mark_all_clean(struct vmcb *vmcb)
{
vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1)
& ~VMCB_ALWAYS_DIRTY_MASK;
}
static inline void mark_dirty(struct vmcb *vmcb, int bit)
{
vmcb->control.clean &= ~(1 << bit);
}
static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
{
return container_of(vcpu, struct vcpu_svm, vcpu);
}
static void recalc_intercepts(struct vcpu_svm *svm)
{
struct vmcb_control_area *c, *h;
struct nested_state *g;
mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
if (!is_guest_mode(&svm->vcpu))
return;
c = &svm->vmcb->control;
h = &svm->nested.hsave->control;
g = &svm->nested;
c->intercept_cr = h->intercept_cr | g->intercept_cr;
c->intercept_dr = h->intercept_dr | g->intercept_dr;
c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions;
c->intercept = h->intercept | g->intercept;
}
static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
{
if (is_guest_mode(&svm->vcpu))
return svm->nested.hsave;
else
return svm->vmcb;
}
static inline void set_cr_intercept(struct vcpu_svm *svm, int bit)
{
struct vmcb *vmcb = get_host_vmcb(svm);
vmcb->control.intercept_cr |= (1U << bit);
recalc_intercepts(svm);
}
static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit)
{
struct vmcb *vmcb = get_host_vmcb(svm);
vmcb->control.intercept_cr &= ~(1U << bit);
recalc_intercepts(svm);
}
static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit)
{
struct vmcb *vmcb = get_host_vmcb(svm);
return vmcb->control.intercept_cr & (1U << bit);
}
static inline void set_dr_intercepts(struct vcpu_svm *svm)
{
struct vmcb *vmcb = get_host_vmcb(svm);
vmcb->control.intercept_dr = (1 << INTERCEPT_DR0_READ)
| (1 << INTERCEPT_DR1_READ)
| (1 << INTERCEPT_DR2_READ)
| (1 << INTERCEPT_DR3_READ)
| (1 << INTERCEPT_DR4_READ)
| (1 << INTERCEPT_DR5_READ)
| (1 << INTERCEPT_DR6_READ)
| (1 << INTERCEPT_DR7_READ)
| (1 << INTERCEPT_DR0_WRITE)
| (1 << INTERCEPT_DR1_WRITE)
| (1 << INTERCEPT_DR2_WRITE)
| (1 << INTERCEPT_DR3_WRITE)
| (1 << INTERCEPT_DR4_WRITE)
| (1 << INTERCEPT_DR5_WRITE)
| (1 << INTERCEPT_DR6_WRITE)
| (1 << INTERCEPT_DR7_WRITE);
recalc_intercepts(svm);
}
static inline void clr_dr_intercepts(struct vcpu_svm *svm)
{
struct vmcb *vmcb = get_host_vmcb(svm);
vmcb->control.intercept_dr = 0;
recalc_intercepts(svm);
}
static inline void set_exception_intercept(struct vcpu_svm *svm, int bit)
{
struct vmcb *vmcb = get_host_vmcb(svm);
vmcb->control.intercept_exceptions |= (1U << bit);
recalc_intercepts(svm);
}
static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit)
{
struct vmcb *vmcb = get_host_vmcb(svm);
vmcb->control.intercept_exceptions &= ~(1U << bit);
recalc_intercepts(svm);
}
static inline void set_intercept(struct vcpu_svm *svm, int bit)
{
struct vmcb *vmcb = get_host_vmcb(svm);
vmcb->control.intercept |= (1ULL << bit);
recalc_intercepts(svm);
}
static inline void clr_intercept(struct vcpu_svm *svm, int bit)
{
struct vmcb *vmcb = get_host_vmcb(svm);
vmcb->control.intercept &= ~(1ULL << bit);
recalc_intercepts(svm);
}
static inline void enable_gif(struct vcpu_svm *svm)
{
svm->vcpu.arch.hflags |= HF_GIF_MASK;
}
static inline void disable_gif(struct vcpu_svm *svm)
{
svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
}
static inline bool gif_set(struct vcpu_svm *svm)
{
return !!(svm->vcpu.arch.hflags & HF_GIF_MASK);
}
static unsigned long iopm_base;
struct kvm_ldttss_desc {
u16 limit0;
u16 base0;
unsigned base1:8, type:5, dpl:2, p:1;
unsigned limit1:4, zero0:3, g:1, base2:8;
u32 base3;
u32 zero1;
} __attribute__((packed));
struct svm_cpu_data {
int cpu;
u64 asid_generation;
u32 max_asid;
u32 next_asid;
struct kvm_ldttss_desc *tss_desc;
struct page *save_area;
};
static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
struct svm_init_data {
int cpu;
int r;
};
static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
#define MSRS_RANGE_SIZE 2048
#define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
static u32 svm_msrpm_offset(u32 msr)
{
u32 offset;
int i;
for (i = 0; i < NUM_MSR_MAPS; i++) {
if (msr < msrpm_ranges[i] ||
msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
continue;
offset = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
offset += (i * MSRS_RANGE_SIZE); /* add range offset */
/* Now we have the u8 offset - but need the u32 offset */
return offset / 4;
}
/* MSR not in any range */
return MSR_INVALID;
}
#define MAX_INST_SIZE 15
static inline void clgi(void)
{
asm volatile (__ex(SVM_CLGI));
}
static inline void stgi(void)
{
asm volatile (__ex(SVM_STGI));
}
static inline void invlpga(unsigned long addr, u32 asid)
{
asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid));
}
static int get_npt_level(void)
{
#ifdef CONFIG_X86_64
return PT64_ROOT_LEVEL;
#else
return PT32E_ROOT_LEVEL;
#endif
}
static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
vcpu->arch.efer = efer;
if (!npt_enabled && !(efer & EFER_LMA))
efer &= ~EFER_LME;
to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
}
static int is_external_interrupt(u32 info)
{
info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
}
static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
u32 ret = 0;
if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
return ret;
}
static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
{
struct vcpu_svm *svm = to_svm(vcpu);
if (mask == 0)
svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
else
svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
}
static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
if (svm->vmcb->control.next_rip != 0) {
WARN_ON(!static_cpu_has(X86_FEATURE_NRIPS));
svm->next_rip = svm->vmcb->control.next_rip;
}
if (!svm->next_rip) {
if (emulate_instruction(vcpu, EMULTYPE_SKIP) !=
EMULATE_DONE)
printk(KERN_DEBUG "%s: NOP\n", __func__);
return;
}
if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n",
__func__, kvm_rip_read(vcpu), svm->next_rip);
kvm_rip_write(vcpu, svm->next_rip);
svm_set_interrupt_shadow(vcpu, 0);
}
static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
bool has_error_code, u32 error_code,
bool reinject)
{
struct vcpu_svm *svm = to_svm(vcpu);
/*
* If we are within a nested VM we'd better #VMEXIT and let the guest
* handle the exception
*/
if (!reinject &&
nested_svm_check_exception(svm, nr, has_error_code, error_code))
return;
if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) {
unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
/*
* For guest debugging where we have to reinject #BP if some
* INT3 is guest-owned:
* Emulate nRIP by moving RIP forward. Will fail if injection
* raises a fault that is not intercepted. Still better than
* failing in all cases.
*/
skip_emulated_instruction(&svm->vcpu);
rip = kvm_rip_read(&svm->vcpu);
svm->int3_rip = rip + svm->vmcb->save.cs.base;
svm->int3_injected = rip - old_rip;
}
svm->vmcb->control.event_inj = nr
| SVM_EVTINJ_VALID
| (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
| SVM_EVTINJ_TYPE_EXEPT;
svm->vmcb->control.event_inj_err = error_code;
}
static void svm_init_erratum_383(void)
{
u32 low, high;
int err;
u64 val;
if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH))
return;
/* Use _safe variants to not break nested virtualization */
val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err);
if (err)
return;
val |= (1ULL << 47);
low = lower_32_bits(val);
high = upper_32_bits(val);
native_write_msr_safe(MSR_AMD64_DC_CFG, low, high);
erratum_383_found = true;
}
static void svm_init_osvw(struct kvm_vcpu *vcpu)
{
/*
* Guests should see errata 400 and 415 as fixed (assuming that
* HLT and IO instructions are intercepted).
*/
vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
vcpu->arch.osvw.status = osvw_status & ~(6ULL);
/*
* By increasing VCPU's osvw.length to 3 we are telling the guest that
* all osvw.status bits inside that length, including bit 0 (which is
* reserved for erratum 298), are valid. However, if host processor's
* osvw_len is 0 then osvw_status[0] carries no information. We need to
* be conservative here and therefore we tell the guest that erratum 298
* is present (because we really don't know).
*/
if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
vcpu->arch.osvw.status |= 1;
}
static int has_svm(void)
{
const char *msg;
if (!cpu_has_svm(&msg)) {
printk(KERN_INFO "has_svm: %s\n", msg);
return 0;
}
return 1;
}
static void svm_hardware_disable(void)
{
/* Make sure we clean up behind us */
if (static_cpu_has(X86_FEATURE_TSCRATEMSR))
wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
cpu_svm_disable();
amd_pmu_disable_virt();
}
static int svm_hardware_enable(void)
{
struct svm_cpu_data *sd;
uint64_t efer;
struct desc_ptr gdt_descr;
struct desc_struct *gdt;
int me = raw_smp_processor_id();
rdmsrl(MSR_EFER, efer);
if (efer & EFER_SVME)
return -EBUSY;
if (!has_svm()) {
pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me);
return -EINVAL;
}
sd = per_cpu(svm_data, me);
if (!sd) {
pr_err("%s: svm_data is NULL on %d\n", __func__, me);
return -EINVAL;
}
sd->asid_generation = 1;
sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
sd->next_asid = sd->max_asid + 1;
native_store_gdt(&gdt_descr);
gdt = (struct desc_struct *)gdt_descr.address;
sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
wrmsrl(MSR_EFER, efer | EFER_SVME);
wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT);
if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
__this_cpu_write(current_tsc_ratio, TSC_RATIO_DEFAULT);
}
/*
* Get OSVW bits.
*
* Note that it is possible to have a system with mixed processor
* revisions and therefore different OSVW bits. If bits are not the same
* on different processors then choose the worst case (i.e. if erratum
* is present on one processor and not on another then assume that the
* erratum is present everywhere).
*/
if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
uint64_t len, status = 0;
int err;
len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err);
if (!err)
status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS,
&err);
if (err)
osvw_status = osvw_len = 0;
else {
if (len < osvw_len)
osvw_len = len;
osvw_status |= status;
osvw_status &= (1ULL << osvw_len) - 1;
}
} else
osvw_status = osvw_len = 0;
svm_init_erratum_383();
amd_pmu_enable_virt();
return 0;
}
static void svm_cpu_uninit(int cpu)
{
struct svm_cpu_data *sd = per_cpu(svm_data, raw_smp_processor_id());
if (!sd)
return;
per_cpu(svm_data, raw_smp_processor_id()) = NULL;
__free_page(sd->save_area);
kfree(sd);
}
static int svm_cpu_init(int cpu)
{
struct svm_cpu_data *sd;
int r;
sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
if (!sd)
return -ENOMEM;
sd->cpu = cpu;
sd->save_area = alloc_page(GFP_KERNEL);
r = -ENOMEM;
if (!sd->save_area)
goto err_1;
per_cpu(svm_data, cpu) = sd;
return 0;
err_1:
kfree(sd);
return r;
}
static bool valid_msr_intercept(u32 index)
{
int i;
for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
if (direct_access_msrs[i].index == index)
return true;
return false;
}
static void set_msr_interception(u32 *msrpm, unsigned msr,
int read, int write)
{
u8 bit_read, bit_write;
unsigned long tmp;
u32 offset;
/*
* If this warning triggers extend the direct_access_msrs list at the
* beginning of the file
*/
WARN_ON(!valid_msr_intercept(msr));
offset = svm_msrpm_offset(msr);
bit_read = 2 * (msr & 0x0f);
bit_write = 2 * (msr & 0x0f) + 1;
tmp = msrpm[offset];
BUG_ON(offset == MSR_INVALID);
read ? clear_bit(bit_read, &tmp) : set_bit(bit_read, &tmp);
write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
msrpm[offset] = tmp;
}
static void svm_vcpu_init_msrpm(u32 *msrpm)
{
int i;
memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
if (!direct_access_msrs[i].always)
continue;
set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1);
}
}
static void add_msr_offset(u32 offset)
{
int i;
for (i = 0; i < MSRPM_OFFSETS; ++i) {
/* Offset already in list? */
if (msrpm_offsets[i] == offset)
return;
/* Slot used by another offset? */
if (msrpm_offsets[i] != MSR_INVALID)
continue;
/* Add offset to list */
msrpm_offsets[i] = offset;
return;
}
/*
* If this BUG triggers the msrpm_offsets table has an overflow. Just
* increase MSRPM_OFFSETS in this case.
*/
BUG();
}
static void init_msrpm_offsets(void)
{
int i;
memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
u32 offset;
offset = svm_msrpm_offset(direct_access_msrs[i].index);
BUG_ON(offset == MSR_INVALID);
add_msr_offset(offset);
}
}
static void svm_enable_lbrv(struct vcpu_svm *svm)
{
u32 *msrpm = svm->msrpm;
svm->vmcb->control.lbr_ctl = 1;
set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
}
static void svm_disable_lbrv(struct vcpu_svm *svm)
{
u32 *msrpm = svm->msrpm;
svm->vmcb->control.lbr_ctl = 0;
set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
}
#define MTRR_TYPE_UC_MINUS 7
#define MTRR2PROTVAL_INVALID 0xff
static u8 mtrr2protval[8];
static u8 fallback_mtrr_type(int mtrr)
{
/*
* WT and WP aren't always available in the host PAT. Treat
* them as UC and UC- respectively. Everything else should be
* there.
*/
switch (mtrr)
{
case MTRR_TYPE_WRTHROUGH:
return MTRR_TYPE_UNCACHABLE;
case MTRR_TYPE_WRPROT:
return MTRR_TYPE_UC_MINUS;
default:
BUG();
}
}
static void build_mtrr2protval(void)
{
int i;
u64 pat;
for (i = 0; i < 8; i++)
mtrr2protval[i] = MTRR2PROTVAL_INVALID;
/* Ignore the invalid MTRR types. */
mtrr2protval[2] = 0;
mtrr2protval[3] = 0;
/*
* Use host PAT value to figure out the mapping from guest MTRR
* values to nested page table PAT/PCD/PWT values. We do not
* want to change the host PAT value every time we enter the
* guest.
*/
rdmsrl(MSR_IA32_CR_PAT, pat);
for (i = 0; i < 8; i++) {
u8 mtrr = pat >> (8 * i);
if (mtrr2protval[mtrr] == MTRR2PROTVAL_INVALID)
mtrr2protval[mtrr] = __cm_idx2pte(i);
}
for (i = 0; i < 8; i++) {
if (mtrr2protval[i] == MTRR2PROTVAL_INVALID) {
u8 fallback = fallback_mtrr_type(i);
mtrr2protval[i] = mtrr2protval[fallback];
BUG_ON(mtrr2protval[i] == MTRR2PROTVAL_INVALID);
}
}
}
static __init int svm_hardware_setup(void)
{
int cpu;
struct page *iopm_pages;
void *iopm_va;
int r;
iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER);
if (!iopm_pages)
return -ENOMEM;
iopm_va = page_address(iopm_pages);
memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
init_msrpm_offsets();
if (boot_cpu_has(X86_FEATURE_NX))
kvm_enable_efer_bits(EFER_NX);
if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
kvm_enable_efer_bits(EFER_FFXSR);
if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
u64 max;
kvm_has_tsc_control = true;
/*
* Make sure the user can only configure tsc_khz values that
* fit into a signed integer.
* A min value is not calculated needed because it will always
* be 1 on all machines and a value of 0 is used to disable
* tsc-scaling for the vcpu.
*/
max = min(0x7fffffffULL, __scale_tsc(tsc_khz, TSC_RATIO_MAX));
kvm_max_guest_tsc_khz = max;
}
if (nested) {
printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
}
for_each_possible_cpu(cpu) {
r = svm_cpu_init(cpu);
if (r)
goto err;
}
if (!boot_cpu_has(X86_FEATURE_NPT))
npt_enabled = false;
if (npt_enabled && !npt) {
printk(KERN_INFO "kvm: Nested Paging disabled\n");
npt_enabled = false;
}
if (npt_enabled) {
printk(KERN_INFO "kvm: Nested Paging enabled\n");
kvm_enable_tdp();
} else
kvm_disable_tdp();
build_mtrr2protval();
return 0;
err:
__free_pages(iopm_pages, IOPM_ALLOC_ORDER);
iopm_base = 0;
return r;
}
static __exit void svm_hardware_unsetup(void)
{
int cpu;
for_each_possible_cpu(cpu)
svm_cpu_uninit(cpu);
__free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
iopm_base = 0;
}
static void init_seg(struct vmcb_seg *seg)
{
seg->selector = 0;
seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
seg->limit = 0xffff;
seg->base = 0;
}
static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
{
seg->selector = 0;
seg->attrib = SVM_SELECTOR_P_MASK | type;
seg->limit = 0xffff;
seg->base = 0;
}
static u64 __scale_tsc(u64 ratio, u64 tsc)
{
u64 mult, frac, _tsc;
mult = ratio >> 32;
frac = ratio & ((1ULL << 32) - 1);
_tsc = tsc;
_tsc *= mult;
_tsc += (tsc >> 32) * frac;
_tsc += ((tsc & ((1ULL << 32) - 1)) * frac) >> 32;
return _tsc;
}
static u64 svm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
{
struct vcpu_svm *svm = to_svm(vcpu);
u64 _tsc = tsc;
if (svm->tsc_ratio != TSC_RATIO_DEFAULT)
_tsc = __scale_tsc(svm->tsc_ratio, tsc);
return _tsc;
}
static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
{
struct vcpu_svm *svm = to_svm(vcpu);
u64 ratio;
u64 khz;
/* Guest TSC same frequency as host TSC? */
if (!scale) {
svm->tsc_ratio = TSC_RATIO_DEFAULT;
return;
}
/* TSC scaling supported? */
if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
if (user_tsc_khz > tsc_khz) {
vcpu->arch.tsc_catchup = 1;
vcpu->arch.tsc_always_catchup = 1;
} else
WARN(1, "user requested TSC rate below hardware speed\n");
return;
}
khz = user_tsc_khz;
/* TSC scaling required - calculate ratio */
ratio = khz << 32;
do_div(ratio, tsc_khz);
if (ratio == 0 || ratio & TSC_RATIO_RSVD) {
WARN_ONCE(1, "Invalid TSC ratio - virtual-tsc-khz=%u\n",
user_tsc_khz);
return;
}
svm->tsc_ratio = ratio;
}
static u64 svm_read_tsc_offset(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
return svm->vmcb->control.tsc_offset;
}
static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
{
struct vcpu_svm *svm = to_svm(vcpu);
u64 g_tsc_offset = 0;
if (is_guest_mode(vcpu)) {
g_tsc_offset = svm->vmcb->control.tsc_offset -
svm->nested.hsave->control.tsc_offset;
svm->nested.hsave->control.tsc_offset = offset;
} else
trace_kvm_write_tsc_offset(vcpu->vcpu_id,
svm->vmcb->control.tsc_offset,
offset);
svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
}
static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host)
{
struct vcpu_svm *svm = to_svm(vcpu);
if (host) {
if (svm->tsc_ratio != TSC_RATIO_DEFAULT)
WARN_ON(adjustment < 0);
adjustment = svm_scale_tsc(vcpu, (u64)adjustment);
}
svm->vmcb->control.tsc_offset += adjustment;
if (is_guest_mode(vcpu))
svm->nested.hsave->control.tsc_offset += adjustment;
else
trace_kvm_write_tsc_offset(vcpu->vcpu_id,
svm->vmcb->control.tsc_offset - adjustment,
svm->vmcb->control.tsc_offset);
mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
}
static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
{
u64 tsc;
tsc = svm_scale_tsc(vcpu, native_read_tsc());
return target_tsc - tsc;
}
static void svm_set_guest_pat(struct vcpu_svm *svm, u64 *g_pat)
{
struct kvm_vcpu *vcpu = &svm->vcpu;
/* Unlike Intel, AMD takes the guest's CR0.CD into account.
*
* AMD does not have IPAT. To emulate it for the case of guests
* with no assigned devices, just set everything to WB. If guests
* have assigned devices, however, we cannot force WB for RAM
* pages only, so use the guest PAT directly.
*/
if (!kvm_arch_has_assigned_device(vcpu->kvm))
*g_pat = 0x0606060606060606;
else
*g_pat = vcpu->arch.pat;
}
static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
{
u8 mtrr;
/*
* 1. MMIO: trust guest MTRR, so same as item 3.
* 2. No passthrough: always map as WB, and force guest PAT to WB as well
* 3. Passthrough: can't guarantee the result, try to trust guest.
*/
if (!is_mmio && !kvm_arch_has_assigned_device(vcpu->kvm))
return 0;
mtrr = kvm_mtrr_get_guest_memory_type(vcpu, gfn);
return mtrr2protval[mtrr];
}
static void init_vmcb(struct vcpu_svm *svm, bool init_event)
{
struct vmcb_control_area *control = &svm->vmcb->control;
struct vmcb_save_area *save = &svm->vmcb->save;
svm->vcpu.fpu_active = 1;
svm->vcpu.arch.hflags = 0;
set_cr_intercept(svm, INTERCEPT_CR0_READ);
set_cr_intercept(svm, INTERCEPT_CR3_READ);
set_cr_intercept(svm, INTERCEPT_CR4_READ);
set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
set_cr_intercept(svm, INTERCEPT_CR3_WRITE);
set_cr_intercept(svm, INTERCEPT_CR4_WRITE);
set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
set_dr_intercepts(svm);
set_exception_intercept(svm, PF_VECTOR);
set_exception_intercept(svm, UD_VECTOR);
set_exception_intercept(svm, MC_VECTOR);
set_intercept(svm, INTERCEPT_INTR);
set_intercept(svm, INTERCEPT_NMI);
set_intercept(svm, INTERCEPT_SMI);
set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
set_intercept(svm, INTERCEPT_RDPMC);
set_intercept(svm, INTERCEPT_CPUID);
set_intercept(svm, INTERCEPT_INVD);
set_intercept(svm, INTERCEPT_HLT);
set_intercept(svm, INTERCEPT_INVLPG);
set_intercept(svm, INTERCEPT_INVLPGA);
set_intercept(svm, INTERCEPT_IOIO_PROT);
set_intercept(svm, INTERCEPT_MSR_PROT);
set_intercept(svm, INTERCEPT_TASK_SWITCH);
set_intercept(svm, INTERCEPT_SHUTDOWN);
set_intercept(svm, INTERCEPT_VMRUN);
set_intercept(svm, INTERCEPT_VMMCALL);
set_intercept(svm, INTERCEPT_VMLOAD);
set_intercept(svm, INTERCEPT_VMSAVE);
set_intercept(svm, INTERCEPT_STGI);
set_intercept(svm, INTERCEPT_CLGI);
set_intercept(svm, INTERCEPT_SKINIT);
set_intercept(svm, INTERCEPT_WBINVD);
set_intercept(svm, INTERCEPT_MONITOR);
set_intercept(svm, INTERCEPT_MWAIT);
set_intercept(svm, INTERCEPT_XSETBV);
control->iopm_base_pa = iopm_base;
control->msrpm_base_pa = __pa(svm->msrpm);
control->int_ctl = V_INTR_MASKING_MASK;
init_seg(&save->es);
init_seg(&save->ss);
init_seg(&save->ds);
init_seg(&save->fs);
init_seg(&save->gs);
save->cs.selector = 0xf000;
save->cs.base = 0xffff0000;
/* Executable/Readable Code Segment */
save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
save->cs.limit = 0xffff;
save->gdtr.limit = 0xffff;
save->idtr.limit = 0xffff;
init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
if (!init_event)
svm_set_efer(&svm->vcpu, 0);
save->dr6 = 0xffff0ff0;
kvm_set_rflags(&svm->vcpu, 2);
save->rip = 0x0000fff0;
svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
/*
* svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
* It also updates the guest-visible cr0 value.
*/
(void)kvm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
save->cr4 = X86_CR4_PAE;
/* rdx = ?? */
if (npt_enabled) {
/* Setup VMCB for Nested Paging */
control->nested_ctl = 1;
clr_intercept(svm, INTERCEPT_INVLPG);
clr_exception_intercept(svm, PF_VECTOR);
clr_cr_intercept(svm, INTERCEPT_CR3_READ);
clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
save->g_pat = svm->vcpu.arch.pat;
svm_set_guest_pat(svm, &save->g_pat);
save->cr3 = 0;
save->cr4 = 0;
}
svm->asid_generation = 0;
svm->nested.vmcb = 0;
svm->vcpu.arch.hflags = 0;
if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
control->pause_filter_count = 3000;
set_intercept(svm, INTERCEPT_PAUSE);
}
mark_all_dirty(svm->vmcb);
enable_gif(svm);
}
static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{
struct vcpu_svm *svm = to_svm(vcpu);
u32 dummy;
u32 eax = 1;
if (!init_event) {
svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
MSR_IA32_APICBASE_ENABLE;
if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
}
init_vmcb(svm, init_event);
kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
}
static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
{
struct vcpu_svm *svm;
struct page *page;
struct page *msrpm_pages;
struct page *hsave_page;
struct page *nested_msrpm_pages;
int err;
svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
if (!svm) {
err = -ENOMEM;
goto out;
}
svm->tsc_ratio = TSC_RATIO_DEFAULT;
err = kvm_vcpu_init(&svm->vcpu, kvm, id);
if (err)
goto free_svm;
err = -ENOMEM;
page = alloc_page(GFP_KERNEL);
if (!page)
goto uninit;
msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
if (!msrpm_pages)
goto free_page1;
nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
if (!nested_msrpm_pages)
goto free_page2;
hsave_page = alloc_page(GFP_KERNEL);
if (!hsave_page)
goto free_page3;
svm->nested.hsave = page_address(hsave_page);
svm->msrpm = page_address(msrpm_pages);
svm_vcpu_init_msrpm(svm->msrpm);
svm->nested.msrpm = page_address(nested_msrpm_pages);
svm_vcpu_init_msrpm(svm->nested.msrpm);
svm->vmcb = page_address(page);
clear_page(svm->vmcb);
svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
svm->asid_generation = 0;
init_vmcb(svm, false);
svm_init_osvw(&svm->vcpu);
return &svm->vcpu;
free_page3:
__free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
free_page2:
__free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
free_page1:
__free_page(page);
uninit:
kvm_vcpu_uninit(&svm->vcpu);
free_svm:
kmem_cache_free(kvm_vcpu_cache, svm);
out:
return ERR_PTR(err);
}
static void svm_free_vcpu(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
__free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT));
__free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
__free_page(virt_to_page(svm->nested.hsave));
__free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
kvm_vcpu_uninit(vcpu);
kmem_cache_free(kvm_vcpu_cache, svm);
}
static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
int i;
if (unlikely(cpu != vcpu->cpu)) {
svm->asid_generation = 0;
mark_all_dirty(svm->vmcb);
}
#ifdef CONFIG_X86_64
rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
#endif
savesegment(fs, svm->host.fs);
savesegment(gs, svm->host.gs);
svm->host.ldt = kvm_read_ldt();
for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
if (static_cpu_has(X86_FEATURE_TSCRATEMSR) &&
svm->tsc_ratio != __this_cpu_read(current_tsc_ratio)) {
__this_cpu_write(current_tsc_ratio, svm->tsc_ratio);
wrmsrl(MSR_AMD64_TSC_RATIO, svm->tsc_ratio);
}
}
static void svm_vcpu_put(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
int i;
++vcpu->stat.host_state_reload;
kvm_load_ldt(svm->host.ldt);
#ifdef CONFIG_X86_64
loadsegment(fs, svm->host.fs);
wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
load_gs_index(svm->host.gs);
#else
#ifdef CONFIG_X86_32_LAZY_GS
loadsegment(gs, svm->host.gs);
#endif
#endif
for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
}
static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
{
return to_svm(vcpu)->vmcb->save.rflags;
}
static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
{
/*
* Any change of EFLAGS.VM is accompained by a reload of SS
* (caused by either a task switch or an inter-privilege IRET),
* so we do not need to update the CPL here.
*/
to_svm(vcpu)->vmcb->save.rflags = rflags;
}
static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
{
switch (reg) {
case VCPU_EXREG_PDPTR:
BUG_ON(!npt_enabled);
load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
break;
default:
BUG();
}
}
static void svm_set_vintr(struct vcpu_svm *svm)
{
set_intercept(svm, INTERCEPT_VINTR);
}
static void svm_clear_vintr(struct vcpu_svm *svm)
{
clr_intercept(svm, INTERCEPT_VINTR);
}
static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
{
struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
switch (seg) {
case VCPU_SREG_CS: return &save->cs;
case VCPU_SREG_DS: return &save->ds;
case VCPU_SREG_ES: return &save->es;
case VCPU_SREG_FS: return &save->fs;
case VCPU_SREG_GS: return &save->gs;
case VCPU_SREG_SS: return &save->ss;
case VCPU_SREG_TR: return &save->tr;
case VCPU_SREG_LDTR: return &save->ldtr;
}
BUG();
return NULL;
}
static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
{
struct vmcb_seg *s = svm_seg(vcpu, seg);
return s->base;
}
static void svm_get_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg)
{
struct vmcb_seg *s = svm_seg(vcpu, seg);
var->base = s->base;
var->limit = s->limit;
var->selector = s->selector;
var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
/*
* AMD CPUs circa 2014 track the G bit for all segments except CS.
* However, the SVM spec states that the G bit is not observed by the
* CPU, and some VMware virtual CPUs drop the G bit for all segments.
* So let's synthesize a legal G bit for all segments, this helps
* running KVM nested. It also helps cross-vendor migration, because
* Intel's vmentry has a check on the 'G' bit.
*/
var->g = s->limit > 0xfffff;
/*
* AMD's VMCB does not have an explicit unusable field, so emulate it
* for cross vendor migration purposes by "not present"
*/
var->unusable = !var->present || (var->type == 0);
switch (seg) {
case VCPU_SREG_TR:
/*
* Work around a bug where the busy flag in the tr selector
* isn't exposed
*/
var->type |= 0x2;
break;
case VCPU_SREG_DS:
case VCPU_SREG_ES:
case VCPU_SREG_FS:
case VCPU_SREG_GS:
/*
* The accessed bit must always be set in the segment
* descriptor cache, although it can be cleared in the
* descriptor, the cached bit always remains at 1. Since
* Intel has a check on this, set it here to support
* cross-vendor migration.
*/
if (!var->unusable)
var->type |= 0x1;
break;
case VCPU_SREG_SS:
/*
* On AMD CPUs sometimes the DB bit in the segment
* descriptor is left as 1, although the whole segment has
* been made unusable. Clear it here to pass an Intel VMX
* entry check when cross vendor migrating.
*/
if (var->unusable)
var->db = 0;
var->dpl = to_svm(vcpu)->vmcb->save.cpl;
break;
}
}
static int svm_get_cpl(struct kvm_vcpu *vcpu)
{
struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
return save->cpl;
}
static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
struct vcpu_svm *svm = to_svm(vcpu);
dt->size = svm->vmcb->save.idtr.limit;
dt->address = svm->vmcb->save.idtr.base;
}
static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
struct vcpu_svm *svm = to_svm(vcpu);
svm->vmcb->save.idtr.limit = dt->size;
svm->vmcb->save.idtr.base = dt->address ;
mark_dirty(svm->vmcb, VMCB_DT);
}
static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
struct vcpu_svm *svm = to_svm(vcpu);
dt->size = svm->vmcb->save.gdtr.limit;
dt->address = svm->vmcb->save.gdtr.base;
}
static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
{
struct vcpu_svm *svm = to_svm(vcpu);
svm->vmcb->save.gdtr.limit = dt->size;
svm->vmcb->save.gdtr.base = dt->address ;
mark_dirty(svm->vmcb, VMCB_DT);
}
static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
{
}
static void svm_decache_cr3(struct kvm_vcpu *vcpu)
{
}
static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
{
}
static void update_cr0_intercept(struct vcpu_svm *svm)
{
ulong gcr0 = svm->vcpu.arch.cr0;
u64 *hcr0 = &svm->vmcb->save.cr0;
if (!svm->vcpu.fpu_active)
*hcr0 |= SVM_CR0_SELECTIVE_MASK;
else
*hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
| (gcr0 & SVM_CR0_SELECTIVE_MASK);
mark_dirty(svm->vmcb, VMCB_CR);
if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
clr_cr_intercept(svm, INTERCEPT_CR0_READ);
clr_cr_intercept(svm, INTERCEPT_CR0_WRITE);
} else {
set_cr_intercept(svm, INTERCEPT_CR0_READ);
set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
}
}
static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
struct vcpu_svm *svm = to_svm(vcpu);
#ifdef CONFIG_X86_64
if (vcpu->arch.efer & EFER_LME) {
if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
vcpu->arch.efer |= EFER_LMA;
svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
}
if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
vcpu->arch.efer &= ~EFER_LMA;
svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
}
}
#endif
vcpu->arch.cr0 = cr0;
if (!npt_enabled)
cr0 |= X86_CR0_PG | X86_CR0_WP;
if (!vcpu->fpu_active)
cr0 |= X86_CR0_TS;
/*
* re-enable caching here because the QEMU bios
* does not do it - this results in some delay at
* reboot
*/
if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
svm->vmcb->save.cr0 = cr0;
mark_dirty(svm->vmcb, VMCB_CR);
update_cr0_intercept(svm);
}
static int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
if (cr4 & X86_CR4_VMXE)
return 1;
if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
svm_flush_tlb(vcpu);
vcpu->arch.cr4 = cr4;
if (!npt_enabled)
cr4 |= X86_CR4_PAE;
cr4 |= host_cr4_mce;
to_svm(vcpu)->vmcb->save.cr4 = cr4;
mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
return 0;
}
static void svm_set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb_seg *s = svm_seg(vcpu, seg);
s->base = var->base;
s->limit = var->limit;
s->selector = var->selector;
if (var->unusable)
s->attrib = 0;
else {
s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
s->attrib |= (var->present & 1) << SVM_SELECTOR_P_SHIFT;
s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
}
/*
* This is always accurate, except if SYSRET returned to a segment
* with SS.DPL != 3. Intel does not have this quirk, and always
* forces SS.DPL to 3 on sysret, so we ignore that case; fixing it
* would entail passing the CPL to userspace and back.
*/
if (seg == VCPU_SREG_SS)
svm->vmcb->save.cpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
mark_dirty(svm->vmcb, VMCB_SEG);
}
static void update_db_bp_intercept(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
clr_exception_intercept(svm, DB_VECTOR);
clr_exception_intercept(svm, BP_VECTOR);
if (svm->nmi_singlestep)
set_exception_intercept(svm, DB_VECTOR);
if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
if (vcpu->guest_debug &
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
set_exception_intercept(svm, DB_VECTOR);
if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
set_exception_intercept(svm, BP_VECTOR);
} else
vcpu->guest_debug = 0;
}
static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
{
if (sd->next_asid > sd->max_asid) {
++sd->asid_generation;
sd->next_asid = 1;
svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
}
svm->asid_generation = sd->asid_generation;
svm->vmcb->control.asid = sd->next_asid++;
mark_dirty(svm->vmcb, VMCB_ASID);
}
static u64 svm_get_dr6(struct kvm_vcpu *vcpu)
{
return to_svm(vcpu)->vmcb->save.dr6;
}
static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value)
{
struct vcpu_svm *svm = to_svm(vcpu);
svm->vmcb->save.dr6 = value;
mark_dirty(svm->vmcb, VMCB_DR);
}
static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
get_debugreg(vcpu->arch.db[0], 0);
get_debugreg(vcpu->arch.db[1], 1);
get_debugreg(vcpu->arch.db[2], 2);
get_debugreg(vcpu->arch.db[3], 3);
vcpu->arch.dr6 = svm_get_dr6(vcpu);
vcpu->arch.dr7 = svm->vmcb->save.dr7;
vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
set_dr_intercepts(svm);
}
static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
{
struct vcpu_svm *svm = to_svm(vcpu);
svm->vmcb->save.dr7 = value;
mark_dirty(svm->vmcb, VMCB_DR);
}
static int pf_interception(struct vcpu_svm *svm)
{
u64 fault_address = svm->vmcb->control.exit_info_2;
u32 error_code;
int r = 1;
switch (svm->apf_reason) {
default:
error_code = svm->vmcb->control.exit_info_1;
trace_kvm_page_fault(fault_address, error_code);
if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu))
kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
svm->vmcb->control.insn_bytes,
svm->vmcb->control.insn_len);
break;
case KVM_PV_REASON_PAGE_NOT_PRESENT:
svm->apf_reason = 0;
local_irq_disable();
kvm_async_pf_task_wait(fault_address);
local_irq_enable();
break;
case KVM_PV_REASON_PAGE_READY:
svm->apf_reason = 0;
local_irq_disable();
kvm_async_pf_task_wake(fault_address);
local_irq_enable();
break;
}
return r;
}
static int db_interception(struct vcpu_svm *svm)
{
struct kvm_run *kvm_run = svm->vcpu.run;
if (!(svm->vcpu.guest_debug &
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
!svm->nmi_singlestep) {
kvm_queue_exception(&svm->vcpu, DB_VECTOR);
return 1;
}
if (svm->nmi_singlestep) {
svm->nmi_singlestep = false;
if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))
svm->vmcb->save.rflags &=
~(X86_EFLAGS_TF | X86_EFLAGS_RF);
update_db_bp_intercept(&svm->vcpu);
}
if (svm->vcpu.guest_debug &
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
kvm_run->exit_reason = KVM_EXIT_DEBUG;
kvm_run->debug.arch.pc =
svm->vmcb->save.cs.base + svm->vmcb->save.rip;
kvm_run->debug.arch.exception = DB_VECTOR;
return 0;
}
return 1;
}
static int bp_interception(struct vcpu_svm *svm)
{
struct kvm_run *kvm_run = svm->vcpu.run;
kvm_run->exit_reason = KVM_EXIT_DEBUG;
kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
kvm_run->debug.arch.exception = BP_VECTOR;
return 0;
}
static int ud_interception(struct vcpu_svm *svm)
{
int er;
er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD);
if (er != EMULATE_DONE)
kvm_queue_exception(&svm->vcpu, UD_VECTOR);
return 1;
}
static void svm_fpu_activate(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
clr_exception_intercept(svm, NM_VECTOR);
svm->vcpu.fpu_active = 1;
update_cr0_intercept(svm);
}
static int nm_interception(struct vcpu_svm *svm)
{
svm_fpu_activate(&svm->vcpu);
return 1;
}
static bool is_erratum_383(void)
{
int err, i;
u64 value;
if (!erratum_383_found)
return false;
value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err);
if (err)
return false;
/* Bit 62 may or may not be set for this mce */
value &= ~(1ULL << 62);
if (value != 0xb600000000010015ULL)
return false;
/* Clear MCi_STATUS registers */
for (i = 0; i < 6; ++i)
native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0);
value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err);
if (!err) {
u32 low, high;
value &= ~(1ULL << 2);
low = lower_32_bits(value);
high = upper_32_bits(value);
native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high);
}
/* Flush tlb to evict multi-match entries */
__flush_tlb_all();
return true;
}
static void svm_handle_mce(struct vcpu_svm *svm)
{
if (is_erratum_383()) {
/*
* Erratum 383 triggered. Guest state is corrupt so kill the
* guest.
*/
pr_err("KVM: Guest triggered AMD Erratum 383\n");
kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu);
return;
}
/*
* On an #MC intercept the MCE handler is not called automatically in
* the host. So do it by hand here.
*/
asm volatile (
"int $0x12\n");
/* not sure if we ever come back to this point */
return;
}
static int mc_interception(struct vcpu_svm *svm)
{
return 1;
}
static int shutdown_interception(struct vcpu_svm *svm)
{
struct kvm_run *kvm_run = svm->vcpu.run;
/*
* VMCB is undefined after a SHUTDOWN intercept
* so reinitialize it.
*/
clear_page(svm->vmcb);
init_vmcb(svm, false);
kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
return 0;
}
static int io_interception(struct vcpu_svm *svm)
{
struct kvm_vcpu *vcpu = &svm->vcpu;
u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
int size, in, string;
unsigned port;
++svm->vcpu.stat.io_exits;
string = (io_info & SVM_IOIO_STR_MASK) != 0;
in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
if (string || in)
return emulate_instruction(vcpu, 0) == EMULATE_DONE;
port = io_info >> 16;
size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
svm->next_rip = svm->vmcb->control.exit_info_2;
skip_emulated_instruction(&svm->vcpu);
return kvm_fast_pio_out(vcpu, size, port);
}
static int nmi_interception(struct vcpu_svm *svm)
{
return 1;
}
static int intr_interception(struct vcpu_svm *svm)
{
++svm->vcpu.stat.irq_exits;
return 1;
}
static int nop_on_interception(struct vcpu_svm *svm)
{
return 1;
}
static int halt_interception(struct vcpu_svm *svm)
{
svm->next_rip = kvm_rip_read(&svm->vcpu) + 1;
return kvm_emulate_halt(&svm->vcpu);
}
static int vmmcall_interception(struct vcpu_svm *svm)
{
svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
kvm_emulate_hypercall(&svm->vcpu);
return 1;
}
static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
return svm->nested.nested_cr3;
}
static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
{
struct vcpu_svm *svm = to_svm(vcpu);
u64 cr3 = svm->nested.nested_cr3;
u64 pdpte;
int ret;
ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(cr3), &pdpte,
offset_in_page(cr3) + index * 8, 8);
if (ret)
return 0;
return pdpte;
}
static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
unsigned long root)
{
struct vcpu_svm *svm = to_svm(vcpu);
svm->vmcb->control.nested_cr3 = root;
mark_dirty(svm->vmcb, VMCB_NPT);
svm_flush_tlb(vcpu);
}
static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
struct x86_exception *fault)
{
struct vcpu_svm *svm = to_svm(vcpu);
if (svm->vmcb->control.exit_code != SVM_EXIT_NPF) {
/*
* TODO: track the cause of the nested page fault, and
* correctly fill in the high bits of exit_info_1.
*/
svm->vmcb->control.exit_code = SVM_EXIT_NPF;
svm->vmcb->control.exit_code_hi = 0;
svm->vmcb->control.exit_info_1 = (1ULL << 32);
svm->vmcb->control.exit_info_2 = fault->address;
}
svm->vmcb->control.exit_info_1 &= ~0xffffffffULL;
svm->vmcb->control.exit_info_1 |= fault->error_code;
/*
* The present bit is always zero for page structure faults on real
* hardware.
*/
if (svm->vmcb->control.exit_info_1 & (2ULL << 32))
svm->vmcb->control.exit_info_1 &= ~1;
nested_svm_vmexit(svm);
}
static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
{
WARN_ON(mmu_is_nested(vcpu));
kvm_init_shadow_mmu(vcpu);
vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3;
vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3;
vcpu->arch.mmu.get_pdptr = nested_svm_get_tdp_pdptr;
vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit;
vcpu->arch.mmu.shadow_root_level = get_npt_level();
vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
}
static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
{
vcpu->arch.walk_mmu = &vcpu->arch.mmu;
}
static int nested_svm_check_permissions(struct vcpu_svm *svm)
{
if (!(svm->vcpu.arch.efer & EFER_SVME)
|| !is_paging(&svm->vcpu)) {
kvm_queue_exception(&svm->vcpu, UD_VECTOR);
return 1;
}
if (svm->vmcb->save.cpl) {
kvm_inject_gp(&svm->vcpu, 0);
return 1;
}
return 0;
}
static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
bool has_error_code, u32 error_code)
{
int vmexit;
if (!is_guest_mode(&svm->vcpu))
return 0;
svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
svm->vmcb->control.exit_code_hi = 0;
svm->vmcb->control.exit_info_1 = error_code;
svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
vmexit = nested_svm_intercept(svm);
if (vmexit == NESTED_EXIT_DONE)
svm->nested.exit_required = true;
return vmexit;
}
/* This function returns true if it is save to enable the irq window */
static inline bool nested_svm_intr(struct vcpu_svm *svm)
{
if (!is_guest_mode(&svm->vcpu))
return true;
if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
return true;
if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
return false;
/*
* if vmexit was already requested (by intercepted exception
* for instance) do not overwrite it with "external interrupt"
* vmexit.
*/
if (svm->nested.exit_required)
return false;
svm->vmcb->control.exit_code = SVM_EXIT_INTR;
svm->vmcb->control.exit_info_1 = 0;
svm->vmcb->control.exit_info_2 = 0;
if (svm->nested.intercept & 1ULL) {
/*
* The #vmexit can't be emulated here directly because this
* code path runs with irqs and preemption disabled. A
* #vmexit emulation might sleep. Only signal request for
* the #vmexit here.
*/
svm->nested.exit_required = true;
trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
return false;
}
return true;
}
/* This function returns true if it is save to enable the nmi window */
static inline bool nested_svm_nmi(struct vcpu_svm *svm)
{
if (!is_guest_mode(&svm->vcpu))
return true;
if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI)))
return true;
svm->vmcb->control.exit_code = SVM_EXIT_NMI;
svm->nested.exit_required = true;
return false;
}
static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
{
struct page *page;
might_sleep();
page = kvm_vcpu_gfn_to_page(&svm->vcpu, gpa >> PAGE_SHIFT);
if (is_error_page(page))
goto error;
*_page = page;
return kmap(page);
error:
kvm_inject_gp(&svm->vcpu, 0);
return NULL;
}
static void nested_svm_unmap(struct page *page)
{
kunmap(page);
kvm_release_page_dirty(page);
}
static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
{
unsigned port, size, iopm_len;
u16 val, mask;
u8 start_bit;
u64 gpa;
if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT)))
return NESTED_EXIT_HOST;
port = svm->vmcb->control.exit_info_1 >> 16;
size = (svm->vmcb->control.exit_info_1 & SVM_IOIO_SIZE_MASK) >>
SVM_IOIO_SIZE_SHIFT;
gpa = svm->nested.vmcb_iopm + (port / 8);
start_bit = port % 8;
iopm_len = (start_bit + size > 8) ? 2 : 1;
mask = (0xf >> (4 - size)) << start_bit;
val = 0;
if (kvm_vcpu_read_guest(&svm->vcpu, gpa, &val, iopm_len))
return NESTED_EXIT_DONE;
return (val & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
}
static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
{
u32 offset, msr, value;
int write, mask;
if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
return NESTED_EXIT_HOST;
msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
offset = svm_msrpm_offset(msr);
write = svm->vmcb->control.exit_info_1 & 1;
mask = 1 << ((2 * (msr & 0xf)) + write);
if (offset == MSR_INVALID)
return NESTED_EXIT_DONE;
/* Offset is in 32 bit units but need in 8 bit units */
offset *= 4;
if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.vmcb_msrpm + offset, &value, 4))
return NESTED_EXIT_DONE;
return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
}
static int nested_svm_exit_special(struct vcpu_svm *svm)
{
u32 exit_code = svm->vmcb->control.exit_code;
switch (exit_code) {
case SVM_EXIT_INTR:
case SVM_EXIT_NMI:
case SVM_EXIT_EXCP_BASE + MC_VECTOR:
return NESTED_EXIT_HOST;
case SVM_EXIT_NPF:
/* For now we are always handling NPFs when using them */
if (npt_enabled)
return NESTED_EXIT_HOST;
break;
case SVM_EXIT_EXCP_BASE + PF_VECTOR:
/* When we're shadowing, trap PFs, but not async PF */
if (!npt_enabled && svm->apf_reason == 0)
return NESTED_EXIT_HOST;
break;
case SVM_EXIT_EXCP_BASE + NM_VECTOR:
nm_interception(svm);
break;
default:
break;
}
return NESTED_EXIT_CONTINUE;
}
/*
* If this function returns true, this #vmexit was already handled
*/
static int nested_svm_intercept(struct vcpu_svm *svm)
{
u32 exit_code = svm->vmcb->control.exit_code;
int vmexit = NESTED_EXIT_HOST;
switch (exit_code) {
case SVM_EXIT_MSR:
vmexit = nested_svm_exit_handled_msr(svm);
break;
case SVM_EXIT_IOIO:
vmexit = nested_svm_intercept_ioio(svm);
break;
case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: {
u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0);
if (svm->nested.intercept_cr & bit)
vmexit = NESTED_EXIT_DONE;
break;
}
case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: {
u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0);
if (svm->nested.intercept_dr & bit)
vmexit = NESTED_EXIT_DONE;
break;
}
case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
if (svm->nested.intercept_exceptions & excp_bits)
vmexit = NESTED_EXIT_DONE;
/* async page fault always cause vmexit */
else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) &&
svm->apf_reason != 0)
vmexit = NESTED_EXIT_DONE;
break;
}
case SVM_EXIT_ERR: {
vmexit = NESTED_EXIT_DONE;
break;
}
default: {
u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
if (svm->nested.intercept & exit_bits)
vmexit = NESTED_EXIT_DONE;
}
}
return vmexit;
}
static int nested_svm_exit_handled(struct vcpu_svm *svm)
{
int vmexit;
vmexit = nested_svm_intercept(svm);
if (vmexit == NESTED_EXIT_DONE)
nested_svm_vmexit(svm);
return vmexit;
}
static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *from_vmcb)
{
struct vmcb_control_area *dst = &dst_vmcb->control;
struct vmcb_control_area *from = &from_vmcb->control;
dst->intercept_cr = from->intercept_cr;
dst->intercept_dr = from->intercept_dr;
dst->intercept_exceptions = from->intercept_exceptions;
dst->intercept = from->intercept;
dst->iopm_base_pa = from->iopm_base_pa;
dst->msrpm_base_pa = from->msrpm_base_pa;
dst->tsc_offset = from->tsc_offset;
dst->asid = from->asid;
dst->tlb_ctl = from->tlb_ctl;
dst->int_ctl = from->int_ctl;
dst->int_vector = from->int_vector;
dst->int_state = from->int_state;
dst->exit_code = from->exit_code;
dst->exit_code_hi = from->exit_code_hi;
dst->exit_info_1 = from->exit_info_1;
dst->exit_info_2 = from->exit_info_2;
dst->exit_int_info = from->exit_int_info;
dst->exit_int_info_err = from->exit_int_info_err;
dst->nested_ctl = from->nested_ctl;
dst->event_inj = from->event_inj;
dst->event_inj_err = from->event_inj_err;
dst->nested_cr3 = from->nested_cr3;
dst->lbr_ctl = from->lbr_ctl;
}
static int nested_svm_vmexit(struct vcpu_svm *svm)
{
struct vmcb *nested_vmcb;
struct vmcb *hsave = svm->nested.hsave;
struct vmcb *vmcb = svm->vmcb;
struct page *page;
trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
vmcb->control.exit_info_1,
vmcb->control.exit_info_2,
vmcb->control.exit_int_info,
vmcb->control.exit_int_info_err,
KVM_ISA_SVM);
nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page);
if (!nested_vmcb)
return 1;
/* Exit Guest-Mode */
leave_guest_mode(&svm->vcpu);
svm->nested.vmcb = 0;
/* Give the current vmcb to the guest */
disable_gif(svm);
nested_vmcb->save.es = vmcb->save.es;
nested_vmcb->save.cs = vmcb->save.cs;
nested_vmcb->save.ss = vmcb->save.ss;
nested_vmcb->save.ds = vmcb->save.ds;
nested_vmcb->save.gdtr = vmcb->save.gdtr;
nested_vmcb->save.idtr = vmcb->save.idtr;
nested_vmcb->save.efer = svm->vcpu.arch.efer;
nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu);
nested_vmcb->save.cr3 = kvm_read_cr3(&svm->vcpu);
nested_vmcb->save.cr2 = vmcb->save.cr2;
nested_vmcb->save.cr4 = svm->vcpu.arch.cr4;
nested_vmcb->save.rflags = kvm_get_rflags(&svm->vcpu);
nested_vmcb->save.rip = vmcb->save.rip;
nested_vmcb->save.rsp = vmcb->save.rsp;
nested_vmcb->save.rax = vmcb->save.rax;
nested_vmcb->save.dr7 = vmcb->save.dr7;
nested_vmcb->save.dr6 = vmcb->save.dr6;
nested_vmcb->save.cpl = vmcb->save.cpl;
nested_vmcb->control.int_ctl = vmcb->control.int_ctl;
nested_vmcb->control.int_vector = vmcb->control.int_vector;
nested_vmcb->control.int_state = vmcb->control.int_state;
nested_vmcb->control.exit_code = vmcb->control.exit_code;
nested_vmcb->control.exit_code_hi = vmcb->control.exit_code_hi;
nested_vmcb->control.exit_info_1 = vmcb->control.exit_info_1;
nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2;
nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info;
nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
nested_vmcb->control.next_rip = vmcb->control.next_rip;
/*
* If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
* to make sure that we do not lose injected events. So check event_inj
* here and copy it to exit_int_info if it is valid.
* Exit_int_info and event_inj can't be both valid because the case
* below only happens on a VMRUN instruction intercept which has
* no valid exit_int_info set.
*/
if (vmcb->control.event_inj & SVM_EVTINJ_VALID) {
struct vmcb_control_area *nc = &nested_vmcb->control;
nc->exit_int_info = vmcb->control.event_inj;
nc->exit_int_info_err = vmcb->control.event_inj_err;
}
nested_vmcb->control.tlb_ctl = 0;
nested_vmcb->control.event_inj = 0;
nested_vmcb->control.event_inj_err = 0;
/* We always set V_INTR_MASKING and remember the old value in hflags */
if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK;
/* Restore the original control entries */
copy_vmcb_control_area(vmcb, hsave);
kvm_clear_exception_queue(&svm->vcpu);
kvm_clear_interrupt_queue(&svm->vcpu);
svm->nested.nested_cr3 = 0;
/* Restore selected save entries */
svm->vmcb->save.es = hsave->save.es;
svm->vmcb->save.cs = hsave->save.cs;
svm->vmcb->save.ss = hsave->save.ss;
svm->vmcb->save.ds = hsave->save.ds;
svm->vmcb->save.gdtr = hsave->save.gdtr;
svm->vmcb->save.idtr = hsave->save.idtr;
kvm_set_rflags(&svm->vcpu, hsave->save.rflags);
svm_set_efer(&svm->vcpu, hsave->save.efer);
svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
svm_set_cr4(&svm->vcpu, hsave->save.cr4);
if (npt_enabled) {
svm->vmcb->save.cr3 = hsave->save.cr3;
svm->vcpu.arch.cr3 = hsave->save.cr3;
} else {
(void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
}
kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax);
kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp);
kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, hsave->save.rip);
svm->vmcb->save.dr7 = 0;
svm->vmcb->save.cpl = 0;
svm->vmcb->control.exit_int_info = 0;
mark_all_dirty(svm->vmcb);
nested_svm_unmap(page);
nested_svm_uninit_mmu_context(&svm->vcpu);
kvm_mmu_reset_context(&svm->vcpu);
kvm_mmu_load(&svm->vcpu);
return 0;
}
static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
{
/*
* This function merges the msr permission bitmaps of kvm and the
* nested vmcb. It is optimized in that it only merges the parts where
* the kvm msr permission bitmap may contain zero bits
*/
int i;
if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
return true;
for (i = 0; i < MSRPM_OFFSETS; i++) {
u32 value, p;
u64 offset;
if (msrpm_offsets[i] == 0xffffffff)
break;
p = msrpm_offsets[i];
offset = svm->nested.vmcb_msrpm + (p * 4);
if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4))
return false;
svm->nested.msrpm[p] = svm->msrpm[p] | value;
}
svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm);
return true;
}
static bool nested_vmcb_checks(struct vmcb *vmcb)
{
if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0)
return false;
if (vmcb->control.asid == 0)
return false;
if (vmcb->control.nested_ctl && !npt_enabled)
return false;
return true;
}
static bool nested_svm_vmrun(struct vcpu_svm *svm)
{
struct vmcb *nested_vmcb;
struct vmcb *hsave = svm->nested.hsave;
struct vmcb *vmcb = svm->vmcb;
struct page *page;
u64 vmcb_gpa;
vmcb_gpa = svm->vmcb->save.rax;
nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
if (!nested_vmcb)
return false;
if (!nested_vmcb_checks(nested_vmcb)) {
nested_vmcb->control.exit_code = SVM_EXIT_ERR;
nested_vmcb->control.exit_code_hi = 0;
nested_vmcb->control.exit_info_1 = 0;
nested_vmcb->control.exit_info_2 = 0;
nested_svm_unmap(page);
return false;
}
trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa,
nested_vmcb->save.rip,
nested_vmcb->control.int_ctl,
nested_vmcb->control.event_inj,
nested_vmcb->control.nested_ctl);
trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff,
nested_vmcb->control.intercept_cr >> 16,
nested_vmcb->control.intercept_exceptions,
nested_vmcb->control.intercept);
/* Clear internal status */
kvm_clear_exception_queue(&svm->vcpu);
kvm_clear_interrupt_queue(&svm->vcpu);
/*
* Save the old vmcb, so we don't need to pick what we save, but can
* restore everything when a VMEXIT occurs
*/
hsave->save.es = vmcb->save.es;
hsave->save.cs = vmcb->save.cs;
hsave->save.ss = vmcb->save.ss;
hsave->save.ds = vmcb->save.ds;
hsave->save.gdtr = vmcb->save.gdtr;
hsave->save.idtr = vmcb->save.idtr;
hsave->save.efer = svm->vcpu.arch.efer;
hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
hsave->save.cr4 = svm->vcpu.arch.cr4;
hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
hsave->save.rip = kvm_rip_read(&svm->vcpu);
hsave->save.rsp = vmcb->save.rsp;
hsave->save.rax = vmcb->save.rax;
if (npt_enabled)
hsave->save.cr3 = vmcb->save.cr3;
else
hsave->save.cr3 = kvm_read_cr3(&svm->vcpu);
copy_vmcb_control_area(hsave, vmcb);
if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF)
svm->vcpu.arch.hflags |= HF_HIF_MASK;
else
svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
if (nested_vmcb->control.nested_ctl) {
kvm_mmu_unload(&svm->vcpu);
svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3;
nested_svm_init_mmu_context(&svm->vcpu);
}
/* Load the nested guest state */
svm->vmcb->save.es = nested_vmcb->save.es;
svm->vmcb->save.cs = nested_vmcb->save.cs;
svm->vmcb->save.ss = nested_vmcb->save.ss;
svm->vmcb->save.ds = nested_vmcb->save.ds;
svm->vmcb->save.gdtr = nested_vmcb->save.gdtr;
svm->vmcb->save.idtr = nested_vmcb->save.idtr;
kvm_set_rflags(&svm->vcpu, nested_vmcb->save.rflags);
svm_set_efer(&svm->vcpu, nested_vmcb->save.efer);
svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0);
svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4);
if (npt_enabled) {
svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
} else
(void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
/* Guest paging mode is active - reset mmu */
kvm_mmu_reset_context(&svm->vcpu);
svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2;
kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax);
kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp);
kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip);
/* In case we don't even reach vcpu_run, the fields are not updated */
svm->vmcb->save.rax = nested_vmcb->save.rax;
svm->vmcb->save.rsp = nested_vmcb->save.rsp;
svm->vmcb->save.rip = nested_vmcb->save.rip;
svm->vmcb->save.dr7 = nested_vmcb->save.dr7;
svm->vmcb->save.dr6 = nested_vmcb->save.dr6;
svm->vmcb->save.cpl = nested_vmcb->save.cpl;
svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL;
svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL;
/* cache intercepts */
svm->nested.intercept_cr = nested_vmcb->control.intercept_cr;
svm->nested.intercept_dr = nested_vmcb->control.intercept_dr;
svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions;
svm->nested.intercept = nested_vmcb->control.intercept;
svm_flush_tlb(&svm->vcpu);
svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
svm->vcpu.arch.hflags |= HF_VINTR_MASK;
else
svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
/* We only want the cr8 intercept bits of the guest */
clr_cr_intercept(svm, INTERCEPT_CR8_READ);
clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
}
/* We don't want to see VMMCALLs from a nested guest */
clr_intercept(svm, INTERCEPT_VMMCALL);
svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl;
svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
svm->vmcb->control.int_state = nested_vmcb->control.int_state;
svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset;
svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
nested_svm_unmap(page);
/* Enter Guest-Mode */
enter_guest_mode(&svm->vcpu);
/*