blob: b73f7a9c62ec2e45967967df586f5d97a1b382c2 [file] [log] [blame]
From be974341d069d90af944dd253d8987c63029988e Mon Sep 17 00:00:00 2001
From: Sean Christopherson <>
Date: Sat, 21 Mar 2020 12:37:49 -0700
Subject: [PATCH] KVM: VMX: Always VMCLEAR in-use VMCSes during crash with
kexec support
commit 31603d4fc2bb4f0815245d496cb970b27b4f636a upstream.
VMCLEAR all in-use VMCSes during a crash, even if kdump's NMI shootdown
interrupted a KVM update of the percpu in-use VMCS list.
Because NMIs are not blocked by disabling IRQs, it's possible that
crash_vmclear_local_loaded_vmcss() could be called while the percpu list
of VMCSes is being modified, e.g. in the middle of list_add() in
vmx_vcpu_load_vmcs(). This potential corner case was called out in the
original commit[*], but the analysis of its impact was wrong.
Skipping the VMCLEARs is wrong because it all but guarantees that a
loaded, and therefore cached, VMCS will live across kexec and corrupt
memory in the new kernel. Corruption will occur because the CPU's VMCS
cache is non-coherent, i.e. not snooped, and so the writeback of VMCS
memory on its eviction will overwrite random memory in the new kernel.
The VMCS will live because the NMI shootdown also disables VMX, i.e. the
in-progress VMCLEAR will #UD, and existing Intel CPUs do not flush the
VMCS cache on VMXOFF.
Furthermore, interrupting list_add() and list_del() is safe due to
crash_vmclear_local_loaded_vmcss() using forward iteration. list_add()
ensures the new entry is not visible to forward iteration unless the
entire add completes, via WRITE_ONCE(prev->next, new). A bad "prev"
pointer could be observed if the NMI shootdown interrupted list_del() or
list_add(), but list_for_each_entry() does not consume ->prev.
In addition to removing the temporary disabling of VMCLEAR, open code
loaded_vmcs_init() in __loaded_vmcs_clear() and reorder VMCLEAR so that
the VMCS is deleted from the list only after it's been VMCLEAR'd.
Deleting the VMCS before VMCLEAR would allow a race where the NMI
shootdown could arrive between list_del() and vmcs_clear() and thus
neither flow would execute a successful VMCLEAR. Alternatively, more
code could be moved into loaded_vmcs_init(), but that gets rather silly
as the only other user, alloc_loaded_vmcs(), doesn't need the smp_wmb()
and would need to work around the list_del().
Update the smp_*() comments related to the list manipulation, and
opportunistically reword them to improve clarity.
Fixes: 8f536b7697a0 ("KVM: VMX: provide the vmclear function and a bitmap to support VMCLEAR in kdump")
Signed-off-by: Sean Christopherson <>
Message-Id: <>
Reviewed-by: Vitaly Kuznetsov <>
Signed-off-by: Paolo Bonzini <>
Signed-off-by: Paul Gortmaker <>
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 3663785262ae..92670007d9bc 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -575,43 +575,15 @@ void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
- * This bitmap is used to indicate whether the vmclear
- * operation is enabled on all cpus. All disabled by
- * default.
- */
-static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
-static inline void crash_enable_local_vmclear(int cpu)
- cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
-static inline void crash_disable_local_vmclear(int cpu)
- cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
-static inline int crash_local_vmclear_enabled(int cpu)
- return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
static void crash_vmclear_local_loaded_vmcss(void)
int cpu = raw_smp_processor_id();
struct loaded_vmcs *v;
- if (!crash_local_vmclear_enabled(cpu))
- return;
list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
-static inline void crash_enable_local_vmclear(int cpu) { }
-static inline void crash_disable_local_vmclear(int cpu) { }
#endif /* CONFIG_KEXEC_CORE */
static void __loaded_vmcs_clear(void *arg)
@@ -623,19 +595,24 @@ static void __loaded_vmcs_clear(void *arg)
return; /* vcpu migration can race with cpu offline */
if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
per_cpu(current_vmcs, cpu) = NULL;
- crash_disable_local_vmclear(cpu);
+ vmcs_clear(loaded_vmcs->vmcs);
+ if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
+ vmcs_clear(loaded_vmcs->shadow_vmcs);
- * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link
- * is before setting loaded_vmcs->vcpu to -1 which is done in
- * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist
- * then adds the vmcs into percpu list before it is deleted.
+ * Ensure all writes to loaded_vmcs, including deleting it from its
+ * current percpu list, complete before setting loaded_vmcs->vcpu to
+ * -1, otherwise a different cpu can see vcpu == -1 first and add
+ * loaded_vmcs to its percpu list before it's deleted from this cpu's
+ * list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
- loaded_vmcs_init(loaded_vmcs);
- crash_enable_local_vmclear(cpu);
+ loaded_vmcs->cpu = -1;
+ loaded_vmcs->launched = 0;
void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
@@ -1229,18 +1206,17 @@ void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (!already_loaded) {
- crash_disable_local_vmclear(cpu);
- * Read loaded_vmcs->cpu should be before fetching
- * loaded_vmcs->loaded_vmcss_on_cpu_link.
- * See the comments in __loaded_vmcs_clear().
+ * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to
+ * this cpu's percpu list, otherwise it may not yet be deleted
+ * from its previous cpu's percpu list. Pairs with the
+ * smb_wmb() in __loaded_vmcs_clear().
&per_cpu(loaded_vmcss_on_cpu, cpu));
- crash_enable_local_vmclear(cpu);
@@ -2112,17 +2088,6 @@ static int hardware_enable(void)
INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
- /*
- * Now we can enable the vmclear operation in kdump
- * since the loaded_vmcss_on_cpu list on this cpu
- * has been initialized.
- *
- * Though the cpu is not in VMX operation now, there
- * is no problem to enable the vmclear operation
- * for the loaded_vmcss_on_cpu list is empty!
- */
- crash_enable_local_vmclear(cpu);
rdmsrl(MSR_IA32_FEATURE_CONTROL, old);