| From be974341d069d90af944dd253d8987c63029988e Mon Sep 17 00:00:00 2001 |
| From: Sean Christopherson <sean.j.christopherson@intel.com> |
| Date: Sat, 21 Mar 2020 12:37:49 -0700 |
| Subject: [PATCH] KVM: VMX: Always VMCLEAR in-use VMCSes during crash with |
| kexec support |
| |
| commit 31603d4fc2bb4f0815245d496cb970b27b4f636a upstream. |
| |
| VMCLEAR all in-use VMCSes during a crash, even if kdump's NMI shootdown |
| interrupted a KVM update of the percpu in-use VMCS list. |
| |
| Because NMIs are not blocked by disabling IRQs, it's possible that |
| crash_vmclear_local_loaded_vmcss() could be called while the percpu list |
| of VMCSes is being modified, e.g. in the middle of list_add() in |
| vmx_vcpu_load_vmcs(). This potential corner case was called out in the |
| original commit[*], but the analysis of its impact was wrong. |
| |
| Skipping the VMCLEARs is wrong because it all but guarantees that a |
| loaded, and therefore cached, VMCS will live across kexec and corrupt |
| memory in the new kernel. Corruption will occur because the CPU's VMCS |
| cache is non-coherent, i.e. not snooped, and so the writeback of VMCS |
| memory on its eviction will overwrite random memory in the new kernel. |
| The VMCS will live because the NMI shootdown also disables VMX, i.e. the |
| in-progress VMCLEAR will #UD, and existing Intel CPUs do not flush the |
| VMCS cache on VMXOFF. |
| |
| Furthermore, interrupting list_add() and list_del() is safe due to |
| crash_vmclear_local_loaded_vmcss() using forward iteration. list_add() |
| ensures the new entry is not visible to forward iteration unless the |
| entire add completes, via WRITE_ONCE(prev->next, new). A bad "prev" |
| pointer could be observed if the NMI shootdown interrupted list_del() or |
| list_add(), but list_for_each_entry() does not consume ->prev. |
| |
| In addition to removing the temporary disabling of VMCLEAR, open code |
| loaded_vmcs_init() in __loaded_vmcs_clear() and reorder VMCLEAR so that |
| the VMCS is deleted from the list only after it's been VMCLEAR'd. |
| Deleting the VMCS before VMCLEAR would allow a race where the NMI |
| shootdown could arrive between list_del() and vmcs_clear() and thus |
| neither flow would execute a successful VMCLEAR. Alternatively, more |
| code could be moved into loaded_vmcs_init(), but that gets rather silly |
| as the only other user, alloc_loaded_vmcs(), doesn't need the smp_wmb() |
| and would need to work around the list_del(). |
| |
| Update the smp_*() comments related to the list manipulation, and |
| opportunistically reword them to improve clarity. |
| |
| [*] https://patchwork.kernel.org/patch/1675731/#3720461 |
| |
| Fixes: 8f536b7697a0 ("KVM: VMX: provide the vmclear function and a bitmap to support VMCLEAR in kdump") |
| Cc: stable@vger.kernel.org |
| Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com> |
| Message-Id: <20200321193751.24985-2-sean.j.christopherson@intel.com> |
| Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com> |
| Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> |
| Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com> |
| |
| diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c |
| index 3663785262ae..92670007d9bc 100644 |
| --- a/arch/x86/kvm/vmx/vmx.c |
| +++ b/arch/x86/kvm/vmx/vmx.c |
| @@ -575,43 +575,15 @@ void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs) |
| } |
| |
| #ifdef CONFIG_KEXEC_CORE |
| -/* |
| - * This bitmap is used to indicate whether the vmclear |
| - * operation is enabled on all cpus. All disabled by |
| - * default. |
| - */ |
| -static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE; |
| - |
| -static inline void crash_enable_local_vmclear(int cpu) |
| -{ |
| - cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap); |
| -} |
| - |
| -static inline void crash_disable_local_vmclear(int cpu) |
| -{ |
| - cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap); |
| -} |
| - |
| -static inline int crash_local_vmclear_enabled(int cpu) |
| -{ |
| - return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap); |
| -} |
| - |
| static void crash_vmclear_local_loaded_vmcss(void) |
| { |
| int cpu = raw_smp_processor_id(); |
| struct loaded_vmcs *v; |
| |
| - if (!crash_local_vmclear_enabled(cpu)) |
| - return; |
| - |
| list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), |
| loaded_vmcss_on_cpu_link) |
| vmcs_clear(v->vmcs); |
| } |
| -#else |
| -static inline void crash_enable_local_vmclear(int cpu) { } |
| -static inline void crash_disable_local_vmclear(int cpu) { } |
| #endif /* CONFIG_KEXEC_CORE */ |
| |
| static void __loaded_vmcs_clear(void *arg) |
| @@ -623,19 +595,24 @@ static void __loaded_vmcs_clear(void *arg) |
| return; /* vcpu migration can race with cpu offline */ |
| if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) |
| per_cpu(current_vmcs, cpu) = NULL; |
| - crash_disable_local_vmclear(cpu); |
| + |
| + vmcs_clear(loaded_vmcs->vmcs); |
| + if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched) |
| + vmcs_clear(loaded_vmcs->shadow_vmcs); |
| + |
| list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); |
| |
| /* |
| - * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link |
| - * is before setting loaded_vmcs->vcpu to -1 which is done in |
| - * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist |
| - * then adds the vmcs into percpu list before it is deleted. |
| + * Ensure all writes to loaded_vmcs, including deleting it from its |
| + * current percpu list, complete before setting loaded_vmcs->vcpu to |
| + * -1, otherwise a different cpu can see vcpu == -1 first and add |
| + * loaded_vmcs to its percpu list before it's deleted from this cpu's |
| + * list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs(). |
| */ |
| smp_wmb(); |
| |
| - loaded_vmcs_init(loaded_vmcs); |
| - crash_enable_local_vmclear(cpu); |
| + loaded_vmcs->cpu = -1; |
| + loaded_vmcs->launched = 0; |
| } |
| |
| void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) |
| @@ -1229,18 +1206,17 @@ void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) |
| if (!already_loaded) { |
| loaded_vmcs_clear(vmx->loaded_vmcs); |
| local_irq_disable(); |
| - crash_disable_local_vmclear(cpu); |
| |
| /* |
| - * Read loaded_vmcs->cpu should be before fetching |
| - * loaded_vmcs->loaded_vmcss_on_cpu_link. |
| - * See the comments in __loaded_vmcs_clear(). |
| + * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to |
| + * this cpu's percpu list, otherwise it may not yet be deleted |
| + * from its previous cpu's percpu list. Pairs with the |
| + * smb_wmb() in __loaded_vmcs_clear(). |
| */ |
| smp_rmb(); |
| |
| list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, |
| &per_cpu(loaded_vmcss_on_cpu, cpu)); |
| - crash_enable_local_vmclear(cpu); |
| local_irq_enable(); |
| } |
| |
| @@ -2112,17 +2088,6 @@ static int hardware_enable(void) |
| INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu)); |
| spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); |
| |
| - /* |
| - * Now we can enable the vmclear operation in kdump |
| - * since the loaded_vmcss_on_cpu list on this cpu |
| - * has been initialized. |
| - * |
| - * Though the cpu is not in VMX operation now, there |
| - * is no problem to enable the vmclear operation |
| - * for the loaded_vmcss_on_cpu list is empty! |
| - */ |
| - crash_enable_local_vmclear(cpu); |
| - |
| rdmsrl(MSR_IA32_FEATURE_CONTROL, old); |
| |
| test_bits = FEATURE_CONTROL_LOCKED; |
| -- |
| 2.7.4 |
| |