| From 506091ad8b3114372401c3d56c837ee9abd5184e Mon Sep 17 00:00:00 2001 |
| From: Eiichi Tsukata <eiichi.tsukata@nutanix.com> |
| Date: Sat, 6 Jun 2020 13:26:27 +0900 |
| Subject: [PATCH] KVM: x86: Fix APIC page invalidation race |
| |
| commit e649b3f0188f8fd34dd0dde8d43fd3312b902fb2 upstream. |
| |
| Commit b1394e745b94 ("KVM: x86: fix APIC page invalidation") tried |
| to fix inappropriate APIC page invalidation by re-introducing arch |
| specific kvm_arch_mmu_notifier_invalidate_range() and calling it from |
| kvm_mmu_notifier_invalidate_range_start. However, the patch left a |
| possible race where the VMCS APIC address cache is updated *before* |
| it is unmapped: |
| |
| (Invalidator) kvm_mmu_notifier_invalidate_range_start() |
| (Invalidator) kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD) |
| (KVM VCPU) vcpu_enter_guest() |
| (KVM VCPU) kvm_vcpu_reload_apic_access_page() |
| (Invalidator) actually unmap page |
| |
| Because of the above race, there can be a mismatch between the |
| host physical address stored in the APIC_ACCESS_PAGE VMCS field and |
| the host physical address stored in the EPT entry for the APIC GPA |
| (0xfee0000). When this happens, the processor will not trap APIC |
| accesses, and will instead show the raw contents of the APIC-access page. |
| Because Windows OS periodically checks for unexpected modifications to |
| the LAPIC register, this will show up as a BSOD crash with BugCheck |
| CRITICAL_STRUCTURE_CORRUPTION (109) we are currently seeing in |
| https://bugzilla.redhat.com/show_bug.cgi?id=1751017. |
| |
| The root cause of the issue is that kvm_arch_mmu_notifier_invalidate_range() |
| cannot guarantee that no additional references are taken to the pages in |
| the range before kvm_mmu_notifier_invalidate_range_end(). Fortunately, |
| this case is supported by the MMU notifier API, as documented in |
| include/linux/mmu_notifier.h: |
| |
| * If the subsystem |
| * can't guarantee that no additional references are taken to |
| * the pages in the range, it has to implement the |
| * invalidate_range() notifier to remove any references taken |
| * after invalidate_range_start(). |
| |
| The fix therefore is to reload the APIC-access page field in the VMCS |
| from kvm_mmu_notifier_invalidate_range() instead of ..._range_start(). |
| |
| Cc: stable@vger.kernel.org |
| Fixes: b1394e745b94 ("KVM: x86: fix APIC page invalidation") |
| Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=197951 |
| Signed-off-by: Eiichi Tsukata <eiichi.tsukata@nutanix.com> |
| Message-Id: <20200606042627.61070-1-eiichi.tsukata@nutanix.com> |
| Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> |
| Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com> |
| |
| diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c |
| index c4ab1fb89480..aac5dc1b9910 100644 |
| --- a/arch/x86/kvm/x86.c |
| +++ b/arch/x86/kvm/x86.c |
| @@ -7828,9 +7828,8 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu) |
| kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); |
| } |
| |
| -int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, |
| - unsigned long start, unsigned long end, |
| - bool blockable) |
| +void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, |
| + unsigned long start, unsigned long end) |
| { |
| unsigned long apic_address; |
| |
| @@ -7841,8 +7840,6 @@ int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, |
| apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); |
| if (start <= apic_address && apic_address < end) |
| kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); |
| - |
| - return 0; |
| } |
| |
| void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) |
| diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h |
| index d1e34dc7624c..f3f86a7f36ec 100644 |
| --- a/include/linux/kvm_host.h |
| +++ b/include/linux/kvm_host.h |
| @@ -1373,8 +1373,8 @@ static inline long kvm_arch_vcpu_async_ioctl(struct file *filp, |
| } |
| #endif /* CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL */ |
| |
| -int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, |
| - unsigned long start, unsigned long end, bool blockable); |
| +void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, |
| + unsigned long start, unsigned long end); |
| |
| #ifdef CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE |
| int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu); |
| diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c |
| index e13f8793f8e1..db65c81400b7 100644 |
| --- a/virt/kvm/kvm_main.c |
| +++ b/virt/kvm/kvm_main.c |
| @@ -144,10 +144,9 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm); |
| static unsigned long long kvm_createvm_count; |
| static unsigned long long kvm_active_vms; |
| |
| -__weak int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, |
| - unsigned long start, unsigned long end, bool blockable) |
| +__weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, |
| + unsigned long start, unsigned long end) |
| { |
| - return 0; |
| } |
| |
| bool kvm_is_zone_device_pfn(kvm_pfn_t pfn) |
| @@ -367,6 +366,18 @@ static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) |
| return container_of(mn, struct kvm, mmu_notifier); |
| } |
| |
| +static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn, |
| + struct mm_struct *mm, |
| + unsigned long start, unsigned long end) |
| +{ |
| + struct kvm *kvm = mmu_notifier_to_kvm(mn); |
| + int idx; |
| + |
| + idx = srcu_read_lock(&kvm->srcu); |
| + kvm_arch_mmu_notifier_invalidate_range(kvm, start, end); |
| + srcu_read_unlock(&kvm->srcu, idx); |
| +} |
| + |
| static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, |
| struct mm_struct *mm, |
| unsigned long address, |
| @@ -391,7 +402,6 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, |
| { |
| struct kvm *kvm = mmu_notifier_to_kvm(mn); |
| int need_tlb_flush = 0, idx; |
| - int ret; |
| |
| idx = srcu_read_lock(&kvm->srcu); |
| spin_lock(&kvm->mmu_lock); |
| @@ -408,14 +418,9 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, |
| kvm_flush_remote_tlbs(kvm); |
| |
| spin_unlock(&kvm->mmu_lock); |
| - |
| - ret = kvm_arch_mmu_notifier_invalidate_range(kvm, range->start, |
| - range->end, |
| - mmu_notifier_range_blockable(range)); |
| - |
| srcu_read_unlock(&kvm->srcu, idx); |
| |
| - return ret; |
| + return 0; |
| } |
| |
| static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, |
| @@ -521,6 +526,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn, |
| } |
| |
| static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { |
| + .invalidate_range = kvm_mmu_notifier_invalidate_range, |
| .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, |
| .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, |
| .clear_flush_young = kvm_mmu_notifier_clear_flush_young, |
| -- |
| 2.27.0 |
| |