| From 0293dd7baaa88c6dda30d2ebc074013990fa7248 Mon Sep 17 00:00:00 2001 |
| From: Sasha Levin <sashal@kernel.org> |
| Date: Wed, 20 Apr 2022 15:08:40 +1000 |
| Subject: KVM: PPC: Fix TCE handling for VFIO |
| |
| From: Alexey Kardashevskiy <aik@ozlabs.ru> |
| |
| [ Upstream commit 26a62b750a4e6364b0393562f66759b1494c3a01 ] |
| |
| The LoPAPR spec defines a guest visible IOMMU with a variable page size. |
| Currently QEMU advertises 4K, 64K, 2M, 16MB pages, a Linux VM picks |
| the biggest (16MB). In the case of a passed though PCI device, there is |
| a hardware IOMMU which does not support all pages sizes from the above - |
| P8 cannot do 2MB and P9 cannot do 16MB. So for each emulated |
| 16M IOMMU page we may create several smaller mappings ("TCEs") in |
| the hardware IOMMU. |
| |
| The code wrongly uses the emulated TCE index instead of hardware TCE |
| index in error handling. The problem is easier to see on POWER8 with |
| multi-level TCE tables (when only the first level is preallocated) |
| as hash mode uses real mode TCE hypercalls handlers. |
| The kernel starts using indirect tables when VMs get bigger than 128GB |
| (depends on the max page order). |
| The very first real mode hcall is going to fail with H_TOO_HARD as |
| in the real mode we cannot allocate memory for TCEs (we can in the virtual |
| mode) but on the way out the code attempts to clear hardware TCEs using |
| emulated TCE indexes which corrupts random kernel memory because |
| it_offset==1<<59 is subtracted from those indexes and the resulting index |
| is out of the TCE table bounds. |
| |
| This fixes kvmppc_clear_tce() to use the correct TCE indexes. |
| |
| While at it, this fixes TCE cache invalidation which uses emulated TCE |
| indexes instead of the hardware ones. This went unnoticed as 64bit DMA |
| is used these days and VMs map all RAM in one go and only then do DMA |
| and this is when the TCE cache gets populated. |
| |
| Potentially this could slow down mapping, however normally 16MB |
| emulated pages are backed by 64K hardware pages so it is one write to |
| the "TCE Kill" per 256 updates which is not that bad considering the size |
| of the cache (1024 TCEs or so). |
| |
| Fixes: ca1fc489cfa0 ("KVM: PPC: Book3S: Allow backing bigger guest IOMMU pages with smaller physical pages") |
| |
| Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> |
| Tested-by: David Gibson <david@gibson.dropbear.id.au> |
| Reviewed-by: Frederic Barrat <fbarrat@linux.ibm.com> |
| Reviewed-by: David Gibson <david@gibson.dropbear.id.au> |
| Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> |
| Link: https://lore.kernel.org/r/20220420050840.328223-1-aik@ozlabs.ru |
| Signed-off-by: Sasha Levin <sashal@kernel.org> |
| --- |
| arch/powerpc/kvm/book3s_64_vio.c | 45 +++++++++++++++-------------- |
| arch/powerpc/kvm/book3s_64_vio_hv.c | 44 ++++++++++++++-------------- |
| 2 files changed, 45 insertions(+), 44 deletions(-) |
| |
| diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c |
| index 03b947429e4d..4518a0f2d6c6 100644 |
| --- a/arch/powerpc/kvm/book3s_64_vio.c |
| +++ b/arch/powerpc/kvm/book3s_64_vio.c |
| @@ -420,13 +420,19 @@ static void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt, |
| tbl[idx % TCES_PER_PAGE] = tce; |
| } |
| |
| -static void kvmppc_clear_tce(struct mm_struct *mm, struct iommu_table *tbl, |
| - unsigned long entry) |
| +static void kvmppc_clear_tce(struct mm_struct *mm, struct kvmppc_spapr_tce_table *stt, |
| + struct iommu_table *tbl, unsigned long entry) |
| { |
| - unsigned long hpa = 0; |
| - enum dma_data_direction dir = DMA_NONE; |
| + unsigned long i; |
| + unsigned long subpages = 1ULL << (stt->page_shift - tbl->it_page_shift); |
| + unsigned long io_entry = entry << (stt->page_shift - tbl->it_page_shift); |
| + |
| + for (i = 0; i < subpages; ++i) { |
| + unsigned long hpa = 0; |
| + enum dma_data_direction dir = DMA_NONE; |
| |
| - iommu_tce_xchg_no_kill(mm, tbl, entry, &hpa, &dir); |
| + iommu_tce_xchg_no_kill(mm, tbl, io_entry + i, &hpa, &dir); |
| + } |
| } |
| |
| static long kvmppc_tce_iommu_mapped_dec(struct kvm *kvm, |
| @@ -485,6 +491,8 @@ static long kvmppc_tce_iommu_unmap(struct kvm *kvm, |
| break; |
| } |
| |
| + iommu_tce_kill(tbl, io_entry, subpages); |
| + |
| return ret; |
| } |
| |
| @@ -544,6 +552,8 @@ static long kvmppc_tce_iommu_map(struct kvm *kvm, |
| break; |
| } |
| |
| + iommu_tce_kill(tbl, io_entry, subpages); |
| + |
| return ret; |
| } |
| |
| @@ -590,10 +600,9 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, |
| ret = kvmppc_tce_iommu_map(vcpu->kvm, stt, stit->tbl, |
| entry, ua, dir); |
| |
| - iommu_tce_kill(stit->tbl, entry, 1); |
| |
| if (ret != H_SUCCESS) { |
| - kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl, entry); |
| + kvmppc_clear_tce(vcpu->kvm->mm, stt, stit->tbl, entry); |
| goto unlock_exit; |
| } |
| } |
| @@ -669,13 +678,13 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, |
| */ |
| if (get_user(tce, tces + i)) { |
| ret = H_TOO_HARD; |
| - goto invalidate_exit; |
| + goto unlock_exit; |
| } |
| tce = be64_to_cpu(tce); |
| |
| if (kvmppc_tce_to_ua(vcpu->kvm, tce, &ua)) { |
| ret = H_PARAMETER; |
| - goto invalidate_exit; |
| + goto unlock_exit; |
| } |
| |
| list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { |
| @@ -684,19 +693,15 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, |
| iommu_tce_direction(tce)); |
| |
| if (ret != H_SUCCESS) { |
| - kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl, |
| - entry); |
| - goto invalidate_exit; |
| + kvmppc_clear_tce(vcpu->kvm->mm, stt, stit->tbl, |
| + entry + i); |
| + goto unlock_exit; |
| } |
| } |
| |
| kvmppc_tce_put(stt, entry + i, tce); |
| } |
| |
| -invalidate_exit: |
| - list_for_each_entry_lockless(stit, &stt->iommu_tables, next) |
| - iommu_tce_kill(stit->tbl, entry, npages); |
| - |
| unlock_exit: |
| srcu_read_unlock(&vcpu->kvm->srcu, idx); |
| |
| @@ -735,20 +740,16 @@ long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu, |
| continue; |
| |
| if (ret == H_TOO_HARD) |
| - goto invalidate_exit; |
| + return ret; |
| |
| WARN_ON_ONCE(1); |
| - kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl, entry); |
| + kvmppc_clear_tce(vcpu->kvm->mm, stt, stit->tbl, entry + i); |
| } |
| } |
| |
| for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift)) |
| kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value); |
| |
| -invalidate_exit: |
| - list_for_each_entry_lockless(stit, &stt->iommu_tables, next) |
| - iommu_tce_kill(stit->tbl, ioba >> stt->page_shift, npages); |
| - |
| return ret; |
| } |
| EXPORT_SYMBOL_GPL(kvmppc_h_stuff_tce); |
| diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c |
| index 35fd67b4ceb4..abb49d863329 100644 |
| --- a/arch/powerpc/kvm/book3s_64_vio_hv.c |
| +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c |
| @@ -251,13 +251,19 @@ extern void iommu_tce_kill_rm(struct iommu_table *tbl, |
| tbl->it_ops->tce_kill(tbl, entry, pages, true); |
| } |
| |
| -static void kvmppc_rm_clear_tce(struct kvm *kvm, struct iommu_table *tbl, |
| - unsigned long entry) |
| +static void kvmppc_rm_clear_tce(struct kvm *kvm, struct kvmppc_spapr_tce_table *stt, |
| + struct iommu_table *tbl, unsigned long entry) |
| { |
| - unsigned long hpa = 0; |
| - enum dma_data_direction dir = DMA_NONE; |
| + unsigned long i; |
| + unsigned long subpages = 1ULL << (stt->page_shift - tbl->it_page_shift); |
| + unsigned long io_entry = entry << (stt->page_shift - tbl->it_page_shift); |
| + |
| + for (i = 0; i < subpages; ++i) { |
| + unsigned long hpa = 0; |
| + enum dma_data_direction dir = DMA_NONE; |
| |
| - iommu_tce_xchg_no_kill_rm(kvm->mm, tbl, entry, &hpa, &dir); |
| + iommu_tce_xchg_no_kill_rm(kvm->mm, tbl, io_entry + i, &hpa, &dir); |
| + } |
| } |
| |
| static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm, |
| @@ -320,6 +326,8 @@ static long kvmppc_rm_tce_iommu_unmap(struct kvm *kvm, |
| break; |
| } |
| |
| + iommu_tce_kill_rm(tbl, io_entry, subpages); |
| + |
| return ret; |
| } |
| |
| @@ -383,6 +391,8 @@ static long kvmppc_rm_tce_iommu_map(struct kvm *kvm, |
| break; |
| } |
| |
| + iommu_tce_kill_rm(tbl, io_entry, subpages); |
| + |
| return ret; |
| } |
| |
| @@ -428,10 +438,8 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, |
| ret = kvmppc_rm_tce_iommu_map(vcpu->kvm, stt, |
| stit->tbl, entry, ua, dir); |
| |
| - iommu_tce_kill_rm(stit->tbl, entry, 1); |
| - |
| if (ret != H_SUCCESS) { |
| - kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry); |
| + kvmppc_rm_clear_tce(vcpu->kvm, stt, stit->tbl, entry); |
| return ret; |
| } |
| } |
| @@ -571,7 +579,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, |
| ua = 0; |
| if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) { |
| ret = H_PARAMETER; |
| - goto invalidate_exit; |
| + goto unlock_exit; |
| } |
| |
| list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { |
| @@ -580,19 +588,15 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, |
| iommu_tce_direction(tce)); |
| |
| if (ret != H_SUCCESS) { |
| - kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, |
| - entry); |
| - goto invalidate_exit; |
| + kvmppc_rm_clear_tce(vcpu->kvm, stt, stit->tbl, |
| + entry + i); |
| + goto unlock_exit; |
| } |
| } |
| |
| kvmppc_rm_tce_put(stt, entry + i, tce); |
| } |
| |
| -invalidate_exit: |
| - list_for_each_entry_lockless(stit, &stt->iommu_tables, next) |
| - iommu_tce_kill_rm(stit->tbl, entry, npages); |
| - |
| unlock_exit: |
| if (rmap) |
| unlock_rmap(rmap); |
| @@ -635,20 +639,16 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu, |
| continue; |
| |
| if (ret == H_TOO_HARD) |
| - goto invalidate_exit; |
| + return ret; |
| |
| WARN_ON_ONCE_RM(1); |
| - kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry); |
| + kvmppc_rm_clear_tce(vcpu->kvm, stt, stit->tbl, entry + i); |
| } |
| } |
| |
| for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift)) |
| kvmppc_rm_tce_put(stt, ioba >> stt->page_shift, tce_value); |
| |
| -invalidate_exit: |
| - list_for_each_entry_lockless(stit, &stt->iommu_tables, next) |
| - iommu_tce_kill_rm(stit->tbl, ioba >> stt->page_shift, npages); |
| - |
| return ret; |
| } |
| |
| -- |
| 2.35.1 |
| |