randprotect: KSM: re-enable mmu_notifier_change_pte during wrprotect

mmu_notifier_invalidate_range_start zaps all shadow pagetables leaving
nothing to do later for mmu_notifier_change_pte.

This hurts KVM+KSM by causing an unnecessary vmexit floods even if the
guest only reads the memory (and KSM does its best effort to only
merge memory that doesn't change frequently and wasn't supposed to
trigger vmexists post merge).

The s/ptep_clear_flush_notify/ptep_clear_flush/ in commit 0f10851ea475
resolved the issue only for mmu notifier users that don't need to
invalidate in mmu_notifier_invalidate_range_start because they share
the same pgtables of the primary MMU as IOMMUs, but it didn't solve
the issue for the KVM shadow secondary MMU.

Reported-by: Jonas Juffinger <jonas.juffinger@lamarr.at>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 7a64fb2..7d17d18 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -432,7 +432,8 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
 	 * changed, or the SPTE should be zeroed, and the TLBs flushed by the
 	 * thread before replacement.
 	 */
-	if (was_leaf && is_leaf && pfn_changed) {
+	if (was_leaf && is_leaf && pfn_changed &&
+	    (is_writable_pte(old_spte) || is_writable_pte(new_spte))) {
 		pr_err("Invalid SPTE change: cannot replace a present leaf\n"
 		       "SPTE with another present leaf SPTE mapping a\n"
 		       "different PFN!\n"
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 45fc2c8..a21a457 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -47,6 +47,11 @@ struct mmu_interval_notifier;
  * longer have exclusive access to the page. When sent during creation of an
  * exclusive range the owner will be initialised to the value provided by the
  * caller of make_device_exclusive_range(), otherwise the owner will be NULL.
+ *
+ * @MMU_NOTIFY_CHANGE_PTE: every single pgtable change will be
+ * reflected by a mmu_notifier_change_pte() so if the change_pte()
+ * method is implemented by the driver there's no need to invalidate
+ * the secondary MMU in invalidate_range_start().
  */
 enum mmu_notifier_event {
 	MMU_NOTIFY_UNMAP = 0,
@@ -57,6 +62,7 @@ enum mmu_notifier_event {
 	MMU_NOTIFY_RELEASE,
 	MMU_NOTIFY_MIGRATE,
 	MMU_NOTIFY_EXCLUSIVE,
+	MMU_NOTIFY_CHANGE_PTE,
 };
 
 #define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0)
diff --git a/mm/ksm.c b/mm/ksm.c
index 29fc0f1..8952849 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1111,7 +1111,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
 	if (page_mapcount(page) + 1 + PageSwapCache(page) != page_count(page))
 		goto out;
 
-	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
+	mmu_notifier_range_init(&range, MMU_NOTIFY_CHANGE_PTE, 0, vma, mm,
 				pvmw.address,
 				pvmw.address + PAGE_SIZE);
 	mmu_notifier_invalidate_range_start(&range);
@@ -1212,8 +1212,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
 	if (!pmd)
 		goto out;
 
-	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
-				addr + PAGE_SIZE);
+	mmu_notifier_range_init(&range, MMU_NOTIFY_CHANGE_PTE,
+				0, vma, mm, addr, addr + PAGE_SIZE);
 	mmu_notifier_invalidate_range_start(&range);
 
 	ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index d11c581..de15ffb 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -714,7 +714,8 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 		.start		= range->start,
 		.end		= range->end,
 		.pte		= __pte(0),
-		.handler	= kvm_unmap_gfn_range,
+		.handler	= (range->event == MMU_NOTIFY_CHANGE_PTE ?
+				   (void *)kvm_null_fn : kvm_unmap_gfn_range),
 		.on_lock	= kvm_inc_notifier_count,
 		.on_unlock	= kvm_arch_guest_memory_reclaimed,
 		.flush_on_ret	= true,