blob: cffe1157a90acfcf741b31ac216d6fd3a9ed4fd2 [file] [log] [blame]
// SPDX-License-Identifier: GPL-2.0-only
/*
* AMD SVM-SEV Host Support.
*
* Copyright (C) 2023 Advanced Micro Devices, Inc.
*
* Author: Ashish Kalra <ashish.kalra@amd.com>
*
*/
#include <linux/cc_platform.h>
#include <linux/printk.h>
#include <linux/mm_types.h>
#include <linux/set_memory.h>
#include <linux/memblock.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/cpumask.h>
#include <linux/iommu.h>
#include <linux/amd-iommu.h>
#include <asm/sev.h>
#include <asm/processor.h>
#include <asm/setup.h>
#include <asm/svm.h>
#include <asm/smp.h>
#include <asm/cpu.h>
#include <asm/apic.h>
#include <asm/cpuid.h>
#include <asm/cmdline.h>
#include <asm/iommu.h>
/*
* The RMP entry format is not architectural. The format is defined in PPR
* Family 19h Model 01h, Rev B1 processor.
*/
struct rmpentry {
union {
struct {
u64 assigned : 1,
pagesize : 1,
immutable : 1,
rsvd1 : 9,
gpa : 39,
asid : 10,
vmsa : 1,
validated : 1,
rsvd2 : 1;
};
u64 lo;
};
u64 hi;
} __packed;
/*
* The first 16KB from the RMP_BASE is used by the processor for the
* bookkeeping, the range needs to be added during the RMP entry lookup.
*/
#define RMPTABLE_CPU_BOOKKEEPING_SZ 0x4000
/* Mask to apply to a PFN to get the first PFN of a 2MB page */
#define PFN_PMD_MASK GENMASK_ULL(63, PMD_SHIFT - PAGE_SHIFT)
static u64 probed_rmp_base, probed_rmp_size;
static struct rmpentry *rmptable __ro_after_init;
static u64 rmptable_max_pfn __ro_after_init;
static LIST_HEAD(snp_leaked_pages_list);
static DEFINE_SPINLOCK(snp_leaked_pages_list_lock);
static unsigned long snp_nr_leaked_pages;
#undef pr_fmt
#define pr_fmt(fmt) "SEV-SNP: " fmt
static int __mfd_enable(unsigned int cpu)
{
u64 val;
if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP))
return 0;
rdmsrl(MSR_AMD64_SYSCFG, val);
val |= MSR_AMD64_SYSCFG_MFDM;
wrmsrl(MSR_AMD64_SYSCFG, val);
return 0;
}
static __init void mfd_enable(void *arg)
{
__mfd_enable(smp_processor_id());
}
static int __snp_enable(unsigned int cpu)
{
u64 val;
if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP))
return 0;
rdmsrl(MSR_AMD64_SYSCFG, val);
val |= MSR_AMD64_SYSCFG_SNP_EN;
val |= MSR_AMD64_SYSCFG_SNP_VMPL_EN;
wrmsrl(MSR_AMD64_SYSCFG, val);
return 0;
}
static __init void snp_enable(void *arg)
{
__snp_enable(smp_processor_id());
}
#define RMP_ADDR_MASK GENMASK_ULL(51, 13)
bool snp_probe_rmptable_info(void)
{
u64 max_rmp_pfn, calc_rmp_sz, rmp_sz, rmp_base, rmp_end;
rdmsrl(MSR_AMD64_RMP_BASE, rmp_base);
rdmsrl(MSR_AMD64_RMP_END, rmp_end);
if (!(rmp_base & RMP_ADDR_MASK) || !(rmp_end & RMP_ADDR_MASK)) {
pr_err("Memory for the RMP table has not been reserved by BIOS\n");
return false;
}
if (rmp_base > rmp_end) {
pr_err("RMP configuration not valid: base=%#llx, end=%#llx\n", rmp_base, rmp_end);
return false;
}
rmp_sz = rmp_end - rmp_base + 1;
/*
* Calculate the amount the memory that must be reserved by the BIOS to
* address the whole RAM, including the bookkeeping area. The RMP itself
* must also be covered.
*/
max_rmp_pfn = max_pfn;
if (PHYS_PFN(rmp_end) > max_pfn)
max_rmp_pfn = PHYS_PFN(rmp_end);
calc_rmp_sz = (max_rmp_pfn << 4) + RMPTABLE_CPU_BOOKKEEPING_SZ;
if (calc_rmp_sz > rmp_sz) {
pr_err("Memory reserved for the RMP table does not cover full system RAM (expected 0x%llx got 0x%llx)\n",
calc_rmp_sz, rmp_sz);
return false;
}
probed_rmp_base = rmp_base;
probed_rmp_size = rmp_sz;
pr_info("RMP table physical range [0x%016llx - 0x%016llx]\n",
probed_rmp_base, probed_rmp_base + probed_rmp_size - 1);
return true;
}
/*
* Do the necessary preparations which are verified by the firmware as
* described in the SNP_INIT_EX firmware command description in the SNP
* firmware ABI spec.
*/
static int __init snp_rmptable_init(void)
{
void *rmptable_start;
u64 rmptable_size;
u64 val;
if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP))
return 0;
if (!amd_iommu_snp_en)
return 0;
if (!probed_rmp_size)
goto nosnp;
rmptable_start = memremap(probed_rmp_base, probed_rmp_size, MEMREMAP_WB);
if (!rmptable_start) {
pr_err("Failed to map RMP table\n");
return 1;
}
/*
* Check if SEV-SNP is already enabled, this can happen in case of
* kexec boot.
*/
rdmsrl(MSR_AMD64_SYSCFG, val);
if (val & MSR_AMD64_SYSCFG_SNP_EN)
goto skip_enable;
memset(rmptable_start, 0, probed_rmp_size);
/* Flush the caches to ensure that data is written before SNP is enabled. */
wbinvd_on_all_cpus();
/* MtrrFixDramModEn must be enabled on all the CPUs prior to enabling SNP. */
on_each_cpu(mfd_enable, NULL, 1);
on_each_cpu(snp_enable, NULL, 1);
skip_enable:
rmptable_start += RMPTABLE_CPU_BOOKKEEPING_SZ;
rmptable_size = probed_rmp_size - RMPTABLE_CPU_BOOKKEEPING_SZ;
rmptable = (struct rmpentry *)rmptable_start;
rmptable_max_pfn = rmptable_size / sizeof(struct rmpentry) - 1;
cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/rmptable_init:online", __snp_enable, NULL);
/*
* Setting crash_kexec_post_notifiers to 'true' to ensure that SNP panic
* notifier is invoked to do SNP IOMMU shutdown before kdump.
*/
crash_kexec_post_notifiers = true;
return 0;
nosnp:
setup_clear_cpu_cap(X86_FEATURE_SEV_SNP);
return -ENOSYS;
}
/*
* This must be called after the IOMMU has been initialized.
*/
device_initcall(snp_rmptable_init);
static struct rmpentry *get_rmpentry(u64 pfn)
{
if (WARN_ON_ONCE(pfn > rmptable_max_pfn))
return ERR_PTR(-EFAULT);
return &rmptable[pfn];
}
static struct rmpentry *__snp_lookup_rmpentry(u64 pfn, int *level)
{
struct rmpentry *large_entry, *entry;
if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP))
return ERR_PTR(-ENODEV);
entry = get_rmpentry(pfn);
if (IS_ERR(entry))
return entry;
/*
* Find the authoritative RMP entry for a PFN. This can be either a 4K
* RMP entry or a special large RMP entry that is authoritative for a
* whole 2M area.
*/
large_entry = get_rmpentry(pfn & PFN_PMD_MASK);
if (IS_ERR(large_entry))
return large_entry;
*level = RMP_TO_PG_LEVEL(large_entry->pagesize);
return entry;
}
int snp_lookup_rmpentry(u64 pfn, bool *assigned, int *level)
{
struct rmpentry *e;
e = __snp_lookup_rmpentry(pfn, level);
if (IS_ERR(e))
return PTR_ERR(e);
*assigned = !!e->assigned;
return 0;
}
EXPORT_SYMBOL_GPL(snp_lookup_rmpentry);
/*
* Dump the raw RMP entry for a particular PFN. These bits are documented in the
* PPR for a particular CPU model and provide useful information about how a
* particular PFN is being utilized by the kernel/firmware at the time certain
* unexpected events occur, such as RMP faults.
*/
static void dump_rmpentry(u64 pfn)
{
u64 pfn_i, pfn_end;
struct rmpentry *e;
int level;
e = __snp_lookup_rmpentry(pfn, &level);
if (IS_ERR(e)) {
pr_err("Failed to read RMP entry for PFN 0x%llx, error %ld\n",
pfn, PTR_ERR(e));
return;
}
if (e->assigned) {
pr_info("PFN 0x%llx, RMP entry: [0x%016llx - 0x%016llx]\n",
pfn, e->lo, e->hi);
return;
}
/*
* If the RMP entry for a particular PFN is not in an assigned state,
* then it is sometimes useful to get an idea of whether or not any RMP
* entries for other PFNs within the same 2MB region are assigned, since
* those too can affect the ability to access a particular PFN in
* certain situations, such as when the PFN is being accessed via a 2MB
* mapping in the host page table.
*/
pfn_i = ALIGN_DOWN(pfn, PTRS_PER_PMD);
pfn_end = pfn_i + PTRS_PER_PMD;
pr_info("PFN 0x%llx unassigned, dumping non-zero entries in 2M PFN region: [0x%llx - 0x%llx]\n",
pfn, pfn_i, pfn_end);
while (pfn_i < pfn_end) {
e = __snp_lookup_rmpentry(pfn_i, &level);
if (IS_ERR(e)) {
pr_err("Error %ld reading RMP entry for PFN 0x%llx\n",
PTR_ERR(e), pfn_i);
pfn_i++;
continue;
}
if (e->lo || e->hi)
pr_info("PFN: 0x%llx, [0x%016llx - 0x%016llx]\n", pfn_i, e->lo, e->hi);
pfn_i++;
}
}
void snp_dump_hva_rmpentry(unsigned long hva)
{
unsigned long paddr;
unsigned int level;
pgd_t *pgd;
pte_t *pte;
pgd = __va(read_cr3_pa());
pgd += pgd_index(hva);
pte = lookup_address_in_pgd(pgd, hva, &level);
if (!pte) {
pr_err("Can't dump RMP entry for HVA %lx: no PTE/PFN found\n", hva);
return;
}
paddr = PFN_PHYS(pte_pfn(*pte)) | (hva & ~page_level_mask(level));
dump_rmpentry(PHYS_PFN(paddr));
}
/*
* PSMASH a 2MB aligned page into 4K pages in the RMP table while preserving the
* Validated bit.
*/
int psmash(u64 pfn)
{
unsigned long paddr = pfn << PAGE_SHIFT;
int ret;
if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP))
return -ENODEV;
if (!pfn_valid(pfn))
return -EINVAL;
/* Binutils version 2.36 supports the PSMASH mnemonic. */
asm volatile(".byte 0xF3, 0x0F, 0x01, 0xFF"
: "=a" (ret)
: "a" (paddr)
: "memory", "cc");
return ret;
}
EXPORT_SYMBOL_GPL(psmash);
/*
* If the kernel uses a 2MB or larger directmap mapping to write to an address,
* and that mapping contains any 4KB pages that are set to private in the RMP
* table, an RMP #PF will trigger and cause a host crash. Hypervisor code that
* owns the PFNs being transitioned will never attempt such a write, but other
* kernel tasks writing to other PFNs in the range may trigger these checks
* inadvertently due a large directmap mapping that happens to overlap such a
* PFN.
*
* Prevent this by splitting any 2MB+ mappings that might end up containing a
* mix of private/shared PFNs as a result of a subsequent RMPUPDATE for the
* PFN/rmp_level passed in.
*
* Note that there is no attempt here to scan all the RMP entries for the 2MB
* physical range, since it would only be worthwhile in determining if a
* subsequent RMPUPDATE for a 4KB PFN would result in all the entries being of
* the same shared/private state, thus avoiding the need to split the mapping.
* But that would mean the entries are currently in a mixed state, and so the
* mapping would have already been split as a result of prior transitions.
* And since the 4K split is only done if the mapping is 2MB+, and there isn't
* currently a mechanism in place to restore 2MB+ mappings, such a check would
* not provide any usable benefit.
*
* More specifics on how these checks are carried out can be found in APM
* Volume 2, "RMP and VMPL Access Checks".
*/
static int adjust_direct_map(u64 pfn, int rmp_level)
{
unsigned long vaddr;
unsigned int level;
int npages, ret;
pte_t *pte;
/*
* pfn_to_kaddr() will return a vaddr only within the direct
* map range.
*/
vaddr = (unsigned long)pfn_to_kaddr(pfn);
/* Only 4KB/2MB RMP entries are supported by current hardware. */
if (WARN_ON_ONCE(rmp_level > PG_LEVEL_2M))
return -EINVAL;
if (!pfn_valid(pfn))
return -EINVAL;
if (rmp_level == PG_LEVEL_2M &&
(!IS_ALIGNED(pfn, PTRS_PER_PMD) || !pfn_valid(pfn + PTRS_PER_PMD - 1)))
return -EINVAL;
/*
* If an entire 2MB physical range is being transitioned, then there is
* no risk of RMP #PFs due to write accesses from overlapping mappings,
* since even accesses from 1GB mappings will be treated as 2MB accesses
* as far as RMP table checks are concerned.
*/
if (rmp_level == PG_LEVEL_2M)
return 0;
pte = lookup_address(vaddr, &level);
if (!pte || pte_none(*pte))
return 0;
if (level == PG_LEVEL_4K)
return 0;
npages = page_level_size(rmp_level) / PAGE_SIZE;
ret = set_memory_4k(vaddr, npages);
if (ret)
pr_warn("Failed to split direct map for PFN 0x%llx, ret: %d\n",
pfn, ret);
return ret;
}
/*
* It is expected that those operations are seldom enough so that no mutual
* exclusion of updaters is needed and thus the overlap error condition below
* should happen very rarely and would get resolved relatively quickly by
* the firmware.
*
* If not, one could consider introducing a mutex or so here to sync concurrent
* RMP updates and thus diminish the amount of cases where firmware needs to
* lock 2M ranges to protect against concurrent updates.
*
* The optimal solution would be range locking to avoid locking disjoint
* regions unnecessarily but there's no support for that yet.
*/
static int rmpupdate(u64 pfn, struct rmp_state *state)
{
unsigned long paddr = pfn << PAGE_SHIFT;
int ret, level;
if (!cpu_feature_enabled(X86_FEATURE_SEV_SNP))
return -ENODEV;
level = RMP_TO_PG_LEVEL(state->pagesize);
if (adjust_direct_map(pfn, level))
return -EFAULT;
do {
/* Binutils version 2.36 supports the RMPUPDATE mnemonic. */
asm volatile(".byte 0xF2, 0x0F, 0x01, 0xFE"
: "=a" (ret)
: "a" (paddr), "c" ((unsigned long)state)
: "memory", "cc");
} while (ret == RMPUPDATE_FAIL_OVERLAP);
if (ret) {
pr_err("RMPUPDATE failed for PFN %llx, pg_level: %d, ret: %d\n",
pfn, level, ret);
dump_rmpentry(pfn);
dump_stack();
return -EFAULT;
}
return 0;
}
/* Transition a page to guest-owned/private state in the RMP table. */
int rmp_make_private(u64 pfn, u64 gpa, enum pg_level level, u32 asid, bool immutable)
{
struct rmp_state state;
memset(&state, 0, sizeof(state));
state.assigned = 1;
state.asid = asid;
state.immutable = immutable;
state.gpa = gpa;
state.pagesize = PG_LEVEL_TO_RMP(level);
return rmpupdate(pfn, &state);
}
EXPORT_SYMBOL_GPL(rmp_make_private);
/* Transition a page to hypervisor-owned/shared state in the RMP table. */
int rmp_make_shared(u64 pfn, enum pg_level level)
{
struct rmp_state state;
memset(&state, 0, sizeof(state));
state.pagesize = PG_LEVEL_TO_RMP(level);
return rmpupdate(pfn, &state);
}
EXPORT_SYMBOL_GPL(rmp_make_shared);
void snp_leak_pages(u64 pfn, unsigned int npages)
{
struct page *page = pfn_to_page(pfn);
pr_warn("Leaking PFN range 0x%llx-0x%llx\n", pfn, pfn + npages);
spin_lock(&snp_leaked_pages_list_lock);
while (npages--) {
/*
* Reuse the page's buddy list for chaining into the leaked
* pages list. This page should not be on a free list currently
* and is also unsafe to be added to a free list.
*/
if (likely(!PageCompound(page)) ||
/*
* Skip inserting tail pages of compound page as
* page->buddy_list of tail pages is not usable.
*/
(PageHead(page) && compound_nr(page) <= npages))
list_add_tail(&page->buddy_list, &snp_leaked_pages_list);
dump_rmpentry(pfn);
snp_nr_leaked_pages++;
pfn++;
page++;
}
spin_unlock(&snp_leaked_pages_list_lock);
}
EXPORT_SYMBOL_GPL(snp_leak_pages);