[PATCH 2/2] x86/snp: Convert shared memory back to private on kexec
Tom Lendacky
thomas.lendacky at amd.com
Wed Feb 21 12:35:13 PST 2024
On 2/19/24 19:18, Ashish Kalra wrote:
> From: Ashish Kalra <ashish.kalra at amd.com>
>
> SNP guests allocate shared buffers to perform I/O. It is done by
> allocating pages normally from the buddy allocator and converting them
> to shared with set_memory_decrypted().
>
> The second kernel has no idea what memory is converted this way. It only
> sees E820_TYPE_RAM.
>
> Accessing shared memory via private mapping will cause unrecoverable RMP
> page-faults.
>
> On kexec walk direct mapping and convert all shared memory back to
> private. It makes all RAM private again and second kernel may use it
> normally. Additionally for SNP guests convert all bss decrypted section
> pages back to private and switch back ROM regions to shared so that
> their revalidation does not fail during kexec kernel boot.
>
> The conversion occurs in two steps: stopping new conversions and
> unsharing all memory. In the case of normal kexec, the stopping of
> conversions takes place while scheduling is still functioning. This
> allows for waiting until any ongoing conversions are finished. The
> second step is carried out when all CPUs except one are inactive and
> interrupts are disabled. This prevents any conflicts with code that may
> access shared memory.
This seems like this patch should be broken down into multiple patches
with the final patch setting x86_platform.guest.enc_kexec_stop_conversion
and x86_platform.guest.enc_kexec_unshare_mem
>
> Signed-off-by: Ashish Kalra <ashish.kalra at amd.com>
> ---
> arch/x86/include/asm/probe_roms.h | 1 +
> arch/x86/include/asm/sev.h | 8 ++
> arch/x86/kernel/probe_roms.c | 16 +++
> arch/x86/kernel/sev.c | 211 ++++++++++++++++++++++++++++++
> arch/x86/mm/mem_encrypt_amd.c | 18 ++-
> 5 files changed, 253 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/include/asm/probe_roms.h b/arch/x86/include/asm/probe_roms.h
> index 1c7f3815bbd6..d50b67dbff33 100644
> --- a/arch/x86/include/asm/probe_roms.h
> +++ b/arch/x86/include/asm/probe_roms.h
> @@ -6,4 +6,5 @@ struct pci_dev;
> extern void __iomem *pci_map_biosrom(struct pci_dev *pdev);
> extern void pci_unmap_biosrom(void __iomem *rom);
> extern size_t pci_biosrom_size(struct pci_dev *pdev);
> +extern void snp_kexec_unprep_rom_memory(void);
> #endif
> diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
> index 5b4a1ce3d368..dd236d7e9407 100644
> --- a/arch/x86/include/asm/sev.h
> +++ b/arch/x86/include/asm/sev.h
> @@ -81,6 +81,10 @@ extern void vc_no_ghcb(void);
> extern void vc_boot_ghcb(void);
> extern bool handle_vc_boot_ghcb(struct pt_regs *regs);
>
> +extern atomic_t conversions_in_progress;
> +extern bool conversion_allowed;
> +extern unsigned long pg_level_to_pfn(int level, pte_t *kpte, pgprot_t *ret_prot);
> +
> /* PVALIDATE return codes */
> #define PVALIDATE_FAIL_SIZEMISMATCH 6
>
> @@ -213,6 +217,8 @@ int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct sn
> void snp_accept_memory(phys_addr_t start, phys_addr_t end);
> u64 snp_get_unsupported_features(u64 status);
> u64 sev_get_status(void);
> +void snp_kexec_unshare_mem(void);
> +void snp_kexec_stop_conversion(bool crash);
> #else
> static inline void sev_es_ist_enter(struct pt_regs *regs) { }
> static inline void sev_es_ist_exit(void) { }
> @@ -241,6 +247,8 @@ static inline int snp_issue_guest_request(u64 exit_code, struct snp_req_data *in
> static inline void snp_accept_memory(phys_addr_t start, phys_addr_t end) { }
> static inline u64 snp_get_unsupported_features(u64 status) { return 0; }
> static inline u64 sev_get_status(void) { return 0; }
> +void snp_kexec_unshare_mem(void) {}
> +static void snp_kexec_stop_conversion(bool crash) {}
> #endif
>
> #endif
> diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c
> index 319fef37d9dc..457f1e5c8d00 100644
> --- a/arch/x86/kernel/probe_roms.c
> +++ b/arch/x86/kernel/probe_roms.c
> @@ -177,6 +177,22 @@ size_t pci_biosrom_size(struct pci_dev *pdev)
> }
> EXPORT_SYMBOL(pci_biosrom_size);
>
> +void snp_kexec_unprep_rom_memory(void)
> +{
> + unsigned long vaddr, npages, sz;
> +
> + /*
> + * Switch back ROM regions to shared so that their validation
> + * does not fail during kexec kernel boot.
> + */
> + vaddr = (unsigned long)__va(video_rom_resource.start);
> + sz = (system_rom_resource.end + 1) - video_rom_resource.start;
> + npages = PAGE_ALIGN(sz) >> PAGE_SHIFT;
> +
> + snp_set_memory_shared(vaddr, npages);
> +}
> +EXPORT_SYMBOL(snp_kexec_unprep_rom_memory);
> +
> #define ROMSIGNATURE 0xaa55
>
> static int __init romsignature(const unsigned char *rom)
> diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
> index c67285824e82..765ab83129eb 100644
> --- a/arch/x86/kernel/sev.c
> +++ b/arch/x86/kernel/sev.c
> @@ -23,6 +23,9 @@
> #include <linux/platform_device.h>
> #include <linux/io.h>
> #include <linux/psp-sev.h>
> +#include <linux/pagewalk.h>
> +#include <linux/cacheflush.h>
> +#include <linux/delay.h>
> #include <uapi/linux/sev-guest.h>
>
> #include <asm/cpu_entry_area.h>
> @@ -40,6 +43,7 @@
> #include <asm/apic.h>
> #include <asm/cpuid.h>
> #include <asm/cmdline.h>
> +#include <asm/probe_roms.h>
>
> #define DR7_RESET_VALUE 0x400
>
> @@ -71,6 +75,13 @@ static struct ghcb *boot_ghcb __section(".data");
> /* Bitmap of SEV features supported by the hypervisor */
> static u64 sev_hv_features __ro_after_init;
>
> +/* Last address to be switched to private during kexec */
> +static unsigned long last_address_shd_kexec;
Maybe kexec_last_address_to_make_private ? Or just something that makes a
bit more sense.
> +
> +static bool crash_requested;
> +atomic_t conversions_in_progress;
> +bool conversion_allowed = true;
> +
> /* #VC handler runtime per-CPU data */
> struct sev_es_runtime_data {
> struct ghcb ghcb_page;
> @@ -906,6 +917,206 @@ void snp_accept_memory(phys_addr_t start, phys_addr_t end)
> set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE);
> }
>
> +static inline bool pte_decrypted(pte_t pte)
> +{
> + return cc_mkdec(pte_val(pte)) == pte_val(pte);
> +}
> +
This is duplicated in TDX code, arch/x86/coco/tdx/tdx.c, looks like
something that can go in a header file, maybe mem_encrypt.h.
> +static int set_pte_enc(pte_t *kpte, int level, void *va)
> +{
> + pgprot_t old_prot, new_prot;
> + unsigned long pfn, pa, size;
> + pte_t new_pte;
> +
> + pfn = pg_level_to_pfn(level, kpte, &old_prot);
> + if (!pfn)
> + return 0;
Not sure this matters... a zero PFN is a valid PFN, it's just marked not
present. This seems a bit overdone to me, see the end of this function to
see if the more compact version works.
> +
> + new_prot = old_prot;
> + pgprot_val(new_prot) |= _PAGE_ENC;
> + pa = pfn << PAGE_SHIFT;
> + size = page_level_size(level);
> +
> + /*
> + * Change the physical page attribute from C=0 to C=1. Flush the
> + * caches to ensure that data gets accessed with the correct C-bit.
> + */
> + clflush_cache_range(va, size);
> +
> + /* Change the page encryption mask. */
> + new_pte = pfn_pte(pfn, new_prot);
> + set_pte_atomic(kpte, new_pte);
> +
> + return 1;
> +}
static bool set_pte_enc(pte_t *kpte, int level, void *va)
{
pte_t new_pte;
if (pte_none(*kpte))
return false;
if (pte_present(*kpte))
clflush_cache_range(va, page_leve_size(level);
new_pte = cc_mkenc(*kpte);
set_pte_atomic(kpte, new_pte);
return true;
}
> +
> +static int unshare_pte(pte_t *pte, unsigned long addr, int pages, int level)
Maybe a better name is make_pte_private ?
And if you are only returning 0 or 1, it begs to be a bool.
> +{
> + struct sev_es_runtime_data *data;
> + struct ghcb *ghcb;
> +
> + data = this_cpu_read(runtime_data);
> + ghcb = &data->ghcb_page;
> +
> + /*
> + * check for GHCB for being part of a PMD range.
> + */
Single line comment.
> + if ((unsigned long)ghcb >= addr &&
> + (unsigned long)ghcb <= (addr + (pages * PAGE_SIZE))) {
> + /*
> + * setup last address to be made private so that this GHCB
> + * is made private at the end of unshared loop so that RMP
> + * does not possibly getting PSMASHed from using the
> + * MSR protocol.
> + */
Please clarify this comment a bit more... it's a bit hard to follow.
> + pr_debug("setting boot_ghcb to NULL for this cpu ghcb\n");
> + last_address_shd_kexec = addr;
> + return 1;
> + }
Add a blank line here.
> + if (!set_pte_enc(pte, level, (void *)addr))
> + return 0;
Add a blank line here.
> + snp_set_memory_private(addr, pages);
> +
> + return 1;
> +}
> +
> +static void unshare_all_memory(bool unmap)
Unused input, looks like this can be removed.
> +{
> + unsigned long addr, end;
> +
> + /*
> + * Walk direct mapping and convert all shared memory back to private,
> + */
> +
> + addr = PAGE_OFFSET;
> + end = PAGE_OFFSET + get_max_mapped();
> +
> + while (addr < end) {
> + unsigned long size;
> + unsigned int level;
> + pte_t *pte;
> +
> + pte = lookup_address(addr, &level);
> + size = page_level_size(level);
> +
> + /*
> + * pte_none() check is required to skip physical memory holes in direct mapped.
> + */
> + if (pte && pte_decrypted(*pte) && !pte_none(*pte)) {
> + int pages = size / PAGE_SIZE;
> +
> + if (!unshare_pte(pte, addr, pages, level)) {
> + pr_err("Failed to unshare range %#lx-%#lx\n",
> + addr, addr + size);
> + }
> +
> + }
> +
> + addr += size;
> + }
> + __flush_tlb_all();
This is also mostly in the TDX code and begs to be made common and not
copied... please figure out a way to do the "different" things through a
registered callback or such.
> +
> +}
> +
> +static void unshare_all_bss_decrypted_memory(void)
> +{
> + unsigned long vaddr, vaddr_end;
> + unsigned long size;
> + unsigned int level;
> + unsigned int npages;
> + pte_t *pte;
> +
> + vaddr = (unsigned long)__start_bss_decrypted;
> + vaddr_end = (unsigned long)__start_bss_decrypted_unused;
> + npages = (vaddr_end - vaddr) >> PAGE_SHIFT;
> + for (; vaddr < vaddr_end; vaddr += PAGE_SIZE) {
> + pte = lookup_address(vaddr, &level);
> + if (!pte || !pte_decrypted(*pte) || pte_none(*pte))
> + continue;
> +
> + size = page_level_size(level);
> + set_pte_enc(pte, level, (void *)vaddr);
> + }
> + vaddr = (unsigned long)__start_bss_decrypted;
> + snp_set_memory_private(vaddr, npages);
> +}
> +
> +void snp_kexec_stop_conversion(bool crash)
> +{
> + /* Stop new private<->shared conversions */
> + conversion_allowed = false;
> + crash_requested = crash;
> +
> + /*
> + * Make sure conversion_allowed is cleared before checking
> + * conversions_in_progress.
> + */
> + barrier();
This should be smp_wmb().
> +
> + /*
> + * Crash kernel reaches here with interrupts disabled: can't wait for
> + * conversions to finish.
> + *
> + * If race happened, just report and proceed.
> + */
> + if (!crash) {
> + unsigned long timeout;
> +
> + /*
> + * Wait for in-flight conversions to complete.
> + *
> + * Do not wait more than 30 seconds.
> + */
> + timeout = 30 * USEC_PER_SEC;
> + while (atomic_read(&conversions_in_progress) && timeout--)
> + udelay(1);
> + }
> +
> + if (atomic_read(&conversions_in_progress))
> + pr_warn("Failed to finish shared<->private conversions\n");
> +}
Again, same code as in TDX (except for the crash_requested, but I don't
see that used anywhere), so please make it common.
> +
> +void snp_kexec_unshare_mem(void)
> +{
> + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
> + return;
> +
> + /*
> + * Switch back any specific memory regions such as option
> + * ROM regions back to shared so that (re)validation does
> + * not fail when kexec kernel boots.
> + */
> + snp_kexec_unprep_rom_memory();
> +
> + unshare_all_memory(true);
> +
> + unshare_all_bss_decrypted_memory();
> +
> + if (last_address_shd_kexec) {
> + unsigned long size;
> + unsigned int level;
> + pte_t *pte;
> +
> + /*
> + * Switch to using the MSR protocol to change this cpu's
> + * GHCB to private.
> + */
> + boot_ghcb = NULL;
> + /*
> + * All the per-cpu GHCBs have been switched back to private,
> + * so can't do any more GHCB calls to the hypervisor beyond
> + * this point till the kexec kernel starts running.
> + */
> + sev_cfg.ghcbs_initialized = false;
Maybe combine the two comments above into a single comment and then keep
the two assignments together.
> +
> + pr_debug("boot ghcb 0x%lx\n", last_address_shd_kexec);
> + pte = lookup_address(last_address_shd_kexec, &level);
> + size = page_level_size(level);
> + set_pte_enc(pte, level, (void *)last_address_shd_kexec);
> + snp_set_memory_private(last_address_shd_kexec, (size / PAGE_SIZE));
> + }
> +}
> +
> static int snp_set_vmsa(void *va, bool vmsa)
> {
> u64 attrs;
> diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c
> index d314e577836d..87b6475358ad 100644
> --- a/arch/x86/mm/mem_encrypt_amd.c
> +++ b/arch/x86/mm/mem_encrypt_amd.c
> @@ -214,7 +214,7 @@ void __init sme_map_bootdata(char *real_mode_data)
> __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, true);
> }
>
> -static unsigned long pg_level_to_pfn(int level, pte_t *kpte, pgprot_t *ret_prot)
> +unsigned long pg_level_to_pfn(int level, pte_t *kpte, pgprot_t *ret_prot)
This change shouldn't be needed anymore if you modify the set_pte_enc()
function.
> {
> unsigned long pfn = 0;
> pgprot_t prot;
> @@ -285,6 +285,17 @@ static void enc_dec_hypercall(unsigned long vaddr, unsigned long size, bool enc)
>
> static int amd_enc_status_change_prepare(unsigned long vaddr, int npages, bool enc)
> {
> + atomic_inc(&conversions_in_progress);
> +
> + /*
> + * Check after bumping conversions_in_progress to serialize
> + * against snp_kexec_stop_conversion().
> + */
> + if (!conversion_allowed) {
> + atomic_dec(&conversions_in_progress);
> + return -EBUSY;
> + }
Duplicate code, please move to a common file, along with the varialbles,
such as arch/x86/mm/mem_encrypt.c ?
> +
> /*
> * To maintain the security guarantees of SEV-SNP guests, make sure
> * to invalidate the memory before encryption attribute is cleared.
> @@ -308,6 +319,8 @@ static int amd_enc_status_change_finish(unsigned long vaddr, int npages, bool en
> if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
> enc_dec_hypercall(vaddr, npages << PAGE_SHIFT, enc);
>
> + atomic_dec(&conversions_in_progress);
Ditto here.
Thanks,
Tom
> +
> return 0;
> }
>
> @@ -468,6 +481,9 @@ void __init sme_early_init(void)
> x86_platform.guest.enc_tlb_flush_required = amd_enc_tlb_flush_required;
> x86_platform.guest.enc_cache_flush_required = amd_enc_cache_flush_required;
>
> + x86_platform.guest.enc_kexec_stop_conversion = snp_kexec_stop_conversion;
> + x86_platform.guest.enc_kexec_unshare_mem = snp_kexec_unshare_mem;
> +
> /*
> * AMD-SEV-ES intercepts the RDMSR to read the X2APIC ID in the
> * parallel bringup low level code. That raises #VC which cannot be
More information about the kexec
mailing list