[PATCH 2/2] x86/snp: Convert shared memory back to private on kexec

Tom Lendacky thomas.lendacky at amd.com
Wed Feb 21 12:35:13 PST 2024


On 2/19/24 19:18, Ashish Kalra wrote:
> From: Ashish Kalra <ashish.kalra at amd.com>
> 
> SNP guests allocate shared buffers to perform I/O. It is done by
> allocating pages normally from the buddy allocator and converting them
> to shared with set_memory_decrypted().
> 
> The second kernel has no idea what memory is converted this way. It only
> sees E820_TYPE_RAM.
> 
> Accessing shared memory via private mapping will cause unrecoverable RMP
> page-faults.
> 
> On kexec walk direct mapping and convert all shared memory back to
> private. It makes all RAM private again and second kernel may use it
> normally. Additionally for SNP guests convert all bss decrypted section
> pages back to private and switch back ROM regions to shared so that
> their revalidation does not fail during kexec kernel boot.
> 
> The conversion occurs in two steps: stopping new conversions and
> unsharing all memory. In the case of normal kexec, the stopping of
> conversions takes place while scheduling is still functioning. This
> allows for waiting until any ongoing conversions are finished. The
> second step is carried out when all CPUs except one are inactive and
> interrupts are disabled. This prevents any conflicts with code that may
> access shared memory.

This seems like this patch should be broken down into multiple patches 
with the final patch setting x86_platform.guest.enc_kexec_stop_conversion 
and x86_platform.guest.enc_kexec_unshare_mem

> 
> Signed-off-by: Ashish Kalra <ashish.kalra at amd.com>
> ---
>   arch/x86/include/asm/probe_roms.h |   1 +
>   arch/x86/include/asm/sev.h        |   8 ++
>   arch/x86/kernel/probe_roms.c      |  16 +++
>   arch/x86/kernel/sev.c             | 211 ++++++++++++++++++++++++++++++
>   arch/x86/mm/mem_encrypt_amd.c     |  18 ++-
>   5 files changed, 253 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/include/asm/probe_roms.h b/arch/x86/include/asm/probe_roms.h
> index 1c7f3815bbd6..d50b67dbff33 100644
> --- a/arch/x86/include/asm/probe_roms.h
> +++ b/arch/x86/include/asm/probe_roms.h
> @@ -6,4 +6,5 @@ struct pci_dev;
>   extern void __iomem *pci_map_biosrom(struct pci_dev *pdev);
>   extern void pci_unmap_biosrom(void __iomem *rom);
>   extern size_t pci_biosrom_size(struct pci_dev *pdev);
> +extern void snp_kexec_unprep_rom_memory(void);
>   #endif
> diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
> index 5b4a1ce3d368..dd236d7e9407 100644
> --- a/arch/x86/include/asm/sev.h
> +++ b/arch/x86/include/asm/sev.h
> @@ -81,6 +81,10 @@ extern void vc_no_ghcb(void);
>   extern void vc_boot_ghcb(void);
>   extern bool handle_vc_boot_ghcb(struct pt_regs *regs);
>   
> +extern atomic_t conversions_in_progress;
> +extern bool conversion_allowed;
> +extern unsigned long pg_level_to_pfn(int level, pte_t *kpte, pgprot_t *ret_prot);
> +
>   /* PVALIDATE return codes */
>   #define PVALIDATE_FAIL_SIZEMISMATCH	6
>   
> @@ -213,6 +217,8 @@ int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct sn
>   void snp_accept_memory(phys_addr_t start, phys_addr_t end);
>   u64 snp_get_unsupported_features(u64 status);
>   u64 sev_get_status(void);
> +void snp_kexec_unshare_mem(void);
> +void snp_kexec_stop_conversion(bool crash);
>   #else
>   static inline void sev_es_ist_enter(struct pt_regs *regs) { }
>   static inline void sev_es_ist_exit(void) { }
> @@ -241,6 +247,8 @@ static inline int snp_issue_guest_request(u64 exit_code, struct snp_req_data *in
>   static inline void snp_accept_memory(phys_addr_t start, phys_addr_t end) { }
>   static inline u64 snp_get_unsupported_features(u64 status) { return 0; }
>   static inline u64 sev_get_status(void) { return 0; }
> +void snp_kexec_unshare_mem(void) {}
> +static void snp_kexec_stop_conversion(bool crash) {}
>   #endif
>   
>   #endif
> diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c
> index 319fef37d9dc..457f1e5c8d00 100644
> --- a/arch/x86/kernel/probe_roms.c
> +++ b/arch/x86/kernel/probe_roms.c
> @@ -177,6 +177,22 @@ size_t pci_biosrom_size(struct pci_dev *pdev)
>   }
>   EXPORT_SYMBOL(pci_biosrom_size);
>   
> +void snp_kexec_unprep_rom_memory(void)
> +{
> +	unsigned long vaddr, npages, sz;
> +
> +	/*
> +	 * Switch back ROM regions to shared so that their validation
> +	 * does not fail during kexec kernel boot.
> +	 */
> +	vaddr = (unsigned long)__va(video_rom_resource.start);
> +	sz = (system_rom_resource.end + 1) - video_rom_resource.start;
> +	npages = PAGE_ALIGN(sz) >> PAGE_SHIFT;
> +
> +	snp_set_memory_shared(vaddr, npages);
> +}
> +EXPORT_SYMBOL(snp_kexec_unprep_rom_memory);
> +
>   #define ROMSIGNATURE 0xaa55
>   
>   static int __init romsignature(const unsigned char *rom)
> diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
> index c67285824e82..765ab83129eb 100644
> --- a/arch/x86/kernel/sev.c
> +++ b/arch/x86/kernel/sev.c
> @@ -23,6 +23,9 @@
>   #include <linux/platform_device.h>
>   #include <linux/io.h>
>   #include <linux/psp-sev.h>
> +#include <linux/pagewalk.h>
> +#include <linux/cacheflush.h>
> +#include <linux/delay.h>
>   #include <uapi/linux/sev-guest.h>
>   
>   #include <asm/cpu_entry_area.h>
> @@ -40,6 +43,7 @@
>   #include <asm/apic.h>
>   #include <asm/cpuid.h>
>   #include <asm/cmdline.h>
> +#include <asm/probe_roms.h>
>   
>   #define DR7_RESET_VALUE        0x400
>   
> @@ -71,6 +75,13 @@ static struct ghcb *boot_ghcb __section(".data");
>   /* Bitmap of SEV features supported by the hypervisor */
>   static u64 sev_hv_features __ro_after_init;
>   
> +/* Last address to be switched to private during kexec */
> +static unsigned long last_address_shd_kexec;

Maybe kexec_last_address_to_make_private ? Or just something that makes a 
bit more sense.

> +
> +static bool crash_requested;
> +atomic_t conversions_in_progress;
> +bool conversion_allowed = true;
> +
>   /* #VC handler runtime per-CPU data */
>   struct sev_es_runtime_data {
>   	struct ghcb ghcb_page;
> @@ -906,6 +917,206 @@ void snp_accept_memory(phys_addr_t start, phys_addr_t end)
>   	set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE);
>   }
>   
> +static inline bool pte_decrypted(pte_t pte)
> +{
> +	return cc_mkdec(pte_val(pte)) == pte_val(pte);
> +}
> +

This is duplicated in TDX code, arch/x86/coco/tdx/tdx.c, looks like 
something that can go in a header file, maybe mem_encrypt.h.

> +static int set_pte_enc(pte_t *kpte, int level, void *va)
> +{
> +	pgprot_t old_prot, new_prot;
> +	unsigned long pfn, pa, size;
> +	pte_t new_pte;
> +
> +	pfn = pg_level_to_pfn(level, kpte, &old_prot);
> +	if (!pfn)
> +		return 0;

Not sure this matters... a zero PFN is a valid PFN, it's just marked not 
present.  This seems a bit overdone to me, see the end of this function to 
see if the more compact version works.

> +
> +	new_prot = old_prot;
> +	pgprot_val(new_prot) |= _PAGE_ENC;
> +	pa = pfn << PAGE_SHIFT;
> +	size = page_level_size(level);
> +
> +	/*
> +	 * Change the physical page attribute from C=0 to C=1. Flush the
> +	 * caches to ensure that data gets accessed with the correct C-bit.
> +	 */
> +	clflush_cache_range(va, size);
> +
> +	/* Change the page encryption mask. */
> +	new_pte = pfn_pte(pfn, new_prot);
> +	set_pte_atomic(kpte, new_pte);
> +
> +	return 1;
> +}

static bool set_pte_enc(pte_t *kpte, int level, void *va)
{
	pte_t new_pte;

	if (pte_none(*kpte))
		return false;

	if (pte_present(*kpte))
		clflush_cache_range(va, page_leve_size(level);

	new_pte = cc_mkenc(*kpte);
	set_pte_atomic(kpte, new_pte);

	return true;
}

> +
> +static int unshare_pte(pte_t *pte, unsigned long addr, int pages, int level)

Maybe a better name is make_pte_private ?

And if you are only returning 0 or 1, it begs to be a bool.

> +{
> +	struct sev_es_runtime_data *data;
> +	struct ghcb *ghcb;
> +
> +	data = this_cpu_read(runtime_data);
> +	ghcb = &data->ghcb_page;
> +
> +	/*
> +	 * check for GHCB for being part of a PMD range.
> +	 */

Single line comment.

> +	if ((unsigned long)ghcb >= addr &&
> +	    (unsigned long)ghcb <= (addr + (pages * PAGE_SIZE))) {
> +		/*
> +		 * setup last address to be made private so that this GHCB
> +		 * is made private at the end of unshared loop so that RMP
> +		 * does not possibly getting PSMASHed from using the
> +		 * MSR protocol.
> +		 */

Please clarify this comment a bit more...  it's a bit hard to follow.

> +		pr_debug("setting boot_ghcb to NULL for this cpu ghcb\n");
> +		last_address_shd_kexec = addr;
> +		return 1;
> +	}

Add a blank line here.

> +	if (!set_pte_enc(pte, level, (void *)addr))
> +		return 0;

Add a blank line here.

> +	snp_set_memory_private(addr, pages);
> +
> +	return 1;
> +}
> +
> +static void unshare_all_memory(bool unmap)

Unused input, looks like this can be removed.

> +{
> +	unsigned long addr, end;
> +
> +	/*
> +	 * Walk direct mapping and convert all shared memory back to private,
> +	 */
> +
> +	addr = PAGE_OFFSET;
> +	end  = PAGE_OFFSET + get_max_mapped();
> +
> +	while (addr < end) {
> +		unsigned long size;
> +		unsigned int level;
> +		pte_t *pte;
> +
> +		pte = lookup_address(addr, &level);
> +		size = page_level_size(level);
> +
> +		/*
> +		 * pte_none() check is required to skip physical memory holes in direct mapped.
> +		 */
> +		if (pte && pte_decrypted(*pte) && !pte_none(*pte)) {
> +			int pages = size / PAGE_SIZE;
> +
> +			if (!unshare_pte(pte, addr, pages, level)) {
> +				pr_err("Failed to unshare range %#lx-%#lx\n",
> +				       addr, addr + size);
> +			}
> +
> +		}
> +
> +		addr += size;
> +	}
> +	__flush_tlb_all();

This is also mostly in the TDX code and begs to be made common and not 
copied... please figure out a way to do the "different" things through a 
registered callback or such.

> +
> +}
> +
> +static void unshare_all_bss_decrypted_memory(void)
> +{
> +	unsigned long vaddr, vaddr_end;
> +	unsigned long size;
> +	unsigned int level;
> +	unsigned int npages;
> +	pte_t *pte;
> +
> +	vaddr = (unsigned long)__start_bss_decrypted;
> +	vaddr_end = (unsigned long)__start_bss_decrypted_unused;
> +	npages = (vaddr_end - vaddr) >> PAGE_SHIFT;
> +	for (; vaddr < vaddr_end; vaddr += PAGE_SIZE) {
> +		pte = lookup_address(vaddr, &level);
> +		if (!pte || !pte_decrypted(*pte) || pte_none(*pte))
> +			continue;
> +
> +		size = page_level_size(level);
> +		set_pte_enc(pte, level, (void *)vaddr);
> +	}
> +	vaddr = (unsigned long)__start_bss_decrypted;
> +	snp_set_memory_private(vaddr, npages);
> +}
> +
> +void snp_kexec_stop_conversion(bool crash)
> +{
> +	/* Stop new private<->shared conversions */
> +	conversion_allowed = false;
> +	crash_requested = crash;
> +
> +	/*
> +	 * Make sure conversion_allowed is cleared before checking
> +	 * conversions_in_progress.
> +	 */
> +	barrier();

This should be smp_wmb().

> +
> +	/*
> +	 * Crash kernel reaches here with interrupts disabled: can't wait for
> +	 * conversions to finish.
> +	 *
> +	 * If race happened, just report and proceed.
> +	 */
> +	if (!crash) {
> +		unsigned long timeout;
> +
> +		/*
> +		 * Wait for in-flight conversions to complete.
> +		 *
> +		 * Do not wait more than 30 seconds.
> +		 */
> +		timeout = 30 * USEC_PER_SEC;
> +		while (atomic_read(&conversions_in_progress) && timeout--)
> +			udelay(1);
> +	}
> +
> +	if (atomic_read(&conversions_in_progress))
> +		pr_warn("Failed to finish shared<->private conversions\n");
> +}

Again, same code as in TDX (except for the crash_requested, but I don't 
see that used anywhere), so please make it common.

> +
> +void snp_kexec_unshare_mem(void)
> +{
> +	if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP))
> +		return;
> +
> +	/*
> +	 * Switch back any specific memory regions such as option
> +	 * ROM regions back to shared so that (re)validation does
> +	 * not fail when kexec kernel boots.
> +	 */
> +	snp_kexec_unprep_rom_memory();
> +
> +	unshare_all_memory(true);
> +
> +	unshare_all_bss_decrypted_memory();
> +
> +	if (last_address_shd_kexec) {
> +		unsigned long size;
> +		unsigned int level;
> +		pte_t *pte;
> +
> +		/*
> +		 * Switch to using the MSR protocol to change this cpu's
> +		 * GHCB to private.
> +		 */
> +		boot_ghcb = NULL;
> +		/*
> +		 * All the per-cpu GHCBs have been switched back to private,
> +		 * so can't do any more GHCB calls to the hypervisor beyond
> +		 * this point till the kexec kernel starts running.
> +		 */
> +		sev_cfg.ghcbs_initialized = false;

Maybe combine the two comments above into a single comment and then keep 
the two assignments together.

> +
> +		pr_debug("boot ghcb 0x%lx\n", last_address_shd_kexec);
> +		pte = lookup_address(last_address_shd_kexec, &level);
> +		size = page_level_size(level);
> +		set_pte_enc(pte, level, (void *)last_address_shd_kexec);
> +		snp_set_memory_private(last_address_shd_kexec, (size / PAGE_SIZE));
> +	}
> +}
> +
>   static int snp_set_vmsa(void *va, bool vmsa)
>   {
>   	u64 attrs;
> diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c
> index d314e577836d..87b6475358ad 100644
> --- a/arch/x86/mm/mem_encrypt_amd.c
> +++ b/arch/x86/mm/mem_encrypt_amd.c
> @@ -214,7 +214,7 @@ void __init sme_map_bootdata(char *real_mode_data)
>   	__sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, true);
>   }
>   
> -static unsigned long pg_level_to_pfn(int level, pte_t *kpte, pgprot_t *ret_prot)
> +unsigned long pg_level_to_pfn(int level, pte_t *kpte, pgprot_t *ret_prot)

This change shouldn't be needed anymore if you modify the set_pte_enc() 
function.

>   {
>   	unsigned long pfn = 0;
>   	pgprot_t prot;
> @@ -285,6 +285,17 @@ static void enc_dec_hypercall(unsigned long vaddr, unsigned long size, bool enc)
>   
>   static int amd_enc_status_change_prepare(unsigned long vaddr, int npages, bool enc)
>   {
> +	atomic_inc(&conversions_in_progress);
> +
> +	/*
> +	 * Check after bumping conversions_in_progress to serialize
> +	 * against snp_kexec_stop_conversion().
> +	 */
> +	if (!conversion_allowed) {
> +		atomic_dec(&conversions_in_progress);
> +		return -EBUSY;
> +	}

Duplicate code, please move to a common file, along with the varialbles, 
such as arch/x86/mm/mem_encrypt.c ?

> +
>   	/*
>   	 * To maintain the security guarantees of SEV-SNP guests, make sure
>   	 * to invalidate the memory before encryption attribute is cleared.
> @@ -308,6 +319,8 @@ static int amd_enc_status_change_finish(unsigned long vaddr, int npages, bool en
>   	if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
>   		enc_dec_hypercall(vaddr, npages << PAGE_SHIFT, enc);
>   
> +	atomic_dec(&conversions_in_progress);

Ditto here.

Thanks,
Tom

> +
>   	return 0;
>   }
>   
> @@ -468,6 +481,9 @@ void __init sme_early_init(void)
>   	x86_platform.guest.enc_tlb_flush_required    = amd_enc_tlb_flush_required;
>   	x86_platform.guest.enc_cache_flush_required  = amd_enc_cache_flush_required;
>   
> +	x86_platform.guest.enc_kexec_stop_conversion = snp_kexec_stop_conversion;
> +	x86_platform.guest.enc_kexec_unshare_mem     = snp_kexec_unshare_mem;
> +
>   	/*
>   	 * AMD-SEV-ES intercepts the RDMSR to read the X2APIC ID in the
>   	 * parallel bringup low level code. That raises #VC which cannot be



More information about the kexec mailing list