[PATCHv10 10/18] x86/tdx: Convert shared memory back to private on kexec
Borislav Petkov
bp at alien8.de
Sun May 5 05:13:19 PDT 2024
On Tue, Apr 09, 2024 at 02:30:02PM +0300, Kirill A. Shutemov wrote:
> TDX guests allocate shared buffers to perform I/O. It is done by
> allocating pages normally from the buddy allocator and converting them
> to shared with set_memory_decrypted().
>
> The second kernel has no idea what memory is converted this way. It only
"The kexec-ed kernel..."
is more precise.
> sees E820_TYPE_RAM.
>
> Accessing shared memory via private mapping is fatal. It leads to
> unrecoverable TD exit.
>
> On kexec walk direct mapping and convert all shared memory back to
> private. It makes all RAM private again and second kernel may use it
> normally.
>
> The conversion occurs in two steps: stopping new conversions and
> unsharing all memory. In the case of normal kexec, the stopping of
> conversions takes place while scheduling is still functioning. This
> allows for waiting until any ongoing conversions are finished. The
> second step is carried out when all CPUs except one are inactive and
> interrupts are disabled. This prevents any conflicts with code that may
> access shared memory.
This is the missing bit of information I was looking for in the previous
patch. This needs to be documented in the code.
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov at linux.intel.com>
> Reviewed-by: Rick Edgecombe <rick.p.edgecombe at intel.com>
> Reviewed-by: Kai Huang <kai.huang at intel.com>
> Tested-by: Tao Liu <ltao at redhat.com>
> ---
> arch/x86/coco/tdx/tdx.c | 72 +++++++++++++++++++++++++++++++
> arch/x86/include/asm/pgtable.h | 5 +++
> arch/x86/include/asm/set_memory.h | 3 ++
> arch/x86/mm/pat/set_memory.c | 35 +++++++++++++--
> 4 files changed, 112 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c
> index 979891e97d83..59776ce1c1d7 100644
> --- a/arch/x86/coco/tdx/tdx.c
> +++ b/arch/x86/coco/tdx/tdx.c
> @@ -7,6 +7,7 @@
> #include <linux/cpufeature.h>
> #include <linux/export.h>
> #include <linux/io.h>
> +#include <linux/kexec.h>
> #include <asm/coco.h>
> #include <asm/tdx.h>
> #include <asm/vmx.h>
> @@ -14,6 +15,7 @@
> #include <asm/insn.h>
> #include <asm/insn-eval.h>
> #include <asm/pgtable.h>
> +#include <asm/set_memory.h>
>
> /* MMIO direction */
> #define EPT_READ 0
> @@ -831,6 +833,73 @@ static int tdx_enc_status_change_finish(unsigned long vaddr, int numpages,
> return 0;
> }
>
> +/* Stop new private<->shared conversions */
> +static void tdx_kexec_stop_conversion(bool crash)
> +{
> + /*
> + * Crash kernel reaches here with interrupts disabled: can't wait for
> + * conversions to finish.
> + *
> + * If race happened, just report and proceed.
> + */
> + bool wait_for_lock = !crash;
You don't need that bool - use crash.
> +
> + if (!stop_memory_enc_conversion(wait_for_lock))
> + pr_warn("Failed to stop shared<->private conversions\n");
> +}
> +
> +static void tdx_kexec_unshare_mem(void)
> +{
> + unsigned long addr, end;
> + long found = 0, shared;
> +
> + /*
> + * Walk direct mapping and convert all shared memory back to private,
> + */
Over the function name and end with a fullstop.
> +
> + addr = PAGE_OFFSET;
> + end = PAGE_OFFSET + get_max_mapped();
> +
> + while (addr < end) {
> + unsigned long size;
> + unsigned int level;
> + pte_t *pte;
> +
> + pte = lookup_address(addr, &level);
> + size = page_level_size(level);
> +
> + if (pte && pte_decrypted(*pte)) {
> + int pages = size / PAGE_SIZE;
> +
> + /*
> + * Touching memory with shared bit set triggers implicit
> + * conversion to shared.
> + *
> + * Make sure nobody touches the shared range from
> + * now on.
> + */
lockdep_assert_irqs_disabled() ?
> + set_pte(pte, __pte(0));
> +
> + if (!tdx_enc_status_changed(addr, pages, true)) {
> + pr_err("Failed to unshare range %#lx-%#lx\n",
> + addr, addr + size);
Why are we printing something here if we're not really acting up on it?
Who should care here?
> + }
> +
> + found += pages;
> + }
> +
> + addr += size;
> + }
> +
> + __flush_tlb_all();
> +
> + shared = atomic_long_read(&nr_shared);
> + if (shared != found) {
> + pr_err("shared page accounting is off\n");
> + pr_err("nr_shared = %ld, nr_found = %ld\n", shared, found);
> + }
Ok, we failed unsharing. And yet we don't do anything.
But if we fail unsharing, we will die on a unrecoverable TD exit or
whatever.
Why aren't we failing kexec here?
> +}
> +
> void __init tdx_early_init(void)
> {
> struct tdx_module_args args = {
> @@ -890,6 +959,9 @@ void __init tdx_early_init(void)
> x86_platform.guest.enc_cache_flush_required = tdx_cache_flush_required;
> x86_platform.guest.enc_tlb_flush_required = tdx_tlb_flush_required;
>
> + x86_platform.guest.enc_kexec_stop_conversion = tdx_kexec_stop_conversion;
> + x86_platform.guest.enc_kexec_unshare_mem = tdx_kexec_unshare_mem;
> +
> /*
> * TDX intercepts the RDMSR to read the X2APIC ID in the parallel
> * bringup low level code. That raises #VE which cannot be handled
> diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
> index 315535ffb258..17f4d97fae06 100644
> --- a/arch/x86/include/asm/pgtable.h
> +++ b/arch/x86/include/asm/pgtable.h
> @@ -140,6 +140,11 @@ static inline int pte_young(pte_t pte)
> return pte_flags(pte) & _PAGE_ACCESSED;
> }
>
> +static inline bool pte_decrypted(pte_t pte)
> +{
> + return cc_mkdec(pte_val(pte)) == pte_val(pte);
> +}
> +
> #define pmd_dirty pmd_dirty
> static inline bool pmd_dirty(pmd_t pmd)
> {
> diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
> index 9aee31862b4a..44b6d711296c 100644
> --- a/arch/x86/include/asm/set_memory.h
> +++ b/arch/x86/include/asm/set_memory.h
> @@ -49,8 +49,11 @@ int set_memory_wb(unsigned long addr, int numpages);
> int set_memory_np(unsigned long addr, int numpages);
> int set_memory_p(unsigned long addr, int numpages);
> int set_memory_4k(unsigned long addr, int numpages);
> +
> +bool stop_memory_enc_conversion(bool wait);
> int set_memory_encrypted(unsigned long addr, int numpages);
> int set_memory_decrypted(unsigned long addr, int numpages);
> +
> int set_memory_np_noalias(unsigned long addr, int numpages);
> int set_memory_nonglobal(unsigned long addr, int numpages);
> int set_memory_global(unsigned long addr, int numpages);
> diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
> index 6c49f69c0368..21835339c0e6 100644
> --- a/arch/x86/mm/pat/set_memory.c
> +++ b/arch/x86/mm/pat/set_memory.c
> @@ -2188,12 +2188,41 @@ static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc)
> return ret;
> }
>
<--- insert comment here what this thing is guarding.
> +static DECLARE_RWSEM(mem_enc_lock);
> +
> +/*
> + * Stop new private<->shared conversions.
> + *
> + * Taking the exclusive mem_enc_lock waits for in-flight conversions to complete.
> + * The lock is not released to prevent new conversions from being started.
> + *
> + * If sleep is not allowed, as in a crash scenario, try to take the lock.
> + * Failure indicates that there is a race with the conversion.
> + */
> +bool stop_memory_enc_conversion(bool wait)
This is a global function which means, it should be called:
set_memory_enc_stop_conversion()
or so. With the proper prefix and so on.
> +{
> + if (!wait)
> + return down_write_trylock(&mem_enc_lock);
> +
> + down_write(&mem_enc_lock);
> +
> + return true;
> +}
> +
> static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
> {
> - if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
> - return __set_memory_enc_pgtable(addr, numpages, enc);
> + int ret = 0;
>
> - return 0;
> + if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) {
> + if (!down_read_trylock(&mem_enc_lock))
> + return -EBUSY;
This function is called on SEV* and HyperV and the respective folks need
to at least ack this new approach.
I see Ashish's patch adds the respective stuff:
https://lore.kernel.org/r/c24516a4636a36d57186ea90ae26495b3c1cfb8b.1714148366.git.ashish.kalra@amd.com
which leaves HyperV. You'd need to Cc them on the next submission.
Thx.
--
Regards/Gruss,
Boris.
https://people.kernel.org/tglx/notes-about-netiquette
More information about the kexec
mailing list