[PATCH v8 12/13] arm64: kernel: Add support for hibernate/suspend-to-disk

Catalin Marinas catalin.marinas at arm.com
Tue Apr 26 07:39:45 PDT 2016


On Mon, Apr 25, 2016 at 06:10:49PM +0100, James Morse wrote:
> Add support for hibernate/suspend-to-disk.
> 
> Suspend borrows code from cpu_suspend() to write cpu state onto the stack,
> before calling swsusp_save() to save the memory image.
> 
> Restore creates a set of temporary page tables, covering only the
> linear map, copies the restore code to a 'safe' page, then uses the copy to
> restore the memory image. The copied code executes in the lower half of the
> address space, and once complete, restores the original kernel's page
> tables. It then calls into cpu_resume(), and follows the normal
> cpu_suspend() path back into the suspend code.
> 
> To restore a kernel using KASLR, the address of the page tables, and
> cpu_resume() are stored in the hibernate arch-header and the el2
> vectors are pivotted via the 'safe' page in low memory.
> 
> Signed-off-by: James Morse <james.morse at arm.com>
> Tested-by: Kevin Hilman <khilman at baylibre.com> # Tested on Juno R2
> 
> CC: Catalin Marinas <catalin.marinas at arm.com>

Nitpick: please keep the Cc: tags in the same block as the other tags
(signed-off etc.; IOW no empty line between them). Though at some point
you'll change this to a reviewed-by tag.

> --- /dev/null
> +++ b/arch/arm64/kernel/hibernate-asm.S
> @@ -0,0 +1,157 @@
> +/*
> + * Hibernate low-level support
> + *
> + * Copyright (C) 2016 ARM Ltd.
> + * Author:	James Morse <james.morse at arm.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program.  If not, see <http://www.gnu.org/licenses/>.
> + */
> +#include <linux/linkage.h>
> +#include <linux/errno.h>
> +
> +#include <asm/asm-offsets.h>
> +#include <asm/assembler.h>
> +#include <asm/cputype.h>
> +#include <asm/memory.h>
> +#include <asm/page.h>
> +#include <asm/virt.h>
> +
> +/*
> + * Corrupt memory.

Maybe a better headline for this function ;).

> + *
> + * Loads temporary page tables then restores the memory image.
> + * Finally branches to cpu_resume() to restore the state saved by
> + * swsusp_arch_suspend().
> + *
> + * Because this code has to be copied to a safe_page, it can't call out to
> + * other functions by PC-relative address. Also remember that it may be
> + * mid-way through over-writing other functions. For this reason it contains
> + * code from flush_icache_range() and uses the copy_page() macro.
> + *
> + * All of memory gets written to, including code. We need to clean the kernel
> + * text to the Point of Coherence (PoC) before secondary cores can be booted.
> + * Because the kernel modules and executable pages mapped to user space are
> + * also written as data, we clean all pages we touch to the Point of
> + * Unification (PoU).
> + *
> + * We use el2_setup() to reconfigure el2. This code needs cleaning to PoC by VA,
> + * but called with its physical address, as we left el2 with the MMU turned off.
> + *
> + * x0: physical address of temporary page tables
> + * x1: physical address of swapper page tables
> + * x2: address of cpu_resume
> + * x3: linear map address of restore_pblist in the current kernel
> + * x4: physical address of __hyp_stub_vectors, or 0
> + */
> +.pushsection    ".hibernate_exit.text", "ax"
> +ENTRY(swsusp_arch_suspend_exit)
> +	/* Temporary page tables are a copy, so no need for a trampoline here */
> +	msr	ttbr1_el1, x0
> +	isb
> +	tlbi	vmalle1is	/* invalidate intermediate caching entries */
> +	dsb	ish

I think you need an ISB here as well since you are continuing with the
new page tables (without an ERET or exception taking).

> +
> +	mov	x21, x1
> +	mov	x30, x2		/* el2_setup() will eret to the lr directly */
> +	mov	x24, x4
> +
> +	/* walk the restore_pblist and use copy_page() to over-write memory */
> +	mov	x19, x3
> +
> +1:	ldr	x10, [x19, #HIBERN_PBE_ORIG]
> +	mov	x0, x10
> +	ldr	x1, [x19, #HIBERN_PBE_ADDR]
> +
> +	copy_page	x0, x1, x2, x3, x4, x5, x6, x7, x8, x9
> +
> +	add	x1, x10, #PAGE_SIZE
> +	/* Clean the copied page to PoU - based on flush_icache_range() */
> +	dcache_line_size x2, x3
> +	sub	x3, x2, #1
> +	bic	x4, x10, x3
> +2:	dc	cvau, x4	/* clean D line / unified line */
> +	add	x4, x4, x2
> +	cmp	x4, x1
> +	b.lo	2b
> +
> +	ldr	x19, [x19, #HIBERN_PBE_NEXT]
> +	cbnz	x19, 1b
> +	dsb	ish		/* wait for PoU cleaning to finish */
> +
> +	/* switch to the restored kernels page tables */
> +	msr	ttbr1_el1, x21  /* physical address of swapper page tables */
> +	isb
> +	tlbi	vmalle1is	/* invalidate intermediate caching entries */
> +	ic	ialluis
> +	dsb	ish		/* wait for icache invalidate to finish */
> +	isb

It looks like you have an ISB here but not above.

> +
> +	cbz	x24, 3f		/* Do we need to re-initialise EL2? */
> +	hvc	#0
> +3:	ret
> +
> +	.ltorg
> +ENDPROC(swsusp_arch_suspend_exit)

[...]

> --- /dev/null
> +++ b/arch/arm64/kernel/hibernate.c
> @@ -0,0 +1,444 @@

[...]

> +/*
> + * Copies length bytes, starting at src_start into an new page,
> + * perform cache maintentance, then maps it at the specified address low
> + * address as executable.
> + *
> + * This is used by hibernate to copy the code it needs to execute when
> + * overwriting the kernel text. This function generates a new set of page
> + * tables, which it loads into ttbr0.
> + *
> + * Length is provided as we probably only want 4K of data, even on a 64K
> + * page system.
> + */
> +static int create_safe_exec_page(void *src_start, size_t length,
> +				 unsigned long dst_addr,
> +				 phys_addr_t *phys_dst_addr,
> +				 void *(*allocator)(gfp_t mask),
> +				 gfp_t mask)
> +{
> +	int rc = 0;
> +	pgd_t *pgd;
> +	pud_t *pud;
> +	pmd_t *pmd;
> +	pte_t *pte;
> +	unsigned long dst = (unsigned long)allocator(mask);
> +
> +	if (!dst) {
> +		rc = -ENOMEM;
> +		goto out;
> +	}
> +
> +	memcpy((void *)dst, src_start, length);
> +	flush_icache_range(dst, dst + length);
> +
> +	pgd = pgd_offset_raw(allocator(mask), dst_addr);
> +	if (pgd_none(*pgd)) {
> +		pud = allocator(mask);
> +		if (!pud) {
> +			rc = -ENOMEM;
> +			goto out;
> +		}
> +		set_pgd(pgd, __pgd(virt_to_phys(pud) | PUD_TYPE_TABLE));
> +	}
> +
> +	pud = pud_offset(pgd, dst_addr);
> +	if (pud_none(*pud)) {
> +		pmd = allocator(mask);
> +		if (!pmd) {
> +			rc = -ENOMEM;
> +			goto out;
> +		}
> +		set_pud(pud, __pud(virt_to_phys(pmd) | PUD_TYPE_TABLE));
> +	}
> +
> +	pmd = pmd_offset(pud, dst_addr);
> +	if (pmd_none(*pmd)) {
> +		pte = allocator(mask);
> +		if (!pte) {
> +			rc = -ENOMEM;
> +			goto out;
> +		}
> +		set_pmd(pmd, __pmd(virt_to_phys(pte) | PMD_TYPE_TABLE));
> +	}

It looks fine now. A potential improvement would be to use
p*d_populate() instead of set_p*d(), though it needs an init_mm. It's up
to you (I have a similar comment further down).

> +
> +	pte = pte_offset_kernel(pmd, dst_addr);
> +	set_pte(pte, __pte(virt_to_phys((void *)dst) |
> +			 pgprot_val(PAGE_KERNEL_EXEC)));
> +
> +	/* Load our new page tables */
> +	asm volatile("msr	ttbr0_el1, %0;"
> +		     "isb;"
> +		     "tlbi	vmalle1is;"
> +		     "dsb	ish" : : "r"(virt_to_phys(pgd)));

ISB needed after TLBI *if* we are going to execute code from the new
mapping *without* an ERET (or taking of an exception).

> +
> +	*phys_dst_addr = virt_to_phys((void *)dst);
> +
> +out:
> +	return rc;
> +}
> +
> +
> +int swsusp_arch_suspend(void)
> +{
> +	int ret = 0;
> +	unsigned long flags;
> +	struct sleep_stack_data state;
> +
> +	local_dbg_save(flags);
> +
> +	if (__cpu_suspend_enter(&state)) {
> +		ret = swsusp_save();
> +	} else {
> +		/* Clean kernel to PoC for secondary core startup */
> +		__flush_dcache_area(LMADDR(KERNEL_START), KERNEL_END - KERNEL_START);
> +
> +		/*
> +		 * Tell the hibernation core that we've just restored
> +		 * the memory
> +		 */
> +		in_suspend = 0;
> +
> +		__cpu_suspend_exit();
> +	}
> +
> +	local_dbg_restore(flags);
> +
> +	return ret;
> +}
> +
> +static int copy_pte(pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long start,
> +		    unsigned long end)
> +{
> +	pte_t *src_pte;
> +	pte_t *dst_pte;
> +	unsigned long addr = start;
> +
> +	dst_pte = (pte_t *)get_safe_page(GFP_ATOMIC);
> +	if (!dst_pte)
> +		return -ENOMEM;
> +	set_pmd(dst_pmd, __pmd(virt_to_phys(dst_pte) | PMD_TYPE_TABLE));
> +	dst_pte = pte_offset_kernel(dst_pmd, start);
> +
> +	src_pte = pte_offset_kernel(src_pmd, start);
> +	do {
> +		if (pte_val(*src_pte))
> +			set_pte(dst_pte, __pte(pte_val(*src_pte) & ~PTE_RDONLY));
> +	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
> +
> +	return 0;
> +}
> +
> +static int copy_pmd(pud_t *dst_pud, pud_t *src_pud, unsigned long start,
> +		    unsigned long end)
> +{
> +	pmd_t *src_pmd;
> +	pmd_t *dst_pmd;
> +	unsigned long next;
> +	unsigned long addr = start;
> +
> +	dst_pmd = (pmd_t *)get_safe_page(GFP_ATOMIC);
> +	if (!dst_pmd)
> +		return -ENOMEM;
> +	set_pud(dst_pud, __pud(virt_to_phys(dst_pmd) | PUD_TYPE_TABLE));
> +	dst_pmd = pmd_offset(dst_pud, start);
> +
> +	src_pmd = pmd_offset(src_pud, start);
> +	do {
> +		next = pmd_addr_end(addr, end);
> +		if (!pmd_val(*src_pmd))
> +			continue;
> +		if (pmd_table(*src_pmd)) {
> +			if (copy_pte(dst_pmd, src_pmd, addr, next))
> +				return -ENOMEM;
> +		} else {
> +			set_pmd(dst_pmd,
> +				__pmd(pmd_val(*src_pmd) & ~PMD_SECT_RDONLY));
> +		}
> +	} while (dst_pmd++, src_pmd++, addr = next, addr != end);
> +
> +	return 0;
> +}
> +
> +static int copy_pud(pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long start,
> +		    unsigned long end)
> +{
> +	pud_t *dst_pud;
> +	pud_t *src_pud;
> +	unsigned long next;
> +	unsigned long addr = start;
> +
> +	dst_pud = (pud_t *)get_safe_page(GFP_ATOMIC);
> +	if (!dst_pud)
> +		return -ENOMEM;
> +	set_pgd(dst_pgd, __pgd(virt_to_phys(dst_pud) | PUD_TYPE_TABLE));
> +	dst_pud = pud_offset(dst_pgd, start);

I think this is allocating pages even if we don't have a pud. So we
need:

	if (pgd_none(*dst_pgd)) {
		dst_pud = (pud_t *)get_safe_page(GFP_ATOMIC);
		if (!dst_pud)
			return -ENOMEM;
		set_pgd(dst_pgd, __pgd(virt_to_phys(dst_pud) | PUD_TYPE_TABLE));
	}
	dst_pud = pud_offset(dst_pgd, start);

With nopud, pgd_none is 0, so we simply skip the allocation (inspired by
the pud_alloc() macro).

You could also use pgd_populate() here, though you need init_mm
(alternatively, there is __pgd_populate()).

> +
> +	src_pud = pud_offset(src_pgd, start);
> +	do {
> +		next = pud_addr_end(addr, end);
> +		if (!pud_val(*src_pud))
> +			continue;

As for the pgd, I would use pud_none(*src_pud) here.

> +		if (pud_table(*(src_pud))) {
> +			if (copy_pmd(dst_pud, src_pud, addr, next))
> +				return -ENOMEM;
> +		} else {
> +			set_pud(dst_pud,
> +				__pud(pud_val(*src_pud) & ~PMD_SECT_RDONLY));

Why does PMD_SECT_RDONLY need clearing? You may want to add a comment.

The same comments apply to the copy_pmd/copy_pte functions above (I just
reviewed this part in reverse ;)).

> +		}
> +	} while (dst_pud++, src_pud++, addr = next, addr != end);
> +
> +	return 0;
> +}
> +
> +static int copy_page_tables(pgd_t *dst_pgd, unsigned long start,
> +			    unsigned long end)
> +{
> +	unsigned long next;
> +	unsigned long addr = start;
> +	pgd_t *src_pgd = pgd_offset_k(start);
> +
> +	dst_pgd = pgd_offset_raw(dst_pgd, start);
> +	do {
> +		next = pgd_addr_end(addr, end);
> +		if (!pgd_val(*src_pgd))
> +			continue;

The outcome may be the same but I would rather use:

		if (pgd_none(*src_pgd))
			continue;

When nopud, we won't even bother checking as pgd_none() is 0.

> +		if (copy_pud(dst_pgd, src_pgd, addr, next))
> +			return -ENOMEM;
> +	} while (dst_pgd++, src_pgd++, addr = next, addr != end);
> +
> +	return 0;
> +}

[...]

-- 
Catalin



More information about the linux-arm-kernel mailing list