[PATCH v4 6/6] arm64: mm: set the contiguous bit for kernel mappings where appropriate
Ard Biesheuvel
ard.biesheuvel at linaro.org
Wed Mar 8 02:57:22 PST 2017
On 7 March 2017 at 17:46, Mark Rutland <mark.rutland at arm.com> wrote:
> Hi,
>
> On Sat, Mar 04, 2017 at 02:30:48PM +0000, Ard Biesheuvel wrote:
>> This is the third attempt at enabling the use of contiguous hints for
>> kernel mappings. The most recent attempt 0bfc445dec9d was reverted after
>> it turned out that updating permission attributes on live contiguous ranges
>> may result in TLB conflicts. So this time, the contiguous hint is not set
>> for .rodata or for the linear alias of .text/.rodata, both of which are
>> mapped read-write initially, and remapped read-only at a later stage.
>> (Note that the latter region could also be unmapped and remapped again
>> with updated permission attributes, given that the region, while live, is
>> only mapped for the convenience of the hibernation code, but that also
>> means the TLB footprint is negligible anyway, so why bother)
>>
>> This enables the following contiguous range sizes for the virtual mapping
>> of the kernel image, and for the linear mapping:
>>
>> granule size | cont PTE | cont PMD |
>> -------------+------------+------------+
>> 4 KB | 64 KB | 32 MB |
>> 16 KB | 2 MB | 1 GB* |
>> 64 KB | 2 MB | 16 GB* |
>>
>> * Only when built for 3 or more levels of translation. This is due to the
>> fact that a 2 level configuration only consists of PGDs and PTEs, and the
>> added complexity of dealing with folded PMDs is not justified considering
>> that 16 GB contiguous ranges are likely to be ignored by the hardware (and
>> 16k/2 levels is a niche configuration)
>>
>> Signed-off-by: Ard Biesheuvel <ard.biesheuvel at linaro.org>
>> ---
>> arch/arm64/mm/mmu.c | 86 ++++++++++++++------
>> 1 file changed, 63 insertions(+), 23 deletions(-)
>>
>> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
>> index 0612573ef869..d0ae2f1f44fc 100644
>> --- a/arch/arm64/mm/mmu.c
>> +++ b/arch/arm64/mm/mmu.c
>> @@ -109,8 +109,10 @@ static bool pgattr_change_is_safe(u64 old, u64 new)
>> static void alloc_init_pte(pmd_t *pmd, unsigned long addr,
>> unsigned long end, unsigned long pfn,
>> pgprot_t prot,
>> - phys_addr_t (*pgtable_alloc)(void))
>> + phys_addr_t (*pgtable_alloc)(void),
>> + bool may_use_cont)
>
> Maybe we should invert this as single_pages_only, to keep the same
> polarity as page_mappings_only?
>
> That might make the calls to __create_pgd_mapping() in __map_memblock()
> look a little nicer, for instance.
>
Good point
>> {
>> + pgprot_t __prot = prot;
>> pte_t *pte;
>>
>> BUG_ON(pmd_sect(*pmd));
>> @@ -128,7 +130,19 @@ static void alloc_init_pte(pmd_t *pmd, unsigned long addr,
>> do {
>> pte_t old_pte = *pte;
>>
>> - set_pte(pte, pfn_pte(pfn, prot));
>> + /*
>> + * Set the contiguous bit for the subsequent group of PTEs if
>> + * its size and alignment are appropriate.
>> + */
>> + if (may_use_cont &&
>> + ((addr | PFN_PHYS(pfn)) & ~CONT_PTE_MASK) == 0) {
>> + if (end - addr >= CONT_PTE_SIZE)
>> + __prot = __pgprot(pgprot_val(prot) | PTE_CONT);
>> + else
>> + __prot = prot;
>> + }
>> +
>> + set_pte(pte, pfn_pte(pfn, __prot));
>> pfn++;
>>
>> /*
>
> While it would require more code, I think it would be better to add a
> function between alloc_init_pte() and alloc_init_pmd(), handling this in
> the usual fashion. e.g.
>
> ---->8----
> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> index 0eef606..2c90925 100644
> --- a/arch/arm64/include/asm/pgtable.h
> +++ b/arch/arm64/include/asm/pgtable.h
> @@ -74,6 +74,11 @@
> #define pte_user_exec(pte) (!(pte_val(pte) & PTE_UXN))
> #define pte_cont(pte) (!!(pte_val(pte) & PTE_CONT))
>
> +#define pte_cont_addr_end(addr, end) \
> +({ unsigned long __boundary = ((addr) + CONT_PTE_SIZE) & CONT_PTE_MASK; \
> + (__boundary - 1 < (end) - 1)? __boundary: (end); \
> +})
> +
> #ifdef CONFIG_ARM64_HW_AFDBM
> #define pte_hw_dirty(pte) (pte_write(pte) && !(pte_val(pte) & PTE_RDONLY))
> #else
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index 2aec93ab..3cee826 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -142,6 +142,32 @@ static void alloc_init_pte(pmd_t *pmd, unsigned long addr,
> pte_clear_fixmap();
> }
>
> +static void alloc_init_cont_pte(pmd_t *pmd, unsigned long addr,
> + unsigned long end, phys_addr_t phys,
> + pgprot_t prot,
> + phys_addr_t (*pgtable_alloc)(void),
> + bool may_use_cont)
> +{
> + const pgprot_t cont_prot = __pgprot(pgprot_val(prot) | PTE_CONT);
> + unsigned long next;
> +
> + do {
> + next = pte_cont_addr_end(addr, end);
> +
> + /* try a contiguous mapping first */
> + if ((((addr | next | phys) & ~CONT_PTE_MASK) == 0) &&
> + may_use_cont) {
> + alloc_init_pte(pmd, addr, next, phys, cont_prot,
> + pgtable_alloc);
> + } else {
> + alloc_init_pte(pmd, addr, next, phys, prot,
> + pgtable_alloc);
> + }
> +
> + phys += next - addr;
> + } while (addr = next, addr != end);
> +}
> +
> static void alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end,
> phys_addr_t phys, pgprot_t prot,
> phys_addr_t (*pgtable_alloc)(void),
> ---->8----
>
> It's more code, but it follows the existing pattern, and I personally
> find it easier to follow than changing prot within the loop.
>
> Note that I've cheated and made alloc_init_pte() take a phys_addr_t
> rather than a pfn, which I think we should do anyhow for consistency. I
> have a patch for that, if you agree.
>
Yes, absolutely. I did not spot this before you pointed it out, but it
looks a bit sloppy.
> Another think we *might* want to do here is pass a flags parameter that
> than separate page_mappings_only and mask_use_cont. That might also make
> things less painful in future if there are other things we need to pass
> down.
>
Yes, I already did that in a draft version, and then dropped it again.
I don't have a strong preference either way, so let me try that for
the next version
> That might also end up complicating matters, given the way we currently
> use debug_pagealloc_enabled() in __create_pgd_mapping() callers, so I'm
> also happy to leave that.
>
> [...]
>
>> @@ -173,7 +189,19 @@ static void alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end,
>> /* try section mapping first */
>> if (((addr | next | phys) & ~SECTION_MASK) == 0 &&
>> !page_mappings_only) {
>> - pmd_set_huge(pmd, phys, prot);
>> + /*
>> + * Set the contiguous bit for the subsequent group of
>> + * PMDs if its size and alignment are appropriate.
>> + */
>> + if (may_use_cont &&
>> + ((addr | phys) & ~CONT_PMD_MASK) == 0) {
>> + if (end - addr >= CONT_PMD_SIZE)
>> + __prot = __pgprot(pgprot_val(prot) |
>> + PTE_CONT);
>> + else
>> + __prot = prot;
>> + }
>> + pmd_set_huge(pmd, phys, __prot);
>
> As above, I think it would be better to have a alloc_init_cont_pmd()
> wrapper that iterated in CONT_PMD_SIZE chunks, so as to keep the usual
> pattern.
>
>> @@ -381,7 +418,8 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end
>> */
>> __create_pgd_mapping(pgd, kernel_start, __phys_to_virt(kernel_start),
>> kernel_end - kernel_start, PAGE_KERNEL,
>> - early_pgtable_alloc, debug_pagealloc_enabled());
>> + early_pgtable_alloc, debug_pagealloc_enabled(),
>> + false);
>> }
>
>> @@ -437,7 +476,8 @@ static void __init map_kernel_segment(pgd_t *pgd, void *va_start, void *va_end,
>> BUG_ON(!PAGE_ALIGNED(size));
>>
>> __create_pgd_mapping(pgd, pa_start, (unsigned long)va_start, size, prot,
>> - early_pgtable_alloc, debug_pagealloc_enabled());
>> + early_pgtable_alloc, debug_pagealloc_enabled(),
>> + !debug_pagealloc_enabled() && may_use_cont);
>>
>> vma->addr = va_start;
>> vma->phys_addr = pa_start;
>> @@ -464,14 +504,14 @@ static void __init map_kernel(pgd_t *pgd)
>>
>> pgprot_t text_prot = rodata_enabled ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC;
>>
>> - map_kernel_segment(pgd, _text, _etext, text_prot, &vmlinux_text);
>> + map_kernel_segment(pgd, _text, _etext, text_prot, &vmlinux_text, true);
>> map_kernel_segment(pgd, __start_rodata, __inittext_begin, PAGE_KERNEL,
>> - &vmlinux_rodata);
>> + &vmlinux_rodata, false);
>> map_kernel_segment(pgd, __inittext_begin, __inittext_end, text_prot,
>> - &vmlinux_inittext);
>> + &vmlinux_inittext, true);
>> map_kernel_segment(pgd, __initdata_begin, __initdata_end, PAGE_KERNEL,
>> - &vmlinux_initdata);
>> - map_kernel_segment(pgd, _data, _end, PAGE_KERNEL, &vmlinux_data);
>> + &vmlinux_initdata, true);
>> + map_kernel_segment(pgd, _data, _end, PAGE_KERNEL, &vmlinux_data, true);
>
> It might be worth a comment somewhere as to why we have to special case
> text, rodata, and the linear alias, but otherwise this looks fine.
>
Ack
More information about the linux-arm-kernel
mailing list