[PATCH] arm64: mm: don't acquire mutex when rewriting swapper

Ard Biesheuvel ardb at kernel.org
Tue Sep 20 07:25:33 PDT 2022


On Tue, 20 Sept 2022 at 15:47, Mark Rutland <mark.rutland at arm.com> wrote:
>
> Since commit:
>
>   47546a1912fc4a03 ("arm64: mm: install KPTI nG mappings with MMU enabled)"
>
> ... when building with CONFIG_DEBUG_ATOMIC_SLEEP=y and booting under
> QEMU TCG with '-cpu max', there's a boot-time splat:
>
> | BUG: sleeping function called from invalid context at kernel/locking/mutex.c:580
> | in_atomic(): 1, irqs_disabled(): 128, non_block: 0, pid: 15, name: migration/0
> | preempt_count: 1, expected: 0
> | RCU nest depth: 0, expected: 0
> | no locks held by migration/0/15.
> | irq event stamp: 28
> | hardirqs last  enabled at (27): [<ffff8000091ed180>] _raw_spin_unlock_irq+0x3c/0x7c
> | hardirqs last disabled at (28): [<ffff8000081b8d74>] multi_cpu_stop+0x150/0x18c
> | softirqs last  enabled at (0): [<ffff80000809a314>] copy_process+0x594/0x1964
> | softirqs last disabled at (0): [<0000000000000000>] 0x0
> | CPU: 0 PID: 15 Comm: migration/0 Not tainted 6.0.0-rc3-00002-g419b42ff7eef #3
> | Hardware name: linux,dummy-virt (DT)
> | Stopper: multi_cpu_stop+0x0/0x18c <- stop_cpus.constprop.0+0xa0/0xfc
> | Call trace:
> |  dump_backtrace.part.0+0xd0/0xe0
> |  show_stack+0x1c/0x5c
> |  dump_stack_lvl+0x88/0xb4
> |  dump_stack+0x1c/0x38
> |  __might_resched+0x180/0x230
> |  __might_sleep+0x4c/0xa0
> |  __mutex_lock+0x5c/0x450
> |  mutex_lock_nested+0x30/0x40
> |  create_kpti_ng_temp_pgd+0x4fc/0x6d0
> |  kpti_install_ng_mappings+0x2b8/0x3b0
> |  cpu_enable_non_boot_scope_capabilities+0x7c/0xd0
> |  multi_cpu_stop+0xa0/0x18c
> |  cpu_stopper_thread+0x88/0x11c
> |  smpboot_thread_fn+0x1ec/0x290
> |  kthread+0x118/0x120
> |  ret_from_fork+0x10/0x20
>
> Since commit:
>
>   ee017ee353506fce ("arm64/mm: avoid fixmap race condition when create pud mapping")
>
> ... once the kernel leave the SYSTEM_BOOTING state, the fixmap pagetable
> entries are protected by the fixmap_lock mutex.
>
> The new KPTI rewrite code uses __create_pgd_mapping() to create a
> temporary pagetable. This happens in atomic context, after secondary
> CPUs are brought up and the kernel has left the SYSTEM_BOOTING state.
> Hence we try to acquire a mutex in atomic context, which is generally
> unsound (though benign in this case as the mutex should be free and all
> other CPUs are quiescent).
>
> This patch avoids the issue by pulling the mutex out of alloc_init_pud()
> and calling it at a higher level in the pagetable manipulation code.
> This allows it to be used without locking where one CPU is known to be
> in exclusive control of the machine, even after having left the
> SYSTEM_BOOTING state.
>
> Fixes: 47546a1912fc4a03 ("arm64: mm: install KPTI nG mappings with MMU enabled)"
> Signed-off-by: Mark Rutland <mark.rutland at arm.com>
> Cc: Ard Biesheuvel <ardb at kernel.org>
> Cc: Catalin Marinas <catalin.marinas at arm.com>
> Cc: Will Deacon <will at kernel.org>

Reviewed-by: Ard Biesheuvel <ardb at kernel.org>

> ---
>  arch/arm64/mm/mmu.c | 32 ++++++++++++++++++--------------
>  1 file changed, 18 insertions(+), 14 deletions(-)
>
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index e7ad44585f40a..eb489302c28a4 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -331,12 +331,6 @@ static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end,
>         }
>         BUG_ON(p4d_bad(p4d));
>
> -       /*
> -        * No need for locking during early boot. And it doesn't work as
> -        * expected with KASLR enabled.
> -        */
> -       if (system_state != SYSTEM_BOOTING)
> -               mutex_lock(&fixmap_lock);
>         pudp = pud_set_fixmap_offset(p4dp, addr);
>         do {
>                 pud_t old_pud = READ_ONCE(*pudp);
> @@ -368,15 +362,13 @@ static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end,
>         } while (pudp++, addr = next, addr != end);
>
>         pud_clear_fixmap();
> -       if (system_state != SYSTEM_BOOTING)
> -               mutex_unlock(&fixmap_lock);
>  }
>
> -static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
> -                                unsigned long virt, phys_addr_t size,
> -                                pgprot_t prot,
> -                                phys_addr_t (*pgtable_alloc)(int),
> -                                int flags)
> +static void __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys,
> +                                       unsigned long virt, phys_addr_t size,
> +                                       pgprot_t prot,
> +                                       phys_addr_t (*pgtable_alloc)(int),
> +                                       int flags)
>  {
>         unsigned long addr, end, next;
>         pgd_t *pgdp = pgd_offset_pgd(pgdir, virt);
> @@ -400,8 +392,20 @@ static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
>         } while (pgdp++, addr = next, addr != end);
>  }
>
> +static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
> +                                unsigned long virt, phys_addr_t size,
> +                                pgprot_t prot,
> +                                phys_addr_t (*pgtable_alloc)(int),
> +                                int flags)
> +{
> +       mutex_lock(&fixmap_lock);
> +       __create_pgd_mapping_locked(pgdir, phys, virt, size, prot,
> +                                   pgtable_alloc, flags);
> +       mutex_unlock(&fixmap_lock);
> +}
> +
>  #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
> -extern __alias(__create_pgd_mapping)
> +extern __alias(__create_pgd_mapping_locked)
>  void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt,
>                              phys_addr_t size, pgprot_t prot,
>                              phys_addr_t (*pgtable_alloc)(int), int flags);
> --
> 2.30.2
>



More information about the linux-arm-kernel mailing list