[PATCH v3 11/11] arm64/mm: Batch barriers when updating kernel mappings

Ryan Roberts ryan.roberts at arm.com
Mon Apr 14 11:28:46 PDT 2025


On 14/04/2025 18:38, Catalin Marinas wrote:
> On Tue, Mar 04, 2025 at 03:04:41PM +0000, Ryan Roberts wrote:
>> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
>> index 1898c3069c43..149df945c1ab 100644
>> --- a/arch/arm64/include/asm/pgtable.h
>> +++ b/arch/arm64/include/asm/pgtable.h
>> @@ -40,6 +40,55 @@
>>  #include <linux/sched.h>
>>  #include <linux/page_table_check.h>
>>  
>> +static inline void emit_pte_barriers(void)
>> +{
>> +	/*
>> +	 * These barriers are emitted under certain conditions after a pte entry
>> +	 * was modified (see e.g. __set_pte_complete()). The dsb makes the store
>> +	 * visible to the table walker. The isb ensures that any previous
>> +	 * speculative "invalid translation" marker that is in the CPU's
>> +	 * pipeline gets cleared, so that any access to that address after
>> +	 * setting the pte to valid won't cause a spurious fault. If the thread
>> +	 * gets preempted after storing to the pgtable but before emitting these
>> +	 * barriers, __switch_to() emits a dsb which ensure the walker gets to
>> +	 * see the store. There is no guarrantee of an isb being issued though.
>> +	 * This is safe because it will still get issued (albeit on a
>> +	 * potentially different CPU) when the thread starts running again,
>> +	 * before any access to the address.
>> +	 */
>> +	dsb(ishst);
>> +	isb();
>> +}
>> +
>> +static inline void queue_pte_barriers(void)
>> +{
>> +	if (test_thread_flag(TIF_LAZY_MMU))
>> +		set_thread_flag(TIF_LAZY_MMU_PENDING);
> 
> As we can have lots of calls here, it might be slightly cheaper to test
> TIF_LAZY_MMU_PENDING and avoid setting it unnecessarily.

Yes, good point.

> 
> I haven't checked - does the compiler generate multiple mrs from sp_el0
> for subsequent test_thread_flag()?

It emits a single mrs but it loads from the pointer twice. I think v3 is the version we want?


void TEST_queue_pte_barriers_v1(void)
{
	if (test_thread_flag(TIF_LAZY_MMU))
		set_thread_flag(TIF_LAZY_MMU_PENDING);
	else
		emit_pte_barriers();
}

void TEST_queue_pte_barriers_v2(void)
{
	if (test_thread_flag(TIF_LAZY_MMU) &&
	    !test_thread_flag(TIF_LAZY_MMU_PENDING))
		set_thread_flag(TIF_LAZY_MMU_PENDING);
	else
		emit_pte_barriers();
}

void TEST_queue_pte_barriers_v3(void)
{
	unsigned long flags = read_thread_flags();

	if ((flags & (_TIF_LAZY_MMU | _TIF_LAZY_MMU_PENDING)) == _TIF_LAZY_MMU)
		set_thread_flag(TIF_LAZY_MMU_PENDING);
	else
		emit_pte_barriers();
}


000000000000101c <TEST_queue_pte_barriers_v1>:
    101c:	d5384100 	mrs	x0, sp_el0
    1020:	f9400001 	ldr	x1, [x0]
    1024:	37f80081 	tbnz	w1, #31, 1034 <TEST_queue_pte_barriers_v1+0x18>
    1028:	d5033a9f 	dsb	ishst
    102c:	d5033fdf 	isb
    1030:	d65f03c0 	ret
    1034:	14000004 	b	1044 <TEST_queue_pte_barriers_v1+0x28>
    1038:	d2c00021 	mov	x1, #0x100000000           	// #4294967296
    103c:	f821301f 	stset	x1, [x0]
    1040:	d65f03c0 	ret
    1044:	f9800011 	prfm	pstl1strm, [x0]
    1048:	c85f7c01 	ldxr	x1, [x0]
    104c:	b2600021 	orr	x1, x1, #0x100000000
    1050:	c8027c01 	stxr	w2, x1, [x0]
    1054:	35ffffa2 	cbnz	w2, 1048 <TEST_queue_pte_barriers_v1+0x2c>
    1058:	d65f03c0 	ret

000000000000105c <TEST_queue_pte_barriers_v2>:
    105c:	d5384100 	mrs	x0, sp_el0
    1060:	f9400001 	ldr	x1, [x0]
    1064:	37f80081 	tbnz	w1, #31, 1074 <TEST_queue_pte_barriers_v2+0x18>
    1068:	d5033a9f 	dsb	ishst
    106c:	d5033fdf 	isb
    1070:	d65f03c0 	ret
    1074:	f9400001 	ldr	x1, [x0]
    1078:	b707ff81 	tbnz	x1, #32, 1068 <TEST_queue_pte_barriers_v2+0xc>
    107c:	14000004 	b	108c <TEST_queue_pte_barriers_v2+0x30>
    1080:	d2c00021 	mov	x1, #0x100000000           	// #4294967296
    1084:	f821301f 	stset	x1, [x0]
    1088:	d65f03c0 	ret
    108c:	f9800011 	prfm	pstl1strm, [x0]
    1090:	c85f7c01 	ldxr	x1, [x0]
    1094:	b2600021 	orr	x1, x1, #0x100000000
    1098:	c8027c01 	stxr	w2, x1, [x0]
    109c:	35ffffa2 	cbnz	w2, 1090 <TEST_queue_pte_barriers_v2+0x34>
    10a0:	d65f03c0 	ret

00000000000010a4 <TEST_queue_pte_barriers_v3>:
    10a4:	d5384101 	mrs	x1, sp_el0
    10a8:	f9400020 	ldr	x0, [x1]
    10ac:	d2b00002 	mov	x2, #0x80000000            	// #2147483648
    10b0:	92610400 	and	x0, x0, #0x180000000
    10b4:	eb02001f 	cmp	x0, x2
    10b8:	54000080 	b.eq	10c8 <TEST_queue_pte_barriers_v3+0x24>  // b.none
    10bc:	d5033a9f 	dsb	ishst
    10c0:	d5033fdf 	isb
    10c4:	d65f03c0 	ret
    10c8:	14000004 	b	10d8 <TEST_queue_pte_barriers_v3+0x34>
    10cc:	d2c00020 	mov	x0, #0x100000000           	// #4294967296
    10d0:	f820303f 	stset	x0, [x1]
    10d4:	d65f03c0 	ret
    10d8:	f9800031 	prfm	pstl1strm, [x1]
    10dc:	c85f7c20 	ldxr	x0, [x1]
    10e0:	b2600000 	orr	x0, x0, #0x100000000
    10e4:	c8027c20 	stxr	w2, x0, [x1]
    10e8:	35ffffa2 	cbnz	w2, 10dc <TEST_queue_pte_barriers_v3+0x38>
    10ec:	d65f03c0 	ret



> 
>> +	else
>> +		emit_pte_barriers();
>> +}
>> +
>> +#define  __HAVE_ARCH_ENTER_LAZY_MMU_MODE
>> +static inline void arch_enter_lazy_mmu_mode(void)
>> +{
>> +	VM_WARN_ON(in_interrupt());
>> +	VM_WARN_ON(test_thread_flag(TIF_LAZY_MMU));
>> +
>> +	set_thread_flag(TIF_LAZY_MMU);
>> +}
>> +
>> +static inline void arch_flush_lazy_mmu_mode(void)
>> +{
>> +	if (test_and_clear_thread_flag(TIF_LAZY_MMU_PENDING))
>> +		emit_pte_barriers();
>> +}
>> +
>> +static inline void arch_leave_lazy_mmu_mode(void)
>> +{
>> +	arch_flush_lazy_mmu_mode();
>> +	clear_thread_flag(TIF_LAZY_MMU);
>> +}
>> +
>>  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>  #define __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
>>  
>> @@ -323,10 +372,8 @@ static inline void __set_pte_complete(pte_t pte)
>>  	 * Only if the new pte is valid and kernel, otherwise TLB maintenance
>>  	 * has the necessary barriers.
>>  	 */
>> -	if (pte_valid_not_user(pte)) {
>> -		dsb(ishst);
>> -		isb();
>> -	}
>> +	if (pte_valid_not_user(pte))
>> +		queue_pte_barriers();
>>  }
> 
> I think this scheme works, I couldn't find a counter-example unless
> __set_pte() gets called in an interrupt context. You could add
> VM_WARN_ON(in_interrupt()) in queue_pte_barriers() as well.
> 
> With preemption, the newly mapped range shouldn't be used before
> arch_flush_lazy_mmu_mode() is called, so it looks safe as well. I think
> x86 uses a per-CPU variable to track this but per-thread is easier to
> reason about if there's no nesting.
> 
>>  static inline void __set_pte(pte_t *ptep, pte_t pte)
>> @@ -778,10 +825,8 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
>>  
>>  	WRITE_ONCE(*pmdp, pmd);
>>  
>> -	if (pmd_valid(pmd)) {
>> -		dsb(ishst);
>> -		isb();
>> -	}
>> +	if (pmd_valid(pmd))
>> +		queue_pte_barriers();
>>  }
> 
> We discussed on a previous series - for pmd/pud we end up with barriers
> even for user mappings but they are at a much coarser granularity (and I
> wasn't keen on 'user' attributes for the table entries).
> 
> Reviewed-by: Catalin Marinas <catalin.marinas at arm.com>

Thanks!

Ryan




More information about the linux-arm-kernel mailing list