[PATCH] arm64: optimize flush tlb kernel range

Tue Sep 17 17:57:16 PDT 2024

On 2024/9/16 23:08, Catalin Marinas wrote:
> On Sat, Sep 14, 2024 at 10:14:41PM +0800, Kefeng Wang wrote:
>> Currently the kernel TLBs is flushed page by page if the target
>> VA range is less than MAX_DVM_OPS * PAGE_SIZE, otherwise we'll
>> brutally issue a TLBI ALL.
>>
>> But we could optimize it when CPU supports TLB range operations,
>> convert to use __flush_tlb_range_nosync() like other tlb range
>> flush to improve performance.
>>
>> Signed-off-by: Yicong Yang <yangyicong at hisilicon.com>
>> Signed-off-by: Kefeng Wang <wangkefeng.wang at huawei.com>
> 
> Nit: you need a co-developed-by here for Yicong.

OK,

> 
>> ---
>>   arch/arm64/include/asm/tlbflush.h | 43 +++++++++++++------------------
>>   arch/arm64/mm/contpte.c           |  3 ++-
>>   2 files changed, 20 insertions(+), 26 deletions(-)
>>
>> diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
>> index 95fbc8c05607..8537fad83999 100644
>> --- a/arch/arm64/include/asm/tlbflush.h
>> +++ b/arch/arm64/include/asm/tlbflush.h
>> @@ -431,12 +431,12 @@ do {									\
>>   #define __flush_s2_tlb_range_op(op, start, pages, stride, tlb_level) \
>>   	__flush_tlb_range_op(op, start, pages, stride, 0, tlb_level, false, kvm_lpa2_is_enabled());
>>   
>> -static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma,
>> -				     unsigned long start, unsigned long end,
>> -				     unsigned long stride, bool last_level,
>> -				     int tlb_level)
>> +static __always_inline void __flush_tlb_range_nosync(struct mm_struct *mm,
>> +		unsigned long asid, unsigned long start, unsigned long end,
>> +		unsigned long stride, bool last_level, int tlb_level)
>>   {
>> -	unsigned long asid, pages;
>> +	bool tlbi_user = !!asid;
>> +	unsigned long pages;
>>   
>>   	start = round_down(start, stride);
>>   	end = round_up(end, stride);
>> @@ -451,21 +451,24 @@ static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma,
>>   	if ((!system_supports_tlb_range() &&
>>   	     (end - start) >= (MAX_DVM_OPS * stride)) ||
>>   	    pages > MAX_TLBI_RANGE_PAGES) {
>> -		flush_tlb_mm(vma->vm_mm);
>> +		if (asid)
>> +			flush_tlb_mm(mm);
>> +		else
>> +			flush_tlb_all();
>>   		return;
>>   	}
>>   
>>   	dsb(ishst);
>> -	asid = ASID(vma->vm_mm);
>>   
>>   	if (last_level)
>>   		__flush_tlb_range_op(vale1is, start, pages, stride, asid,
>> -				     tlb_level, true, lpa2_is_enabled());
>> +				     tlb_level, tlbi_user, lpa2_is_enabled());
>>   	else
>> -		__flush_tlb_range_op(vae1is, start, pages, stride, asid,
>> -				     tlb_level, true, lpa2_is_enabled());
>> +		__flush_tlb_range_op(vae1is, start, pages, stride, tlbi_user,
>> +				     tlb_level, tlbi_user, lpa2_is_enabled());
>>   
>> -	mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end);
>> +	if (asid)
>> +		mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
>>   }
> 
> This is not correct. The flush_tlb_kernel_range() uses the TLBI VAALE1IS
> operation while the above would use VALE1IS for the kernel mapping, with
> ASID 0.

Right, missing it when code refactoring.
> 
> I also don't like overriding the meaning of asid here to guess whether
> it's user or kernel mapping, it just complicates this function
> unnecessarily.
> 
>>   static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end)
>>   {
>> -	unsigned long addr;
>>   
>> -	if ((end - start) > (MAX_DVM_OPS * PAGE_SIZE)) {
>> -		flush_tlb_all();
>> -		return;
>> -	}
>> -
>> -	start = __TLBI_VADDR(start, 0);
>> -	end = __TLBI_VADDR(end, 0);
>> -
>> -	dsb(ishst);
>> -	for (addr = start; addr < end; addr += 1 << (PAGE_SHIFT - 12))
>> -		__tlbi(vaale1is, addr);
>> +	__flush_tlb_range_nosync(&init_mm, 0, start, end, PAGE_SIZE, false,
>> +				 TLBI_TTL_UNKNOWN);
>>   	dsb(ish);
>>   	isb();
>>   }
> 
> Just call __flush_tlb_range_op(vaale1is, ...) directly here.

Our first internal version using __flush_tlb_range_op(), but I want to 
avoid some duplicate code between flush_tlb_kernel_range() and
__flush_tlb_range_nosync(), but will turn back to use 
__flush_tlb_range_op(), thank.