[RFC PATCH v2 2/2] riscv: mm: use svinval instructions instead of sfence.vma

Wed Jun 14 22:51:08 PDT 2023

On Tue, Jun 13, 2023 at 7:13 PM Alexandre Ghiti <alex at ghiti.fr> wrote:
>
>
> On 13/06/2023 06:02, Mayuresh Chitale wrote:
> > On Fri, Dec 9, 2022 at 12:33 AM Palmer Dabbelt <palmer at dabbelt.com> wrote:
> >> On Sun, 24 Apr 2022 02:02:16 PDT (-0700), mchitale at ventanamicro.com wrote:
> >>> When svinval is supported the local_flush_tlb_page*
> >>> functions would prefer to use the following sequence
> >>> to optimize the tlb flushes instead of a simple sfence.vma:
> >>>
> >>> sfence.w.inval
> >>> svinval.vma
> >>>    .
> >>>    .
> >>> svinval.vma
> >>> sfence.inval.ir
> >>>
> >>> The maximum number of consecutive svinval.vma instructions
> >>> that can be executed in local_flush_tlb_page* functions is
> >>> limited to PTRS_PER_PTE. This is required to avoid soft
> >>> lockups and the approach is similar to that used in arm64.
> >>>
> >>> Signed-off-by: Mayuresh Chitale <mchitale at ventanamicro.com>
> >>> ---
> >>>   arch/riscv/include/asm/tlbflush.h |  12 ++++
> >>>   arch/riscv/kernel/setup.c         |   1 +
> >>>   arch/riscv/mm/tlbflush.c          | 116 ++++++++++++++++++++++++++++--
> >>>   3 files changed, 123 insertions(+), 6 deletions(-)
> >>>
> >>> diff --git a/arch/riscv/include/asm/tlbflush.h b/arch/riscv/include/asm/tlbflush.h
> >>> index 801019381dea..b535467c99f0 100644
> >>> --- a/arch/riscv/include/asm/tlbflush.h
> >>> +++ b/arch/riscv/include/asm/tlbflush.h
> >>> @@ -22,6 +22,18 @@ static inline void local_flush_tlb_page(unsigned long addr)
> >>>   {
> >>>        ALT_FLUSH_TLB_PAGE(__asm__ __volatile__ ("sfence.vma %0" : : "r" (addr) : "memory"));
> >>>   }
> >>> +
> >>> +void riscv_tlbflush_init(void);
> >>> +void __riscv_sfence_w_inval(void);
> >>> +void __riscv_sfence_inval_ir(void);
> >>> +void __riscv_sinval_vma(unsigned long addr);
> >>> +void __riscv_sinval_vma_asid(unsigned long addr, unsigned long asid);
> >>> +
> >>> +/* Check if we can use sinval for tlb flush */
> >>> +DECLARE_STATIC_KEY_FALSE(riscv_flush_tlb_svinval);
> >>> +#define riscv_use_flush_tlb_svinval() \
> >>> +     static_branch_unlikely(&riscv_flush_tlb_svinval)
> >>> +
> >>>   #else /* CONFIG_MMU */
> >>>   #define local_flush_tlb_all()                        do { } while (0)
> >>>   #define local_flush_tlb_page(addr)           do { } while (0)
> >>> diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c
> >>> index 834eb652a7b9..13de04259de9 100644
> >>> --- a/arch/riscv/kernel/setup.c
> >>> +++ b/arch/riscv/kernel/setup.c
> >>> @@ -295,6 +295,7 @@ void __init setup_arch(char **cmdline_p)
> >>>   #endif
> >>>
> >>>        riscv_fill_hwcap();
> >>> +     riscv_tlbflush_init();
> >>>   }
> >>>
> >>>   static int __init topology_init(void)
> >>> diff --git a/arch/riscv/mm/tlbflush.c b/arch/riscv/mm/tlbflush.c
> >>> index 27a7db8eb2c4..800953f9121e 100644
> >>> --- a/arch/riscv/mm/tlbflush.c
> >>> +++ b/arch/riscv/mm/tlbflush.c
> >>> @@ -1,11 +1,14 @@
> >>>   // SPDX-License-Identifier: GPL-2.0
> >>>
> >>> +#define pr_fmt(fmt) "riscv: " fmt
> >>>   #include <linux/mm.h>
> >>>   #include <linux/smp.h>
> >>>   #include <linux/sched.h>
> >>>   #include <asm/sbi.h>
> >>>   #include <asm/mmu_context.h>
> >>>
> >>> +static unsigned long tlb_flush_all_threshold __read_mostly = PTRS_PER_PTE;
> >>> +
> >>>   static inline void local_flush_tlb_all_asid(unsigned long asid)
> >>>   {
> >>>        __asm__ __volatile__ ("sfence.vma x0, %0"
> >>> @@ -23,22 +26,110 @@ static inline void local_flush_tlb_page_asid(unsigned long addr,
> >>>                        : "memory");
> >>>   }
> >>>
> >>> +static inline void riscv_sfence_inval_ir(void)
> >>> +{
> >>> +     /*
> >>> +      * SFENCE.INVAL.IR
> >>> +      * 0001100 00001 00000 000 00000 1110011
> >>> +      */
> >>> +     asm volatile (".word 0x18100073" ::: "memory");
> >>> +}
> >>> +
> >>> +static inline void riscv_sfence_w_inval(void)
> >>> +{
> >>> +     /*
> >>> +      * SFENCE.W.INVAL
> >>> +      * 0001100 00000 00000 000 00000 1110011
> >>> +      */
> >>> +     asm volatile (".word 0x18000073" ::: "memory");
> >>> +}
> >>> +
> >>> +static inline void riscv_sinval_vma_asid(unsigned long vma, unsigned long asid)
> >>> +{
> >>> +     /*
> >>> +      * rs1 = a0 (VMA)
> >>> +      * rs2 = a1 (asid)
> >>> +      * SINVAL.VMA a0, a1
> >>> +      * 0001011 01011 01010 000 00000 1110011
> >>> +      */
> >>> +     asm volatile ("srli a0, %0, 2\n"
> >>> +                     "add a1, %1, zero\n"
> >>> +                     ".word 0x16B50073\n"
> >>> +                     :: "r" (vma), "r" (asid)
> >>> +                     : "a0", "a1", "memory");
> >>> +}
> >>> +
> >>> +static inline void riscv_sinval_vma(unsigned long vma)
> >>> +{
> >>> +     /*
> >>> +      * rs1 = a0 (VMA)
> >>> +      * rs2 = 0
> >>> +      * SINVAL.VMA a0
> >>> +      * 0001011 00000 01010 000 00000 1110011
> >>> +      */
> >>> +     asm volatile ("srli a0, %0, 2\n"
> >>> +                     ".word 0x16050073\n"
> >>> +                     :: "r" (vma) : "a0", "memory");
> >>> +}
> >>> +
> >>>   static inline void local_flush_tlb_range(unsigned long start,
> >>>                unsigned long size, unsigned long stride)
> >>>   {
> >>> -     if (size <= stride)
> >>> -             local_flush_tlb_page(start);
> >>> -     else
> >>> +     if ((size / stride) <= tlb_flush_all_threshold) {
> >>> +             if (riscv_use_flush_tlb_svinval()) {
> >>> +                     riscv_sfence_w_inval();
> >>> +                     while (size) {
> >>> +                             riscv_sinval_vma(start);
> >>> +                             start += stride;
> >>> +                             if (size > stride)
> >>> +                                     size -= stride;
> >>> +                             else
> >>> +                                     size = 0;
> >>> +                     }
> >>> +                     riscv_sfence_inval_ir();
> >>> +             } else {
> >>> +                     while (size) {
> >>> +                             local_flush_tlb_page(start);
> >>> +                             start += stride;
> >>> +                             if (size > stride)
> >>> +                                     size -= stride;
> >>> +                             else
> >>> +                                     size = 0;
> >>> +                     }
> >>> +             }
> >>> +     } else {
> >>>                local_flush_tlb_all();
> >>> +     }
> >>>   }
> >>>
> >>>   static inline void local_flush_tlb_range_asid(unsigned long start,
> >>>                unsigned long size, unsigned long stride, unsigned long asid)
> >>>   {
> >>> -     if (size <= stride)
> >>> -             local_flush_tlb_page_asid(start, asid);
> >>> -     else
> >>> +     if ((size / stride) <= tlb_flush_all_threshold) {
> >>> +             if (riscv_use_flush_tlb_svinval()) {
> >>> +                     riscv_sfence_w_inval();
> >>> +                     while (size) {
> >>> +                             riscv_sinval_vma_asid(start, asid);
> >>> +                             start += stride;
> >>> +                             if (size > stride)
> >>> +                                     size -= stride;
> >>> +                             else
> >>> +                                     size = 0;
> >>> +                     }
> >>> +                     riscv_sfence_inval_ir();
> >>> +             } else {
> >>> +                     while (size) {
> >>> +                             local_flush_tlb_page_asid(start, asid);
> >>> +                             start += stride;
> >>> +                             if (size > stride)
> >>> +                                     size -= stride;
> >>> +                             else
> >>> +                                     size = 0;
> >>> +                     }
> >>> +             }
> >>> +     } else {
> >>>                local_flush_tlb_all_asid(asid);
> >>> +     }
> >> Sorry to dig up an old post, but svinval came up and I started looking
> >> at this again.
> >>
> >> There's some other issues with the code, but at a higher level it's not
> >> really clear this is a useful thing to do: splitting up sfence.vma makes
> >> sense, but if there's no operations between the sfence.w.inval and
> >> sinval.vma (or sinval.vma and sfence.inval.ir) then we're not giving the
> >> HW anything else to do concurrently with the flushes and thus I don't
> >> see how this would improve performance -- if anything we're just
> >> emitting more instructions to do the samething, which would be bad for
> >> performance.
> > Actually this approach is similar to ARM with addition of only two
> > instructions: sfence.w.inval and sfence.inval.ir and it is also
> > described in the non normative text in the priv spec ch.7.
> >
> >> So I think if we're going to use these we really need to also split up
> >> the TLB flush operations, so we can give something for the HW to do
> >> currently with the flushing.  That's be a pretty big change to our TLB
> >> model, though: essentially we're going from two phases (write the page
> >> tables, then flush) to at least three phases (write the page tables,
> >> start the flush, finish the flush) or even four (if we want to insert
> >> work between both sfence.w.inval->sinval.vma and
> >> sinval.vma->sfence.inval.ir).
> > I think the extension is just to allow for a more efficient TLB
> > flushing operation in the hardware and no change in the software TLB
> > model is expected.
> > Besides, the extension would be disabled by default and only get
> > enabled on platforms that enable it in DT.
> >
> >> Not sure if I'm just misunderstanding the performance characteristics,
> >> though.   Maybe I'm over-thinking things and sfence.vma is heavy-weight
> >> enough that implementations are flushing the pipeline for even the
> >> page-based flavors and thus sfence.vma-based loops are very slow?
> >
> >> Do you have any benchmarks for the Svinval bits here (which are mixed in
> >> with some non-global flushing, that's a different optimization that
> >> needs to be split out)?
>
>
> Thanks for digging that up!
>
> I have a patchset to optimize the TLB flushes (which also uses this
> concept of threshold, also present in x86) and I don't want to propose
> it upstream since I can't measure any performance benefits (which is
> very weird actually): did you actually benchmark this? If yes, I'd like
> to know how you did it :)

I couldn't run any benchmark as such. I just used a few tests from
stress-ng and the kernel selftests.
>
> Thanks,
>
> Alex
>
>
> >>
> >>>   }
> >>>
> >>>   static void __ipi_flush_tlb_all(void *info)
> >>> @@ -149,3 +240,16 @@ void flush_pmd_tlb_range(struct vm_area_struct *vma, unsigned long start,
> >>>        __flush_tlb_range(vma->vm_mm, start, end - start, PMD_SIZE);
> >>>   }
> >>>   #endif
> >>> +
> >>> +DEFINE_STATIC_KEY_FALSE(riscv_flush_tlb_svinval);
> >>> +EXPORT_SYMBOL_GPL(riscv_flush_tlb_svinval);
> >>> +
> >>> +void riscv_tlbflush_init(void)
> >>> +{
> >>> +     if (riscv_isa_extension_available(NULL, SVINVAL)) {
> >>> +             pr_info("Svinval extension supported\n");
> >>> +             static_branch_enable(&riscv_flush_tlb_svinval);
> >>> +     } else {
> >>> +             static_branch_disable(&riscv_flush_tlb_svinval);
> >>> +     }
> >>> +}
> > _______________________________________________
> > linux-riscv mailing list
> > linux-riscv at lists.infradead.org
> > http://lists.infradead.org/mailman/listinfo/linux-riscv