[PATCH v3 7/8] RISC-V: Report vector unaligned access speed hwprobe

Thu Jul 11 13:35:48 PDT 2024

On 7/1/24 18:51, Evan Green wrote:
> On Mon, Jun 24, 2024 at 5:52 PM Jesse Taube <jesse at rivosinc.com> wrote:
>>
>> Detect if vector misaligned accesses are faster or slower than
>> equivalent vector byte accesses. This is useful for usermode to know
>> whether vector byte accesses or vector misaligned accesses have a better
>> bandwidth for operations like memcpy.
>>
>> Signed-off-by: Jesse Taube <jesse at rivosinc.com>
>> ---
>> V1 -> V2:
>>   - Add Kconfig options
>>   - Add WORD_EEW to vec-copy-unaligned.S
>> V2 -> V3:
>>   - Remove unnecessary comment
>>   - Remove local_irq_enable
>> ---
>>   arch/riscv/Kconfig                         |  18 +++
>>   arch/riscv/kernel/Makefile                 |   3 +-
>>   arch/riscv/kernel/copy-unaligned.h         |   5 +
>>   arch/riscv/kernel/sys_hwprobe.c            |   6 +
>>   arch/riscv/kernel/unaligned_access_speed.c | 132 ++++++++++++++++++++-
>>   arch/riscv/kernel/vec-copy-unaligned.S     |  58 +++++++++
>>   6 files changed, 219 insertions(+), 3 deletions(-)
>>   create mode 100644 arch/riscv/kernel/vec-copy-unaligned.S
>>
>> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
>> index ffbe0fdd7fb3..6f9fd3748916 100644
>> --- a/arch/riscv/Kconfig
>> +++ b/arch/riscv/Kconfig
>> @@ -807,6 +807,24 @@ config RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
>>            will dynamically determine the speed of vector unaligned accesses on
>>            the underlying system if they are supported.
>>
>> +config RISCV_SLOW_VEC_UNALIGNED_ACCESS
>> +       bool "Assume the system supports slow vector unaligned memory accesses"
>> +       depends on NONPORTABLE
>> +       help
>> +         Assume that the system supports slow vector unaligned memory accesses. The
>> +         kernel and userspace programs may not be able to run at all on systems
>> +         that do not support unaligned memory accesses.
>> +
>> +config RISCV_EFFICIENT_VEC_UNALIGNED_ACCESS
>> +       bool "Assume the system supports fast vector unaligned memory accesses"
>> +       depends on NONPORTABLE
>> +       help
>> +         Assume that the system supports fast vector unaligned memory accesses. When
>> +         enabled, this option improves the performance of the kernel on such
>> +         systems. However, the kernel and userspace programs will run much more
>> +         slowly, or will not be able to run at all, on systems that do not
>> +         support efficient unaligned memory accesses.
>> +
>>   endchoice
>>
>>   endmenu # "Platform type"
>> diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
>> index 5b243d46f4b1..291935a084d5 100644
>> --- a/arch/riscv/kernel/Makefile
>> +++ b/arch/riscv/kernel/Makefile
>> @@ -64,7 +64,8 @@ obj-$(CONFIG_MMU) += vdso.o vdso/
>>
>>   obj-$(CONFIG_RISCV_MISALIGNED) += traps_misaligned.o
>>   obj-$(CONFIG_RISCV_MISALIGNED) += unaligned_access_speed.o
>> -obj-$(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS)     += copy-unaligned.o
>> +obj-$(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS)             += copy-unaligned.o
>> +obj-$(CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS)      += vec-copy-unaligned.o
>>
>>   obj-$(CONFIG_FPU)              += fpu.o
>>   obj-$(CONFIG_FPU)              += kernel_mode_fpu.o
>> diff --git a/arch/riscv/kernel/copy-unaligned.h b/arch/riscv/kernel/copy-unaligned.h
>> index e3d70d35b708..85d4d11450cb 100644
>> --- a/arch/riscv/kernel/copy-unaligned.h
>> +++ b/arch/riscv/kernel/copy-unaligned.h
>> @@ -10,4 +10,9 @@
>>   void __riscv_copy_words_unaligned(void *dst, const void *src, size_t size);
>>   void __riscv_copy_bytes_unaligned(void *dst, const void *src, size_t size);
>>
>> +#ifdef CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
>> +void __riscv_copy_vec_words_unaligned(void *dst, const void *src, size_t size);
>> +void __riscv_copy_vec_bytes_unaligned(void *dst, const void *src, size_t size);
>> +#endif
>> +
>>   #endif /* __RISCV_KERNEL_COPY_UNALIGNED_H */
>> diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c
>> index 5b78ea5a84d1..46de3f145154 100644
>> --- a/arch/riscv/kernel/sys_hwprobe.c
>> +++ b/arch/riscv/kernel/sys_hwprobe.c
>> @@ -221,6 +221,12 @@ static u64 hwprobe_vec_misaligned(const struct cpumask *cpus)
>>   #else
>>   static u64 hwprobe_vec_misaligned(const struct cpumask *cpus)
>>   {
>> +       if (IS_ENABLED(CONFIG_RISCV_EFFICIENT_VEC_UNALIGNED_ACCESS))
>> +               return RISCV_HWPROBE_VEC_MISALIGNED_FAST;
>> +
>> +       if (IS_ENABLED(CONFIG_RISCV_SLOW_VEC_UNALIGNED_ACCESS))
>> +               return RISCV_HWPROBE_VEC_MISALIGNED_SLOW;
>> +
>>          return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
>>   }
>>   #endif
>> diff --git a/arch/riscv/kernel/unaligned_access_speed.c b/arch/riscv/kernel/unaligned_access_speed.c
>> index 8489e012cf23..a2a8f948500b 100644
>> --- a/arch/riscv/kernel/unaligned_access_speed.c
>> +++ b/arch/riscv/kernel/unaligned_access_speed.c
>> @@ -8,9 +8,11 @@
>>   #include <linux/jump_label.h>
>>   #include <linux/mm.h>
>>   #include <linux/smp.h>
>> +#include <linux/kthread.h>
>>   #include <linux/types.h>
>>   #include <asm/cpufeature.h>
>>   #include <asm/hwprobe.h>
>> +#include <asm/vector.h>
>>
>>   #include "copy-unaligned.h"
>>
>> @@ -267,9 +269,129 @@ static int check_unaligned_access_speed_all_cpus(void)
>>   }
>>   #endif
>>
>> +#ifdef CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
>> +static void check_vector_unaligned_access(struct work_struct *unused)
>> +{
>> +       int cpu = smp_processor_id();
>> +       u64 start_cycles, end_cycles;
>> +       u64 word_cycles;
>> +       u64 byte_cycles;
>> +       int ratio;
>> +       unsigned long start_jiffies, now;
>> +       struct page *page;
>> +       void *dst;
>> +       void *src;
>> +       long speed = RISCV_HWPROBE_VEC_MISALIGNED_SLOW;
>> +
>> +       if (per_cpu(vector_misaligned_access, cpu) != RISCV_HWPROBE_VEC_MISALIGNED_SLOW)
>> +               return;
>> +
>> +       page = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
>> +       if (!page) {
>> +               pr_warn("Allocation failure, not measuring vector misaligned performance\n");
>> +               return;
>> +       }
>> +
>> +       /* Make an unaligned destination buffer. */
>> +       dst = (void *)((unsigned long)page_address(page) | 0x1);
>> +       /* Unalign src as well, but differently (off by 1 + 2 = 3). */
>> +       src = dst + (MISALIGNED_BUFFER_SIZE / 2);
>> +       src += 2;
>> +       word_cycles = -1ULL;
>> +
>> +       /* Do a warmup. */
>> +       kernel_vector_begin();
> 
> Should there be a preempt_disable() in here too, or is that implied
> already by schedule_on_each_cpu? Mainly we want to minimize the amount
> of things that might soak up our jiffy and steal iterations from the
> loop.

To test I put a schedule here and it says OOPS we are attomic.

I added preempt_disable and preempt_enable to be sure preemption is 
disabled anyway.

> 
>> +       __riscv_copy_vec_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
>> +
>> +       start_jiffies = jiffies;
>> +       while ((now = jiffies) == start_jiffies)
>> +               cpu_relax();
>> +
>> +       /*
>> +        * For a fixed amount of time, repeatedly try the function, and take
>> +        * the best time in cycles as the measurement.
>> +        */
>> +       while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
>> +               start_cycles = get_cycles64();
>> +               /* Ensure the CSR read can't reorder WRT to the copy. */
>> +               mb();
>> +               __riscv_copy_vec_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
>> +               /* Ensure the copy ends before the end time is snapped. */
>> +               mb();
>> +               end_cycles = get_cycles64();
>> +               if ((end_cycles - start_cycles) < word_cycles)
>> +                       word_cycles = end_cycles - start_cycles;
>> +       }
>> +
>> +       byte_cycles = -1ULL;
>> +       __riscv_copy_vec_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
>> +       start_jiffies = jiffies;
>> +       while ((now = jiffies) == start_jiffies)
>> +               cpu_relax();
>> +
>> +       while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
>> +               start_cycles = get_cycles64();
>> +               mb();
>> +               __riscv_copy_vec_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
>> +               mb();
>> +               end_cycles = get_cycles64();
>> +               if ((end_cycles - start_cycles) < byte_cycles)
>> +                       byte_cycles = end_cycles - start_cycles;
>> +       }
>> +
>> +       kernel_vector_end();
>> +
>> +       /* Don't divide by zero. */
>> +       if (!word_cycles || !byte_cycles) {
>> +               pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned vector access speed\n",
>> +                       cpu);
>> +
>> +               return;
>> +       }
>> +
>> +       if (word_cycles < byte_cycles)
>> +               speed = RISCV_HWPROBE_VEC_MISALIGNED_FAST;
>> +
>> +       ratio = div_u64((byte_cycles * 100), word_cycles);
>> +       pr_info("cpu%d: Ratio of vector byte access time to vector unaligned word access is %d.%02d, unaligned accesses are %s\n",
>> +               cpu,
>> +               ratio / 100,
>> +               ratio % 100,
>> +               (speed ==  RISCV_HWPROBE_VEC_MISALIGNED_FAST) ? "fast" : "slow");
>> +
>> +       per_cpu(vector_misaligned_access, cpu) = speed;
>> +}
>> +
>> +static int riscv_online_cpu_vec(unsigned int cpu)
>> +{
>> +       check_vector_unaligned_access(NULL);
>> +       return 0;
>> +}
>> +
>> +/* Measure unaligned access speed on all CPUs present at boot in parallel. */
>> +static int vec_check_unaligned_access_speed_all_cpus(void *unused)
>> +{
>> +       schedule_on_each_cpu(check_vector_unaligned_access);
>> +
>> +       /*
>> +        * Setup hotplug callbacks for any new CPUs that come online or go
>> +        * offline.
>> +        */
>> +       cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online",
>> +                                 riscv_online_cpu_vec, NULL);
>> +
>> +       return 0;
>> +}
>> +#else /* CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS */
>> +static int vec_check_unaligned_access_speed_all_cpus(void *unused)
>> +{
>> +       return 0;
>> +}
>> +#endif
>> +
>>   static int check_unaligned_access_all_cpus(void)
>>   {
>> -       bool all_cpus_emulated;
>> +       bool all_cpus_emulated, all_cpus_vec_supported;
>>          int cpu;
>>
>>          if (riscv_has_extension_unlikely(RISCV_ISA_EXT_ZICCLSM)) {
>> @@ -285,7 +407,13 @@ static int check_unaligned_access_all_cpus(void)
>>          }
>>
>>          all_cpus_emulated = check_unaligned_access_emulated_all_cpus();
>> -       check_vector_unaligned_access_emulated_all_cpus();
>> +       all_cpus_vec_supported = check_vector_unaligned_access_emulated_all_cpus();
>> +
>> +       if (all_cpus_vec_supported &&
> 
> Note that with this here, if a heterogeneous system ever shows up with
> a mix of supported/unsupported, the supported cpus will end up with a
> misaligned vector speed of UNKNOWN. It might be nicer to change the
> semantics of that return value from all_cpus_vec_supported (aka
> entirely_supported) to entirely_unsupported. Then you'd only fire up
> the thread if (!entirely_unsupported). If you did that, you'd also
> need to bail early in the check on any CPUs that already had their
> value set to UNSUPPORTED.

Fortunatly check_vector_unaligned_access already checks this.

> That way we can still avoid firing up this
> thread for machines that don't have misaligned V (or V at all), but
> not leave the question UNKNOWN in some cases.

That makes a lot of sence I'm not sure why I didnt think ot that.
I changed it to `all_cpus_vec_unsupported` so it will check the speed if 
atleast one cpu has unaligned vector support.

Thanks,
Jesse taube

> 
>> +           IS_ENABLED(CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS)) {
>> +               kthread_run(vec_check_unaligned_access_speed_all_cpus,
>> +                           NULL, "vec_check_unaligned_access_speed_all_cpus");
>> +       }
>>
>>          if (!all_cpus_emulated)
>>                  return check_unaligned_access_speed_all_cpus();
>> diff --git a/arch/riscv/kernel/vec-copy-unaligned.S b/arch/riscv/kernel/vec-copy-unaligned.S
>> new file mode 100644
>> index 000000000000..e5bc94917e60
>> --- /dev/null
>> +++ b/arch/riscv/kernel/vec-copy-unaligned.S
>> @@ -0,0 +1,58 @@
>> +/* SPDX-License-Identifier: GPL-2.0 */
>> +/* Copyright (C) 2024 Rivos Inc. */
>> +
>> +#include <linux/linkage.h>
>> +#include <asm/asm.h>
>> +#include <linux/args.h>
>> +
>> +       .text
>> +
>> +#define WORD_EEW 32
>> +
>> +#define WORD_SEW CONCATENATE(e, WORD_EEW)
>> +#define VEC_L CONCATENATE(vle, WORD_EEW).v
>> +#define VEC_S CONCATENATE(vle, WORD_EEW).v
>> +
>> +/* void __riscv_copy_vec_words_unaligned(void *, const void *, size_t) */
>> +/* Performs a memcpy without aligning buffers, using word loads and stores. */
>> +/* Note: The size is truncated to a multiple of WORD_EEW */
>> +SYM_FUNC_START(__riscv_copy_vec_words_unaligned)
>> +       andi  a4, a2, ~(WORD_EEW-1)
>> +       beqz  a4, 2f
>> +       add   a3, a1, a4
>> +       .option push
>> +       .option arch, +zve32x
>> +1:
>> +       vsetivli t0, 8, WORD_SEW, m8, ta, ma
>> +       VEC_L v0, (a1)
>> +       VEC_S v0, (a0)
>> +       addi  a0, a0, WORD_EEW
>> +       addi  a1, a1, WORD_EEW
>> +       bltu  a1, a3, 1b
>> +
>> +2:
>> +       .option pop
>> +       ret
>> +SYM_FUNC_END(__riscv_copy_vec_words_unaligned)
>> +
>> +/* void __riscv_copy_vec_bytes_unaligned(void *, const void *, size_t) */
>> +/* Performs a memcpy without aligning buffers, using only byte accesses. */
>> +/* Note: The size is truncated to a multiple of 8 */
>> +SYM_FUNC_START(__riscv_copy_vec_bytes_unaligned)
>> +       andi a4, a2, ~(8-1)
>> +       beqz a4, 2f
>> +       add  a3, a1, a4
>> +       .option push
>> +       .option arch, +zve32x
>> +1:
>> +       vsetivli t0, 8, e8, m8, ta, ma
>> +       vle8.v v0, (a1)
>> +       vse8.v v0, (a0)
>> +       addi a0, a0, 8
>> +       addi a1, a1, 8
>> +       bltu a1, a3, 1b
>> +
>> +2:
>> +       .option pop
>> +       ret
>> +SYM_FUNC_END(__riscv_copy_vec_bytes_unaligned)
>> --
>> 2.45.2
>>