[PATCH v3 7/8] RISC-V: Report vector unaligned access speed hwprobe
Jesse Taube
jesse at rivosinc.com
Thu Jul 11 13:35:48 PDT 2024
On 7/1/24 18:51, Evan Green wrote:
> On Mon, Jun 24, 2024 at 5:52 PM Jesse Taube <jesse at rivosinc.com> wrote:
>>
>> Detect if vector misaligned accesses are faster or slower than
>> equivalent vector byte accesses. This is useful for usermode to know
>> whether vector byte accesses or vector misaligned accesses have a better
>> bandwidth for operations like memcpy.
>>
>> Signed-off-by: Jesse Taube <jesse at rivosinc.com>
>> ---
>> V1 -> V2:
>> - Add Kconfig options
>> - Add WORD_EEW to vec-copy-unaligned.S
>> V2 -> V3:
>> - Remove unnecessary comment
>> - Remove local_irq_enable
>> ---
>> arch/riscv/Kconfig | 18 +++
>> arch/riscv/kernel/Makefile | 3 +-
>> arch/riscv/kernel/copy-unaligned.h | 5 +
>> arch/riscv/kernel/sys_hwprobe.c | 6 +
>> arch/riscv/kernel/unaligned_access_speed.c | 132 ++++++++++++++++++++-
>> arch/riscv/kernel/vec-copy-unaligned.S | 58 +++++++++
>> 6 files changed, 219 insertions(+), 3 deletions(-)
>> create mode 100644 arch/riscv/kernel/vec-copy-unaligned.S
>>
>> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
>> index ffbe0fdd7fb3..6f9fd3748916 100644
>> --- a/arch/riscv/Kconfig
>> +++ b/arch/riscv/Kconfig
>> @@ -807,6 +807,24 @@ config RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
>> will dynamically determine the speed of vector unaligned accesses on
>> the underlying system if they are supported.
>>
>> +config RISCV_SLOW_VEC_UNALIGNED_ACCESS
>> + bool "Assume the system supports slow vector unaligned memory accesses"
>> + depends on NONPORTABLE
>> + help
>> + Assume that the system supports slow vector unaligned memory accesses. The
>> + kernel and userspace programs may not be able to run at all on systems
>> + that do not support unaligned memory accesses.
>> +
>> +config RISCV_EFFICIENT_VEC_UNALIGNED_ACCESS
>> + bool "Assume the system supports fast vector unaligned memory accesses"
>> + depends on NONPORTABLE
>> + help
>> + Assume that the system supports fast vector unaligned memory accesses. When
>> + enabled, this option improves the performance of the kernel on such
>> + systems. However, the kernel and userspace programs will run much more
>> + slowly, or will not be able to run at all, on systems that do not
>> + support efficient unaligned memory accesses.
>> +
>> endchoice
>>
>> endmenu # "Platform type"
>> diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
>> index 5b243d46f4b1..291935a084d5 100644
>> --- a/arch/riscv/kernel/Makefile
>> +++ b/arch/riscv/kernel/Makefile
>> @@ -64,7 +64,8 @@ obj-$(CONFIG_MMU) += vdso.o vdso/
>>
>> obj-$(CONFIG_RISCV_MISALIGNED) += traps_misaligned.o
>> obj-$(CONFIG_RISCV_MISALIGNED) += unaligned_access_speed.o
>> -obj-$(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS) += copy-unaligned.o
>> +obj-$(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS) += copy-unaligned.o
>> +obj-$(CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS) += vec-copy-unaligned.o
>>
>> obj-$(CONFIG_FPU) += fpu.o
>> obj-$(CONFIG_FPU) += kernel_mode_fpu.o
>> diff --git a/arch/riscv/kernel/copy-unaligned.h b/arch/riscv/kernel/copy-unaligned.h
>> index e3d70d35b708..85d4d11450cb 100644
>> --- a/arch/riscv/kernel/copy-unaligned.h
>> +++ b/arch/riscv/kernel/copy-unaligned.h
>> @@ -10,4 +10,9 @@
>> void __riscv_copy_words_unaligned(void *dst, const void *src, size_t size);
>> void __riscv_copy_bytes_unaligned(void *dst, const void *src, size_t size);
>>
>> +#ifdef CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
>> +void __riscv_copy_vec_words_unaligned(void *dst, const void *src, size_t size);
>> +void __riscv_copy_vec_bytes_unaligned(void *dst, const void *src, size_t size);
>> +#endif
>> +
>> #endif /* __RISCV_KERNEL_COPY_UNALIGNED_H */
>> diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c
>> index 5b78ea5a84d1..46de3f145154 100644
>> --- a/arch/riscv/kernel/sys_hwprobe.c
>> +++ b/arch/riscv/kernel/sys_hwprobe.c
>> @@ -221,6 +221,12 @@ static u64 hwprobe_vec_misaligned(const struct cpumask *cpus)
>> #else
>> static u64 hwprobe_vec_misaligned(const struct cpumask *cpus)
>> {
>> + if (IS_ENABLED(CONFIG_RISCV_EFFICIENT_VEC_UNALIGNED_ACCESS))
>> + return RISCV_HWPROBE_VEC_MISALIGNED_FAST;
>> +
>> + if (IS_ENABLED(CONFIG_RISCV_SLOW_VEC_UNALIGNED_ACCESS))
>> + return RISCV_HWPROBE_VEC_MISALIGNED_SLOW;
>> +
>> return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
>> }
>> #endif
>> diff --git a/arch/riscv/kernel/unaligned_access_speed.c b/arch/riscv/kernel/unaligned_access_speed.c
>> index 8489e012cf23..a2a8f948500b 100644
>> --- a/arch/riscv/kernel/unaligned_access_speed.c
>> +++ b/arch/riscv/kernel/unaligned_access_speed.c
>> @@ -8,9 +8,11 @@
>> #include <linux/jump_label.h>
>> #include <linux/mm.h>
>> #include <linux/smp.h>
>> +#include <linux/kthread.h>
>> #include <linux/types.h>
>> #include <asm/cpufeature.h>
>> #include <asm/hwprobe.h>
>> +#include <asm/vector.h>
>>
>> #include "copy-unaligned.h"
>>
>> @@ -267,9 +269,129 @@ static int check_unaligned_access_speed_all_cpus(void)
>> }
>> #endif
>>
>> +#ifdef CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
>> +static void check_vector_unaligned_access(struct work_struct *unused)
>> +{
>> + int cpu = smp_processor_id();
>> + u64 start_cycles, end_cycles;
>> + u64 word_cycles;
>> + u64 byte_cycles;
>> + int ratio;
>> + unsigned long start_jiffies, now;
>> + struct page *page;
>> + void *dst;
>> + void *src;
>> + long speed = RISCV_HWPROBE_VEC_MISALIGNED_SLOW;
>> +
>> + if (per_cpu(vector_misaligned_access, cpu) != RISCV_HWPROBE_VEC_MISALIGNED_SLOW)
>> + return;
>> +
>> + page = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
>> + if (!page) {
>> + pr_warn("Allocation failure, not measuring vector misaligned performance\n");
>> + return;
>> + }
>> +
>> + /* Make an unaligned destination buffer. */
>> + dst = (void *)((unsigned long)page_address(page) | 0x1);
>> + /* Unalign src as well, but differently (off by 1 + 2 = 3). */
>> + src = dst + (MISALIGNED_BUFFER_SIZE / 2);
>> + src += 2;
>> + word_cycles = -1ULL;
>> +
>> + /* Do a warmup. */
>> + kernel_vector_begin();
>
> Should there be a preempt_disable() in here too, or is that implied
> already by schedule_on_each_cpu? Mainly we want to minimize the amount
> of things that might soak up our jiffy and steal iterations from the
> loop.
To test I put a schedule here and it says OOPS we are attomic.
I added preempt_disable and preempt_enable to be sure preemption is
disabled anyway.
>
>> + __riscv_copy_vec_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
>> +
>> + start_jiffies = jiffies;
>> + while ((now = jiffies) == start_jiffies)
>> + cpu_relax();
>> +
>> + /*
>> + * For a fixed amount of time, repeatedly try the function, and take
>> + * the best time in cycles as the measurement.
>> + */
>> + while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
>> + start_cycles = get_cycles64();
>> + /* Ensure the CSR read can't reorder WRT to the copy. */
>> + mb();
>> + __riscv_copy_vec_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
>> + /* Ensure the copy ends before the end time is snapped. */
>> + mb();
>> + end_cycles = get_cycles64();
>> + if ((end_cycles - start_cycles) < word_cycles)
>> + word_cycles = end_cycles - start_cycles;
>> + }
>> +
>> + byte_cycles = -1ULL;
>> + __riscv_copy_vec_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
>> + start_jiffies = jiffies;
>> + while ((now = jiffies) == start_jiffies)
>> + cpu_relax();
>> +
>> + while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
>> + start_cycles = get_cycles64();
>> + mb();
>> + __riscv_copy_vec_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
>> + mb();
>> + end_cycles = get_cycles64();
>> + if ((end_cycles - start_cycles) < byte_cycles)
>> + byte_cycles = end_cycles - start_cycles;
>> + }
>> +
>> + kernel_vector_end();
>> +
>> + /* Don't divide by zero. */
>> + if (!word_cycles || !byte_cycles) {
>> + pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned vector access speed\n",
>> + cpu);
>> +
>> + return;
>> + }
>> +
>> + if (word_cycles < byte_cycles)
>> + speed = RISCV_HWPROBE_VEC_MISALIGNED_FAST;
>> +
>> + ratio = div_u64((byte_cycles * 100), word_cycles);
>> + pr_info("cpu%d: Ratio of vector byte access time to vector unaligned word access is %d.%02d, unaligned accesses are %s\n",
>> + cpu,
>> + ratio / 100,
>> + ratio % 100,
>> + (speed == RISCV_HWPROBE_VEC_MISALIGNED_FAST) ? "fast" : "slow");
>> +
>> + per_cpu(vector_misaligned_access, cpu) = speed;
>> +}
>> +
>> +static int riscv_online_cpu_vec(unsigned int cpu)
>> +{
>> + check_vector_unaligned_access(NULL);
>> + return 0;
>> +}
>> +
>> +/* Measure unaligned access speed on all CPUs present at boot in parallel. */
>> +static int vec_check_unaligned_access_speed_all_cpus(void *unused)
>> +{
>> + schedule_on_each_cpu(check_vector_unaligned_access);
>> +
>> + /*
>> + * Setup hotplug callbacks for any new CPUs that come online or go
>> + * offline.
>> + */
>> + cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online",
>> + riscv_online_cpu_vec, NULL);
>> +
>> + return 0;
>> +}
>> +#else /* CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS */
>> +static int vec_check_unaligned_access_speed_all_cpus(void *unused)
>> +{
>> + return 0;
>> +}
>> +#endif
>> +
>> static int check_unaligned_access_all_cpus(void)
>> {
>> - bool all_cpus_emulated;
>> + bool all_cpus_emulated, all_cpus_vec_supported;
>> int cpu;
>>
>> if (riscv_has_extension_unlikely(RISCV_ISA_EXT_ZICCLSM)) {
>> @@ -285,7 +407,13 @@ static int check_unaligned_access_all_cpus(void)
>> }
>>
>> all_cpus_emulated = check_unaligned_access_emulated_all_cpus();
>> - check_vector_unaligned_access_emulated_all_cpus();
>> + all_cpus_vec_supported = check_vector_unaligned_access_emulated_all_cpus();
>> +
>> + if (all_cpus_vec_supported &&
>
> Note that with this here, if a heterogeneous system ever shows up with
> a mix of supported/unsupported, the supported cpus will end up with a
> misaligned vector speed of UNKNOWN. It might be nicer to change the
> semantics of that return value from all_cpus_vec_supported (aka
> entirely_supported) to entirely_unsupported. Then you'd only fire up
> the thread if (!entirely_unsupported). If you did that, you'd also
> need to bail early in the check on any CPUs that already had their
> value set to UNSUPPORTED.
Fortunatly check_vector_unaligned_access already checks this.
> That way we can still avoid firing up this
> thread for machines that don't have misaligned V (or V at all), but
> not leave the question UNKNOWN in some cases.
That makes a lot of sence I'm not sure why I didnt think ot that.
I changed it to `all_cpus_vec_unsupported` so it will check the speed if
atleast one cpu has unaligned vector support.
Thanks,
Jesse taube
>
>> + IS_ENABLED(CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS)) {
>> + kthread_run(vec_check_unaligned_access_speed_all_cpus,
>> + NULL, "vec_check_unaligned_access_speed_all_cpus");
>> + }
>>
>> if (!all_cpus_emulated)
>> return check_unaligned_access_speed_all_cpus();
>> diff --git a/arch/riscv/kernel/vec-copy-unaligned.S b/arch/riscv/kernel/vec-copy-unaligned.S
>> new file mode 100644
>> index 000000000000..e5bc94917e60
>> --- /dev/null
>> +++ b/arch/riscv/kernel/vec-copy-unaligned.S
>> @@ -0,0 +1,58 @@
>> +/* SPDX-License-Identifier: GPL-2.0 */
>> +/* Copyright (C) 2024 Rivos Inc. */
>> +
>> +#include <linux/linkage.h>
>> +#include <asm/asm.h>
>> +#include <linux/args.h>
>> +
>> + .text
>> +
>> +#define WORD_EEW 32
>> +
>> +#define WORD_SEW CONCATENATE(e, WORD_EEW)
>> +#define VEC_L CONCATENATE(vle, WORD_EEW).v
>> +#define VEC_S CONCATENATE(vle, WORD_EEW).v
>> +
>> +/* void __riscv_copy_vec_words_unaligned(void *, const void *, size_t) */
>> +/* Performs a memcpy without aligning buffers, using word loads and stores. */
>> +/* Note: The size is truncated to a multiple of WORD_EEW */
>> +SYM_FUNC_START(__riscv_copy_vec_words_unaligned)
>> + andi a4, a2, ~(WORD_EEW-1)
>> + beqz a4, 2f
>> + add a3, a1, a4
>> + .option push
>> + .option arch, +zve32x
>> +1:
>> + vsetivli t0, 8, WORD_SEW, m8, ta, ma
>> + VEC_L v0, (a1)
>> + VEC_S v0, (a0)
>> + addi a0, a0, WORD_EEW
>> + addi a1, a1, WORD_EEW
>> + bltu a1, a3, 1b
>> +
>> +2:
>> + .option pop
>> + ret
>> +SYM_FUNC_END(__riscv_copy_vec_words_unaligned)
>> +
>> +/* void __riscv_copy_vec_bytes_unaligned(void *, const void *, size_t) */
>> +/* Performs a memcpy without aligning buffers, using only byte accesses. */
>> +/* Note: The size is truncated to a multiple of 8 */
>> +SYM_FUNC_START(__riscv_copy_vec_bytes_unaligned)
>> + andi a4, a2, ~(8-1)
>> + beqz a4, 2f
>> + add a3, a1, a4
>> + .option push
>> + .option arch, +zve32x
>> +1:
>> + vsetivli t0, 8, e8, m8, ta, ma
>> + vle8.v v0, (a1)
>> + vse8.v v0, (a0)
>> + addi a0, a0, 8
>> + addi a1, a1, 8
>> + bltu a1, a3, 1b
>> +
>> +2:
>> + .option pop
>> + ret
>> +SYM_FUNC_END(__riscv_copy_vec_bytes_unaligned)
>> --
>> 2.45.2
>>
More information about the linux-riscv
mailing list