[PATCH v3 2/2] riscv: Disable misaligned access probe when CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS

Charlie Jenkins charlie at rivosinc.com
Fri Feb 16 11:24:19 PST 2024


On Wed, Feb 14, 2024 at 09:24:16AM -0500, Evan Green wrote:
> On Fri, Feb 2, 2024 at 6:54 PM Charlie Jenkins <charlie at rivosinc.com> wrote:
> >
> > When CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS is selected, the cpus can be
> > set to have fast misaligned access without needing to probe.
> >
> > To avoid some ifdefs, move unalignment probing code into its own file
> > and make CONFIG_RISCV_MISALIGNED depend on
> > CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
> >
> > Signed-off-by: Charlie Jenkins <charlie at rivosinc.com>
> 
> One nit below, but feel free to add my tag on the next spin:
> 
> Reviewed-by: Evan Green <evan at rivosinc.com>

Thanks! I will apply your suggestions. I am also going to add support
for selecting the other alignment speeds so I will leave off the
reviewed-by for now in case you have further suggestions for that.

- Charlie

> 
> > ---
> >  arch/riscv/Kconfig                          |   1 +
> >  arch/riscv/include/asm/cpufeature.h         |   7 +
> >  arch/riscv/kernel/Makefile                  |   3 +
> >  arch/riscv/kernel/cpufeature.c              | 255 --------------------------
> >  arch/riscv/kernel/misaligned_access_speed.c | 265 ++++++++++++++++++++++++++++
> >  arch/riscv/kernel/sys_hwprobe.c             |   4 +
> >  6 files changed, 280 insertions(+), 255 deletions(-)
> >
> > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > index bffbd869a068..3223d2d08f74 100644
> > --- a/arch/riscv/Kconfig
> > +++ b/arch/riscv/Kconfig
> > @@ -690,6 +690,7 @@ config THREAD_SIZE_ORDER
> >  config RISCV_MISALIGNED
> >         bool "Support misaligned load/store traps for kernel and userspace"
> >         select SYSCTL_ARCH_UNALIGN_ALLOW
> > +       depends on !HAVE_EFFICIENT_UNALIGNED_ACCESS
> >         default y
> >         help
> >           Say Y here if you want the kernel to embed support for misaligned
> > diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h
> > index eb3ac304fc42..44734e5169b1 100644
> > --- a/arch/riscv/include/asm/cpufeature.h
> > +++ b/arch/riscv/include/asm/cpufeature.h
> > @@ -51,6 +51,12 @@ static inline bool check_unaligned_access_emulated(int cpu)
> >  static inline void unaligned_emulation_finish(void) {}
> >  #endif
> >
> > +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
> > +static __always_inline bool has_fast_misaligned_accesses(void)
> > +{
> > +       return true;
> > +}
> > +#else
> >  DECLARE_PER_CPU(long, misaligned_access_speed);
> >
> >  DECLARE_STATIC_KEY_FALSE(fast_misaligned_access_speed_key);
> > @@ -59,6 +65,7 @@ static __always_inline bool has_fast_misaligned_accesses(void)
> >  {
> >         return static_branch_likely(&fast_misaligned_access_speed_key);
> >  }
> > +#endif
> >
> >  unsigned long riscv_get_elf_hwcap(void);
> >
> > diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
> > index f71910718053..ffba5ecf12c2 100644
> > --- a/arch/riscv/kernel/Makefile
> > +++ b/arch/riscv/kernel/Makefile
> > @@ -62,6 +62,9 @@ obj-y += tests/
> >  obj-$(CONFIG_MMU) += vdso.o vdso/
> >
> >  obj-$(CONFIG_RISCV_MISALIGNED) += traps_misaligned.o
> > +ifneq ($(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS), y)
> > +obj-y  += misaligned_access_speed.o
> 
> copy-unaligned.o can also go in here (those are the assembly copy
> routines used only by the probing code).
> 
> > +endif
> >  obj-$(CONFIG_FPU)              += fpu.o
> >  obj-$(CONFIG_RISCV_ISA_V)      += vector.o
> >  obj-$(CONFIG_RISCV_ISA_V)      += kernel_mode_vector.o
> > diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
> > index 89920f84d0a3..319670af5704 100644
> > --- a/arch/riscv/kernel/cpufeature.c
> > +++ b/arch/riscv/kernel/cpufeature.c
> > @@ -11,7 +11,6 @@
> >  #include <linux/cpu.h>
> >  #include <linux/cpuhotplug.h>
> >  #include <linux/ctype.h>
> > -#include <linux/jump_label.h>
> >  #include <linux/log2.h>
> >  #include <linux/memory.h>
> >  #include <linux/module.h>
> > @@ -21,20 +20,12 @@
> >  #include <asm/cacheflush.h>
> >  #include <asm/cpufeature.h>
> >  #include <asm/hwcap.h>
> > -#include <asm/hwprobe.h>
> >  #include <asm/patch.h>
> >  #include <asm/processor.h>
> >  #include <asm/vector.h>
> >
> > -#include "copy-unaligned.h"
> > -
> >  #define NUM_ALPHA_EXTS ('z' - 'a' + 1)
> >
> > -#define MISALIGNED_ACCESS_JIFFIES_LG2 1
> > -#define MISALIGNED_BUFFER_SIZE 0x4000
> > -#define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE)
> > -#define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
> > -
> >  unsigned long elf_hwcap __read_mostly;
> >
> >  /* Host ISA bitmap */
> > @@ -43,11 +34,6 @@ static DECLARE_BITMAP(riscv_isa, RISCV_ISA_EXT_MAX) __read_mostly;
> >  /* Per-cpu ISA extensions. */
> >  struct riscv_isainfo hart_isa[NR_CPUS];
> >
> > -/* Performance information */
> > -DEFINE_PER_CPU(long, misaligned_access_speed);
> > -
> > -static cpumask_t fast_misaligned_access;
> > -
> >  /**
> >   * riscv_isa_extension_base() - Get base extension word
> >   *
> > @@ -706,247 +692,6 @@ unsigned long riscv_get_elf_hwcap(void)
> >         return hwcap;
> >  }
> >
> > -static int check_unaligned_access(void *param)
> > -{
> > -       int cpu = smp_processor_id();
> > -       u64 start_cycles, end_cycles;
> > -       u64 word_cycles;
> > -       u64 byte_cycles;
> > -       int ratio;
> > -       unsigned long start_jiffies, now;
> > -       struct page *page = param;
> > -       void *dst;
> > -       void *src;
> > -       long speed = RISCV_HWPROBE_MISALIGNED_SLOW;
> > -
> > -       if (check_unaligned_access_emulated(cpu))
> > -               return 0;
> > -
> > -       /* Make an unaligned destination buffer. */
> > -       dst = (void *)((unsigned long)page_address(page) | 0x1);
> > -       /* Unalign src as well, but differently (off by 1 + 2 = 3). */
> > -       src = dst + (MISALIGNED_BUFFER_SIZE / 2);
> > -       src += 2;
> > -       word_cycles = -1ULL;
> > -       /* Do a warmup. */
> > -       __riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
> > -       preempt_disable();
> > -       start_jiffies = jiffies;
> > -       while ((now = jiffies) == start_jiffies)
> > -               cpu_relax();
> > -
> > -       /*
> > -        * For a fixed amount of time, repeatedly try the function, and take
> > -        * the best time in cycles as the measurement.
> > -        */
> > -       while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
> > -               start_cycles = get_cycles64();
> > -               /* Ensure the CSR read can't reorder WRT to the copy. */
> > -               mb();
> > -               __riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
> > -               /* Ensure the copy ends before the end time is snapped. */
> > -               mb();
> > -               end_cycles = get_cycles64();
> > -               if ((end_cycles - start_cycles) < word_cycles)
> > -                       word_cycles = end_cycles - start_cycles;
> > -       }
> > -
> > -       byte_cycles = -1ULL;
> > -       __riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
> > -       start_jiffies = jiffies;
> > -       while ((now = jiffies) == start_jiffies)
> > -               cpu_relax();
> > -
> > -       while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
> > -               start_cycles = get_cycles64();
> > -               mb();
> > -               __riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
> > -               mb();
> > -               end_cycles = get_cycles64();
> > -               if ((end_cycles - start_cycles) < byte_cycles)
> > -                       byte_cycles = end_cycles - start_cycles;
> > -       }
> > -
> > -       preempt_enable();
> > -
> > -       /* Don't divide by zero. */
> > -       if (!word_cycles || !byte_cycles) {
> > -               pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned access speed\n",
> > -                       cpu);
> > -
> > -               return 0;
> > -       }
> > -
> > -       if (word_cycles < byte_cycles)
> > -               speed = RISCV_HWPROBE_MISALIGNED_FAST;
> > -
> > -       ratio = div_u64((byte_cycles * 100), word_cycles);
> > -       pr_info("cpu%d: Ratio of byte access time to unaligned word access is %d.%02d, unaligned accesses are %s\n",
> > -               cpu,
> > -               ratio / 100,
> > -               ratio % 100,
> > -               (speed == RISCV_HWPROBE_MISALIGNED_FAST) ? "fast" : "slow");
> > -
> > -       per_cpu(misaligned_access_speed, cpu) = speed;
> > -
> > -       /*
> > -        * Set the value of fast_misaligned_access of a CPU. These operations
> > -        * are atomic to avoid race conditions.
> > -        */
> > -       if (speed == RISCV_HWPROBE_MISALIGNED_FAST)
> > -               cpumask_set_cpu(cpu, &fast_misaligned_access);
> > -       else
> > -               cpumask_clear_cpu(cpu, &fast_misaligned_access);
> > -
> > -       return 0;
> > -}
> > -
> > -static void check_unaligned_access_nonboot_cpu(void *param)
> > -{
> > -       unsigned int cpu = smp_processor_id();
> > -       struct page **pages = param;
> > -
> > -       if (smp_processor_id() != 0)
> > -               check_unaligned_access(pages[cpu]);
> > -}
> > -
> > -DEFINE_STATIC_KEY_FALSE(fast_misaligned_access_speed_key);
> > -
> > -static void modify_unaligned_access_branches(cpumask_t *mask, int weight)
> > -{
> > -       if (cpumask_weight(mask) == weight)
> > -               static_branch_enable_cpuslocked(&fast_misaligned_access_speed_key);
> > -       else
> > -               static_branch_disable_cpuslocked(&fast_misaligned_access_speed_key);
> > -}
> > -
> > -static void set_unaligned_access_static_branches_except_cpu(int cpu)
> > -{
> > -       /*
> > -        * Same as set_unaligned_access_static_branches, except excludes the
> > -        * given CPU from the result. When a CPU is hotplugged into an offline
> > -        * state, this function is called before the CPU is set to offline in
> > -        * the cpumask, and thus the CPU needs to be explicitly excluded.
> > -        */
> > -
> > -       cpumask_t fast_except_me;
> > -
> > -       cpumask_and(&fast_except_me, &fast_misaligned_access, cpu_online_mask);
> > -       cpumask_clear_cpu(cpu, &fast_except_me);
> > -
> > -       modify_unaligned_access_branches(&fast_except_me, num_online_cpus() - 1);
> > -}
> > -
> > -static void set_unaligned_access_static_branches(void)
> > -{
> > -       /*
> > -        * This will be called after check_unaligned_access_all_cpus so the
> > -        * result of unaligned access speed for all CPUs will be available.
> > -        *
> > -        * To avoid the number of online cpus changing between reading
> > -        * cpu_online_mask and calling num_online_cpus, cpus_read_lock must be
> > -        * held before calling this function.
> > -        */
> > -
> > -       cpumask_t fast_and_online;
> > -
> > -       cpumask_and(&fast_and_online, &fast_misaligned_access, cpu_online_mask);
> > -
> > -       modify_unaligned_access_branches(&fast_and_online, num_online_cpus());
> > -}
> > -
> > -static int lock_and_set_unaligned_access_static_branch(void)
> > -{
> > -       cpus_read_lock();
> > -       set_unaligned_access_static_branches();
> > -       cpus_read_unlock();
> > -
> > -       return 0;
> > -}
> > -
> > -arch_initcall_sync(lock_and_set_unaligned_access_static_branch);
> > -
> > -static int riscv_online_cpu(unsigned int cpu)
> > -{
> > -       static struct page *buf;
> > -
> > -       /* We are already set since the last check */
> > -       if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_UNKNOWN)
> > -               goto exit;
> > -
> > -       buf = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
> > -       if (!buf) {
> > -               pr_warn("Allocation failure, not measuring misaligned performance\n");
> > -               return -ENOMEM;
> > -       }
> > -
> > -       check_unaligned_access(buf);
> > -       __free_pages(buf, MISALIGNED_BUFFER_ORDER);
> > -
> > -exit:
> > -       set_unaligned_access_static_branches();
> > -
> > -       return 0;
> > -}
> > -
> > -static int riscv_offline_cpu(unsigned int cpu)
> > -{
> > -       set_unaligned_access_static_branches_except_cpu(cpu);
> > -
> > -       return 0;
> > -}
> > -
> > -/* Measure unaligned access on all CPUs present at boot in parallel. */
> > -static int check_unaligned_access_all_cpus(void)
> > -{
> > -       unsigned int cpu;
> > -       unsigned int cpu_count = num_possible_cpus();
> > -       struct page **bufs = kzalloc(cpu_count * sizeof(struct page *),
> > -                                    GFP_KERNEL);
> > -
> > -       if (!bufs) {
> > -               pr_warn("Allocation failure, not measuring misaligned performance\n");
> > -               return 0;
> > -       }
> > -
> > -       /*
> > -        * Allocate separate buffers for each CPU so there's no fighting over
> > -        * cache lines.
> > -        */
> > -       for_each_cpu(cpu, cpu_online_mask) {
> > -               bufs[cpu] = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
> > -               if (!bufs[cpu]) {
> > -                       pr_warn("Allocation failure, not measuring misaligned performance\n");
> > -                       goto out;
> > -               }
> > -       }
> > -
> > -       /* Check everybody except 0, who stays behind to tend jiffies. */
> > -       on_each_cpu(check_unaligned_access_nonboot_cpu, bufs, 1);
> > -
> > -       /* Check core 0. */
> > -       smp_call_on_cpu(0, check_unaligned_access, bufs[0], true);
> > -
> > -       /*
> > -        * Setup hotplug callbacks for any new CPUs that come online or go
> > -        * offline.
> > -        */
> > -       cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online",
> > -                                 riscv_online_cpu, riscv_offline_cpu);
> > -
> > -out:
> > -       unaligned_emulation_finish();
> > -       for_each_cpu(cpu, cpu_online_mask) {
> > -               if (bufs[cpu])
> > -                       __free_pages(bufs[cpu], MISALIGNED_BUFFER_ORDER);
> > -       }
> > -
> > -       kfree(bufs);
> > -       return 0;
> > -}
> > -
> > -arch_initcall(check_unaligned_access_all_cpus);
> > -
> >  void riscv_user_isa_enable(void)
> >  {
> >         if (riscv_cpu_has_extension_unlikely(smp_processor_id(), RISCV_ISA_EXT_ZICBOZ))
> > diff --git a/arch/riscv/kernel/misaligned_access_speed.c b/arch/riscv/kernel/misaligned_access_speed.c
> > new file mode 100644
> > index 000000000000..b725c07dd1af
> > --- /dev/null
> > +++ b/arch/riscv/kernel/misaligned_access_speed.c
> > @@ -0,0 +1,265 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +/*
> > + * Copyright 2024 Rivos Inc.
> > + */
> > +
> > +#include <linux/cpu.h>
> > +#include <linux/cpumask.h>
> > +#include <linux/jump_label.h>
> > +#include <linux/mm.h>
> > +#include <linux/smp.h>
> > +#include <linux/types.h>
> > +#include <asm/cpufeature.h>
> > +#include <asm/hwprobe.h>
> > +
> > +#include "copy-unaligned.h"
> > +
> > +#define MISALIGNED_ACCESS_JIFFIES_LG2 1
> > +#define MISALIGNED_BUFFER_SIZE 0x4000
> > +#define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE)
> > +#define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
> > +
> > +DEFINE_PER_CPU(long, misaligned_access_speed);
> > +
> > +static cpumask_t fast_misaligned_access;
> > +
> > +static int check_unaligned_access(void *param)
> > +{
> > +       int cpu = smp_processor_id();
> > +       u64 start_cycles, end_cycles;
> > +       u64 word_cycles;
> > +       u64 byte_cycles;
> > +       int ratio;
> > +       unsigned long start_jiffies, now;
> > +       struct page *page = param;
> > +       void *dst;
> > +       void *src;
> > +       long speed = RISCV_HWPROBE_MISALIGNED_SLOW;
> > +
> > +       if (check_unaligned_access_emulated(cpu))
> > +               return 0;
> > +
> > +       /* Make an unaligned destination buffer. */
> > +       dst = (void *)((unsigned long)page_address(page) | 0x1);
> > +       /* Unalign src as well, but differently (off by 1 + 2 = 3). */
> > +       src = dst + (MISALIGNED_BUFFER_SIZE / 2);
> > +       src += 2;
> > +       word_cycles = -1ULL;
> > +       /* Do a warmup. */
> > +       __riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
> > +       preempt_disable();
> > +       start_jiffies = jiffies;
> > +       while ((now = jiffies) == start_jiffies)
> > +               cpu_relax();
> > +
> > +       /*
> > +        * For a fixed amount of time, repeatedly try the function, and take
> > +        * the best time in cycles as the measurement.
> > +        */
> > +       while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
> > +               start_cycles = get_cycles64();
> > +               /* Ensure the CSR read can't reorder WRT to the copy. */
> > +               mb();
> > +               __riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
> > +               /* Ensure the copy ends before the end time is snapped. */
> > +               mb();
> > +               end_cycles = get_cycles64();
> > +               if ((end_cycles - start_cycles) < word_cycles)
> > +                       word_cycles = end_cycles - start_cycles;
> > +       }
> > +
> > +       byte_cycles = -1ULL;
> > +       __riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
> > +       start_jiffies = jiffies;
> > +       while ((now = jiffies) == start_jiffies)
> > +               cpu_relax();
> > +
> > +       while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
> > +               start_cycles = get_cycles64();
> > +               mb();
> > +               __riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
> > +               mb();
> > +               end_cycles = get_cycles64();
> > +               if ((end_cycles - start_cycles) < byte_cycles)
> > +                       byte_cycles = end_cycles - start_cycles;
> > +       }
> > +
> > +       preempt_enable();
> > +
> > +       /* Don't divide by zero. */
> > +       if (!word_cycles || !byte_cycles) {
> > +               pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned access speed\n",
> > +                       cpu);
> > +
> > +               return 0;
> > +       }
> > +
> > +       if (word_cycles < byte_cycles)
> > +               speed = RISCV_HWPROBE_MISALIGNED_FAST;
> > +
> > +       ratio = div_u64((byte_cycles * 100), word_cycles);
> > +       pr_info("cpu%d: Ratio of byte access time to unaligned word access is %d.%02d, unaligned accesses are %s\n",
> > +               cpu,
> > +               ratio / 100,
> > +               ratio % 100,
> > +               (speed == RISCV_HWPROBE_MISALIGNED_FAST) ? "fast" : "slow");
> > +
> > +       per_cpu(misaligned_access_speed, cpu) = speed;
> > +
> > +       /*
> > +        * Set the value of fast_misaligned_access of a CPU. These operations
> > +        * are atomic to avoid race conditions.
> > +        */
> > +       if (speed == RISCV_HWPROBE_MISALIGNED_FAST)
> > +               cpumask_set_cpu(cpu, &fast_misaligned_access);
> > +       else
> > +               cpumask_clear_cpu(cpu, &fast_misaligned_access);
> > +
> > +       return 0;
> > +}
> > +
> > +static void check_unaligned_access_nonboot_cpu(void *param)
> > +{
> > +       unsigned int cpu = smp_processor_id();
> > +       struct page **pages = param;
> > +
> > +       if (smp_processor_id() != 0)
> > +               check_unaligned_access(pages[cpu]);
> > +}
> > +
> > +DEFINE_STATIC_KEY_FALSE(fast_misaligned_access_speed_key);
> > +
> > +static void modify_unaligned_access_branches(cpumask_t *mask, int weight)
> > +{
> > +       if (cpumask_weight(mask) == weight)
> > +               static_branch_enable_cpuslocked(&fast_misaligned_access_speed_key);
> > +       else
> > +               static_branch_disable_cpuslocked(&fast_misaligned_access_speed_key);
> > +}
> > +
> > +static void set_unaligned_access_static_branches_except_cpu(int cpu)
> > +{
> > +       /*
> > +        * Same as set_unaligned_access_static_branches, except excludes the
> > +        * given CPU from the result. When a CPU is hotplugged into an offline
> > +        * state, this function is called before the CPU is set to offline in
> > +        * the cpumask, and thus the CPU needs to be explicitly excluded.
> > +        */
> > +
> > +       cpumask_t fast_except_me;
> > +
> > +       cpumask_and(&fast_except_me, &fast_misaligned_access, cpu_online_mask);
> > +       cpumask_clear_cpu(cpu, &fast_except_me);
> > +
> > +       modify_unaligned_access_branches(&fast_except_me, num_online_cpus() - 1);
> > +}
> > +
> > +static void set_unaligned_access_static_branches(void)
> > +{
> > +       /*
> > +        * This will be called after check_unaligned_access_all_cpus so the
> > +        * result of unaligned access speed for all CPUs will be available.
> > +        *
> > +        * To avoid the number of online cpus changing between reading
> > +        * cpu_online_mask and calling num_online_cpus, cpus_read_lock must be
> > +        * held before calling this function.
> > +        */
> > +
> > +       cpumask_t fast_and_online;
> > +
> > +       cpumask_and(&fast_and_online, &fast_misaligned_access, cpu_online_mask);
> > +
> > +       modify_unaligned_access_branches(&fast_and_online, num_online_cpus());
> > +}
> > +
> > +static int lock_and_set_unaligned_access_static_branch(void)
> > +{
> > +       cpus_read_lock();
> > +       set_unaligned_access_static_branches();
> > +       cpus_read_unlock();
> > +
> > +       return 0;
> > +}
> > +
> > +arch_initcall_sync(lock_and_set_unaligned_access_static_branch);
> > +
> > +static int riscv_online_cpu(unsigned int cpu)
> > +{
> > +       static struct page *buf;
> > +
> > +       /* We are already set since the last check */
> > +       if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_UNKNOWN)
> > +               goto exit;
> > +
> > +       buf = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
> > +       if (!buf) {
> > +               pr_warn("Allocation failure, not measuring misaligned performance\n");
> > +               return -ENOMEM;
> > +       }
> > +
> > +       check_unaligned_access(buf);
> > +       __free_pages(buf, MISALIGNED_BUFFER_ORDER);
> > +
> > +exit:
> > +       set_unaligned_access_static_branches();
> > +
> > +       return 0;
> > +}
> > +
> > +static int riscv_offline_cpu(unsigned int cpu)
> > +{
> > +       set_unaligned_access_static_branches_except_cpu(cpu);
> > +
> > +       return 0;
> > +}
> > +
> > +/* Measure unaligned access on all CPUs present at boot in parallel. */
> > +static int check_unaligned_access_all_cpus(void)
> > +{
> > +       unsigned int cpu;
> > +       unsigned int cpu_count = num_possible_cpus();
> > +       struct page **bufs = kzalloc(cpu_count * sizeof(struct page *),
> > +                                    GFP_KERNEL);
> > +
> > +       if (!bufs) {
> > +               pr_warn("Allocation failure, not measuring misaligned performance\n");
> > +               return 0;
> > +       }
> > +
> > +       /*
> > +        * Allocate separate buffers for each CPU so there's no fighting over
> > +        * cache lines.
> > +        */
> > +       for_each_cpu(cpu, cpu_online_mask) {
> > +               bufs[cpu] = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
> > +               if (!bufs[cpu]) {
> > +                       pr_warn("Allocation failure, not measuring misaligned performance\n");
> > +                       goto out;
> > +               }
> > +       }
> > +
> > +       /* Check everybody except 0, who stays behind to tend jiffies. */
> > +       on_each_cpu(check_unaligned_access_nonboot_cpu, bufs, 1);
> > +
> > +       /* Check core 0. */
> > +       smp_call_on_cpu(0, check_unaligned_access, bufs[0], true);
> > +
> > +       /*
> > +        * Setup hotplug callbacks for any new CPUs that come online or go
> > +        * offline.
> > +        */
> > +       cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online",
> > +                                 riscv_online_cpu, riscv_offline_cpu);
> > +
> > +out:
> > +       unaligned_emulation_finish();
> > +       for_each_cpu(cpu, cpu_online_mask) {
> > +               if (bufs[cpu])
> > +                       __free_pages(bufs[cpu], MISALIGNED_BUFFER_ORDER);
> > +       }
> > +
> > +       kfree(bufs);
> > +       return 0;
> > +}
> > +
> > +arch_initcall(check_unaligned_access_all_cpus);
> > diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c
> > index a7c56b41efd2..d9bd24776a3e 100644
> > --- a/arch/riscv/kernel/sys_hwprobe.c
> > +++ b/arch/riscv/kernel/sys_hwprobe.c
> > @@ -149,6 +149,9 @@ static bool hwprobe_ext0_has(const struct cpumask *cpus, unsigned long ext)
> >
> >  static u64 hwprobe_misaligned(const struct cpumask *cpus)
> >  {
> > +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
> > +       return RISCV_HWPROBE_MISALIGNED_FAST;
> > +#else
> 
> Ok, two nits, this is a little clunky with the ifdef inside the
> function body. The pattern I see more often is the ifdef/else each
> containing a separate function definition. That would work nicely
> here.
> 
> >         int cpu;
> >         u64 perf = -1ULL;
> >
> > @@ -168,6 +171,7 @@ static u64 hwprobe_misaligned(const struct cpumask *cpus)
> >                 return RISCV_HWPROBE_MISALIGNED_UNKNOWN;
> >
> >         return perf;
> > +#endif
> >  }
> >
> >  static void hwprobe_one_pair(struct riscv_hwprobe *pair,
> >
> > --
> > 2.43.0
> >



More information about the linux-riscv mailing list