[RFC PATCH v0] RISCV: Report vector unaligned accesses hwprobe

Jesse Taube jesse at rivosinc.com
Wed Jun 5 09:19:38 PDT 2024



On 6/5/24 11:54, Evan Green wrote:
> On Tue, Jun 4, 2024 at 9:25 AM Jesse Taube <jesse at rivosinc.com> wrote:
>>
>> Detected if a system traps into the kernel on an vector unaligned access.
>> Add the result to a new key in hwprobe.
>>
>> Signed-off-by: Jesse Taube <jesse at rivosinc.com>
> 
> For the scalar world, we wanted to know whether misaligned accesses
> were faster or slower than equivalent byte accesses, so usermode could
> know for something like memcpy which option had better bandwidth. 

Yes that will be in another patch and will set it to either fast or slow.
This patch will just detect if we can run it.

> Is
> the motivation here the same, where we're going to use vector
> registers for memcpy and we want to know which size load to use? Or
> will usermode be consuming this info for a different purpose as well?
> I know this is a basic question, but having the motivation helps me
> get the right lens for reviewing it. Perhaps that should be added to
> the commit message as well.
> 
>> ---
>>   arch/riscv/include/asm/cpufeature.h        |  3 ++
>>   arch/riscv/include/asm/hwprobe.h           |  2 +-
>>   arch/riscv/include/uapi/asm/hwprobe.h      |  6 +++
>>   arch/riscv/kernel/sys_hwprobe.c            | 34 ++++++++++++
>>   arch/riscv/kernel/traps_misaligned.c       | 60 ++++++++++++++++++++++
>>   arch/riscv/kernel/unaligned_access_speed.c |  4 ++
>>   6 files changed, 108 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h
>> index 347805446151..5ad69cf25b25 100644
>> --- a/arch/riscv/include/asm/cpufeature.h
>> +++ b/arch/riscv/include/asm/cpufeature.h
>> @@ -35,9 +35,12 @@ void riscv_user_isa_enable(void);
>>
>>   #if defined(CONFIG_RISCV_MISALIGNED)
>>   bool check_unaligned_access_emulated_all_cpus(void);
>> +bool check_vector_unaligned_access_all_cpus(void);
>> +
>>   void unaligned_emulation_finish(void);
>>   bool unaligned_ctl_available(void);
>>   DECLARE_PER_CPU(long, misaligned_access_speed);
>> +DECLARE_PER_CPU(long, vector_misaligned_access);
>>   #else
>>   static inline bool unaligned_ctl_available(void)
>>   {
>> diff --git a/arch/riscv/include/asm/hwprobe.h b/arch/riscv/include/asm/hwprobe.h
>> index 630507dff5ea..150a9877b0af 100644
>> --- a/arch/riscv/include/asm/hwprobe.h
>> +++ b/arch/riscv/include/asm/hwprobe.h
>> @@ -8,7 +8,7 @@
>>
>>   #include <uapi/asm/hwprobe.h>
>>
>> -#define RISCV_HWPROBE_MAX_KEY 6
>> +#define RISCV_HWPROBE_MAX_KEY 7
>>
>>   static inline bool riscv_hwprobe_key_is_valid(__s64 key)
>>   {
>> diff --git a/arch/riscv/include/uapi/asm/hwprobe.h b/arch/riscv/include/uapi/asm/hwprobe.h
>> index 060212331a03..4474e98d17bd 100644
>> --- a/arch/riscv/include/uapi/asm/hwprobe.h
>> +++ b/arch/riscv/include/uapi/asm/hwprobe.h
>> @@ -68,6 +68,12 @@ struct riscv_hwprobe {
>>   #define                RISCV_HWPROBE_MISALIGNED_UNSUPPORTED    (4 << 0)
>>   #define                RISCV_HWPROBE_MISALIGNED_MASK           (7 << 0)
>>   #define RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE    6
>> +#define RISCV_HWPROBE_VEC_KEY_MISALIGNED_PERF  7
>> +#define                RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN            0
>> +#define                RISCV_HWPROBE_VEC_MISALIGNED_EMULATED           1
>> +#define                RISCV_HWPROBE_VEC_MISALIGNED_SLOW               2
>> +#define                RISCV_HWPROBE_VEC_MISALIGNED_FAST               3
>> +#define                RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED        4
> 
> This needs to be added to the documentation as well.
> 
> What value should be returned when V is not enabled in the kernel, or
> V is not supported in the hardware? Currently in the code it would be
> UNKNOWN, right?

Yes. It would be trivial to set it to unsupported if V is not enabled.

> Is that what we want, or is it worth differentiating
> "no support for V" from "I don't know the speed of misaligned loads"?
> Maybe UNKNOWN is the right value, as there are other values to tell
> you V is not enabled.
> 
> 
>>   /* Increase RISCV_HWPROBE_MAX_KEY when adding items. */
>>
>>   /* Flags */
>> diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c
>> index b286b73e763e..ce641cc6e47a 100644
>> --- a/arch/riscv/kernel/sys_hwprobe.c
>> +++ b/arch/riscv/kernel/sys_hwprobe.c
>> @@ -184,6 +184,36 @@ static u64 hwprobe_misaligned(const struct cpumask *cpus)
>>   }
>>   #endif
>>
>> +#if defined(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS)
>> +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus)
>> +{
>> +       int cpu;
>> +       u64 perf = -1ULL;
>> +
>> +       for_each_cpu(cpu, cpus) {
>> +               int this_perf = per_cpu(vector_misaligned_access, cpu);
>> +
>> +               if (perf == -1ULL)
>> +                       perf = this_perf;
>> +
>> +               if (perf != this_perf) {
>> +                       perf = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
>> +                       break;
>> +               }
>> +       }
>> +
>> +       if (perf == -1ULL)
>> +               return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
>> +
>> +       return perf;
>> +}
>> +#else
>> +static u64 hwprobe_vec_misaligned(const struct cpumask *cpus)
>> +{
>> +       return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
>> +}
>> +#endif
>> +
>>   static void hwprobe_one_pair(struct riscv_hwprobe *pair,
>>                               const struct cpumask *cpus)
>>   {
>> @@ -211,6 +241,10 @@ static void hwprobe_one_pair(struct riscv_hwprobe *pair,
>>                  pair->value = hwprobe_misaligned(cpus);
>>                  break;
>>
>> +       case RISCV_HWPROBE_VEC_KEY_MISALIGNED_PERF:
>> +               pair->value = hwprobe_vec_misaligned(cpus);
>> +               break;
>> +
>>          case RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE:
>>                  pair->value = 0;
>>                  if (hwprobe_ext0_has(cpus, RISCV_HWPROBE_EXT_ZICBOZ))
>> diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c
>> index 2adb7c3e4dd5..0c07e990e9c5 100644
>> --- a/arch/riscv/kernel/traps_misaligned.c
>> +++ b/arch/riscv/kernel/traps_misaligned.c
>> @@ -16,6 +16,7 @@
>>   #include <asm/entry-common.h>
>>   #include <asm/hwprobe.h>
>>   #include <asm/cpufeature.h>
>> +#include <asm/vector.h>
>>
>>   #define INSN_MATCH_LB                  0x3
>>   #define INSN_MASK_LB                   0x707f
>> @@ -426,6 +427,14 @@ int handle_misaligned_load(struct pt_regs *regs)
>>          if (get_insn(regs, epc, &insn))
>>                  return -1;
>>
>> +#ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
>> +       if (*this_cpu_ptr(&vector_misaligned_access) == RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN) {
>> +               *this_cpu_ptr(&vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNSUPPORTED;
> 
> Shouldn't this be EMULATED, given we were just delegated this trap? I
> guess it depends on whether you're going to add support for actually
> handling the misaligned vector trap, as Clément mentioned.

Yeah I accedentaly forgot to check if the trap was caused by a vector 
instruction. I was not planing on emulating them, this is just a 
one-time check.

> 
> Scalar misaligned loads had a history to lean on since the specs were
> always explicit that misaligned loads/store had to be supported one
> way or another. So UNSUPPORTED was a future theoretical value. I
> haven't dug through the specs yet, do you know what the story is for V
> and misaligned loads?

"
8. Vector Memory Alignment Constraints
  If an element accessed by a vector memory instruction is not naturally 
aligned to the size of the element, either the element is transferred 
successfully or an address misaligned exception is raised on that element.
Support for misaligned vector memory accesses is independent of an 
implementation’s support for misaligned scalar memory accesses.
"

It is if the load/store is naturally aligned to the size of the element.

  My sub-question is what the rules are for
> detecting the difference between EMULATED and UNSUPPORTED.

For vector EMULATED isnt used thought it can be added if I add emulation.
Idealy that would be done in the SBI.

> 
>> +               regs->epc = epc + INSN_LEN(insn);
>> +               return 0;
>> +       }
>> +#endif
>> +
>>          regs->epc = 0;
>>
>>          if ((insn & INSN_MASK_LW) == INSN_MATCH_LW) {
>> @@ -625,6 +634,57 @@ static bool check_unaligned_access_emulated(int cpu)
>>          return misaligned_emu_detected;
>>   }
>>
>> +#ifdef CONFIG_RISCV_ISA_V
>> +static bool check_vector_unaligned_access(int cpu)
>> +{
>> +       long *mas_ptr = per_cpu_ptr(&vector_misaligned_access, cpu);
>> +       struct riscv_isainfo *isainfo = &hart_isa[cpu];
>> +       unsigned long tmp_var;
>> +       bool misaligned_vec_suported;
>> +
>> +       if (!riscv_isa_extension_available(isainfo->isa, v))
>> +               return false;
>> +
>> +       /* This case will only happen if a unaligned vector load
>> +        * was called by the kernel before this check
>> +        */
>> +       if (*mas_ptr != RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN)
>> +               return false;
>> +
>> +       kernel_vector_begin();
>> +       __asm__ __volatile__ (
>> +               ".option push\n\t"
>> +               ".option arch, +v\n\t"
>> +               "       li t1, 0x1\n"                           //size
>> +               "       vsetvli t0, t1, e16, m2, ta, ma\n\t"    // Vectors of 16b
>> +               "       addi t0, %[ptr], 1\n\t"                 // Misalign address
>> +               "       vle16.v v0, (t0)\n\t"                   // Load bytes
>> +               ".option pop\n\t"
>> +               : : [ptr] "r" (&tmp_var) : "v0", "t0", "t1", "memory");
>> +       kernel_vector_end();
>> +
>> +       misaligned_vec_suported = (*mas_ptr == RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN);
>> +
>> +       return misaligned_vec_suported;
>> +}
>> +#else
>> +static bool check_vector_unaligned_access(int cpu)
>> +{
>> +       return false;
>> +}
>> +#endif
>> +
>> +bool check_vector_unaligned_access_all_cpus(void)
>> +{
>> +       int cpu;
>> +
>> +       for_each_online_cpu(cpu)
>> +               if (!check_vector_unaligned_access(cpu))
>> +                       return false;
>> +
>> +       return true;
>> +}
> 
> These functions return a bool, but the bool is never checked. I'm
> guessing that's because you're going to check it in a future patch
> where you decide whether or not to probe for fast/slow?

Yes. Relating to your first question. The values should have the same 
meaning as scalar. So UNKNOWN whould mean it's supported, we dont know 
the speed.

> 
> 
>> +
>>   bool check_unaligned_access_emulated_all_cpus(void)
>>   {
>>          int cpu;
>> diff --git a/arch/riscv/kernel/unaligned_access_speed.c b/arch/riscv/kernel/unaligned_access_speed.c
>> index a9a6bcb02acf..92a84239beaa 100644
>> --- a/arch/riscv/kernel/unaligned_access_speed.c
>> +++ b/arch/riscv/kernel/unaligned_access_speed.c
>> @@ -20,6 +20,7 @@
>>   #define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
>>
>>   DEFINE_PER_CPU(long, misaligned_access_speed);
>> +DEFINE_PER_CPU(long, vector_misaligned_access) = RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
>>
>>   #ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
>>   static cpumask_t fast_misaligned_access;
>> @@ -264,6 +265,8 @@ static int check_unaligned_access_all_cpus(void)
>>   {
>>          bool all_cpus_emulated = check_unaligned_access_emulated_all_cpus();
>>
>> +       check_vector_unaligned_access_all_cpus();
>> +
>>          if (!all_cpus_emulated)
>>                  return check_unaligned_access_speed_all_cpus();
>>
>> @@ -273,6 +276,7 @@ static int check_unaligned_access_all_cpus(void)
>>   static int check_unaligned_access_all_cpus(void)
>>   {
>>          check_unaligned_access_emulated_all_cpus();
>> +       check_vector_unaligned_access_all_cpus();
>>
>>          return 0;
>>   }
>> --
>> 2.43.0
>>



More information about the linux-riscv mailing list