[PATCH 3/5] riscv: Vector checksum header

Mon Aug 28 11:22:46 PDT 2023

On 2023-08-26 8:26 PM, Charlie Jenkins wrote:
> This patch is not ready for merge as vector support in the kernel is
> limited. However, the code has been tested in QEMU so the algorithms
> do work. It is written in assembly rather than using the GCC vector
> instrinsics because they did not provide optimal code.
> 
> Signed-off-by: Charlie Jenkins <charlie at rivosinc.com>
> ---
>  arch/riscv/include/asm/checksum.h | 81 +++++++++++++++++++++++++++++++++++++++
>  1 file changed, 81 insertions(+)
> 
> diff --git a/arch/riscv/include/asm/checksum.h b/arch/riscv/include/asm/checksum.h
> index af49b3409576..7e31c0ad6346 100644
> --- a/arch/riscv/include/asm/checksum.h
> +++ b/arch/riscv/include/asm/checksum.h
> @@ -10,6 +10,10 @@
>  #include <linux/in6.h>
>  #include <linux/uaccess.h>
>  
> +#ifdef CONFIG_RISCV_ISA_V
> +#include <riscv_vector.h>
> +#endif
> +
>  /* Default version is sufficient for 32 bit */
>  #ifdef CONFIG_64BIT
>  #define _HAVE_ARCH_IPV6_CSUM
> @@ -36,6 +40,46 @@ static inline __sum16 csum_fold(__wsum sum)
>   *	without the bitmanip extensions zba/zbb.
>   */
>  #ifdef CONFIG_32BIT
> +#ifdef CONFIG_RISCV_ISA_V
> +static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
> +{
> +	vuint64m1_t prev_buffer;
> +	vuint32m1_t curr_buffer;
> +	unsigned int vl;
> +	unsigned int high_result;
> +	unsigned int low_result;
> +
> +	asm("vsetivli x0, 1, e64, ta, ma				\n\t\

The same concerns from patch 1 apply here as well. Vector assembly must be gated
behind an alternative section or a call to has_vector(), so the kernel can fall
back to a non-vector implementation at runtime.

You are also missing calls to kernel_vector_begin()/kernel_vector_end(), as
added by [1], which are required to avoid corrupting the user-mode vector
register context.

Regards,
Samuel

[1]: https://lore.kernel.org/linux-riscv/20230721112855.1006-3-andy.chiu@sifive.com/

> +	vmv.v.i %[prev_buffer], 0					\n\t\
> +	1:								\n\t\
> +	vsetvli %[vl], %[ihl], e32, m1, ta, ma				\n\t\
> +	vle32.v %[curr_buffer], (%[iph])				\n\t\
> +	vwredsumu.vs %[prev_buffer], %[curr_buffer], %[prev_buffer]	\n\t\
> +	sub %[ihl], %[ihl], %[vl]					\n\t"
> +#ifdef CONFIG_RISCV_ISA_ZBA
> +	"sh2add %[iph], %[vl], %[iph]					\n\t"
> +#else
> +	"slli %[vl], %[vl], 2						\n\
> +	add %[iph], %[vl], %[iph]					\n\t"
> +#endif
> +	"bnez %[ihl], 1b						\n\
> +	vsetivli x0, 1, e64, m1, ta, ma					\n\
> +	vmv.x.s %[low_result], %[prev_buffer]				\n\
> +	addi %[vl], x0, 32						\n\
> +	vsrl.vx %[prev_buffer], %[prev_buffer], %[vl]			\n\
> +	vmv.x.s %[high_result], %[prev_buffer]"
> +	: [vl] "=&r" (vl), [prev_buffer] "=&vd" (prev_buffer),
> +		[curr_buffer] "=&vd" (curr_buffer),
> +		[high_result] "=&r" (high_result),
> +		[low_result] "=&r" (low_result)
> +	: [iph] "r" (iph), [ihl] "r" (ihl));
> +
> +	high_result += low_result;
> +	high_result += high_result < low_result;
> +	return csum_fold((__force __wsum)(high_result));
> +}
> +
> +#else
>  static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
>  {
>  	__wsum csum = 0;
> @@ -47,8 +91,44 @@ static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
>  	} while (++pos < ihl);
>  	return csum_fold(csum);
>  }
> +#endif
> +#else
> +
> +#ifdef CONFIG_RISCV_ISA_V
> +static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
> +{
> +	vuint64m1_t prev_buffer;
> +	vuint32m1_t curr_buffer;
> +	unsigned long vl;
> +	unsigned long result;
> +
> +	asm("vsetivli x0, 1, e64, ta, ma				\n\
> +	vmv.v.i %[prev_buffer], 0					\n\
> +	1:								\n\
> +	# Setup 32-bit sum of iph					\n\
> +	vsetvli %[vl], %[ihl], e32, m1, ta, ma				\n\
> +	vle32.v %[curr_buffer], (%[iph])				\n\
> +	# Sum each 32-bit segment of iph that can fit into a vector reg	\n\
> +	vwredsumu.vs %[prev_buffer], %[curr_buffer], %[prev_buffer]     \n\
> +	subw %[ihl], %[ihl], %[vl]					\n\t"
> +#ifdef CONFIG_RISCV_ISA_ZBA
> +	"sh2add %[iph], %[vl], %[iph]					\n\t"
>  #else
> +	"slli %[vl], %[vl], 2						\n\
> +	addw %[iph], %[vl], %[iph]					\n\t"
> +#endif
> +	"# If not all of iph could fit into vector reg, do another sum	\n\
> +	bnez %[ihl], 1b							\n\
> +	vsetvli x0, x0, e64, m1, ta, ma					\n\
> +	vmv.x.s %[result], %[prev_buffer]"
> +	: [vl] "=&r" (vl), [prev_buffer] "=&vd" (prev_buffer),
> +		[curr_buffer] "=&vd" (curr_buffer), [result] "=&r" (result)
> +	: [iph] "r" (iph), [ihl] "r" (ihl));
>  
> +	result += (result >> 32) | (result << 32);
> +	return csum_fold((__force __wsum)(result >> 32));
> +}
> +#else
>  /*
>   * Quickly compute an IP checksum with the assumption that IPv4 headers will
>   * always be in multiples of 32-bits, and have an ihl of at least 5.
> @@ -74,6 +154,7 @@ static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
>  	return csum_fold((__force __wsum)(csum >> 32));
>  }
>  #endif
> +#endif
>  #define ip_fast_csum ip_fast_csum
>  
>  extern unsigned int do_csum(const unsigned char *buff, int len);
>