[PATCH v4 2/3] riscv: introduce percpu.h into include/asm

Tue May 5 00:05:57 PDT 2026

> diff --git a/arch/riscv/include/asm/percpu.h b/arch/riscv/include/asm/percpu.h
> new file mode 100644
> index 000000000000..3b26fe45e70f
> --- /dev/null
> +++ b/arch/riscv/include/asm/percpu.h

[ ... ]

> @@ -0,0 +1,280 @@

[ ... ]

> +#ifdef CONFIG_64BIT
> +#define PERCPU_OP(name, amo_insn)					\
> +	__PERCPU_AMO_OP_CASE(.w, name, 32, amo_insn)			\
> +	__PERCPU_AMO_OP_CASE(.d, name, 64, amo_insn)
> +#else
> +#define PERCPU_OP(name, amo_insn)					\
> +	__PERCPU_AMO_OP_CASE(.w, name, 32, amo_insn)
> +#endif
> +
> +PERCPU_OP(add, add)
> +PERCPU_OP(andnot, and)
> +PERCPU_OP(or, or)

[ ... ]

> +#define PERCPU_8_16_OP(name, amo_insn, sz, sfx, val_type, new_val_expr, asm_op)			\
> +static inline void __percpu_##name##_amo_case_##sz(void *ptr, unsigned long val)		\
> +{												\
> +	if (IS_ENABLED(CONFIG_RISCV_ISA_ZABHA) &&						\
> +		riscv_has_extension_unlikely(RISCV_ISA_EXT_ZABHA)) {				\
> +		asm volatile ("amo" #amo_insn #sfx " zero, %[val], %[ptr]"			\
> +			: [ptr] "+A"(*(val_type *)ptr)						\
> +			: [val] "r"((val_type)((new_val_expr) & PERCPU_8_16_GET_MASK(sz)))	\
> +			: "memory");								\
> +	} else {										\
> +		u32 *ptr32 = PERCPU_8_16_GET_PTR32(ptr);					\
> +		const unsigned long shift = PERCPU_8_16_GET_SHIFT(ptr);				\
> +		const u32 mask = PERCPU_8_16_GET_MASK(sz) << shift;				\
> +		const val_type val_trunc = (val_type)((new_val_expr)				\
> +					   & PERCPU_8_16_GET_MASK(sz));				\
> +		u32 retx, rc;									\
> +		val_type new_val_type;								\
> +												\
> +		asm volatile (									\
> +			"0: lr.w %0, %2\n"							\
> +			"and %3, %0, %4\n"							\
> +			"srl %3, %3, %5\n"							\
> +			#asm_op " %3, %3, %6\n"							\
> +			"sll %3, %3, %5\n"							\
> +			"and %1, %0, %7\n"							\
> +			"or %1, %1, %3\n"							\
> +			"sc.w %1, %1, %2\n"							\
> +			"bnez %1, 0b\n"								\
> +			: "=&r"(retx), "=&r"(rc), "+A"(*ptr32), "=&r"(new_val_type)		\
> +			: "r"(mask), "r"(shift), "r"(val_trunc), "r"(~mask)			\
> +			: "memory");								\
> +		}										\
> +}

Can the 8/16-bit LR/SC fallback for add operations corrupt adjacent
percpu data?

In the non-ZABHA path above, after extracting and shifting the subword
(lines 3-4), the code performs the operation (line 5) and immediately
shifts back (line 6). For the add instantiation:

    srl %3, %3, %5     ; new_val >>= shift
    add %3, %3, %6     ; new_val += val_trunc (may overflow 8/16 bits)
    sll %3, %3, %5     ; new_val <<= shift

There's no masking between add and sll, so a carry-out from u8 (bit 8)
or u16 (bit 16) survives the shift. The result is then ORed into the
read-modify-write word:

    and %1, %0, %7     ; rc = retx & ~mask  (clears target subword only)
    or  %1, %1, %3     ; rc |= new_val     (overflow bit enters adjacent byte)

Example: a u8 percpu at offset 0 holds 0x80; calling this_cpu_add()
with 0x80 wraps the u8 to 0 (correct), but the carry bit 8 corrupts the
adjacent byte at offset 1 in the same u32.

The sibling add_return helper (PERCPU_8_16_RET_OP) includes the missing
truncation with "and %1, %1, %7" using PERCPU_8_16_GET_MASK(sz) before
the shift-back. Why doesn't PERCPU_8_16_OP do the same?

> +
> +#define PERCPU_OP_8_16(op_name, op, expr, final_op)			\
> +	PERCPU_8_16_OP(op_name, op, 8, .b, u8, expr, final_op);		\
> +	PERCPU_8_16_OP(op_name, op, 16, .h, u16, expr, final_op)
> +
> +PERCPU_OP_8_16(add, add, val, add)
> +PERCPU_OP_8_16(andnot, and, ~(val), and)
> +PERCPU_OP_8_16(or, or, val, or)

[ ... ]

> +#define PERCPU_8_16_RET_OP(name, amo_insn, sz, sfx, val_type, new_val_expr)			\
> +static inline val_type __percpu_##name##_return_amo_case_##sz(void *ptr, unsigned long val)	\
> +{												\
> +	if (IS_ENABLED(CONFIG_RISCV_ISA_ZABHA) &&						\
> +		riscv_has_extension_unlikely(RISCV_ISA_EXT_ZABHA)) {				\
> +		register val_type ret;								\
> +		asm volatile ("amo" #amo_insn #sfx " %[ret], %[val], %[ptr]"			\
> +			: [ptr] "+A"(*(val_type *)ptr), [ret] "=r"(ret)				\
> +			: [val] "r"((val_type)((new_val_expr) & PERCPU_8_16_GET_MASK(sz)))	\
> +			: "memory");								\
> +		return ret + (val_type)((new_val_expr) & PERCPU_8_16_GET_MASK(sz));		\
> +	} else {										\
> +		u32 *ptr32 = PERCPU_8_16_GET_PTR32(ptr);					\
> +		const unsigned long shift = PERCPU_8_16_GET_SHIFT(ptr);				\
> +		const u32 mask = (PERCPU_8_16_GET_MASK(sz) << shift);				\
> +		const u32 inv_mask = ~mask;							\
> +		const val_type val_trunc = (val_type)((new_val_expr)				\
> +					   & PERCPU_8_16_GET_MASK(sz));				\
> +		u32 old, new, tmp;								\
> +												\
> +		asm volatile (									\
> +			"0: lr.w %0, %3\n"							\
> +			"and %1, %0, %4\n"							\
> +			"srl %1, %1, %5\n"							\
> +			"add %1, %1, %6\n"							\
> +			"and %1, %1, %7\n"							\
> +			"sll %1, %1, %5\n"							\
> +			"and %2, %0, %8\n"							\
> +			"or %2, %2, %1\n"							\
> +			"sc.w %2, %2, %3\n"							\
> +			"bnez %2, 0b\n"								\
> +			: "=r"(old), "=r"(tmp), "=&r"(new), "+A"(*ptr32)			\
> +			: "r"(mask), "r"(shift), "r"(val_trunc), "r"(PERCPU_8_16_GET_MASK(sz)), \
> +			"r"(inv_mask)								\
> +			: "memory");								\
> +		return (val_type)(tmp >> shift);						\
> +	}											\
> +}

Does the LR/SC fallback need early-clobber qualifiers on the output
registers?

The inline asm writes "old" and "tmp" (via lr.w and and) before reading
all input registers (%4..%8: mask, shift, val_trunc, MASK(sz),
inv_mask). The GCC inline-asm contract requires "=&r" (early-clobber)
when an output is written before all inputs are consumed. Without it,
the compiler may allocate the same physical register for "old" or "tmp"
and one of the inputs, clobbering that input.

The sibling PERCPU_8_16_OP uses "=&r"(retx), "=&r"(rc),
"=&r"(new_val_type). The equivalent __arch_xchg_masked helper in
arch/riscv/include/asm/cmpxchg.h also uses "=&r" for outputs written
early.

Should "old" and "tmp" be declared "=&r" instead of "=r"?

> +
> +PERCPU_8_16_RET_OP(add, add, 8, .b, u8, val)
> +PERCPU_8_16_RET_OP(add, add, 16, .h, u16, val)

[ ... ]

> +#define this_cpu_and_1(pcp, val)	_pcp_protect(__percpu_andnot_amo_case_8, pcp, ~(val))
> +#define this_cpu_and_2(pcp, val)	_pcp_protect(__percpu_andnot_amo_case_16, pcp, ~(val))
> +#define this_cpu_and_4(pcp, val)	_pcp_protect(__percpu_andnot_amo_case_32, pcp, ~(val))
> +
> +#ifdef CONFIG_64BIT
> +#define this_cpu_and_8(pcp, val)	_pcp_protect(__percpu_andnot_amo_case_64, pcp, ~(val))
> +#endif

Do this_cpu_and_4() and this_cpu_and_8() compute the wrong result?

The macros pass ~(val) to __percpu_andnot_amo_case_32/64, which
are generated by PERCPU_OP(andnot, and). That expands to
__PERCPU_AMO_OP_CASE(.w/.d, andnot, 32/64, and) — the asm body is
"amoand.w/d zero, %[val], %[ptr]", which performs *ptr &= val (plain
AND).

With the caller passing ~val, the final operation becomes *ptr & ~val,
which is the opposite of the expected *ptr & val. Looking at the arm64
pattern (arch/arm64/include/asm/percpu.h), PERCPU_OP(andnot, bic, ldclr)
uses the BIC instruction that provides the inversion: bic performs
*ptr & ~val, so calling with ~val yields *ptr & ~(~val) = *ptr & val.
RISC-V's amoand has no equivalent inversion.

Example: a u32 percpu initialized to 0xFFFFFFFF with
this_cpu_and(var, 0x1) should produce 0x1 but yields 0xFFFFFFFE instead.

The 8/16-bit helpers happen to be correct because PERCPU_OP_8_16(andnot,
and, ~(val), and) feeds new_val_expr = ~(val), producing a second
inversion in the macro body that cancels out. Only the 32/64-bit paths
are broken.

Should PERCPU_OP(andnot, and) be replaced with a generator that does
explicit inversion, or should the this_cpu_and_4/8 callers pass val
without negation?

[ ... ]

---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md

CI run summary: https://github.com/kernel-patches/bpf/actions/runs/25361792504