[PATCH 2/9] lib/crypto: polyval: Add POLYVAL library
Ard Biesheuvel
ardb at kernel.org
Mon Nov 10 23:42:29 PST 2025
On Mon, 10 Nov 2025 at 16:21, Ard Biesheuvel <ardb at kernel.org> wrote:
>
> Hi,
>
> On Mon, 10 Nov 2025 at 00:49, Eric Biggers <ebiggers at kernel.org> wrote:
> >
> > Add support for POLYVAL to lib/crypto/.
> >
> > This will replace the polyval crypto_shash algorithm and its use in the
> > hctr2 template, simplifying the code and reducing overhead.
> >
> > Specifically, this commit introduces the POLYVAL library API and a
> > generic implementation of it. Later commits will migrate the existing
> > architecture-optimized implementations of POLYVAL into lib/crypto/ and
> > add a KUnit test suite.
> >
> > I've also rewritten the generic implementation completely, using a more
> > modern approach instead of the traditional table-based approach. It's
> > now constant-time, requires no precomputation or dynamic memory
> > allocations, decreases the per-key memory usage from 4096 bytes to 16
> > bytes, and is faster than the old polyval-generic even on bulk data
> > reusing the same key (at least on x86_64, where I measured 15% faster).
> > We should do this for GHASH too, but for now just do it for POLYVAL.
> >
>
> Very nice.
>
> GHASH might suffer on 32-bit, I suppose, but taking this approach at
> least on 64-bit also for GHASH would be a huge improvement.
>
> I had a stab at replacing the int128 arithmetic with
> __builtin_bitreverse64(), but it seems to make little difference (and
> GCC does not support it [yet]). I've tried both arm64 and x86, and the
> perf delta (using your kunit benchmark) is negligible in either case.
Sigh. I intended to only apply the generic patch and the kunit test,
but applied the whole series in the end, which explains perfectly why
x86_64 and arm64 performance are identical, given that the generic
code isn't even used.
So trying this again, on a Cortex-A72 without Crypto Extensions, I do
get a ~30% performance improvement doing the below. I haven't
re-tested x86, but given that it does not appear to have a native
scalar bit reverse instruction (or __builtin_bitreverse64() is broken
for it), there is probably no point in finding out.
Not saying we should do this for POLYVAL, but something to keep in
mind for gf128mul.c perhaps.
--- a/lib/crypto/polyval.c
+++ b/lib/crypto/polyval.c
@@ -42,11 +42,48 @@
* 256-bit => 128-bit reduction algorithm.
*/
-#ifdef CONFIG_ARCH_SUPPORTS_INT128
+#if defined(CONFIG_ARCH_SUPPORTS_INT128) ||
__has_builtin(__builtin_bitreverse64)
/* Do a 64 x 64 => 128 bit carryless multiplication. */
static void clmul64(u64 a, u64 b, u64 *out_lo, u64 *out_hi)
{
+ u64 a0 = a & 0x1111111111111111;
+ u64 a1 = a & 0x2222222222222222;
+ u64 a2 = a & 0x4444444444444444;
+ u64 a3 = a & 0x8888888888888888;
+
+ u64 b0 = b & 0x1111111111111111;
+ u64 b1 = b & 0x2222222222222222;
+ u64 b2 = b & 0x4444444444444444;
+ u64 b3 = b & 0x8888888888888888;
+
+#if __has_builtin(__builtin_bitreverse64)
+#define brev64 __builtin_bitreverse64
+ u64 c0 = (a0 * b0) ^ (a1 * b3) ^ (a2 * b2) ^ (a3 * b1);
+ u64 c1 = (a0 * b1) ^ (a1 * b0) ^ (a2 * b3) ^ (a3 * b2);
+ u64 c2 = (a0 * b2) ^ (a1 * b1) ^ (a2 * b0) ^ (a3 * b3);
+ u64 c3 = (a0 * b3) ^ (a1 * b2) ^ (a2 * b1) ^ (a3 * b0);
+
+ a0 = brev64(a0);
+ a1 = brev64(a1);
+ a2 = brev64(a2);
+ a3 = brev64(a3);
+
+ b0 = brev64(b0);
+ b1 = brev64(b1);
+ b2 = brev64(b2);
+ b3 = brev64(b3);
+
+ u64 d0 = (a0 * b0) ^ (a1 * b3) ^ (a2 * b2) ^ (a3 * b1);
+ u64 d1 = (a0 * b1) ^ (a1 * b0) ^ (a2 * b3) ^ (a3 * b2);
+ u64 d2 = (a0 * b2) ^ (a1 * b1) ^ (a2 * b0) ^ (a3 * b3);
+ u64 d3 = (a0 * b3) ^ (a1 * b2) ^ (a2 * b1) ^ (a3 * b0);
+
+ *out_hi = ((brev64(d0) >> 1) & 0x1111111111111111) ^
+ ((brev64(d1) >> 1) & 0x2222222222222222) ^
+ ((brev64(d2) >> 1) & 0x4444444444444444) ^
+ ((brev64(d3) >> 1) & 0x8888888888888888);
+#else
/*
* With 64-bit multiplicands and one term every 4 bits, there would be
* up to 64 / 4 = 16 one bits per column when each multiplication is
@@ -60,15 +97,10 @@ static void clmul64(u64 a, u64 b, u64 *out_lo, u64 *out_hi)
* Instead, mask off 4 bits from one multiplicand, giving a max of 15
* one bits per column. Then handle those 4 bits separately.
*/
- u64 a0 = a & 0x1111111111111110;
- u64 a1 = a & 0x2222222222222220;
- u64 a2 = a & 0x4444444444444440;
- u64 a3 = a & 0x8888888888888880;
-
- u64 b0 = b & 0x1111111111111111;
- u64 b1 = b & 0x2222222222222222;
- u64 b2 = b & 0x4444444444444444;
- u64 b3 = b & 0x8888888888888888;
+ a0 &= ~0xfULL;
+ a1 &= ~0xfULL;
+ a2 &= ~0xfULL;
+ a3 &= ~0xfULL;
/* Multiply the high 60 bits of @a by @b. */
u128 c0 = (a0 * (u128)b0) ^ (a1 * (u128)b3) ^
@@ -85,18 +117,20 @@ static void clmul64(u64 a, u64 b, u64 *out_lo, u64 *out_hi)
u64 e1 = -((a >> 1) & 1) & b;
u64 e2 = -((a >> 2) & 1) & b;
u64 e3 = -((a >> 3) & 1) & b;
- u64 extra_lo = e0 ^ (e1 << 1) ^ (e2 << 2) ^ (e3 << 3);
- u64 extra_hi = (e1 >> 63) ^ (e2 >> 62) ^ (e3 >> 61);
/* Add all the intermediate products together. */
- *out_lo = (((u64)c0) & 0x1111111111111111) ^
- (((u64)c1) & 0x2222222222222222) ^
- (((u64)c2) & 0x4444444444444444) ^
- (((u64)c3) & 0x8888888888888888) ^ extra_lo;
*out_hi = (((u64)(c0 >> 64)) & 0x1111111111111111) ^
(((u64)(c1 >> 64)) & 0x2222222222222222) ^
(((u64)(c2 >> 64)) & 0x4444444444444444) ^
- (((u64)(c3 >> 64)) & 0x8888888888888888) ^ extra_hi;
+ (((u64)(c3 >> 64)) & 0x8888888888888888) ^
+ (e1 >> 63) ^ (e2 >> 62) ^ (e3 >> 61);
+
+ *out_lo = e0 ^ (e1 << 1) ^ (e2 << 2) ^ (e3 << 3);
+#endif
+ *out_lo ^= (((u64)c0) & 0x1111111111111111) ^
+ (((u64)c1) & 0x2222222222222222) ^
+ (((u64)c2) & 0x4444444444444444) ^
+ (((u64)c3) & 0x8888888888888888);
}
#else /* CONFIG_ARCH_SUPPORTS_INT128 */
More information about the linux-arm-kernel
mailing list