[PATCH 2/9] lib/crypto: polyval: Add POLYVAL library

Mon Nov 10 23:42:29 PST 2025

On Mon, 10 Nov 2025 at 16:21, Ard Biesheuvel <ardb at kernel.org> wrote:
>
> Hi,
>
> On Mon, 10 Nov 2025 at 00:49, Eric Biggers <ebiggers at kernel.org> wrote:
> >
> > Add support for POLYVAL to lib/crypto/.
> >
> > This will replace the polyval crypto_shash algorithm and its use in the
> > hctr2 template, simplifying the code and reducing overhead.
> >
> > Specifically, this commit introduces the POLYVAL library API and a
> > generic implementation of it.  Later commits will migrate the existing
> > architecture-optimized implementations of POLYVAL into lib/crypto/ and
> > add a KUnit test suite.
> >
> > I've also rewritten the generic implementation completely, using a more
> > modern approach instead of the traditional table-based approach.  It's
> > now constant-time, requires no precomputation or dynamic memory
> > allocations, decreases the per-key memory usage from 4096 bytes to 16
> > bytes, and is faster than the old polyval-generic even on bulk data
> > reusing the same key (at least on x86_64, where I measured 15% faster).
> > We should do this for GHASH too, but for now just do it for POLYVAL.
> >
>
> Very nice.
>
> GHASH might suffer on 32-bit, I suppose, but taking this approach at
> least on 64-bit also for GHASH would be a huge improvement.
>
> I had a stab at replacing the int128 arithmetic with
> __builtin_bitreverse64(), but it seems to make little difference (and
> GCC does not support it [yet]). I've tried both arm64 and x86, and the
> perf delta (using your kunit benchmark) is negligible in either case.

Sigh. I intended to only apply the generic patch and the kunit test,
but applied the whole series in the end, which explains perfectly why
x86_64 and arm64 performance are identical, given that the generic
code isn't even used.

So trying this again, on a Cortex-A72 without Crypto Extensions, I do
get a ~30% performance improvement doing the below. I haven't
re-tested x86, but given that it does not appear to have a native
scalar bit reverse instruction (or __builtin_bitreverse64() is broken
for it), there is probably no point in finding out.

Not saying we should do this for POLYVAL, but something to keep in
mind for gf128mul.c perhaps.

--- a/lib/crypto/polyval.c
+++ b/lib/crypto/polyval.c
@@ -42,11 +42,48 @@
  * 256-bit => 128-bit reduction algorithm.
  */

-#ifdef CONFIG_ARCH_SUPPORTS_INT128
+#if defined(CONFIG_ARCH_SUPPORTS_INT128) ||
__has_builtin(__builtin_bitreverse64)

 /* Do a 64 x 64 => 128 bit carryless multiplication. */
 static void clmul64(u64 a, u64 b, u64 *out_lo, u64 *out_hi)
 {
+       u64 a0 = a & 0x1111111111111111;
+       u64 a1 = a & 0x2222222222222222;
+       u64 a2 = a & 0x4444444444444444;
+       u64 a3 = a & 0x8888888888888888;
+
+       u64 b0 = b & 0x1111111111111111;
+       u64 b1 = b & 0x2222222222222222;
+       u64 b2 = b & 0x4444444444444444;
+       u64 b3 = b & 0x8888888888888888;
+
+#if __has_builtin(__builtin_bitreverse64)
+#define brev64 __builtin_bitreverse64
+       u64 c0 = (a0 * b0) ^ (a1 * b3) ^ (a2 * b2) ^ (a3 * b1);
+       u64 c1 = (a0 * b1) ^ (a1 * b0) ^ (a2 * b3) ^ (a3 * b2);
+       u64 c2 = (a0 * b2) ^ (a1 * b1) ^ (a2 * b0) ^ (a3 * b3);
+       u64 c3 = (a0 * b3) ^ (a1 * b2) ^ (a2 * b1) ^ (a3 * b0);
+
+       a0 = brev64(a0);
+       a1 = brev64(a1);
+       a2 = brev64(a2);
+       a3 = brev64(a3);
+
+       b0 = brev64(b0);
+       b1 = brev64(b1);
+       b2 = brev64(b2);
+       b3 = brev64(b3);
+
+       u64 d0 = (a0 * b0) ^ (a1 * b3) ^ (a2 * b2) ^ (a3 * b1);
+       u64 d1 = (a0 * b1) ^ (a1 * b0) ^ (a2 * b3) ^ (a3 * b2);
+       u64 d2 = (a0 * b2) ^ (a1 * b1) ^ (a2 * b0) ^ (a3 * b3);
+       u64 d3 = (a0 * b3) ^ (a1 * b2) ^ (a2 * b1) ^ (a3 * b0);
+
+       *out_hi = ((brev64(d0) >> 1) & 0x1111111111111111) ^
+                 ((brev64(d1) >> 1) & 0x2222222222222222) ^
+                 ((brev64(d2) >> 1) & 0x4444444444444444) ^
+                 ((brev64(d3) >> 1) & 0x8888888888888888);
+#else
        /*
         * With 64-bit multiplicands and one term every 4 bits, there would be
         * up to 64 / 4 = 16 one bits per column when each multiplication is
@@ -60,15 +97,10 @@ static void clmul64(u64 a, u64 b, u64 *out_lo, u64 *out_hi)
         * Instead, mask off 4 bits from one multiplicand, giving a max of 15
         * one bits per column.  Then handle those 4 bits separately.
         */
-       u64 a0 = a & 0x1111111111111110;
-       u64 a1 = a & 0x2222222222222220;
-       u64 a2 = a & 0x4444444444444440;
-       u64 a3 = a & 0x8888888888888880;
-
-       u64 b0 = b & 0x1111111111111111;
-       u64 b1 = b & 0x2222222222222222;
-       u64 b2 = b & 0x4444444444444444;
-       u64 b3 = b & 0x8888888888888888;
+       a0 &= ~0xfULL;
+       a1 &= ~0xfULL;
+       a2 &= ~0xfULL;
+       a3 &= ~0xfULL;

        /* Multiply the high 60 bits of @a by @b. */
        u128 c0 = (a0 * (u128)b0) ^ (a1 * (u128)b3) ^
@@ -85,18 +117,20 @@ static void clmul64(u64 a, u64 b, u64 *out_lo, u64 *out_hi)
        u64 e1 = -((a >> 1) & 1) & b;
        u64 e2 = -((a >> 2) & 1) & b;
        u64 e3 = -((a >> 3) & 1) & b;
-       u64 extra_lo = e0 ^ (e1 << 1) ^ (e2 << 2) ^ (e3 << 3);
-       u64 extra_hi = (e1 >> 63) ^ (e2 >> 62) ^ (e3 >> 61);

        /* Add all the intermediate products together. */
-       *out_lo = (((u64)c0) & 0x1111111111111111) ^
-                 (((u64)c1) & 0x2222222222222222) ^
-                 (((u64)c2) & 0x4444444444444444) ^
-                 (((u64)c3) & 0x8888888888888888) ^ extra_lo;
        *out_hi = (((u64)(c0 >> 64)) & 0x1111111111111111) ^
                  (((u64)(c1 >> 64)) & 0x2222222222222222) ^
                  (((u64)(c2 >> 64)) & 0x4444444444444444) ^
-                 (((u64)(c3 >> 64)) & 0x8888888888888888) ^ extra_hi;
+                 (((u64)(c3 >> 64)) & 0x8888888888888888) ^
+                 (e1 >> 63) ^ (e2 >> 62) ^ (e3 >> 61);
+
+       *out_lo = e0 ^ (e1 << 1) ^ (e2 << 2) ^ (e3 << 3);
+#endif
+       *out_lo ^= (((u64)c0) & 0x1111111111111111) ^
+                  (((u64)c1) & 0x2222222222222222) ^
+                  (((u64)c2) & 0x4444444444444444) ^
+                  (((u64)c3) & 0x8888888888888888);
 }

 #else /* CONFIG_ARCH_SUPPORTS_INT128 */