[PATCH v6 6/9] crypto: arm64/aes-xctr: Improve readability of XCTR and CTR modes
Nathan Huckleberry
nhuck at google.com
Tue May 3 17:18:20 PDT 2022
Added some clarifying comments, changed the register allocations to make
the code clearer, and added register aliases.
Signed-off-by: Nathan Huckleberry <nhuck at google.com>
---
arch/arm64/crypto/aes-modes.S | 193 ++++++++++++++++++++++------------
1 file changed, 128 insertions(+), 65 deletions(-)
diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S
index 55df157fce3a..da7c9f3380f8 100644
--- a/arch/arm64/crypto/aes-modes.S
+++ b/arch/arm64/crypto/aes-modes.S
@@ -322,32 +322,60 @@ AES_FUNC_END(aes_cbc_cts_decrypt)
* This macro generates the code for CTR and XCTR mode.
*/
.macro ctr_encrypt xctr
+ // Arguments
+ OUT .req x0
+ IN .req x1
+ KEY .req x2
+ ROUNDS_W .req w3
+ BYTES_W .req w4
+ IV .req x5
+ BYTE_CTR_W .req w6 // XCTR only
+ // Intermediate values
+ CTR_W .req w11 // XCTR only
+ CTR .req x11 // XCTR only
+ IV_PART .req x12
+ BLOCKS .req x13
+ BLOCKS_W .req w13
+
stp x29, x30, [sp, #-16]!
mov x29, sp
- enc_prepare w3, x2, x12
- ld1 {vctr.16b}, [x5]
+ enc_prepare ROUNDS_W, KEY, IV_PART
+ ld1 {vctr.16b}, [IV]
+ /*
+ * Keep 64 bits of the IV in a register. For CTR mode this lets us
+ * easily increment the IV. For XCTR mode this lets us efficiently XOR
+ * the 64-bit counter with the IV.
+ */
.if \xctr
- umov x12, vctr.d[0]
- lsr w11, w6, #4
+ umov IV_PART, vctr.d[0]
+ lsr CTR_W, BYTE_CTR_W, #4
.else
- umov x12, vctr.d[1] /* keep swabbed ctr in reg */
- rev x12, x12
+ umov IV_PART, vctr.d[1]
+ rev IV_PART, IV_PART
.endif
.LctrloopNx\xctr:
- add w7, w4, #15
- sub w4, w4, #MAX_STRIDE << 4
- lsr w7, w7, #4
+ add BLOCKS_W, BYTES_W, #15
+ sub BYTES_W, BYTES_W, #MAX_STRIDE << 4
+ lsr BLOCKS_W, BLOCKS_W, #4
mov w8, #MAX_STRIDE
- cmp w7, w8
- csel w7, w7, w8, lt
+ cmp BLOCKS_W, w8
+ csel BLOCKS_W, BLOCKS_W, w8, lt
+ /*
+ * Set up the counter values in v0-v4.
+ *
+ * If we are encrypting less than MAX_STRIDE blocks, the tail block
+ * handling code expects the last keystream block to be in v4. For
+ * example: if encrypting two blocks with MAX_STRIDE=5, then v3 and v4
+ * should have the next two counter blocks.
+ */
.if \xctr
- add x11, x11, x7
+ add CTR, CTR, BLOCKS
.else
- adds x12, x12, x7
+ adds IV_PART, IV_PART, BLOCKS
.endif
mov v0.16b, vctr.16b
mov v1.16b, vctr.16b
@@ -355,16 +383,16 @@ AES_FUNC_END(aes_cbc_cts_decrypt)
mov v3.16b, vctr.16b
ST5( mov v4.16b, vctr.16b )
.if \xctr
- sub x6, x11, #MAX_STRIDE - 1
- sub x7, x11, #MAX_STRIDE - 2
- sub x8, x11, #MAX_STRIDE - 3
- sub x9, x11, #MAX_STRIDE - 4
-ST5( sub x10, x11, #MAX_STRIDE - 5 )
- eor x6, x6, x12
- eor x7, x7, x12
- eor x8, x8, x12
- eor x9, x9, x12
- eor x10, x10, x12
+ sub x6, CTR, #MAX_STRIDE - 1
+ sub x7, CTR, #MAX_STRIDE - 2
+ sub x8, CTR, #MAX_STRIDE - 3
+ sub x9, CTR, #MAX_STRIDE - 4
+ST5( sub x10, CTR, #MAX_STRIDE - 5 )
+ eor x6, x6, IV_PART
+ eor x7, x7, IV_PART
+ eor x8, x8, IV_PART
+ eor x9, x9, IV_PART
+ eor x10, x10, IV_PART
mov v0.d[0], x6
mov v1.d[0], x7
mov v2.d[0], x8
@@ -381,9 +409,9 @@ ST5( mov v4.d[0], x10 )
ins vctr.d[0], x8
/* apply carry to N counter blocks for N := x12 */
- cbz x12, 2f
+ cbz IV_PART, 2f
adr x16, 1f
- sub x16, x16, x12, lsl #3
+ sub x16, x16, IV_PART, lsl #3
br x16
bti c
mov v0.d[0], vctr.d[0]
@@ -398,71 +426,88 @@ ST5( mov v4.d[0], vctr.d[0] )
1: b 2f
.previous
-2: rev x7, x12
+2: rev x7, IV_PART
ins vctr.d[1], x7
- sub x7, x12, #MAX_STRIDE - 1
- sub x8, x12, #MAX_STRIDE - 2
- sub x9, x12, #MAX_STRIDE - 3
+ sub x7, IV_PART, #MAX_STRIDE - 1
+ sub x8, IV_PART, #MAX_STRIDE - 2
+ sub x9, IV_PART, #MAX_STRIDE - 3
rev x7, x7
rev x8, x8
mov v1.d[1], x7
rev x9, x9
-ST5( sub x10, x12, #MAX_STRIDE - 4 )
+ST5( sub x10, IV_PART, #MAX_STRIDE - 4 )
mov v2.d[1], x8
ST5( rev x10, x10 )
mov v3.d[1], x9
ST5( mov v4.d[1], x10 )
.endif
- tbnz w4, #31, .Lctrtail\xctr
- ld1 {v5.16b-v7.16b}, [x1], #48
+
+ /*
+ * If there are at least MAX_STRIDE blocks left, XOR the plaintext with
+ * keystream and store. Otherwise jump to tail handling.
+ */
+ tbnz BYTES_W, #31, .Lctrtail\xctr
+ ld1 {v5.16b-v7.16b}, [IN], #48
ST4( bl aes_encrypt_block4x )
ST5( bl aes_encrypt_block5x )
eor v0.16b, v5.16b, v0.16b
-ST4( ld1 {v5.16b}, [x1], #16 )
+ST4( ld1 {v5.16b}, [IN], #16 )
eor v1.16b, v6.16b, v1.16b
-ST5( ld1 {v5.16b-v6.16b}, [x1], #32 )
+ST5( ld1 {v5.16b-v6.16b}, [IN], #32 )
eor v2.16b, v7.16b, v2.16b
eor v3.16b, v5.16b, v3.16b
ST5( eor v4.16b, v6.16b, v4.16b )
- st1 {v0.16b-v3.16b}, [x0], #64
-ST5( st1 {v4.16b}, [x0], #16 )
- cbz w4, .Lctrout\xctr
+ st1 {v0.16b-v3.16b}, [OUT], #64
+ST5( st1 {v4.16b}, [OUT], #16 )
+ cbz BYTES_W, .Lctrout\xctr
b .LctrloopNx\xctr
.Lctrout\xctr:
.if !\xctr
- st1 {vctr.16b}, [x5] /* return next CTR value */
+ st1 {vctr.16b}, [IV] /* return next CTR value */
.endif
ldp x29, x30, [sp], #16
ret
.Lctrtail\xctr:
+ /*
+ * Handle up to MAX_STRIDE * 16 - 1 bytes of plaintext
+ *
+ * This code expects the last keystream block to be in v4. For example:
+ * if encrypting two blocks with MAX_STRIDE=5, then v3 and v4 should
+ * have the next two counter blocks.
+ *
+ * This allows us to store the ciphertext by writing to overlapping
+ * regions of memory. Any invalid ciphertext blocks get overwritten by
+ * correctly computed blocks. This approach avoids extra conditional
+ * branches.
+ */
mov x16, #16
- ands x6, x4, #0xf
- csel x13, x6, x16, ne
+ ands w7, BYTES_W, #0xf
+ csel x13, x7, x16, ne
-ST5( cmp w4, #64 - (MAX_STRIDE << 4) )
+ST5( cmp BYTES_W, #64 - (MAX_STRIDE << 4))
ST5( csel x14, x16, xzr, gt )
- cmp w4, #48 - (MAX_STRIDE << 4)
+ cmp BYTES_W, #48 - (MAX_STRIDE << 4)
csel x15, x16, xzr, gt
- cmp w4, #32 - (MAX_STRIDE << 4)
+ cmp BYTES_W, #32 - (MAX_STRIDE << 4)
csel x16, x16, xzr, gt
- cmp w4, #16 - (MAX_STRIDE << 4)
+ cmp BYTES_W, #16 - (MAX_STRIDE << 4)
- adr_l x12, .Lcts_permute_table
- add x12, x12, x13
+ adr_l x9, .Lcts_permute_table
+ add x9, x9, x13
ble .Lctrtail1x\xctr
-ST5( ld1 {v5.16b}, [x1], x14 )
- ld1 {v6.16b}, [x1], x15
- ld1 {v7.16b}, [x1], x16
+ST5( ld1 {v5.16b}, [IN], x14 )
+ ld1 {v6.16b}, [IN], x15
+ ld1 {v7.16b}, [IN], x16
ST4( bl aes_encrypt_block4x )
ST5( bl aes_encrypt_block5x )
- ld1 {v8.16b}, [x1], x13
- ld1 {v9.16b}, [x1]
- ld1 {v10.16b}, [x12]
+ ld1 {v8.16b}, [IN], x13
+ ld1 {v9.16b}, [IN]
+ ld1 {v10.16b}, [x9]
ST4( eor v6.16b, v6.16b, v0.16b )
ST4( eor v7.16b, v7.16b, v1.16b )
@@ -477,30 +522,48 @@ ST5( eor v7.16b, v7.16b, v2.16b )
ST5( eor v8.16b, v8.16b, v3.16b )
ST5( eor v9.16b, v9.16b, v4.16b )
-ST5( st1 {v5.16b}, [x0], x14 )
- st1 {v6.16b}, [x0], x15
- st1 {v7.16b}, [x0], x16
- add x13, x13, x0
+ST5( st1 {v5.16b}, [OUT], x14 )
+ st1 {v6.16b}, [OUT], x15
+ st1 {v7.16b}, [OUT], x16
+ add x13, x13, OUT
st1 {v9.16b}, [x13] // overlapping stores
- st1 {v8.16b}, [x0]
+ st1 {v8.16b}, [OUT]
b .Lctrout\xctr
.Lctrtail1x\xctr:
- sub x7, x6, #16
- csel x6, x6, x7, eq
- add x1, x1, x6
- add x0, x0, x6
- ld1 {v5.16b}, [x1]
- ld1 {v6.16b}, [x0]
+ /*
+ * Handle <= 16 bytes of plaintext
+ */
+ sub x8, x7, #16
+ csel x7, x7, x8, eq
+ add IN, IN, x7
+ add OUT, OUT, x7
+ ld1 {v5.16b}, [IN]
+ ld1 {v6.16b}, [OUT]
ST5( mov v3.16b, v4.16b )
encrypt_block v3, w3, x2, x8, w7
- ld1 {v10.16b-v11.16b}, [x12]
+ ld1 {v10.16b-v11.16b}, [x9]
tbl v3.16b, {v3.16b}, v10.16b
sshr v11.16b, v11.16b, #7
eor v5.16b, v5.16b, v3.16b
bif v5.16b, v6.16b, v11.16b
- st1 {v5.16b}, [x0]
+ st1 {v5.16b}, [OUT]
b .Lctrout\xctr
+
+ // Arguments
+ .unreq OUT
+ .unreq IN
+ .unreq KEY
+ .unreq ROUNDS_W
+ .unreq BYTES_W
+ .unreq IV
+ .unreq BYTE_CTR_W // XCTR only
+ // Intermediate values
+ .unreq CTR_W // XCTR only
+ .unreq CTR // XCTR only
+ .unreq IV_PART
+ .unreq BLOCKS
+ .unreq BLOCKS_W
.endm
/*
--
2.36.0.464.gb9c8b46e94-goog
More information about the linux-arm-kernel
mailing list