crypto: x86/aes-xts - additional optimizations

Reduce latency by taking advantage of the property vaesenclast(key, a) ^
b == vaesenclast(key ^ b, a), like I did in the AES-GCM code.

Also replace a vpand and vpxor with a vpternlogd.

On AMD Zen 5 this improves performance by about 3%.  Intel performance
remains about the same, with a 0.1% improvement being seen on Icelake.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
This commit is contained in:
Eric Biggers
2024-12-12 13:28:45 -08:00
committed by Herbert Xu
parent 68e95f5c64
commit 3cd46a78ee

View File

@@ -235,8 +235,12 @@
vpshufd $0x13, \src, \tmp
vpaddq \src, \src, \dst
vpsrad $31, \tmp, \tmp
.if USE_AVX10
vpternlogd $0x78, GF_POLY_XMM, \tmp, \dst
.else
vpand GF_POLY_XMM, \tmp, \tmp
vpxor \tmp, \dst, \dst
.endif
.endm
// Given the XTS tweak(s) in the vector \src, compute the next vector of
@@ -454,84 +458,94 @@
.endif
.endm
// Do a single round of AES encryption (if \enc==1) or decryption (if \enc==0)
// on the block(s) in \data using the round key(s) in \key. The register length
// determines the number of AES blocks en/decrypted.
.macro _vaes enc, last, key, data
// Do a single non-last round of AES encryption (if \enc==1) or decryption (if
// \enc==0) on the block(s) in \data using the round key(s) in \key. The
// register length determines the number of AES blocks en/decrypted.
.macro _vaes enc, key, data
.if \enc
.if \last
vaesenclast \key, \data, \data
.else
vaesenc \key, \data, \data
.endif
.else
.if \last
vaesdeclast \key, \data, \data
.else
vaesdec \key, \data, \data
.endif
.endm
// Same as _vaes, but does the last round.
.macro _vaeslast enc, key, data
.if \enc
vaesenclast \key, \data, \data
.else
vaesdeclast \key, \data, \data
.endif
.endm
// Do a single round of AES en/decryption on the block(s) in \data, using the
// same key for all block(s). The round key is loaded from the appropriate
// register or memory location for round \i. May clobber V4.
.macro _vaes_1x enc, last, i, xmm_suffix, data
// Do a single non-last round of AES en/decryption on the block(s) in \data,
// using the same key for all block(s). The round key is loaded from the
// appropriate register or memory location for round \i. May clobber \tmp.
.macro _vaes_1x enc, i, xmm_suffix, data, tmp
.if USE_AVX10
_vaes \enc, \last, KEY\i\xmm_suffix, \data
_vaes \enc, KEY\i\xmm_suffix, \data
.else
.ifnb \xmm_suffix
_vaes \enc, \last, (\i-7)*16(KEY), \data
_vaes \enc, (\i-7)*16(KEY), \data
.else
_vbroadcast128 (\i-7)*16(KEY), V4
_vaes \enc, \last, V4, \data
_vbroadcast128 (\i-7)*16(KEY), \tmp
_vaes \enc, \tmp, \data
.endif
.endif
.endm
// Do a single round of AES en/decryption on the blocks in registers V0-V3,
// using the same key for all blocks. The round key is loaded from the
// Do a single non-last round of AES en/decryption on the blocks in registers
// V0-V3, using the same key for all blocks. The round key is loaded from the
// appropriate register or memory location for round \i. In addition, does two
// steps of the computation of the next set of tweaks. May clobber V4 and V5.
.macro _vaes_4x enc, last, i
.macro _vaes_4x enc, i
.if USE_AVX10
_tweak_step (2*(\i-5))
_vaes \enc, \last, KEY\i, V0
_vaes \enc, \last, KEY\i, V1
_vaes \enc, KEY\i, V0
_vaes \enc, KEY\i, V1
_tweak_step (2*(\i-5) + 1)
_vaes \enc, \last, KEY\i, V2
_vaes \enc, \last, KEY\i, V3
_vaes \enc, KEY\i, V2
_vaes \enc, KEY\i, V3
.else
_vbroadcast128 (\i-7)*16(KEY), V4
_tweak_step (2*(\i-5))
_vaes \enc, \last, V4, V0
_vaes \enc, \last, V4, V1
_vaes \enc, V4, V0
_vaes \enc, V4, V1
_tweak_step (2*(\i-5) + 1)
_vaes \enc, \last, V4, V2
_vaes \enc, \last, V4, V3
_vaes \enc, V4, V2
_vaes \enc, V4, V3
.endif
.endm
// Do tweaked AES en/decryption (i.e., XOR with \tweak, then AES en/decrypt,
// then XOR with \tweak again) of the block(s) in \data. To process a single
// block, use xmm registers and set \xmm_suffix=_XMM. To process a vector of
// length VL, use V* registers and leave \xmm_suffix empty. May clobber V4.
.macro _aes_crypt enc, xmm_suffix, tweak, data
// length VL, use V* registers and leave \xmm_suffix empty. Clobbers \tmp.
.macro _aes_crypt enc, xmm_suffix, tweak, data, tmp
_xor3 KEY0\xmm_suffix, \tweak, \data
cmp $24, KEYLEN
jl .Laes128\@
je .Laes192\@
_vaes_1x \enc, 0, 1, \xmm_suffix, \data
_vaes_1x \enc, 0, 2, \xmm_suffix, \data
_vaes_1x \enc, 1, \xmm_suffix, \data, tmp=\tmp
_vaes_1x \enc, 2, \xmm_suffix, \data, tmp=\tmp
.Laes192\@:
_vaes_1x \enc, 0, 3, \xmm_suffix, \data
_vaes_1x \enc, 0, 4, \xmm_suffix, \data
_vaes_1x \enc, 3, \xmm_suffix, \data, tmp=\tmp
_vaes_1x \enc, 4, \xmm_suffix, \data, tmp=\tmp
.Laes128\@:
.irp i, 5,6,7,8,9,10,11,12,13
_vaes_1x \enc, 0, \i, \xmm_suffix, \data
_vaes_1x \enc, \i, \xmm_suffix, \data, tmp=\tmp
.endr
_vaes_1x \enc, 1, 14, \xmm_suffix, \data
_vpxor \tweak, \data, \data
.if USE_AVX10
vpxord KEY14\xmm_suffix, \tweak, \tmp
.else
.ifnb \xmm_suffix
vpxor 7*16(KEY), \tweak, \tmp
.else
_vbroadcast128 7*16(KEY), \tmp
vpxor \tweak, \tmp, \tmp
.endif
.endif
_vaeslast \enc, \tmp, \data
.endm
.macro _aes_xts_crypt enc
@@ -588,22 +602,43 @@
je .Laes192\@
// Do all the AES rounds on the data blocks, interleaved with
// the computation of the next set of tweaks.
_vaes_4x \enc, 0, 1
_vaes_4x \enc, 0, 2
_vaes_4x \enc, 1
_vaes_4x \enc, 2
.Laes192\@:
_vaes_4x \enc, 0, 3
_vaes_4x \enc, 0, 4
_vaes_4x \enc, 3
_vaes_4x \enc, 4
.Laes128\@:
.irp i, 5,6,7,8,9,10,11,12,13
_vaes_4x \enc, 0, \i
_vaes_4x \enc, \i
.endr
_vaes_4x \enc, 1, 14
// XOR in the tweaks again.
_vpxor TWEAK0, V0, V0
_vpxor TWEAK1, V1, V1
_vpxor TWEAK2, V2, V2
_vpxor TWEAK3, V3, V3
// Do the last AES round, then XOR the results with the tweaks again.
// Reduce latency by doing the XOR before the vaesenclast, utilizing the
// property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a)
// (and likewise for vaesdeclast).
.if USE_AVX10
_tweak_step 18
_tweak_step 19
vpxord TWEAK0, KEY14, V4
vpxord TWEAK1, KEY14, V5
_vaeslast \enc, V4, V0
_vaeslast \enc, V5, V1
vpxord TWEAK2, KEY14, V4
vpxord TWEAK3, KEY14, V5
_vaeslast \enc, V4, V2
_vaeslast \enc, V5, V3
.else
_vbroadcast128 7*16(KEY), V4
_tweak_step 18 // uses V5
_tweak_step 19 // uses V5
vpxor TWEAK0, V4, V5
_vaeslast \enc, V5, V0
vpxor TWEAK1, V4, V5
_vaeslast \enc, V5, V1
vpxor TWEAK2, V4, V5
vpxor TWEAK3, V4, V4
_vaeslast \enc, V5, V2
_vaeslast \enc, V4, V3
.endif
// Store the destination blocks.
_vmovdqu V0, 0*VL(DST)
@@ -640,7 +675,7 @@
jl .Lvec_at_a_time_done\@
.Lvec_at_a_time\@:
_vmovdqu (SRC), V0
_aes_crypt \enc, , TWEAK0, V0
_aes_crypt \enc, , TWEAK0, V0, tmp=V1
_vmovdqu V0, (DST)
_next_tweakvec TWEAK0, V0, V1, TWEAK0
add $VL, SRC
@@ -657,7 +692,7 @@
jl .Lblock_at_a_time_done\@
.Lblock_at_a_time\@:
vmovdqu (SRC), %xmm0
_aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0
_aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0, tmp=%xmm1
vmovdqu %xmm0, (DST)
_next_tweak TWEAK0_XMM, %xmm0, TWEAK0_XMM
add $16, SRC
@@ -685,7 +720,7 @@
// Do it now by advancing the tweak and decrypting the last full block.
_next_tweak TWEAK0_XMM, %xmm0, TWEAK1_XMM
vmovdqu (SRC), %xmm0
_aes_crypt \enc, _XMM, TWEAK1_XMM, %xmm0
_aes_crypt \enc, _XMM, TWEAK1_XMM, %xmm0, tmp=%xmm1
.endif
.if USE_AVX10
@@ -728,7 +763,7 @@
vpblendvb %xmm3, %xmm0, %xmm1, %xmm0
.endif
// En/decrypt again and store the last full block.
_aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0
_aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0, tmp=%xmm1
vmovdqu %xmm0, (DST)
jmp .Ldone\@
.endm