Merge tag 'libcrypto-updates-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/linux

Pull crypto library updates from Eric Biggers:
 "This is the main crypto library pull request for 6.19. It includes:

   - Add SHA-3 support to lib/crypto/, including support for both the
     hash functions and the extendable-output functions. Reimplement the
     existing SHA-3 crypto_shash support on top of the library.

     This is motivated mainly by the upcoming support for the ML-DSA
     signature algorithm, which needs the SHAKE128 and SHAKE256
     functions. But even on its own it's a useful cleanup.

     This also fixes the longstanding issue where the
     architecture-optimized SHA-3 code was disabled by default.

   - Add BLAKE2b support to lib/crypto/, and reimplement the existing
     BLAKE2b crypto_shash support on top of the library.

     This is motivated mainly by btrfs, which supports BLAKE2b
     checksums. With this change, all btrfs checksum algorithms now have
     library APIs. btrfs is planned to start just using the library
     directly.

     This refactor also improves consistency between the BLAKE2b code
     and BLAKE2s code. And as usual, it also fixes the issue where the
     architecture-optimized BLAKE2b code was disabled by default.

   - Add POLYVAL support to lib/crypto/, replacing the existing POLYVAL
     support in crypto_shash. Reimplement HCTR2 on top of the library.

     This simplifies the code and improves HCTR2 performance. As usual,
     it also makes the architecture-optimized code be enabled by
     default. The generic implementation of POLYVAL is greatly improved
     as well.

   - Clean up the BLAKE2s code

   - Add FIPS self-tests for SHA-1, SHA-2, and SHA-3"

* tag 'libcrypto-updates-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/linux: (37 commits)
  fscrypt: Drop obsolete recommendation to enable optimized POLYVAL
  crypto: polyval - Remove the polyval crypto_shash
  crypto: hctr2 - Convert to use POLYVAL library
  lib/crypto: x86/polyval: Migrate optimized code into library
  lib/crypto: arm64/polyval: Migrate optimized code into library
  lib/crypto: polyval: Add POLYVAL library
  crypto: polyval - Rename conflicting functions
  lib/crypto: x86/blake2s: Use vpternlogd for 3-input XORs
  lib/crypto: x86/blake2s: Avoid writing back unchanged 'f' value
  lib/crypto: x86/blake2s: Improve readability
  lib/crypto: x86/blake2s: Use local labels for data
  lib/crypto: x86/blake2s: Drop check for nblocks == 0
  lib/crypto: x86/blake2s: Fix 32-bit arg treated as 64-bit
  lib/crypto: arm, arm64: Drop filenames from file comments
  lib/crypto: arm/blake2s: Fix some comments
  crypto: s390/sha3 - Remove superseded SHA-3 code
  crypto: sha3 - Reimplement using library API
  crypto: jitterentropy - Use default sha3 implementation
  lib/crypto: s390/sha3: Add optimized one-shot SHA-3 digest functions
  lib/crypto: sha3: Support arch overrides of one-shot digest functions
  ...
This commit is contained in:
Linus Torvalds
2025-12-02 18:01:03 -08:00
72 changed files with 3074 additions and 2526 deletions

View File

@@ -28,6 +28,17 @@ config CRYPTO_LIB_ARC4
config CRYPTO_LIB_GF128MUL
tristate
config CRYPTO_LIB_BLAKE2B
tristate
help
The BLAKE2b library functions. Select this if your module uses any of
the functions from <crypto/blake2b.h>.
config CRYPTO_LIB_BLAKE2B_ARCH
bool
depends on CRYPTO_LIB_BLAKE2B && !UML
default y if ARM && KERNEL_MODE_NEON
# BLAKE2s support is always built-in, so there's no CRYPTO_LIB_BLAKE2S option.
config CRYPTO_LIB_BLAKE2S_ARCH
@@ -124,6 +135,18 @@ config CRYPTO_LIB_POLY1305_RSIZE
default 9 if ARM || ARM64
default 1
config CRYPTO_LIB_POLYVAL
tristate
help
The POLYVAL library functions. Select this if your module uses any of
the functions from <crypto/polyval.h>.
config CRYPTO_LIB_POLYVAL_ARCH
bool
depends on CRYPTO_LIB_POLYVAL && !UML
default y if ARM64 && KERNEL_MODE_NEON
default y if X86_64
config CRYPTO_LIB_CHACHA20POLY1305
tristate
select CRYPTO_LIB_CHACHA
@@ -184,6 +207,19 @@ config CRYPTO_LIB_SHA512_ARCH
default y if SPARC64
default y if X86_64
config CRYPTO_LIB_SHA3
tristate
select CRYPTO_LIB_UTILS
help
The SHA3 library functions. Select this if your module uses any of
the functions from <crypto/sha3.h>.
config CRYPTO_LIB_SHA3_ARCH
bool
depends on CRYPTO_LIB_SHA3 && !UML
default y if ARM64 && KERNEL_MODE_NEON
default y if S390
config CRYPTO_LIB_SM3
tristate

View File

@@ -31,6 +31,16 @@ obj-$(CONFIG_CRYPTO_LIB_GF128MUL) += gf128mul.o
################################################################################
obj-$(CONFIG_CRYPTO_LIB_BLAKE2B) += libblake2b.o
libblake2b-y := blake2b.o
CFLAGS_blake2b.o := -Wframe-larger-than=4096 # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105930
ifeq ($(CONFIG_CRYPTO_LIB_BLAKE2B_ARCH),y)
CFLAGS_blake2b.o += -I$(src)/$(SRCARCH)
libblake2b-$(CONFIG_ARM) += arm/blake2b-neon-core.o
endif # CONFIG_CRYPTO_LIB_BLAKE2B_ARCH
################################################################################
# blake2s is used by the /dev/random driver which is always builtin
obj-y += blake2s.o
ifeq ($(CONFIG_CRYPTO_LIB_BLAKE2S_ARCH),y)
@@ -188,6 +198,16 @@ clean-files += arm/poly1305-core.S \
################################################################################
obj-$(CONFIG_CRYPTO_LIB_POLYVAL) += libpolyval.o
libpolyval-y := polyval.o
ifeq ($(CONFIG_CRYPTO_LIB_POLYVAL_ARCH),y)
CFLAGS_polyval.o += -I$(src)/$(SRCARCH)
libpolyval-$(CONFIG_ARM64) += arm64/polyval-ce-core.o
libpolyval-$(CONFIG_X86) += x86/polyval-pclmul-avx.o
endif
################################################################################
obj-$(CONFIG_CRYPTO_LIB_SHA1) += libsha1.o
libsha1-y := sha1.o
ifeq ($(CONFIG_CRYPTO_LIB_SHA1_ARCH),y)
@@ -268,6 +288,16 @@ endif # CONFIG_CRYPTO_LIB_SHA512_ARCH
################################################################################
obj-$(CONFIG_CRYPTO_LIB_SHA3) += libsha3.o
libsha3-y := sha3.o
ifeq ($(CONFIG_CRYPTO_LIB_SHA3_ARCH),y)
CFLAGS_sha3.o += -I$(src)/$(SRCARCH)
libsha3-$(CONFIG_ARM64) += arm64/sha3-ce-core.o
endif # CONFIG_CRYPTO_LIB_SHA3_ARCH
################################################################################
obj-$(CONFIG_MPILIB) += mpi/
obj-$(CONFIG_CRYPTO_SELFTESTS_FULL) += simd.o

View File

@@ -0,0 +1,350 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* BLAKE2b digest algorithm optimized with ARM NEON instructions. On ARM
* processors that have NEON support but not the ARMv8 Crypto Extensions,
* typically this BLAKE2b implementation is much faster than the SHA-2 family
* and slightly faster than SHA-1.
*
* Copyright 2020 Google LLC
*
* Author: Eric Biggers <ebiggers@google.com>
*/
#include <linux/linkage.h>
.text
.fpu neon
// The arguments to blake2b_compress_neon()
CTX .req r0
DATA .req r1
NBLOCKS .req r2
INC .req r3
// Pointers to the rotation tables
ROR24_TABLE .req r4
ROR16_TABLE .req r5
// The original stack pointer
ORIG_SP .req r6
// NEON registers which contain the message words of the current block.
// M_0-M_3 are occasionally used for other purposes too.
M_0 .req d16
M_1 .req d17
M_2 .req d18
M_3 .req d19
M_4 .req d20
M_5 .req d21
M_6 .req d22
M_7 .req d23
M_8 .req d24
M_9 .req d25
M_10 .req d26
M_11 .req d27
M_12 .req d28
M_13 .req d29
M_14 .req d30
M_15 .req d31
.align 4
// Tables for computing ror64(x, 24) and ror64(x, 16) using the vtbl.8
// instruction. This is the most efficient way to implement these
// rotation amounts with NEON. (On Cortex-A53 it's the same speed as
// vshr.u64 + vsli.u64, while on Cortex-A7 it's faster.)
.Lror24_table:
.byte 3, 4, 5, 6, 7, 0, 1, 2
.Lror16_table:
.byte 2, 3, 4, 5, 6, 7, 0, 1
// The BLAKE2b initialization vector
.Lblake2b_IV:
.quad 0x6a09e667f3bcc908, 0xbb67ae8584caa73b
.quad 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1
.quad 0x510e527fade682d1, 0x9b05688c2b3e6c1f
.quad 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
// Execute one round of BLAKE2b by updating the state matrix v[0..15] in the
// NEON registers q0-q7. The message block is in q8..q15 (M_0-M_15). The stack
// pointer points to a 32-byte aligned buffer containing a copy of q8 and q9
// (M_0-M_3), so that they can be reloaded if they are used as temporary
// registers. The macro arguments s0-s15 give the order in which the message
// words are used in this round. 'final' is 1 if this is the final round.
.macro _blake2b_round s0, s1, s2, s3, s4, s5, s6, s7, \
s8, s9, s10, s11, s12, s13, s14, s15, final=0
// Mix the columns:
// (v[0], v[4], v[8], v[12]), (v[1], v[5], v[9], v[13]),
// (v[2], v[6], v[10], v[14]), and (v[3], v[7], v[11], v[15]).
// a += b + m[blake2b_sigma[r][2*i + 0]];
vadd.u64 q0, q0, q2
vadd.u64 q1, q1, q3
vadd.u64 d0, d0, M_\s0
vadd.u64 d1, d1, M_\s2
vadd.u64 d2, d2, M_\s4
vadd.u64 d3, d3, M_\s6
// d = ror64(d ^ a, 32);
veor q6, q6, q0
veor q7, q7, q1
vrev64.32 q6, q6
vrev64.32 q7, q7
// c += d;
vadd.u64 q4, q4, q6
vadd.u64 q5, q5, q7
// b = ror64(b ^ c, 24);
vld1.8 {M_0}, [ROR24_TABLE, :64]
veor q2, q2, q4
veor q3, q3, q5
vtbl.8 d4, {d4}, M_0
vtbl.8 d5, {d5}, M_0
vtbl.8 d6, {d6}, M_0
vtbl.8 d7, {d7}, M_0
// a += b + m[blake2b_sigma[r][2*i + 1]];
//
// M_0 got clobbered above, so we have to reload it if any of the four
// message words this step needs happens to be M_0. Otherwise we don't
// need to reload it here, as it will just get clobbered again below.
.if \s1 == 0 || \s3 == 0 || \s5 == 0 || \s7 == 0
vld1.8 {M_0}, [sp, :64]
.endif
vadd.u64 q0, q0, q2
vadd.u64 q1, q1, q3
vadd.u64 d0, d0, M_\s1
vadd.u64 d1, d1, M_\s3
vadd.u64 d2, d2, M_\s5
vadd.u64 d3, d3, M_\s7
// d = ror64(d ^ a, 16);
vld1.8 {M_0}, [ROR16_TABLE, :64]
veor q6, q6, q0
veor q7, q7, q1
vtbl.8 d12, {d12}, M_0
vtbl.8 d13, {d13}, M_0
vtbl.8 d14, {d14}, M_0
vtbl.8 d15, {d15}, M_0
// c += d;
vadd.u64 q4, q4, q6
vadd.u64 q5, q5, q7
// b = ror64(b ^ c, 63);
//
// This rotation amount isn't a multiple of 8, so it has to be
// implemented using a pair of shifts, which requires temporary
// registers. Use q8-q9 (M_0-M_3) for this, and reload them afterwards.
veor q8, q2, q4
veor q9, q3, q5
vshr.u64 q2, q8, #63
vshr.u64 q3, q9, #63
vsli.u64 q2, q8, #1
vsli.u64 q3, q9, #1
vld1.8 {q8-q9}, [sp, :256]
// Mix the diagonals:
// (v[0], v[5], v[10], v[15]), (v[1], v[6], v[11], v[12]),
// (v[2], v[7], v[8], v[13]), and (v[3], v[4], v[9], v[14]).
//
// There are two possible ways to do this: use 'vext' instructions to
// shift the rows of the matrix so that the diagonals become columns,
// and undo it afterwards; or just use 64-bit operations on 'd'
// registers instead of 128-bit operations on 'q' registers. We use the
// latter approach, as it performs much better on Cortex-A7.
// a += b + m[blake2b_sigma[r][2*i + 0]];
vadd.u64 d0, d0, d5
vadd.u64 d1, d1, d6
vadd.u64 d2, d2, d7
vadd.u64 d3, d3, d4
vadd.u64 d0, d0, M_\s8
vadd.u64 d1, d1, M_\s10
vadd.u64 d2, d2, M_\s12
vadd.u64 d3, d3, M_\s14
// d = ror64(d ^ a, 32);
veor d15, d15, d0
veor d12, d12, d1
veor d13, d13, d2
veor d14, d14, d3
vrev64.32 d15, d15
vrev64.32 d12, d12
vrev64.32 d13, d13
vrev64.32 d14, d14
// c += d;
vadd.u64 d10, d10, d15
vadd.u64 d11, d11, d12
vadd.u64 d8, d8, d13
vadd.u64 d9, d9, d14
// b = ror64(b ^ c, 24);
vld1.8 {M_0}, [ROR24_TABLE, :64]
veor d5, d5, d10
veor d6, d6, d11
veor d7, d7, d8
veor d4, d4, d9
vtbl.8 d5, {d5}, M_0
vtbl.8 d6, {d6}, M_0
vtbl.8 d7, {d7}, M_0
vtbl.8 d4, {d4}, M_0
// a += b + m[blake2b_sigma[r][2*i + 1]];
.if \s9 == 0 || \s11 == 0 || \s13 == 0 || \s15 == 0
vld1.8 {M_0}, [sp, :64]
.endif
vadd.u64 d0, d0, d5
vadd.u64 d1, d1, d6
vadd.u64 d2, d2, d7
vadd.u64 d3, d3, d4
vadd.u64 d0, d0, M_\s9
vadd.u64 d1, d1, M_\s11
vadd.u64 d2, d2, M_\s13
vadd.u64 d3, d3, M_\s15
// d = ror64(d ^ a, 16);
vld1.8 {M_0}, [ROR16_TABLE, :64]
veor d15, d15, d0
veor d12, d12, d1
veor d13, d13, d2
veor d14, d14, d3
vtbl.8 d12, {d12}, M_0
vtbl.8 d13, {d13}, M_0
vtbl.8 d14, {d14}, M_0
vtbl.8 d15, {d15}, M_0
// c += d;
vadd.u64 d10, d10, d15
vadd.u64 d11, d11, d12
vadd.u64 d8, d8, d13
vadd.u64 d9, d9, d14
// b = ror64(b ^ c, 63);
veor d16, d4, d9
veor d17, d5, d10
veor d18, d6, d11
veor d19, d7, d8
vshr.u64 q2, q8, #63
vshr.u64 q3, q9, #63
vsli.u64 q2, q8, #1
vsli.u64 q3, q9, #1
// Reloading q8-q9 can be skipped on the final round.
.if ! \final
vld1.8 {q8-q9}, [sp, :256]
.endif
.endm
//
// void blake2b_compress_neon(struct blake2b_ctx *ctx,
// const u8 *data, size_t nblocks, u32 inc);
//
// Only the first three fields of struct blake2b_ctx are used:
// u64 h[8]; (inout)
// u64 t[2]; (inout)
// u64 f[2]; (in)
//
.align 5
ENTRY(blake2b_compress_neon)
push {r4-r10}
// Allocate a 32-byte stack buffer that is 32-byte aligned.
mov ORIG_SP, sp
sub ip, sp, #32
bic ip, ip, #31
mov sp, ip
adr ROR24_TABLE, .Lror24_table
adr ROR16_TABLE, .Lror16_table
mov ip, CTX
vld1.64 {q0-q1}, [ip]! // Load h[0..3]
vld1.64 {q2-q3}, [ip]! // Load h[4..7]
.Lnext_block:
adr r10, .Lblake2b_IV
vld1.64 {q14-q15}, [ip] // Load t[0..1] and f[0..1]
vld1.64 {q4-q5}, [r10]! // Load IV[0..3]
vmov r7, r8, d28 // Copy t[0] to (r7, r8)
vld1.64 {q6-q7}, [r10] // Load IV[4..7]
adds r7, r7, INC // Increment counter
bcs .Lslow_inc_ctr
vmov.i32 d28[0], r7
vst1.64 {d28}, [ip] // Update t[0]
.Linc_ctr_done:
// Load the next message block and finish initializing the state matrix
// 'v'. Fortunately, there are exactly enough NEON registers to fit the
// entire state matrix in q0-q7 and the entire message block in q8-15.
//
// However, _blake2b_round also needs some extra registers for rotates,
// so we have to spill some registers. It's better to spill the message
// registers than the state registers, as the message doesn't change.
// Therefore we store a copy of the first 32 bytes of the message block
// (q8-q9) in an aligned buffer on the stack so that they can be
// reloaded when needed. (We could just reload directly from the
// message buffer, but it's faster to use aligned loads.)
vld1.8 {q8-q9}, [DATA]!
veor q6, q6, q14 // v[12..13] = IV[4..5] ^ t[0..1]
vld1.8 {q10-q11}, [DATA]!
veor q7, q7, q15 // v[14..15] = IV[6..7] ^ f[0..1]
vld1.8 {q12-q13}, [DATA]!
vst1.8 {q8-q9}, [sp, :256]
mov ip, CTX
vld1.8 {q14-q15}, [DATA]!
// Execute the rounds. Each round is provided the order in which it
// needs to use the message words.
_blake2b_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
_blake2b_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
_blake2b_round 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
_blake2b_round 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
_blake2b_round 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
_blake2b_round 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
_blake2b_round 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
_blake2b_round 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
_blake2b_round 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
_blake2b_round 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
_blake2b_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
_blake2b_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 \
final=1
// Fold the final state matrix into the hash chaining value:
//
// for (i = 0; i < 8; i++)
// h[i] ^= v[i] ^ v[i + 8];
//
vld1.64 {q8-q9}, [ip]! // Load old h[0..3]
veor q0, q0, q4 // v[0..1] ^= v[8..9]
veor q1, q1, q5 // v[2..3] ^= v[10..11]
vld1.64 {q10-q11}, [ip] // Load old h[4..7]
veor q2, q2, q6 // v[4..5] ^= v[12..13]
veor q3, q3, q7 // v[6..7] ^= v[14..15]
veor q0, q0, q8 // v[0..1] ^= h[0..1]
veor q1, q1, q9 // v[2..3] ^= h[2..3]
mov ip, CTX
subs NBLOCKS, NBLOCKS, #1 // nblocks--
vst1.64 {q0-q1}, [ip]! // Store new h[0..3]
veor q2, q2, q10 // v[4..5] ^= h[4..5]
veor q3, q3, q11 // v[6..7] ^= h[6..7]
vst1.64 {q2-q3}, [ip]! // Store new h[4..7]
// Advance to the next block, if there is one.
bne .Lnext_block // nblocks != 0?
mov sp, ORIG_SP
pop {r4-r10}
mov pc, lr
.Lslow_inc_ctr:
// Handle the case where the counter overflowed its low 32 bits, by
// carrying the overflow bit into the full 128-bit counter.
vmov r9, r10, d29
adcs r8, r8, #0
adcs r9, r9, #0
adc r10, r10, #0
vmov d28, r7, r8
vmov d29, r9, r10
vst1.64 {q14}, [ip] // Update t[0] and t[1]
b .Linc_ctr_done
ENDPROC(blake2b_compress_neon)

41
lib/crypto/arm/blake2b.h Normal file
View File

@@ -0,0 +1,41 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* BLAKE2b digest algorithm, NEON accelerated
*
* Copyright 2020 Google LLC
*/
#include <asm/neon.h>
#include <asm/simd.h>
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
asmlinkage void blake2b_compress_neon(struct blake2b_ctx *ctx,
const u8 *data, size_t nblocks, u32 inc);
static void blake2b_compress(struct blake2b_ctx *ctx,
const u8 *data, size_t nblocks, u32 inc)
{
if (!static_branch_likely(&have_neon) || !may_use_simd()) {
blake2b_compress_generic(ctx, data, nblocks, inc);
return;
}
do {
const size_t blocks = min_t(size_t, nblocks,
SZ_4K / BLAKE2B_BLOCK_SIZE);
kernel_neon_begin();
blake2b_compress_neon(ctx, data, blocks, inc);
kernel_neon_end();
data += blocks * BLAKE2B_BLOCK_SIZE;
nblocks -= blocks;
} while (nblocks);
}
#define blake2b_mod_init_arch blake2b_mod_init_arch
static void blake2b_mod_init_arch(void)
{
if (elf_hwcap & HWCAP_NEON)
static_branch_enable(&have_neon);
}

View File

@@ -115,7 +115,7 @@
// Execute one round of BLAKE2s by updating the state matrix v[0..15]. v[0..9]
// are in r0..r9. The stack pointer points to 8 bytes of scratch space for
// spilling v[8..9], then to v[9..15], then to the message block. r10-r12 and
// spilling v[8..9], then to v[10..15], then to the message block. r10-r12 and
// r14 are free to use. The macro arguments s0-s15 give the order in which the
// message words are used in this round.
//
@@ -170,10 +170,10 @@
.endm
//
// void blake2s_compress(struct blake2s_state *state,
// const u8 *block, size_t nblocks, u32 inc);
// void blake2s_compress(struct blake2s_ctx *ctx,
// const u8 *data, size_t nblocks, u32 inc);
//
// Only the first three fields of struct blake2s_state are used:
// Only the first three fields of struct blake2s_ctx are used:
// u32 h[8]; (inout)
// u32 t[2]; (inout)
// u32 f[2]; (in)
@@ -183,8 +183,8 @@ ENTRY(blake2s_compress)
push {r0-r2,r4-r11,lr} // keep this an even number
.Lnext_block:
// r0 is 'state'
// r1 is 'block'
// r0 is 'ctx'
// r1 is 'data'
// r3 is 'inc'
// Load and increment the counter t[0..1].
@@ -209,18 +209,18 @@ ENTRY(blake2s_compress)
.Lcopy_block_done:
str r1, [sp, #68] // Update message pointer
// Calculate v[8..15]. Push v[9..15] onto the stack, and leave space
// Calculate v[8..15]. Push v[10..15] onto the stack, and leave space
// for spilling v[8..9]. Leave v[8..9] in r8-r9.
mov r14, r0 // r14 = state
mov r14, r0 // r14 = ctx
adr r12, .Lblake2s_IV
ldmia r12!, {r8-r9} // load IV[0..1]
__ldrd r0, r1, r14, 40 // load f[0..1]
ldm r12, {r2-r7} // load IV[3..7]
ldm r12, {r2-r7} // load IV[2..7]
eor r4, r4, r10 // v[12] = IV[4] ^ t[0]
eor r5, r5, r11 // v[13] = IV[5] ^ t[1]
eor r6, r6, r0 // v[14] = IV[6] ^ f[0]
eor r7, r7, r1 // v[15] = IV[7] ^ f[1]
push {r2-r7} // push v[9..15]
push {r2-r7} // push v[10..15]
sub sp, sp, #8 // leave space for v[8..9]
// Load h[0..7] == v[0..7].
@@ -275,7 +275,7 @@ ENTRY(blake2s_compress)
// Advance to the next block, if there is one. Note that if there are
// multiple blocks, then 'inc' (the counter increment amount) must be
// 64. So we can simply set it to 64 without re-loading it.
ldm sp, {r0, r1, r2} // load (state, block, nblocks)
ldm sp, {r0, r1, r2} // load (ctx, data, nblocks)
mov r3, #64 // set 'inc'
subs r2, r2, #1 // nblocks--
str r2, [sp, #8]

View File

@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* defined in blake2s-core.S */
void blake2s_compress(struct blake2s_state *state, const u8 *block,
size_t nblocks, u32 inc);
void blake2s_compress(struct blake2s_ctx *ctx,
const u8 *data, size_t nblocks, u32 inc);

View File

@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* sha1-armv7-neon.S - ARM/NEON accelerated SHA-1 transform function
/* ARM/NEON accelerated SHA-1 transform function
*
* Copyright © 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
*/

View File

@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* sha1-ce-core.S - SHA-1 secure hash using ARMv8 Crypto Extensions
* SHA-1 secure hash using ARMv8 Crypto Extensions
*
* Copyright (C) 2015 Linaro Ltd.
* Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>

View File

@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* sha256-ce.S - SHA-224/256 secure hash using ARMv8 Crypto Extensions
* SHA-224/256 secure hash using ARMv8 Crypto Extensions
*
* Copyright (C) 2015 Linaro Ltd.
* Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>

View File

@@ -0,0 +1,359 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Implementation of POLYVAL using ARMv8 Crypto Extensions.
*
* Copyright 2021 Google LLC
*/
/*
* This is an efficient implementation of POLYVAL using ARMv8 Crypto Extensions
* It works on 8 blocks at a time, by precomputing the first 8 keys powers h^8,
* ..., h^1 in the POLYVAL finite field. This precomputation allows us to split
* finite field multiplication into two steps.
*
* In the first step, we consider h^i, m_i as normal polynomials of degree less
* than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication
* is simply polynomial multiplication.
*
* In the second step, we compute the reduction of p(x) modulo the finite field
* modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1.
*
* This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where
* multiplication is finite field multiplication. The advantage is that the
* two-step process only requires 1 finite field reduction for every 8
* polynomial multiplications. Further parallelism is gained by interleaving the
* multiplications and polynomial reductions.
*/
#include <linux/linkage.h>
#define STRIDE_BLOCKS 8
ACCUMULATOR .req x0
KEY_POWERS .req x1
MSG .req x2
BLOCKS_LEFT .req x3
KEY_START .req x10
EXTRA_BYTES .req x11
TMP .req x13
M0 .req v0
M1 .req v1
M2 .req v2
M3 .req v3
M4 .req v4
M5 .req v5
M6 .req v6
M7 .req v7
KEY8 .req v8
KEY7 .req v9
KEY6 .req v10
KEY5 .req v11
KEY4 .req v12
KEY3 .req v13
KEY2 .req v14
KEY1 .req v15
PL .req v16
PH .req v17
TMP_V .req v18
LO .req v20
MI .req v21
HI .req v22
SUM .req v23
GSTAR .req v24
.text
.arch armv8-a+crypto
.align 4
.Lgstar:
.quad 0xc200000000000000, 0xc200000000000000
/*
* Computes the product of two 128-bit polynomials in X and Y and XORs the
* components of the 256-bit product into LO, MI, HI.
*
* Given:
* X = [X_1 : X_0]
* Y = [Y_1 : Y_0]
*
* We compute:
* LO += X_0 * Y_0
* MI += (X_0 + X_1) * (Y_0 + Y_1)
* HI += X_1 * Y_1
*
* Later, the 256-bit result can be extracted as:
* [HI_1 : HI_0 + HI_1 + MI_1 + LO_1 : LO_1 + HI_0 + MI_0 + LO_0 : LO_0]
* This step is done when computing the polynomial reduction for efficiency
* reasons.
*
* Karatsuba multiplication is used instead of Schoolbook multiplication because
* it was found to be slightly faster on ARM64 CPUs.
*
*/
.macro karatsuba1 X Y
X .req \X
Y .req \Y
ext v25.16b, X.16b, X.16b, #8
ext v26.16b, Y.16b, Y.16b, #8
eor v25.16b, v25.16b, X.16b
eor v26.16b, v26.16b, Y.16b
pmull2 v28.1q, X.2d, Y.2d
pmull v29.1q, X.1d, Y.1d
pmull v27.1q, v25.1d, v26.1d
eor HI.16b, HI.16b, v28.16b
eor LO.16b, LO.16b, v29.16b
eor MI.16b, MI.16b, v27.16b
.unreq X
.unreq Y
.endm
/*
* Same as karatsuba1, except overwrites HI, LO, MI rather than XORing into
* them.
*/
.macro karatsuba1_store X Y
X .req \X
Y .req \Y
ext v25.16b, X.16b, X.16b, #8
ext v26.16b, Y.16b, Y.16b, #8
eor v25.16b, v25.16b, X.16b
eor v26.16b, v26.16b, Y.16b
pmull2 HI.1q, X.2d, Y.2d
pmull LO.1q, X.1d, Y.1d
pmull MI.1q, v25.1d, v26.1d
.unreq X
.unreq Y
.endm
/*
* Computes the 256-bit polynomial represented by LO, HI, MI. Stores
* the result in PL, PH.
* [PH : PL] =
* [HI_1 : HI_1 + HI_0 + MI_1 + LO_1 : HI_0 + MI_0 + LO_1 + LO_0 : LO_0]
*/
.macro karatsuba2
// v4 = [HI_1 + MI_1 : HI_0 + MI_0]
eor v4.16b, HI.16b, MI.16b
// v4 = [HI_1 + MI_1 + LO_1 : HI_0 + MI_0 + LO_0]
eor v4.16b, v4.16b, LO.16b
// v5 = [HI_0 : LO_1]
ext v5.16b, LO.16b, HI.16b, #8
// v4 = [HI_1 + HI_0 + MI_1 + LO_1 : HI_0 + MI_0 + LO_1 + LO_0]
eor v4.16b, v4.16b, v5.16b
// HI = [HI_0 : HI_1]
ext HI.16b, HI.16b, HI.16b, #8
// LO = [LO_0 : LO_1]
ext LO.16b, LO.16b, LO.16b, #8
// PH = [HI_1 : HI_1 + HI_0 + MI_1 + LO_1]
ext PH.16b, v4.16b, HI.16b, #8
// PL = [HI_0 + MI_0 + LO_1 + LO_0 : LO_0]
ext PL.16b, LO.16b, v4.16b, #8
.endm
/*
* Computes the 128-bit reduction of PH : PL. Stores the result in dest.
*
* This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) =
* x^128 + x^127 + x^126 + x^121 + 1.
*
* We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the
* product of two 128-bit polynomials in Montgomery form. We need to reduce it
* mod g(x). Also, since polynomials in Montgomery form have an "extra" factor
* of x^128, this product has two extra factors of x^128. To get it back into
* Montgomery form, we need to remove one of these factors by dividing by x^128.
*
* To accomplish both of these goals, we add multiples of g(x) that cancel out
* the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low
* bits are zero, the polynomial division by x^128 can be done by right
* shifting.
*
* Since the only nonzero term in the low 64 bits of g(x) is the constant term,
* the multiple of g(x) needed to cancel out P_0 is P_0 * g(x). The CPU can
* only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 +
* x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x). Adding this to
* the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T
* = T_1 : T_0 = g*(x) * P_0. Thus, bits 0-63 got "folded" into bits 64-191.
*
* Repeating this same process on the next 64 bits "folds" bits 64-127 into bits
* 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1
* + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) *
* x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 :
* P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0).
*
* So our final computation is:
* T = T_1 : T_0 = g*(x) * P_0
* V = V_1 : V_0 = g*(x) * (P_1 + T_0)
* p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0
*
* The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0
* + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 :
* T_1 into dest. This allows us to reuse P_1 + T_0 when computing V.
*/
.macro montgomery_reduction dest
DEST .req \dest
// TMP_V = T_1 : T_0 = P_0 * g*(x)
pmull TMP_V.1q, PL.1d, GSTAR.1d
// TMP_V = T_0 : T_1
ext TMP_V.16b, TMP_V.16b, TMP_V.16b, #8
// TMP_V = P_1 + T_0 : P_0 + T_1
eor TMP_V.16b, PL.16b, TMP_V.16b
// PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1
eor PH.16b, PH.16b, TMP_V.16b
// TMP_V = V_1 : V_0 = (P_1 + T_0) * g*(x)
pmull2 TMP_V.1q, TMP_V.2d, GSTAR.2d
eor DEST.16b, PH.16b, TMP_V.16b
.unreq DEST
.endm
/*
* Compute Polyval on 8 blocks.
*
* If reduce is set, also computes the montgomery reduction of the
* previous full_stride call and XORs with the first message block.
* (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1.
* I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0.
*
* Sets PL, PH.
*/
.macro full_stride reduce
eor LO.16b, LO.16b, LO.16b
eor MI.16b, MI.16b, MI.16b
eor HI.16b, HI.16b, HI.16b
ld1 {M0.16b, M1.16b, M2.16b, M3.16b}, [MSG], #64
ld1 {M4.16b, M5.16b, M6.16b, M7.16b}, [MSG], #64
karatsuba1 M7 KEY1
.if \reduce
pmull TMP_V.1q, PL.1d, GSTAR.1d
.endif
karatsuba1 M6 KEY2
.if \reduce
ext TMP_V.16b, TMP_V.16b, TMP_V.16b, #8
.endif
karatsuba1 M5 KEY3
.if \reduce
eor TMP_V.16b, PL.16b, TMP_V.16b
.endif
karatsuba1 M4 KEY4
.if \reduce
eor PH.16b, PH.16b, TMP_V.16b
.endif
karatsuba1 M3 KEY5
.if \reduce
pmull2 TMP_V.1q, TMP_V.2d, GSTAR.2d
.endif
karatsuba1 M2 KEY6
.if \reduce
eor SUM.16b, PH.16b, TMP_V.16b
.endif
karatsuba1 M1 KEY7
eor M0.16b, M0.16b, SUM.16b
karatsuba1 M0 KEY8
karatsuba2
.endm
/*
* Handle any extra blocks after full_stride loop.
*/
.macro partial_stride
add KEY_POWERS, KEY_START, #(STRIDE_BLOCKS << 4)
sub KEY_POWERS, KEY_POWERS, BLOCKS_LEFT, lsl #4
ld1 {KEY1.16b}, [KEY_POWERS], #16
ld1 {TMP_V.16b}, [MSG], #16
eor SUM.16b, SUM.16b, TMP_V.16b
karatsuba1_store KEY1 SUM
sub BLOCKS_LEFT, BLOCKS_LEFT, #1
tst BLOCKS_LEFT, #4
beq .Lpartial4BlocksDone
ld1 {M0.16b, M1.16b, M2.16b, M3.16b}, [MSG], #64
ld1 {KEY8.16b, KEY7.16b, KEY6.16b, KEY5.16b}, [KEY_POWERS], #64
karatsuba1 M0 KEY8
karatsuba1 M1 KEY7
karatsuba1 M2 KEY6
karatsuba1 M3 KEY5
.Lpartial4BlocksDone:
tst BLOCKS_LEFT, #2
beq .Lpartial2BlocksDone
ld1 {M0.16b, M1.16b}, [MSG], #32
ld1 {KEY8.16b, KEY7.16b}, [KEY_POWERS], #32
karatsuba1 M0 KEY8
karatsuba1 M1 KEY7
.Lpartial2BlocksDone:
tst BLOCKS_LEFT, #1
beq .LpartialDone
ld1 {M0.16b}, [MSG], #16
ld1 {KEY8.16b}, [KEY_POWERS], #16
karatsuba1 M0 KEY8
.LpartialDone:
karatsuba2
montgomery_reduction SUM
.endm
/*
* Computes a = a * b * x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1.
*
* void polyval_mul_pmull(struct polyval_elem *a,
* const struct polyval_elem *b);
*/
SYM_FUNC_START(polyval_mul_pmull)
adr TMP, .Lgstar
ld1 {GSTAR.2d}, [TMP]
ld1 {v0.16b}, [x0]
ld1 {v1.16b}, [x1]
karatsuba1_store v0 v1
karatsuba2
montgomery_reduction SUM
st1 {SUM.16b}, [x0]
ret
SYM_FUNC_END(polyval_mul_pmull)
/*
* Perform polynomial evaluation as specified by POLYVAL. This computes:
* h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1}
* where n=nblocks, h is the hash key, and m_i are the message blocks.
*
* x0 - pointer to accumulator
* x1 - pointer to precomputed key powers h^8 ... h^1
* x2 - pointer to message blocks
* x3 - number of blocks to hash
*
* void polyval_blocks_pmull(struct polyval_elem *acc,
* const struct polyval_key *key,
* const u8 *data, size_t nblocks);
*/
SYM_FUNC_START(polyval_blocks_pmull)
adr TMP, .Lgstar
mov KEY_START, KEY_POWERS
ld1 {GSTAR.2d}, [TMP]
ld1 {SUM.16b}, [ACCUMULATOR]
subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
blt .LstrideLoopExit
ld1 {KEY8.16b, KEY7.16b, KEY6.16b, KEY5.16b}, [KEY_POWERS], #64
ld1 {KEY4.16b, KEY3.16b, KEY2.16b, KEY1.16b}, [KEY_POWERS], #64
full_stride 0
subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
blt .LstrideLoopExitReduce
.LstrideLoop:
full_stride 1
subs BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
bge .LstrideLoop
.LstrideLoopExitReduce:
montgomery_reduction SUM
.LstrideLoopExit:
adds BLOCKS_LEFT, BLOCKS_LEFT, #STRIDE_BLOCKS
beq .LskipPartial
partial_stride
.LskipPartial:
st1 {SUM.16b}, [ACCUMULATOR]
ret
SYM_FUNC_END(polyval_blocks_pmull)

View File

@@ -0,0 +1,82 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* POLYVAL library functions, arm64 optimized
*
* Copyright 2025 Google LLC
*/
#include <asm/neon.h>
#include <asm/simd.h>
#include <linux/cpufeature.h>
#define NUM_H_POWERS 8
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pmull);
asmlinkage void polyval_mul_pmull(struct polyval_elem *a,
const struct polyval_elem *b);
asmlinkage void polyval_blocks_pmull(struct polyval_elem *acc,
const struct polyval_key *key,
const u8 *data, size_t nblocks);
static void polyval_preparekey_arch(struct polyval_key *key,
const u8 raw_key[POLYVAL_BLOCK_SIZE])
{
static_assert(ARRAY_SIZE(key->h_powers) == NUM_H_POWERS);
memcpy(&key->h_powers[NUM_H_POWERS - 1], raw_key, POLYVAL_BLOCK_SIZE);
if (static_branch_likely(&have_pmull) && may_use_simd()) {
kernel_neon_begin();
for (int i = NUM_H_POWERS - 2; i >= 0; i--) {
key->h_powers[i] = key->h_powers[i + 1];
polyval_mul_pmull(&key->h_powers[i],
&key->h_powers[NUM_H_POWERS - 1]);
}
kernel_neon_end();
} else {
for (int i = NUM_H_POWERS - 2; i >= 0; i--) {
key->h_powers[i] = key->h_powers[i + 1];
polyval_mul_generic(&key->h_powers[i],
&key->h_powers[NUM_H_POWERS - 1]);
}
}
}
static void polyval_mul_arch(struct polyval_elem *acc,
const struct polyval_key *key)
{
if (static_branch_likely(&have_pmull) && may_use_simd()) {
kernel_neon_begin();
polyval_mul_pmull(acc, &key->h_powers[NUM_H_POWERS - 1]);
kernel_neon_end();
} else {
polyval_mul_generic(acc, &key->h_powers[NUM_H_POWERS - 1]);
}
}
static void polyval_blocks_arch(struct polyval_elem *acc,
const struct polyval_key *key,
const u8 *data, size_t nblocks)
{
if (static_branch_likely(&have_pmull) && may_use_simd()) {
do {
/* Allow rescheduling every 4 KiB. */
size_t n = min_t(size_t, nblocks,
4096 / POLYVAL_BLOCK_SIZE);
kernel_neon_begin();
polyval_blocks_pmull(acc, key, data, n);
kernel_neon_end();
data += n * POLYVAL_BLOCK_SIZE;
nblocks -= n;
} while (nblocks);
} else {
polyval_blocks_generic(acc, &key->h_powers[NUM_H_POWERS - 1],
data, nblocks);
}
}
#define polyval_mod_init_arch polyval_mod_init_arch
static void polyval_mod_init_arch(void)
{
if (cpu_have_named_feature(PMULL))
static_branch_enable(&have_pmull);
}

View File

@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* sha1-ce-core.S - SHA-1 secure hash using ARMv8 Crypto Extensions
* SHA-1 secure hash using ARMv8 Crypto Extensions
*
* Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
*/

View File

@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* sha2-ce-core.S - core SHA-224/SHA-256 transform using v8 Crypto Extensions
* Core SHA-224/SHA-256 transform using v8 Crypto Extensions
*
* Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
*/

View File

@@ -0,0 +1,213 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Core SHA-3 transform using v8.2 Crypto Extensions
*
* Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <linux/linkage.h>
#include <asm/assembler.h>
.irp b,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
.set .Lv\b\().2d, \b
.set .Lv\b\().16b, \b
.endr
/*
* ARMv8.2 Crypto Extensions instructions
*/
.macro eor3, rd, rn, rm, ra
.inst 0xce000000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16)
.endm
.macro rax1, rd, rn, rm
.inst 0xce608c00 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
.endm
.macro bcax, rd, rn, rm, ra
.inst 0xce200000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16)
.endm
.macro xar, rd, rn, rm, imm6
.inst 0xce800000 | .L\rd | (.L\rn << 5) | ((\imm6) << 10) | (.L\rm << 16)
.endm
/*
* size_t sha3_ce_transform(struct sha3_state *state, const u8 *data,
* size_t nblocks, size_t block_size)
*
* block_size is assumed to be one of 72 (SHA3-512), 104 (SHA3-384), 136
* (SHA3-256 and SHAKE256), 144 (SHA3-224), or 168 (SHAKE128).
*/
.text
SYM_FUNC_START(sha3_ce_transform)
/* load state */
add x8, x0, #32
ld1 { v0.1d- v3.1d}, [x0]
ld1 { v4.1d- v7.1d}, [x8], #32
ld1 { v8.1d-v11.1d}, [x8], #32
ld1 {v12.1d-v15.1d}, [x8], #32
ld1 {v16.1d-v19.1d}, [x8], #32
ld1 {v20.1d-v23.1d}, [x8], #32
ld1 {v24.1d}, [x8]
0: sub x2, x2, #1
mov w8, #24
adr_l x9, .Lsha3_rcon
/* load input */
ld1 {v25.8b-v28.8b}, [x1], #32
ld1 {v29.8b}, [x1], #8
eor v0.8b, v0.8b, v25.8b
eor v1.8b, v1.8b, v26.8b
eor v2.8b, v2.8b, v27.8b
eor v3.8b, v3.8b, v28.8b
eor v4.8b, v4.8b, v29.8b
ld1 {v25.8b-v28.8b}, [x1], #32
eor v5.8b, v5.8b, v25.8b
eor v6.8b, v6.8b, v26.8b
eor v7.8b, v7.8b, v27.8b
eor v8.8b, v8.8b, v28.8b
cmp x3, #72
b.eq 3f /* SHA3-512 (block_size=72)? */
ld1 {v25.8b-v28.8b}, [x1], #32
eor v9.8b, v9.8b, v25.8b
eor v10.8b, v10.8b, v26.8b
eor v11.8b, v11.8b, v27.8b
eor v12.8b, v12.8b, v28.8b
cmp x3, #104
b.eq 3f /* SHA3-384 (block_size=104)? */
ld1 {v25.8b-v28.8b}, [x1], #32
eor v13.8b, v13.8b, v25.8b
eor v14.8b, v14.8b, v26.8b
eor v15.8b, v15.8b, v27.8b
eor v16.8b, v16.8b, v28.8b
cmp x3, #144
b.lt 3f /* SHA3-256 or SHAKE256 (block_size=136)? */
b.eq 2f /* SHA3-224 (block_size=144)? */
/* SHAKE128 (block_size=168) */
ld1 {v25.8b-v28.8b}, [x1], #32
eor v17.8b, v17.8b, v25.8b
eor v18.8b, v18.8b, v26.8b
eor v19.8b, v19.8b, v27.8b
eor v20.8b, v20.8b, v28.8b
b 3f
2:
/* SHA3-224 (block_size=144) */
ld1 {v25.8b}, [x1], #8
eor v17.8b, v17.8b, v25.8b
3: sub w8, w8, #1
eor3 v29.16b, v4.16b, v9.16b, v14.16b
eor3 v26.16b, v1.16b, v6.16b, v11.16b
eor3 v28.16b, v3.16b, v8.16b, v13.16b
eor3 v25.16b, v0.16b, v5.16b, v10.16b
eor3 v27.16b, v2.16b, v7.16b, v12.16b
eor3 v29.16b, v29.16b, v19.16b, v24.16b
eor3 v26.16b, v26.16b, v16.16b, v21.16b
eor3 v28.16b, v28.16b, v18.16b, v23.16b
eor3 v25.16b, v25.16b, v15.16b, v20.16b
eor3 v27.16b, v27.16b, v17.16b, v22.16b
rax1 v30.2d, v29.2d, v26.2d // bc[0]
rax1 v26.2d, v26.2d, v28.2d // bc[2]
rax1 v28.2d, v28.2d, v25.2d // bc[4]
rax1 v25.2d, v25.2d, v27.2d // bc[1]
rax1 v27.2d, v27.2d, v29.2d // bc[3]
eor v0.16b, v0.16b, v30.16b
xar v29.2d, v1.2d, v25.2d, (64 - 1)
xar v1.2d, v6.2d, v25.2d, (64 - 44)
xar v6.2d, v9.2d, v28.2d, (64 - 20)
xar v9.2d, v22.2d, v26.2d, (64 - 61)
xar v22.2d, v14.2d, v28.2d, (64 - 39)
xar v14.2d, v20.2d, v30.2d, (64 - 18)
xar v31.2d, v2.2d, v26.2d, (64 - 62)
xar v2.2d, v12.2d, v26.2d, (64 - 43)
xar v12.2d, v13.2d, v27.2d, (64 - 25)
xar v13.2d, v19.2d, v28.2d, (64 - 8)
xar v19.2d, v23.2d, v27.2d, (64 - 56)
xar v23.2d, v15.2d, v30.2d, (64 - 41)
xar v15.2d, v4.2d, v28.2d, (64 - 27)
xar v28.2d, v24.2d, v28.2d, (64 - 14)
xar v24.2d, v21.2d, v25.2d, (64 - 2)
xar v8.2d, v8.2d, v27.2d, (64 - 55)
xar v4.2d, v16.2d, v25.2d, (64 - 45)
xar v16.2d, v5.2d, v30.2d, (64 - 36)
xar v5.2d, v3.2d, v27.2d, (64 - 28)
xar v27.2d, v18.2d, v27.2d, (64 - 21)
xar v3.2d, v17.2d, v26.2d, (64 - 15)
xar v25.2d, v11.2d, v25.2d, (64 - 10)
xar v26.2d, v7.2d, v26.2d, (64 - 6)
xar v30.2d, v10.2d, v30.2d, (64 - 3)
bcax v20.16b, v31.16b, v22.16b, v8.16b
bcax v21.16b, v8.16b, v23.16b, v22.16b
bcax v22.16b, v22.16b, v24.16b, v23.16b
bcax v23.16b, v23.16b, v31.16b, v24.16b
bcax v24.16b, v24.16b, v8.16b, v31.16b
ld1r {v31.2d}, [x9], #8
bcax v17.16b, v25.16b, v19.16b, v3.16b
bcax v18.16b, v3.16b, v15.16b, v19.16b
bcax v19.16b, v19.16b, v16.16b, v15.16b
bcax v15.16b, v15.16b, v25.16b, v16.16b
bcax v16.16b, v16.16b, v3.16b, v25.16b
bcax v10.16b, v29.16b, v12.16b, v26.16b
bcax v11.16b, v26.16b, v13.16b, v12.16b
bcax v12.16b, v12.16b, v14.16b, v13.16b
bcax v13.16b, v13.16b, v29.16b, v14.16b
bcax v14.16b, v14.16b, v26.16b, v29.16b
bcax v7.16b, v30.16b, v9.16b, v4.16b
bcax v8.16b, v4.16b, v5.16b, v9.16b
bcax v9.16b, v9.16b, v6.16b, v5.16b
bcax v5.16b, v5.16b, v30.16b, v6.16b
bcax v6.16b, v6.16b, v4.16b, v30.16b
bcax v3.16b, v27.16b, v0.16b, v28.16b
bcax v4.16b, v28.16b, v1.16b, v0.16b
bcax v0.16b, v0.16b, v2.16b, v1.16b
bcax v1.16b, v1.16b, v27.16b, v2.16b
bcax v2.16b, v2.16b, v28.16b, v27.16b
eor v0.16b, v0.16b, v31.16b
cbnz w8, 3b
cond_yield 4f, x8, x9
cbnz x2, 0b
/* save state */
4: st1 { v0.1d- v3.1d}, [x0], #32
st1 { v4.1d- v7.1d}, [x0], #32
st1 { v8.1d-v11.1d}, [x0], #32
st1 {v12.1d-v15.1d}, [x0], #32
st1 {v16.1d-v19.1d}, [x0], #32
st1 {v20.1d-v23.1d}, [x0], #32
st1 {v24.1d}, [x0]
mov x0, x2
ret
SYM_FUNC_END(sha3_ce_transform)
.section ".rodata", "a"
.align 8
.Lsha3_rcon:
.quad 0x0000000000000001, 0x0000000000008082, 0x800000000000808a
.quad 0x8000000080008000, 0x000000000000808b, 0x0000000080000001
.quad 0x8000000080008081, 0x8000000000008009, 0x000000000000008a
.quad 0x0000000000000088, 0x0000000080008009, 0x000000008000000a
.quad 0x000000008000808b, 0x800000000000008b, 0x8000000000008089
.quad 0x8000000000008003, 0x8000000000008002, 0x8000000000000080
.quad 0x000000000000800a, 0x800000008000000a, 0x8000000080008081
.quad 0x8000000000008080, 0x0000000080000001, 0x8000000080008008

62
lib/crypto/arm64/sha3.h Normal file
View File

@@ -0,0 +1,62 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <asm/neon.h>
#include <asm/simd.h>
#include <linux/cpufeature.h>
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha3);
asmlinkage size_t sha3_ce_transform(struct sha3_state *state, const u8 *data,
size_t nblocks, size_t block_size);
static void sha3_absorb_blocks(struct sha3_state *state, const u8 *data,
size_t nblocks, size_t block_size)
{
if (static_branch_likely(&have_sha3) && likely(may_use_simd())) {
do {
size_t rem;
kernel_neon_begin();
rem = sha3_ce_transform(state, data, nblocks,
block_size);
kernel_neon_end();
data += (nblocks - rem) * block_size;
nblocks = rem;
} while (nblocks);
} else {
sha3_absorb_blocks_generic(state, data, nblocks, block_size);
}
}
static void sha3_keccakf(struct sha3_state *state)
{
if (static_branch_likely(&have_sha3) && likely(may_use_simd())) {
/*
* Passing zeroes into sha3_ce_transform() gives the plain
* Keccak-f permutation, which is what we want here. Any
* supported block size may be used. Use SHA3_512_BLOCK_SIZE
* since it's the shortest.
*/
static const u8 zeroes[SHA3_512_BLOCK_SIZE];
kernel_neon_begin();
sha3_ce_transform(state, zeroes, 1, sizeof(zeroes));
kernel_neon_end();
} else {
sha3_keccakf_generic(state);
}
}
#define sha3_mod_init_arch sha3_mod_init_arch
static void sha3_mod_init_arch(void)
{
if (cpu_have_named_feature(SHA3))
static_branch_enable(&have_sha3);
}

View File

@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* sha512-ce-core.S - core SHA-384/SHA-512 transform using v8 Crypto Extensions
* Core SHA-384/SHA-512 transform using v8 Crypto Extensions
*
* Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
*

174
lib/crypto/blake2b.c Normal file
View File

@@ -0,0 +1,174 @@
// SPDX-License-Identifier: GPL-2.0 OR MIT
/*
* Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
* Copyright 2025 Google LLC
*
* This is an implementation of the BLAKE2b hash and PRF functions.
*
* Information: https://blake2.net/
*/
#include <crypto/blake2b.h>
#include <linux/bug.h>
#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/string.h>
#include <linux/types.h>
static const u8 blake2b_sigma[12][16] = {
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
};
static inline void blake2b_increment_counter(struct blake2b_ctx *ctx, u32 inc)
{
ctx->t[0] += inc;
ctx->t[1] += (ctx->t[0] < inc);
}
static void __maybe_unused
blake2b_compress_generic(struct blake2b_ctx *ctx,
const u8 *data, size_t nblocks, u32 inc)
{
u64 m[16];
u64 v[16];
int i;
WARN_ON(IS_ENABLED(DEBUG) &&
(nblocks > 1 && inc != BLAKE2B_BLOCK_SIZE));
while (nblocks > 0) {
blake2b_increment_counter(ctx, inc);
memcpy(m, data, BLAKE2B_BLOCK_SIZE);
le64_to_cpu_array(m, ARRAY_SIZE(m));
memcpy(v, ctx->h, 64);
v[ 8] = BLAKE2B_IV0;
v[ 9] = BLAKE2B_IV1;
v[10] = BLAKE2B_IV2;
v[11] = BLAKE2B_IV3;
v[12] = BLAKE2B_IV4 ^ ctx->t[0];
v[13] = BLAKE2B_IV5 ^ ctx->t[1];
v[14] = BLAKE2B_IV6 ^ ctx->f[0];
v[15] = BLAKE2B_IV7 ^ ctx->f[1];
#define G(r, i, a, b, c, d) do { \
a += b + m[blake2b_sigma[r][2 * i + 0]]; \
d = ror64(d ^ a, 32); \
c += d; \
b = ror64(b ^ c, 24); \
a += b + m[blake2b_sigma[r][2 * i + 1]]; \
d = ror64(d ^ a, 16); \
c += d; \
b = ror64(b ^ c, 63); \
} while (0)
#define ROUND(r) do { \
G(r, 0, v[0], v[ 4], v[ 8], v[12]); \
G(r, 1, v[1], v[ 5], v[ 9], v[13]); \
G(r, 2, v[2], v[ 6], v[10], v[14]); \
G(r, 3, v[3], v[ 7], v[11], v[15]); \
G(r, 4, v[0], v[ 5], v[10], v[15]); \
G(r, 5, v[1], v[ 6], v[11], v[12]); \
G(r, 6, v[2], v[ 7], v[ 8], v[13]); \
G(r, 7, v[3], v[ 4], v[ 9], v[14]); \
} while (0)
ROUND(0);
ROUND(1);
ROUND(2);
ROUND(3);
ROUND(4);
ROUND(5);
ROUND(6);
ROUND(7);
ROUND(8);
ROUND(9);
ROUND(10);
ROUND(11);
#undef G
#undef ROUND
for (i = 0; i < 8; ++i)
ctx->h[i] ^= v[i] ^ v[i + 8];
data += BLAKE2B_BLOCK_SIZE;
--nblocks;
}
}
#ifdef CONFIG_CRYPTO_LIB_BLAKE2B_ARCH
#include "blake2b.h" /* $(SRCARCH)/blake2b.h */
#else
#define blake2b_compress blake2b_compress_generic
#endif
static inline void blake2b_set_lastblock(struct blake2b_ctx *ctx)
{
ctx->f[0] = -1;
}
void blake2b_update(struct blake2b_ctx *ctx, const u8 *in, size_t inlen)
{
const size_t fill = BLAKE2B_BLOCK_SIZE - ctx->buflen;
if (unlikely(!inlen))
return;
if (inlen > fill) {
memcpy(ctx->buf + ctx->buflen, in, fill);
blake2b_compress(ctx, ctx->buf, 1, BLAKE2B_BLOCK_SIZE);
ctx->buflen = 0;
in += fill;
inlen -= fill;
}
if (inlen > BLAKE2B_BLOCK_SIZE) {
const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2B_BLOCK_SIZE);
blake2b_compress(ctx, in, nblocks - 1, BLAKE2B_BLOCK_SIZE);
in += BLAKE2B_BLOCK_SIZE * (nblocks - 1);
inlen -= BLAKE2B_BLOCK_SIZE * (nblocks - 1);
}
memcpy(ctx->buf + ctx->buflen, in, inlen);
ctx->buflen += inlen;
}
EXPORT_SYMBOL(blake2b_update);
void blake2b_final(struct blake2b_ctx *ctx, u8 *out)
{
WARN_ON(IS_ENABLED(DEBUG) && !out);
blake2b_set_lastblock(ctx);
memset(ctx->buf + ctx->buflen, 0,
BLAKE2B_BLOCK_SIZE - ctx->buflen); /* Padding */
blake2b_compress(ctx, ctx->buf, 1, ctx->buflen);
cpu_to_le64_array(ctx->h, ARRAY_SIZE(ctx->h));
memcpy(out, ctx->h, ctx->outlen);
memzero_explicit(ctx, sizeof(*ctx));
}
EXPORT_SYMBOL(blake2b_final);
#ifdef blake2b_mod_init_arch
static int __init blake2b_mod_init(void)
{
blake2b_mod_init_arch();
return 0;
}
subsys_initcall(blake2b_mod_init);
static void __exit blake2b_mod_exit(void)
{
}
module_exit(blake2b_mod_exit);
#endif
MODULE_DESCRIPTION("BLAKE2b hash function");
MODULE_LICENSE("GPL");

View File

@@ -29,16 +29,15 @@ static const u8 blake2s_sigma[10][16] = {
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
};
static inline void blake2s_increment_counter(struct blake2s_state *state,
const u32 inc)
static inline void blake2s_increment_counter(struct blake2s_ctx *ctx, u32 inc)
{
state->t[0] += inc;
state->t[1] += (state->t[0] < inc);
ctx->t[0] += inc;
ctx->t[1] += (ctx->t[0] < inc);
}
static void __maybe_unused
blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
size_t nblocks, const u32 inc)
blake2s_compress_generic(struct blake2s_ctx *ctx,
const u8 *data, size_t nblocks, u32 inc)
{
u32 m[16];
u32 v[16];
@@ -48,18 +47,18 @@ blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
(nblocks > 1 && inc != BLAKE2S_BLOCK_SIZE));
while (nblocks > 0) {
blake2s_increment_counter(state, inc);
memcpy(m, block, BLAKE2S_BLOCK_SIZE);
blake2s_increment_counter(ctx, inc);
memcpy(m, data, BLAKE2S_BLOCK_SIZE);
le32_to_cpu_array(m, ARRAY_SIZE(m));
memcpy(v, state->h, 32);
memcpy(v, ctx->h, 32);
v[ 8] = BLAKE2S_IV0;
v[ 9] = BLAKE2S_IV1;
v[10] = BLAKE2S_IV2;
v[11] = BLAKE2S_IV3;
v[12] = BLAKE2S_IV4 ^ state->t[0];
v[13] = BLAKE2S_IV5 ^ state->t[1];
v[14] = BLAKE2S_IV6 ^ state->f[0];
v[15] = BLAKE2S_IV7 ^ state->f[1];
v[12] = BLAKE2S_IV4 ^ ctx->t[0];
v[13] = BLAKE2S_IV5 ^ ctx->t[1];
v[14] = BLAKE2S_IV6 ^ ctx->f[0];
v[15] = BLAKE2S_IV7 ^ ctx->f[1];
#define G(r, i, a, b, c, d) do { \
a += b + m[blake2s_sigma[r][2 * i + 0]]; \
@@ -97,9 +96,9 @@ blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
#undef ROUND
for (i = 0; i < 8; ++i)
state->h[i] ^= v[i] ^ v[i + 8];
ctx->h[i] ^= v[i] ^ v[i + 8];
block += BLAKE2S_BLOCK_SIZE;
data += BLAKE2S_BLOCK_SIZE;
--nblocks;
}
}
@@ -110,45 +109,46 @@ blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
#define blake2s_compress blake2s_compress_generic
#endif
static inline void blake2s_set_lastblock(struct blake2s_state *state)
static inline void blake2s_set_lastblock(struct blake2s_ctx *ctx)
{
state->f[0] = -1;
ctx->f[0] = -1;
}
void blake2s_update(struct blake2s_state *state, const u8 *in, size_t inlen)
void blake2s_update(struct blake2s_ctx *ctx, const u8 *in, size_t inlen)
{
const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen;
const size_t fill = BLAKE2S_BLOCK_SIZE - ctx->buflen;
if (unlikely(!inlen))
return;
if (inlen > fill) {
memcpy(state->buf + state->buflen, in, fill);
blake2s_compress(state, state->buf, 1, BLAKE2S_BLOCK_SIZE);
state->buflen = 0;
memcpy(ctx->buf + ctx->buflen, in, fill);
blake2s_compress(ctx, ctx->buf, 1, BLAKE2S_BLOCK_SIZE);
ctx->buflen = 0;
in += fill;
inlen -= fill;
}
if (inlen > BLAKE2S_BLOCK_SIZE) {
const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE);
blake2s_compress(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE);
blake2s_compress(ctx, in, nblocks - 1, BLAKE2S_BLOCK_SIZE);
in += BLAKE2S_BLOCK_SIZE * (nblocks - 1);
inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1);
}
memcpy(state->buf + state->buflen, in, inlen);
state->buflen += inlen;
memcpy(ctx->buf + ctx->buflen, in, inlen);
ctx->buflen += inlen;
}
EXPORT_SYMBOL(blake2s_update);
void blake2s_final(struct blake2s_state *state, u8 *out)
void blake2s_final(struct blake2s_ctx *ctx, u8 *out)
{
WARN_ON(IS_ENABLED(DEBUG) && !out);
blake2s_set_lastblock(state);
memset(state->buf + state->buflen, 0,
BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */
blake2s_compress(state, state->buf, 1, state->buflen);
cpu_to_le32_array(state->h, ARRAY_SIZE(state->h));
memcpy(out, state->h, state->outlen);
memzero_explicit(state, sizeof(*state));
blake2s_set_lastblock(ctx);
memset(ctx->buf + ctx->buflen, 0,
BLAKE2S_BLOCK_SIZE - ctx->buflen); /* Padding */
blake2s_compress(ctx, ctx->buf, 1, ctx->buflen);
cpu_to_le32_array(ctx->h, ARRAY_SIZE(ctx->h));
memcpy(out, ctx->h, ctx->outlen);
memzero_explicit(ctx, sizeof(*ctx));
}
EXPORT_SYMBOL(blake2s_final);

45
lib/crypto/fips.h Normal file
View File

@@ -0,0 +1,45 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* This file was generated by: gen-fips-testvecs.py */
#include <linux/fips.h>
static const u8 fips_test_data[] __initconst __maybe_unused = {
0x66, 0x69, 0x70, 0x73, 0x20, 0x74, 0x65, 0x73,
0x74, 0x20, 0x64, 0x61, 0x74, 0x61, 0x00, 0x00,
};
static const u8 fips_test_key[] __initconst __maybe_unused = {
0x66, 0x69, 0x70, 0x73, 0x20, 0x74, 0x65, 0x73,
0x74, 0x20, 0x6b, 0x65, 0x79, 0x00, 0x00, 0x00,
};
static const u8 fips_test_hmac_sha1_value[] __initconst __maybe_unused = {
0x29, 0xa9, 0x88, 0xb8, 0x5c, 0xb4, 0xaf, 0x4b,
0x97, 0x2a, 0xee, 0x87, 0x5b, 0x0a, 0x02, 0x55,
0x99, 0xbf, 0x86, 0x78,
};
static const u8 fips_test_hmac_sha256_value[] __initconst __maybe_unused = {
0x59, 0x25, 0x85, 0xcc, 0x40, 0xe9, 0x64, 0x2f,
0xe9, 0xbf, 0x82, 0xb7, 0xd3, 0x15, 0x3d, 0x43,
0x22, 0x0b, 0x4c, 0x00, 0x90, 0x14, 0x25, 0xcf,
0x9e, 0x13, 0x2b, 0xc2, 0x30, 0xe6, 0xe8, 0x93,
};
static const u8 fips_test_hmac_sha512_value[] __initconst __maybe_unused = {
0x6b, 0xea, 0x5d, 0x27, 0x49, 0x5b, 0x3f, 0xea,
0xde, 0x2d, 0xfa, 0x32, 0x75, 0xdb, 0x77, 0xc8,
0x26, 0xe9, 0x4e, 0x95, 0x4d, 0xad, 0x88, 0x02,
0x87, 0xf9, 0x52, 0x0a, 0xd1, 0x92, 0x80, 0x1d,
0x92, 0x7e, 0x3c, 0xbd, 0xb1, 0x3c, 0x49, 0x98,
0x44, 0x9c, 0x8f, 0xee, 0x3f, 0x02, 0x71, 0x51,
0x57, 0x0b, 0x15, 0x38, 0x95, 0xd8, 0xa3, 0x81,
0xba, 0xb3, 0x15, 0x37, 0x5c, 0x6d, 0x57, 0x2b,
};
static const u8 fips_test_sha3_256_value[] __initconst __maybe_unused = {
0x77, 0xc4, 0x8b, 0x69, 0x70, 0x5f, 0x0a, 0xb1,
0xb1, 0xa5, 0x82, 0x0a, 0x22, 0x2b, 0x49, 0x31,
0xba, 0x9b, 0xb6, 0xaa, 0x32, 0xa7, 0x97, 0x00,
0x98, 0xdb, 0xff, 0xe7, 0xc6, 0xde, 0xb5, 0x82,
};

307
lib/crypto/polyval.c Normal file
View File

@@ -0,0 +1,307 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* POLYVAL library functions
*
* Copyright 2025 Google LLC
*/
#include <crypto/polyval.h>
#include <linux/export.h>
#include <linux/module.h>
#include <linux/string.h>
#include <linux/unaligned.h>
/*
* POLYVAL is an almost-XOR-universal hash function. Similar to GHASH, POLYVAL
* interprets the message as the coefficients of a polynomial in GF(2^128) and
* evaluates that polynomial at a secret point. POLYVAL has a simple
* mathematical relationship with GHASH, but it uses a better field convention
* which makes it easier and faster to implement.
*
* POLYVAL is not a cryptographic hash function, and it should be used only by
* algorithms that are specifically designed to use it.
*
* POLYVAL is specified by "AES-GCM-SIV: Nonce Misuse-Resistant Authenticated
* Encryption" (https://datatracker.ietf.org/doc/html/rfc8452)
*
* POLYVAL is also used by HCTR2. See "Length-preserving encryption with HCTR2"
* (https://eprint.iacr.org/2021/1441.pdf).
*
* This file provides a library API for POLYVAL. This API can delegate to
* either a generic implementation or an architecture-optimized implementation.
*
* For the generic implementation, we don't use the traditional table approach
* to GF(2^128) multiplication. That approach is not constant-time and requires
* a lot of memory. Instead, we use a different approach which emulates
* carryless multiplication using standard multiplications by spreading the data
* bits apart using "holes". This allows the carries to spill harmlessly. This
* approach is borrowed from BoringSSL, which in turn credits BearSSL's
* documentation (https://bearssl.org/constanttime.html#ghash-for-gcm) for the
* "holes" trick and a presentation by Shay Gueron
* (https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf) for the
* 256-bit => 128-bit reduction algorithm.
*/
#ifdef CONFIG_ARCH_SUPPORTS_INT128
/* Do a 64 x 64 => 128 bit carryless multiplication. */
static void clmul64(u64 a, u64 b, u64 *out_lo, u64 *out_hi)
{
/*
* With 64-bit multiplicands and one term every 4 bits, there would be
* up to 64 / 4 = 16 one bits per column when each multiplication is
* written out as a series of additions in the schoolbook manner.
* Unfortunately, that doesn't work since the value 16 is 1 too large to
* fit in 4 bits. Carries would sometimes overflow into the next term.
*
* Using one term every 5 bits would work. However, that would cost
* 5 x 5 = 25 multiplications instead of 4 x 4 = 16.
*
* Instead, mask off 4 bits from one multiplicand, giving a max of 15
* one bits per column. Then handle those 4 bits separately.
*/
u64 a0 = a & 0x1111111111111110;
u64 a1 = a & 0x2222222222222220;
u64 a2 = a & 0x4444444444444440;
u64 a3 = a & 0x8888888888888880;
u64 b0 = b & 0x1111111111111111;
u64 b1 = b & 0x2222222222222222;
u64 b2 = b & 0x4444444444444444;
u64 b3 = b & 0x8888888888888888;
/* Multiply the high 60 bits of @a by @b. */
u128 c0 = (a0 * (u128)b0) ^ (a1 * (u128)b3) ^
(a2 * (u128)b2) ^ (a3 * (u128)b1);
u128 c1 = (a0 * (u128)b1) ^ (a1 * (u128)b0) ^
(a2 * (u128)b3) ^ (a3 * (u128)b2);
u128 c2 = (a0 * (u128)b2) ^ (a1 * (u128)b1) ^
(a2 * (u128)b0) ^ (a3 * (u128)b3);
u128 c3 = (a0 * (u128)b3) ^ (a1 * (u128)b2) ^
(a2 * (u128)b1) ^ (a3 * (u128)b0);
/* Multiply the low 4 bits of @a by @b. */
u64 e0 = -(a & 1) & b;
u64 e1 = -((a >> 1) & 1) & b;
u64 e2 = -((a >> 2) & 1) & b;
u64 e3 = -((a >> 3) & 1) & b;
u64 extra_lo = e0 ^ (e1 << 1) ^ (e2 << 2) ^ (e3 << 3);
u64 extra_hi = (e1 >> 63) ^ (e2 >> 62) ^ (e3 >> 61);
/* Add all the intermediate products together. */
*out_lo = (((u64)c0) & 0x1111111111111111) ^
(((u64)c1) & 0x2222222222222222) ^
(((u64)c2) & 0x4444444444444444) ^
(((u64)c3) & 0x8888888888888888) ^ extra_lo;
*out_hi = (((u64)(c0 >> 64)) & 0x1111111111111111) ^
(((u64)(c1 >> 64)) & 0x2222222222222222) ^
(((u64)(c2 >> 64)) & 0x4444444444444444) ^
(((u64)(c3 >> 64)) & 0x8888888888888888) ^ extra_hi;
}
#else /* CONFIG_ARCH_SUPPORTS_INT128 */
/* Do a 32 x 32 => 64 bit carryless multiplication. */
static u64 clmul32(u32 a, u32 b)
{
/*
* With 32-bit multiplicands and one term every 4 bits, there are up to
* 32 / 4 = 8 one bits per column when each multiplication is written
* out as a series of additions in the schoolbook manner. The value 8
* fits in 4 bits, so the carries don't overflow into the next term.
*/
u32 a0 = a & 0x11111111;
u32 a1 = a & 0x22222222;
u32 a2 = a & 0x44444444;
u32 a3 = a & 0x88888888;
u32 b0 = b & 0x11111111;
u32 b1 = b & 0x22222222;
u32 b2 = b & 0x44444444;
u32 b3 = b & 0x88888888;
u64 c0 = (a0 * (u64)b0) ^ (a1 * (u64)b3) ^
(a2 * (u64)b2) ^ (a3 * (u64)b1);
u64 c1 = (a0 * (u64)b1) ^ (a1 * (u64)b0) ^
(a2 * (u64)b3) ^ (a3 * (u64)b2);
u64 c2 = (a0 * (u64)b2) ^ (a1 * (u64)b1) ^
(a2 * (u64)b0) ^ (a3 * (u64)b3);
u64 c3 = (a0 * (u64)b3) ^ (a1 * (u64)b2) ^
(a2 * (u64)b1) ^ (a3 * (u64)b0);
/* Add all the intermediate products together. */
return (c0 & 0x1111111111111111) ^
(c1 & 0x2222222222222222) ^
(c2 & 0x4444444444444444) ^
(c3 & 0x8888888888888888);
}
/* Do a 64 x 64 => 128 bit carryless multiplication. */
static void clmul64(u64 a, u64 b, u64 *out_lo, u64 *out_hi)
{
u32 a_lo = (u32)a;
u32 a_hi = a >> 32;
u32 b_lo = (u32)b;
u32 b_hi = b >> 32;
/* Karatsuba multiplication */
u64 lo = clmul32(a_lo, b_lo);
u64 hi = clmul32(a_hi, b_hi);
u64 mi = clmul32(a_lo ^ a_hi, b_lo ^ b_hi) ^ lo ^ hi;
*out_lo = lo ^ (mi << 32);
*out_hi = hi ^ (mi >> 32);
}
#endif /* !CONFIG_ARCH_SUPPORTS_INT128 */
/* Compute @a = @a * @b * x^-128 in the POLYVAL field. */
static void __maybe_unused
polyval_mul_generic(struct polyval_elem *a, const struct polyval_elem *b)
{
u64 c0, c1, c2, c3, mi0, mi1;
/*
* Carryless-multiply @a by @b using Karatsuba multiplication. Store
* the 256-bit product in @c0 (low) through @c3 (high).
*/
clmul64(le64_to_cpu(a->lo), le64_to_cpu(b->lo), &c0, &c1);
clmul64(le64_to_cpu(a->hi), le64_to_cpu(b->hi), &c2, &c3);
clmul64(le64_to_cpu(a->lo ^ a->hi), le64_to_cpu(b->lo ^ b->hi),
&mi0, &mi1);
mi0 ^= c0 ^ c2;
mi1 ^= c1 ^ c3;
c1 ^= mi0;
c2 ^= mi1;
/*
* Cancel out the low 128 bits of the product by adding multiples of
* G(x) = x^128 + x^127 + x^126 + x^121 + 1. Do this in two steps, each
* of which cancels out 64 bits. Note that we break G(x) into three
* parts: 1, x^64 * (x^63 + x^62 + x^57), and x^128 * 1.
*/
/*
* First, add G(x) times c0 as follows:
*
* (c0, c1, c2) = (0,
* c1 + (c0 * (x^63 + x^62 + x^57) mod x^64),
* c2 + c0 + floor((c0 * (x^63 + x^62 + x^57)) / x^64))
*/
c1 ^= (c0 << 63) ^ (c0 << 62) ^ (c0 << 57);
c2 ^= c0 ^ (c0 >> 1) ^ (c0 >> 2) ^ (c0 >> 7);
/*
* Second, add G(x) times the new c1:
*
* (c1, c2, c3) = (0,
* c2 + (c1 * (x^63 + x^62 + x^57) mod x^64),
* c3 + c1 + floor((c1 * (x^63 + x^62 + x^57)) / x^64))
*/
c2 ^= (c1 << 63) ^ (c1 << 62) ^ (c1 << 57);
c3 ^= c1 ^ (c1 >> 1) ^ (c1 >> 2) ^ (c1 >> 7);
/* Return (c2, c3). This implicitly multiplies by x^-128. */
a->lo = cpu_to_le64(c2);
a->hi = cpu_to_le64(c3);
}
static void __maybe_unused
polyval_blocks_generic(struct polyval_elem *acc, const struct polyval_elem *key,
const u8 *data, size_t nblocks)
{
do {
acc->lo ^= get_unaligned((__le64 *)data);
acc->hi ^= get_unaligned((__le64 *)(data + 8));
polyval_mul_generic(acc, key);
data += POLYVAL_BLOCK_SIZE;
} while (--nblocks);
}
/* Include the arch-optimized implementation of POLYVAL, if one is available. */
#ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH
#include "polyval.h" /* $(SRCARCH)/polyval.h */
void polyval_preparekey(struct polyval_key *key,
const u8 raw_key[POLYVAL_BLOCK_SIZE])
{
polyval_preparekey_arch(key, raw_key);
}
EXPORT_SYMBOL_GPL(polyval_preparekey);
#endif /* Else, polyval_preparekey() is an inline function. */
/*
* polyval_mul_generic() and polyval_blocks_generic() take the key as a
* polyval_elem rather than a polyval_key, so that arch-optimized
* implementations with a different key format can use it as a fallback (if they
* have H^1 stored somewhere in their struct). Thus, the following dispatch
* code is needed to pass the appropriate key argument.
*/
static void polyval_mul(struct polyval_ctx *ctx)
{
#ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH
polyval_mul_arch(&ctx->acc, ctx->key);
#else
polyval_mul_generic(&ctx->acc, &ctx->key->h);
#endif
}
static void polyval_blocks(struct polyval_ctx *ctx,
const u8 *data, size_t nblocks)
{
#ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH
polyval_blocks_arch(&ctx->acc, ctx->key, data, nblocks);
#else
polyval_blocks_generic(&ctx->acc, &ctx->key->h, data, nblocks);
#endif
}
void polyval_update(struct polyval_ctx *ctx, const u8 *data, size_t len)
{
if (unlikely(ctx->partial)) {
size_t n = min(len, POLYVAL_BLOCK_SIZE - ctx->partial);
len -= n;
while (n--)
ctx->acc.bytes[ctx->partial++] ^= *data++;
if (ctx->partial < POLYVAL_BLOCK_SIZE)
return;
polyval_mul(ctx);
}
if (len >= POLYVAL_BLOCK_SIZE) {
size_t nblocks = len / POLYVAL_BLOCK_SIZE;
polyval_blocks(ctx, data, nblocks);
data += len & ~(POLYVAL_BLOCK_SIZE - 1);
len &= POLYVAL_BLOCK_SIZE - 1;
}
for (size_t i = 0; i < len; i++)
ctx->acc.bytes[i] ^= data[i];
ctx->partial = len;
}
EXPORT_SYMBOL_GPL(polyval_update);
void polyval_final(struct polyval_ctx *ctx, u8 out[POLYVAL_BLOCK_SIZE])
{
if (unlikely(ctx->partial))
polyval_mul(ctx);
memcpy(out, &ctx->acc, POLYVAL_BLOCK_SIZE);
memzero_explicit(ctx, sizeof(*ctx));
}
EXPORT_SYMBOL_GPL(polyval_final);
#ifdef polyval_mod_init_arch
static int __init polyval_mod_init(void)
{
polyval_mod_init_arch();
return 0;
}
subsys_initcall(polyval_mod_init);
static void __exit polyval_mod_exit(void)
{
}
module_exit(polyval_mod_exit);
#endif
MODULE_DESCRIPTION("POLYVAL almost-XOR-universal hash function");
MODULE_LICENSE("GPL");

151
lib/crypto/s390/sha3.h Normal file
View File

@@ -0,0 +1,151 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* SHA-3 optimized using the CP Assist for Cryptographic Functions (CPACF)
*
* Copyright 2025 Google LLC
*/
#include <asm/cpacf.h>
#include <linux/cpufeature.h>
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha3);
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha3_init_optim);
static void sha3_absorb_blocks(struct sha3_state *state, const u8 *data,
size_t nblocks, size_t block_size)
{
if (static_branch_likely(&have_sha3)) {
/*
* Note that KIMD assumes little-endian order of the state
* words. sha3_state already uses that order, though, so
* there's no need for a byteswap.
*/
switch (block_size) {
case SHA3_224_BLOCK_SIZE:
cpacf_kimd(CPACF_KIMD_SHA3_224, state,
data, nblocks * block_size);
return;
case SHA3_256_BLOCK_SIZE:
/*
* This case handles both SHA3-256 and SHAKE256, since
* they have the same block size.
*/
cpacf_kimd(CPACF_KIMD_SHA3_256, state,
data, nblocks * block_size);
return;
case SHA3_384_BLOCK_SIZE:
cpacf_kimd(CPACF_KIMD_SHA3_384, state,
data, nblocks * block_size);
return;
case SHA3_512_BLOCK_SIZE:
cpacf_kimd(CPACF_KIMD_SHA3_512, state,
data, nblocks * block_size);
return;
}
}
sha3_absorb_blocks_generic(state, data, nblocks, block_size);
}
static void sha3_keccakf(struct sha3_state *state)
{
if (static_branch_likely(&have_sha3)) {
/*
* Passing zeroes into any of CPACF_KIMD_SHA3_* gives the plain
* Keccak-f permutation, which is what we want here. Use
* SHA3-512 since it has the smallest block size.
*/
static const u8 zeroes[SHA3_512_BLOCK_SIZE];
cpacf_kimd(CPACF_KIMD_SHA3_512, state, zeroes, sizeof(zeroes));
} else {
sha3_keccakf_generic(state);
}
}
static inline bool s390_sha3(int func, const u8 *in, size_t in_len,
u8 *out, size_t out_len)
{
struct sha3_state state;
if (!static_branch_likely(&have_sha3))
return false;
if (static_branch_likely(&have_sha3_init_optim))
func |= CPACF_KLMD_NIP | CPACF_KLMD_DUFOP;
else
memset(&state, 0, sizeof(state));
cpacf_klmd(func, &state, in, in_len);
if (static_branch_likely(&have_sha3_init_optim))
kmsan_unpoison_memory(&state, out_len);
memcpy(out, &state, out_len);
memzero_explicit(&state, sizeof(state));
return true;
}
#define sha3_224_arch sha3_224_arch
static bool sha3_224_arch(const u8 *in, size_t in_len,
u8 out[SHA3_224_DIGEST_SIZE])
{
return s390_sha3(CPACF_KLMD_SHA3_224, in, in_len,
out, SHA3_224_DIGEST_SIZE);
}
#define sha3_256_arch sha3_256_arch
static bool sha3_256_arch(const u8 *in, size_t in_len,
u8 out[SHA3_256_DIGEST_SIZE])
{
return s390_sha3(CPACF_KLMD_SHA3_256, in, in_len,
out, SHA3_256_DIGEST_SIZE);
}
#define sha3_384_arch sha3_384_arch
static bool sha3_384_arch(const u8 *in, size_t in_len,
u8 out[SHA3_384_DIGEST_SIZE])
{
return s390_sha3(CPACF_KLMD_SHA3_384, in, in_len,
out, SHA3_384_DIGEST_SIZE);
}
#define sha3_512_arch sha3_512_arch
static bool sha3_512_arch(const u8 *in, size_t in_len,
u8 out[SHA3_512_DIGEST_SIZE])
{
return s390_sha3(CPACF_KLMD_SHA3_512, in, in_len,
out, SHA3_512_DIGEST_SIZE);
}
#define sha3_mod_init_arch sha3_mod_init_arch
static void sha3_mod_init_arch(void)
{
int num_present = 0;
int num_possible = 0;
if (!cpu_have_feature(S390_CPU_FEATURE_MSA))
return;
/*
* Since all the SHA-3 functions are in Message-Security-Assist
* Extension 6, just treat them as all or nothing. This way we need
* only one static_key.
*/
#define QUERY(opcode, func) \
({ num_present += !!cpacf_query_func(opcode, func); num_possible++; })
QUERY(CPACF_KIMD, CPACF_KIMD_SHA3_224);
QUERY(CPACF_KIMD, CPACF_KIMD_SHA3_256);
QUERY(CPACF_KIMD, CPACF_KIMD_SHA3_384);
QUERY(CPACF_KIMD, CPACF_KIMD_SHA3_512);
QUERY(CPACF_KLMD, CPACF_KLMD_SHA3_224);
QUERY(CPACF_KLMD, CPACF_KLMD_SHA3_256);
QUERY(CPACF_KLMD, CPACF_KLMD_SHA3_384);
QUERY(CPACF_KLMD, CPACF_KLMD_SHA3_512);
#undef QUERY
if (num_present == num_possible) {
static_branch_enable(&have_sha3);
if (test_facility(86))
static_branch_enable(&have_sha3_init_optim);
} else if (num_present != 0) {
pr_warn("Unsupported combination of SHA-3 facilities\n");
}
}

View File

@@ -12,6 +12,7 @@
#include <linux/string.h>
#include <linux/unaligned.h>
#include <linux/wordpart.h>
#include "fips.h"
static const struct sha1_block_state sha1_iv = {
.h = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 },
@@ -330,10 +331,26 @@ void hmac_sha1_usingrawkey(const u8 *raw_key, size_t raw_key_len,
}
EXPORT_SYMBOL_GPL(hmac_sha1_usingrawkey);
#ifdef sha1_mod_init_arch
#if defined(sha1_mod_init_arch) || defined(CONFIG_CRYPTO_FIPS)
static int __init sha1_mod_init(void)
{
#ifdef sha1_mod_init_arch
sha1_mod_init_arch();
#endif
if (fips_enabled) {
/*
* FIPS cryptographic algorithm self-test. As per the FIPS
* Implementation Guidance, testing HMAC-SHA1 satisfies the test
* requirement for SHA-1 too.
*/
u8 mac[SHA1_DIGEST_SIZE];
hmac_sha1_usingrawkey(fips_test_key, sizeof(fips_test_key),
fips_test_data, sizeof(fips_test_data),
mac);
if (memcmp(fips_test_hmac_sha1_value, mac, sizeof(mac)) != 0)
panic("sha1: FIPS self-test failed\n");
}
return 0;
}
subsys_initcall(sha1_mod_init);

View File

@@ -17,6 +17,7 @@
#include <linux/string.h>
#include <linux/unaligned.h>
#include <linux/wordpart.h>
#include "fips.h"
static const struct sha256_block_state sha224_iv = {
.h = {
@@ -269,8 +270,8 @@ void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE])
EXPORT_SYMBOL(sha256);
/*
* Pre-boot environment (as indicated by __DISABLE_EXPORTS being defined)
* doesn't need either HMAC support or interleaved hashing support
* Pre-boot environments (as indicated by __DISABLE_EXPORTS being defined) just
* need the generic SHA-256 code. Omit all other features from them.
*/
#ifndef __DISABLE_EXPORTS
@@ -477,12 +478,27 @@ void hmac_sha256_usingrawkey(const u8 *raw_key, size_t raw_key_len,
hmac_sha256_final(&ctx, out);
}
EXPORT_SYMBOL_GPL(hmac_sha256_usingrawkey);
#endif /* !__DISABLE_EXPORTS */
#ifdef sha256_mod_init_arch
#if defined(sha256_mod_init_arch) || defined(CONFIG_CRYPTO_FIPS)
static int __init sha256_mod_init(void)
{
#ifdef sha256_mod_init_arch
sha256_mod_init_arch();
#endif
if (fips_enabled) {
/*
* FIPS cryptographic algorithm self-test. As per the FIPS
* Implementation Guidance, testing HMAC-SHA256 satisfies the
* test requirement for SHA-224, SHA-256, and HMAC-SHA224 too.
*/
u8 mac[SHA256_DIGEST_SIZE];
hmac_sha256_usingrawkey(fips_test_key, sizeof(fips_test_key),
fips_test_data, sizeof(fips_test_data),
mac);
if (memcmp(fips_test_hmac_sha256_value, mac, sizeof(mac)) != 0)
panic("sha256: FIPS self-test failed\n");
}
return 0;
}
subsys_initcall(sha256_mod_init);
@@ -493,5 +509,7 @@ static void __exit sha256_mod_exit(void)
module_exit(sha256_mod_exit);
#endif
#endif /* !__DISABLE_EXPORTS */
MODULE_DESCRIPTION("SHA-224, SHA-256, HMAC-SHA224, and HMAC-SHA256 library functions");
MODULE_LICENSE("GPL");

411
lib/crypto/sha3.c Normal file
View File

@@ -0,0 +1,411 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* SHA-3, as specified in
* https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.202.pdf
*
* SHA-3 code by Jeff Garzik <jeff@garzik.org>
* Ard Biesheuvel <ard.biesheuvel@linaro.org>
* David Howells <dhowells@redhat.com>
*
* See also Documentation/crypto/sha3.rst
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <crypto/sha3.h>
#include <crypto/utils.h>
#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/unaligned.h>
#include "fips.h"
/*
* On some 32-bit architectures, such as h8300, GCC ends up using over 1 KB of
* stack if the round calculation gets inlined into the loop in
* sha3_keccakf_generic(). On the other hand, on 64-bit architectures with
* plenty of [64-bit wide] general purpose registers, not inlining it severely
* hurts performance. So let's use 64-bitness as a heuristic to decide whether
* to inline or not.
*/
#ifdef CONFIG_64BIT
#define SHA3_INLINE inline
#else
#define SHA3_INLINE noinline
#endif
#define SHA3_KECCAK_ROUNDS 24
static const u64 sha3_keccakf_rndc[SHA3_KECCAK_ROUNDS] = {
0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808aULL,
0x8000000080008000ULL, 0x000000000000808bULL, 0x0000000080000001ULL,
0x8000000080008081ULL, 0x8000000000008009ULL, 0x000000000000008aULL,
0x0000000000000088ULL, 0x0000000080008009ULL, 0x000000008000000aULL,
0x000000008000808bULL, 0x800000000000008bULL, 0x8000000000008089ULL,
0x8000000000008003ULL, 0x8000000000008002ULL, 0x8000000000000080ULL,
0x000000000000800aULL, 0x800000008000000aULL, 0x8000000080008081ULL,
0x8000000000008080ULL, 0x0000000080000001ULL, 0x8000000080008008ULL
};
/*
* Perform a single round of Keccak mixing.
*/
static SHA3_INLINE void sha3_keccakf_one_round_generic(u64 st[25], int round)
{
u64 t[5], tt, bc[5];
/* Theta */
bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20];
bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21];
bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22];
bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23];
bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24];
t[0] = bc[4] ^ rol64(bc[1], 1);
t[1] = bc[0] ^ rol64(bc[2], 1);
t[2] = bc[1] ^ rol64(bc[3], 1);
t[3] = bc[2] ^ rol64(bc[4], 1);
t[4] = bc[3] ^ rol64(bc[0], 1);
st[0] ^= t[0];
/* Rho Pi */
tt = st[1];
st[ 1] = rol64(st[ 6] ^ t[1], 44);
st[ 6] = rol64(st[ 9] ^ t[4], 20);
st[ 9] = rol64(st[22] ^ t[2], 61);
st[22] = rol64(st[14] ^ t[4], 39);
st[14] = rol64(st[20] ^ t[0], 18);
st[20] = rol64(st[ 2] ^ t[2], 62);
st[ 2] = rol64(st[12] ^ t[2], 43);
st[12] = rol64(st[13] ^ t[3], 25);
st[13] = rol64(st[19] ^ t[4], 8);
st[19] = rol64(st[23] ^ t[3], 56);
st[23] = rol64(st[15] ^ t[0], 41);
st[15] = rol64(st[ 4] ^ t[4], 27);
st[ 4] = rol64(st[24] ^ t[4], 14);
st[24] = rol64(st[21] ^ t[1], 2);
st[21] = rol64(st[ 8] ^ t[3], 55);
st[ 8] = rol64(st[16] ^ t[1], 45);
st[16] = rol64(st[ 5] ^ t[0], 36);
st[ 5] = rol64(st[ 3] ^ t[3], 28);
st[ 3] = rol64(st[18] ^ t[3], 21);
st[18] = rol64(st[17] ^ t[2], 15);
st[17] = rol64(st[11] ^ t[1], 10);
st[11] = rol64(st[ 7] ^ t[2], 6);
st[ 7] = rol64(st[10] ^ t[0], 3);
st[10] = rol64( tt ^ t[1], 1);
/* Chi */
bc[ 0] = ~st[ 1] & st[ 2];
bc[ 1] = ~st[ 2] & st[ 3];
bc[ 2] = ~st[ 3] & st[ 4];
bc[ 3] = ~st[ 4] & st[ 0];
bc[ 4] = ~st[ 0] & st[ 1];
st[ 0] ^= bc[ 0];
st[ 1] ^= bc[ 1];
st[ 2] ^= bc[ 2];
st[ 3] ^= bc[ 3];
st[ 4] ^= bc[ 4];
bc[ 0] = ~st[ 6] & st[ 7];
bc[ 1] = ~st[ 7] & st[ 8];
bc[ 2] = ~st[ 8] & st[ 9];
bc[ 3] = ~st[ 9] & st[ 5];
bc[ 4] = ~st[ 5] & st[ 6];
st[ 5] ^= bc[ 0];
st[ 6] ^= bc[ 1];
st[ 7] ^= bc[ 2];
st[ 8] ^= bc[ 3];
st[ 9] ^= bc[ 4];
bc[ 0] = ~st[11] & st[12];
bc[ 1] = ~st[12] & st[13];
bc[ 2] = ~st[13] & st[14];
bc[ 3] = ~st[14] & st[10];
bc[ 4] = ~st[10] & st[11];
st[10] ^= bc[ 0];
st[11] ^= bc[ 1];
st[12] ^= bc[ 2];
st[13] ^= bc[ 3];
st[14] ^= bc[ 4];
bc[ 0] = ~st[16] & st[17];
bc[ 1] = ~st[17] & st[18];
bc[ 2] = ~st[18] & st[19];
bc[ 3] = ~st[19] & st[15];
bc[ 4] = ~st[15] & st[16];
st[15] ^= bc[ 0];
st[16] ^= bc[ 1];
st[17] ^= bc[ 2];
st[18] ^= bc[ 3];
st[19] ^= bc[ 4];
bc[ 0] = ~st[21] & st[22];
bc[ 1] = ~st[22] & st[23];
bc[ 2] = ~st[23] & st[24];
bc[ 3] = ~st[24] & st[20];
bc[ 4] = ~st[20] & st[21];
st[20] ^= bc[ 0];
st[21] ^= bc[ 1];
st[22] ^= bc[ 2];
st[23] ^= bc[ 3];
st[24] ^= bc[ 4];
/* Iota */
st[0] ^= sha3_keccakf_rndc[round];
}
/* Generic implementation of the Keccak-f[1600] permutation */
static void sha3_keccakf_generic(struct sha3_state *state)
{
/*
* Temporarily convert the state words from little-endian to native-
* endian so that they can be operated on. Note that on little-endian
* machines this conversion is a no-op and is optimized out.
*/
for (int i = 0; i < ARRAY_SIZE(state->words); i++)
state->native_words[i] = le64_to_cpu(state->words[i]);
for (int round = 0; round < SHA3_KECCAK_ROUNDS; round++)
sha3_keccakf_one_round_generic(state->native_words, round);
for (int i = 0; i < ARRAY_SIZE(state->words); i++)
state->words[i] = cpu_to_le64(state->native_words[i]);
}
/*
* Generic implementation of absorbing the given nonzero number of full blocks
* into the sponge function Keccak[r=8*block_size, c=1600-8*block_size].
*/
static void __maybe_unused
sha3_absorb_blocks_generic(struct sha3_state *state, const u8 *data,
size_t nblocks, size_t block_size)
{
do {
for (size_t i = 0; i < block_size; i += 8)
state->words[i / 8] ^= get_unaligned((__le64 *)&data[i]);
sha3_keccakf_generic(state);
data += block_size;
} while (--nblocks);
}
#ifdef CONFIG_CRYPTO_LIB_SHA3_ARCH
#include "sha3.h" /* $(SRCARCH)/sha3.h */
#else
#define sha3_keccakf sha3_keccakf_generic
#define sha3_absorb_blocks sha3_absorb_blocks_generic
#endif
void __sha3_update(struct __sha3_ctx *ctx, const u8 *in, size_t in_len)
{
const size_t block_size = ctx->block_size;
size_t absorb_offset = ctx->absorb_offset;
/* Warn if squeezing has already begun. */
WARN_ON_ONCE(absorb_offset >= block_size);
if (absorb_offset && absorb_offset + in_len >= block_size) {
crypto_xor(&ctx->state.bytes[absorb_offset], in,
block_size - absorb_offset);
in += block_size - absorb_offset;
in_len -= block_size - absorb_offset;
sha3_keccakf(&ctx->state);
absorb_offset = 0;
}
if (in_len >= block_size) {
size_t nblocks = in_len / block_size;
sha3_absorb_blocks(&ctx->state, in, nblocks, block_size);
in += nblocks * block_size;
in_len -= nblocks * block_size;
}
if (in_len) {
crypto_xor(&ctx->state.bytes[absorb_offset], in, in_len);
absorb_offset += in_len;
}
ctx->absorb_offset = absorb_offset;
}
EXPORT_SYMBOL_GPL(__sha3_update);
void sha3_final(struct sha3_ctx *sha3_ctx, u8 *out)
{
struct __sha3_ctx *ctx = &sha3_ctx->ctx;
ctx->state.bytes[ctx->absorb_offset] ^= 0x06;
ctx->state.bytes[ctx->block_size - 1] ^= 0x80;
sha3_keccakf(&ctx->state);
memcpy(out, ctx->state.bytes, ctx->digest_size);
sha3_zeroize_ctx(sha3_ctx);
}
EXPORT_SYMBOL_GPL(sha3_final);
void shake_squeeze(struct shake_ctx *shake_ctx, u8 *out, size_t out_len)
{
struct __sha3_ctx *ctx = &shake_ctx->ctx;
const size_t block_size = ctx->block_size;
size_t squeeze_offset = ctx->squeeze_offset;
if (ctx->absorb_offset < block_size) {
/* First squeeze: */
/* Add the domain separation suffix and padding. */
ctx->state.bytes[ctx->absorb_offset] ^= 0x1f;
ctx->state.bytes[block_size - 1] ^= 0x80;
/* Indicate that squeezing has begun. */
ctx->absorb_offset = block_size;
/*
* Indicate that no output is pending yet, i.e. sha3_keccakf()
* will need to be called before the first copy.
*/
squeeze_offset = block_size;
}
while (out_len) {
if (squeeze_offset == block_size) {
sha3_keccakf(&ctx->state);
squeeze_offset = 0;
}
size_t copy = min(out_len, block_size - squeeze_offset);
memcpy(out, &ctx->state.bytes[squeeze_offset], copy);
out += copy;
out_len -= copy;
squeeze_offset += copy;
}
ctx->squeeze_offset = squeeze_offset;
}
EXPORT_SYMBOL_GPL(shake_squeeze);
#ifndef sha3_224_arch
static inline bool sha3_224_arch(const u8 *in, size_t in_len,
u8 out[SHA3_224_DIGEST_SIZE])
{
return false;
}
#endif
#ifndef sha3_256_arch
static inline bool sha3_256_arch(const u8 *in, size_t in_len,
u8 out[SHA3_256_DIGEST_SIZE])
{
return false;
}
#endif
#ifndef sha3_384_arch
static inline bool sha3_384_arch(const u8 *in, size_t in_len,
u8 out[SHA3_384_DIGEST_SIZE])
{
return false;
}
#endif
#ifndef sha3_512_arch
static inline bool sha3_512_arch(const u8 *in, size_t in_len,
u8 out[SHA3_512_DIGEST_SIZE])
{
return false;
}
#endif
void sha3_224(const u8 *in, size_t in_len, u8 out[SHA3_224_DIGEST_SIZE])
{
struct sha3_ctx ctx;
if (sha3_224_arch(in, in_len, out))
return;
sha3_224_init(&ctx);
sha3_update(&ctx, in, in_len);
sha3_final(&ctx, out);
}
EXPORT_SYMBOL_GPL(sha3_224);
void sha3_256(const u8 *in, size_t in_len, u8 out[SHA3_256_DIGEST_SIZE])
{
struct sha3_ctx ctx;
if (sha3_256_arch(in, in_len, out))
return;
sha3_256_init(&ctx);
sha3_update(&ctx, in, in_len);
sha3_final(&ctx, out);
}
EXPORT_SYMBOL_GPL(sha3_256);
void sha3_384(const u8 *in, size_t in_len, u8 out[SHA3_384_DIGEST_SIZE])
{
struct sha3_ctx ctx;
if (sha3_384_arch(in, in_len, out))
return;
sha3_384_init(&ctx);
sha3_update(&ctx, in, in_len);
sha3_final(&ctx, out);
}
EXPORT_SYMBOL_GPL(sha3_384);
void sha3_512(const u8 *in, size_t in_len, u8 out[SHA3_512_DIGEST_SIZE])
{
struct sha3_ctx ctx;
if (sha3_512_arch(in, in_len, out))
return;
sha3_512_init(&ctx);
sha3_update(&ctx, in, in_len);
sha3_final(&ctx, out);
}
EXPORT_SYMBOL_GPL(sha3_512);
void shake128(const u8 *in, size_t in_len, u8 *out, size_t out_len)
{
struct shake_ctx ctx;
shake128_init(&ctx);
shake_update(&ctx, in, in_len);
shake_squeeze(&ctx, out, out_len);
shake_zeroize_ctx(&ctx);
}
EXPORT_SYMBOL_GPL(shake128);
void shake256(const u8 *in, size_t in_len, u8 *out, size_t out_len)
{
struct shake_ctx ctx;
shake256_init(&ctx);
shake_update(&ctx, in, in_len);
shake_squeeze(&ctx, out, out_len);
shake_zeroize_ctx(&ctx);
}
EXPORT_SYMBOL_GPL(shake256);
#if defined(sha3_mod_init_arch) || defined(CONFIG_CRYPTO_FIPS)
static int __init sha3_mod_init(void)
{
#ifdef sha3_mod_init_arch
sha3_mod_init_arch();
#endif
if (fips_enabled) {
/*
* FIPS cryptographic algorithm self-test. As per the FIPS
* Implementation Guidance, testing any SHA-3 algorithm
* satisfies the test requirement for all of them.
*/
u8 hash[SHA3_256_DIGEST_SIZE];
sha3_256(fips_test_data, sizeof(fips_test_data), hash);
if (memcmp(fips_test_sha3_256_value, hash, sizeof(hash)) != 0)
panic("sha3: FIPS self-test failed\n");
}
return 0;
}
subsys_initcall(sha3_mod_init);
static void __exit sha3_mod_exit(void)
{
}
module_exit(sha3_mod_exit);
#endif
MODULE_DESCRIPTION("SHA-3 library functions");
MODULE_LICENSE("GPL");

View File

@@ -17,6 +17,7 @@
#include <linux/string.h>
#include <linux/unaligned.h>
#include <linux/wordpart.h>
#include "fips.h"
static const struct sha512_block_state sha384_iv = {
.h = {
@@ -405,10 +406,26 @@ void hmac_sha512_usingrawkey(const u8 *raw_key, size_t raw_key_len,
}
EXPORT_SYMBOL_GPL(hmac_sha512_usingrawkey);
#ifdef sha512_mod_init_arch
#if defined(sha512_mod_init_arch) || defined(CONFIG_CRYPTO_FIPS)
static int __init sha512_mod_init(void)
{
#ifdef sha512_mod_init_arch
sha512_mod_init_arch();
#endif
if (fips_enabled) {
/*
* FIPS cryptographic algorithm self-test. As per the FIPS
* Implementation Guidance, testing HMAC-SHA512 satisfies the
* test requirement for SHA-384, SHA-512, and HMAC-SHA384 too.
*/
u8 mac[SHA512_DIGEST_SIZE];
hmac_sha512_usingrawkey(fips_test_key, sizeof(fips_test_key),
fips_test_data, sizeof(fips_test_data),
mac);
if (memcmp(fips_test_hmac_sha512_value, mac, sizeof(mac)) != 0)
panic("sha512: FIPS self-test failed\n");
}
return 0;
}
subsys_initcall(sha512_mod_init);

View File

@@ -14,12 +14,12 @@
static void blake2s_default(const u8 *data, size_t len,
u8 out[BLAKE2S_HASH_SIZE])
{
blake2s(out, data, NULL, BLAKE2S_HASH_SIZE, len, 0);
blake2s(NULL, 0, data, len, out, BLAKE2S_HASH_SIZE);
}
static void blake2s_init_default(struct blake2s_state *state)
static void blake2s_init_default(struct blake2s_ctx *ctx)
{
blake2s_init(state, BLAKE2S_HASH_SIZE);
blake2s_init(ctx, BLAKE2S_HASH_SIZE);
}
/*
@@ -27,7 +27,7 @@ static void blake2s_init_default(struct blake2s_state *state)
* with a key length of 0 and a hash length of BLAKE2S_HASH_SIZE.
*/
#define HASH blake2s_default
#define HASH_CTX blake2s_state
#define HASH_CTX blake2s_ctx
#define HASH_SIZE BLAKE2S_HASH_SIZE
#define HASH_INIT blake2s_init_default
#define HASH_UPDATE blake2s_update
@@ -44,19 +44,19 @@ static void test_blake2s_all_key_and_hash_lens(struct kunit *test)
u8 *data = &test_buf[0];
u8 *key = data + data_len;
u8 *hash = key + BLAKE2S_KEY_SIZE;
struct blake2s_state main_state;
struct blake2s_ctx main_ctx;
u8 main_hash[BLAKE2S_HASH_SIZE];
rand_bytes_seeded_from_len(data, data_len);
blake2s_init(&main_state, BLAKE2S_HASH_SIZE);
blake2s_init(&main_ctx, BLAKE2S_HASH_SIZE);
for (int key_len = 0; key_len <= BLAKE2S_KEY_SIZE; key_len++) {
rand_bytes_seeded_from_len(key, key_len);
for (int out_len = 1; out_len <= BLAKE2S_HASH_SIZE; out_len++) {
blake2s(hash, data, key, out_len, data_len, key_len);
blake2s_update(&main_state, hash, out_len);
blake2s(key, key_len, data, data_len, hash, out_len);
blake2s_update(&main_ctx, hash, out_len);
}
}
blake2s_final(&main_state, main_hash);
blake2s_final(&main_ctx, main_hash);
KUNIT_ASSERT_MEMEQ(test, main_hash, blake2s_keyed_testvec_consolidated,
BLAKE2S_HASH_SIZE);
}
@@ -75,21 +75,20 @@ static void test_blake2s_with_guarded_key_buf(struct kunit *test)
u8 *guarded_key = &test_buf[TEST_BUF_LEN - key_len];
u8 hash1[BLAKE2S_HASH_SIZE];
u8 hash2[BLAKE2S_HASH_SIZE];
struct blake2s_state state;
struct blake2s_ctx ctx;
rand_bytes(key, key_len);
memcpy(guarded_key, key, key_len);
blake2s(hash1, test_buf, key,
BLAKE2S_HASH_SIZE, data_len, key_len);
blake2s(hash2, test_buf, guarded_key,
BLAKE2S_HASH_SIZE, data_len, key_len);
blake2s(key, key_len, test_buf, data_len,
hash1, BLAKE2S_HASH_SIZE);
blake2s(guarded_key, key_len, test_buf, data_len,
hash2, BLAKE2S_HASH_SIZE);
KUNIT_ASSERT_MEMEQ(test, hash1, hash2, BLAKE2S_HASH_SIZE);
blake2s_init_key(&state, BLAKE2S_HASH_SIZE,
guarded_key, key_len);
blake2s_update(&state, test_buf, data_len);
blake2s_final(&state, hash2);
blake2s_init_key(&ctx, BLAKE2S_HASH_SIZE, guarded_key, key_len);
blake2s_update(&ctx, test_buf, data_len);
blake2s_final(&ctx, hash2);
KUNIT_ASSERT_MEMEQ(test, hash1, hash2, BLAKE2S_HASH_SIZE);
}
}
@@ -107,8 +106,8 @@ static void test_blake2s_with_guarded_out_buf(struct kunit *test)
u8 hash[BLAKE2S_HASH_SIZE];
u8 *guarded_hash = &test_buf[TEST_BUF_LEN - out_len];
blake2s(hash, test_buf, NULL, out_len, data_len, 0);
blake2s(guarded_hash, test_buf, NULL, out_len, data_len, 0);
blake2s(NULL, 0, test_buf, data_len, hash, out_len);
blake2s(NULL, 0, test_buf, data_len, guarded_hash, out_len);
KUNIT_ASSERT_MEMEQ(test, hash, guarded_hash, out_len);
}
}

View File

@@ -6,19 +6,25 @@
#include <linux/linkage.h>
.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32
.section .rodata.cst32.iv, "aM", @progbits, 32
.align 32
IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667
.Liv:
.octa 0xA54FF53A3C6EF372BB67AE856A09E667
.octa 0x5BE0CD191F83D9AB9B05688C510E527F
.section .rodata.cst16.ROT16, "aM", @progbits, 16
.section .rodata.cst16.ror16, "aM", @progbits, 16
.align 16
ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302
.section .rodata.cst16.ROR328, "aM", @progbits, 16
.Lror16:
.octa 0x0D0C0F0E09080B0A0504070601000302
.section .rodata.cst16.ror8, "aM", @progbits, 16
.align 16
ROR328: .octa 0x0C0F0E0D080B0A090407060500030201
.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160
.Lror8:
.octa 0x0C0F0E0D080B0A090407060500030201
.section .rodata.cst64.sigma, "aM", @progbits, 160
.align 64
SIGMA:
.Lsigma:
.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
.byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7
.byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1
@@ -29,9 +35,10 @@ SIGMA:
.byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6
.byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4
.byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12
.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 160
.section .rodata.cst64.sigma2, "aM", @progbits, 160
.align 64
SIGMA2:
.Lsigma2:
.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
.byte 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7
.byte 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9
@@ -43,36 +50,52 @@ SIGMA2:
.byte 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14
.byte 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9
#define CTX %rdi
#define DATA %rsi
#define NBLOCKS %rdx
#define INC %ecx
.text
//
// void blake2s_compress_ssse3(struct blake2s_ctx *ctx,
// const u8 *data, size_t nblocks, u32 inc);
//
// Only the first three fields of struct blake2s_ctx are used:
// u32 h[8]; (inout)
// u32 t[2]; (inout)
// u32 f[2]; (in)
//
SYM_FUNC_START(blake2s_compress_ssse3)
testq %rdx,%rdx
je .Lendofloop
movdqu (%rdi),%xmm0
movdqu 0x10(%rdi),%xmm1
movdqa ROT16(%rip),%xmm12
movdqa ROR328(%rip),%xmm13
movdqu 0x20(%rdi),%xmm14
movq %rcx,%xmm15
leaq SIGMA+0xa0(%rip),%r8
jmp .Lbeginofloop
movdqu (CTX),%xmm0 // Load h[0..3]
movdqu 16(CTX),%xmm1 // Load h[4..7]
movdqa .Lror16(%rip),%xmm12
movdqa .Lror8(%rip),%xmm13
movdqu 32(CTX),%xmm14 // Load t and f
movd INC,%xmm15 // Load inc
leaq .Lsigma+160(%rip),%r8
jmp .Lssse3_mainloop
.align 32
.Lbeginofloop:
movdqa %xmm0,%xmm10
movdqa %xmm1,%xmm11
paddq %xmm15,%xmm14
movdqa IV(%rip),%xmm2
.Lssse3_mainloop:
// Main loop: each iteration processes one 64-byte block.
movdqa %xmm0,%xmm10 // Save h[0..3] and let v[0..3] = h[0..3]
movdqa %xmm1,%xmm11 // Save h[4..7] and let v[4..7] = h[4..7]
paddq %xmm15,%xmm14 // t += inc (64-bit addition)
movdqa .Liv(%rip),%xmm2 // v[8..11] = iv[0..3]
movdqa %xmm14,%xmm3
pxor IV+0x10(%rip),%xmm3
leaq SIGMA(%rip),%rcx
.Lroundloop:
pxor .Liv+16(%rip),%xmm3 // v[12..15] = iv[4..7] ^ [t, f]
leaq .Lsigma(%rip),%rcx
.Lssse3_roundloop:
// Round loop: each iteration does 1 round (of 10 rounds total).
movzbl (%rcx),%eax
movd (%rsi,%rax,4),%xmm4
movzbl 0x1(%rcx),%eax
movd (%rsi,%rax,4),%xmm5
movzbl 0x2(%rcx),%eax
movd (%rsi,%rax,4),%xmm6
movzbl 0x3(%rcx),%eax
movd (%rsi,%rax,4),%xmm7
movd (DATA,%rax,4),%xmm4
movzbl 1(%rcx),%eax
movd (DATA,%rax,4),%xmm5
movzbl 2(%rcx),%eax
movd (DATA,%rax,4),%xmm6
movzbl 3(%rcx),%eax
movd (DATA,%rax,4),%xmm7
punpckldq %xmm5,%xmm4
punpckldq %xmm7,%xmm6
punpcklqdq %xmm6,%xmm4
@@ -83,17 +106,17 @@ SYM_FUNC_START(blake2s_compress_ssse3)
paddd %xmm3,%xmm2
pxor %xmm2,%xmm1
movdqa %xmm1,%xmm8
psrld $0xc,%xmm1
pslld $0x14,%xmm8
psrld $12,%xmm1
pslld $20,%xmm8
por %xmm8,%xmm1
movzbl 0x4(%rcx),%eax
movd (%rsi,%rax,4),%xmm5
movzbl 0x5(%rcx),%eax
movd (%rsi,%rax,4),%xmm6
movzbl 0x6(%rcx),%eax
movd (%rsi,%rax,4),%xmm7
movzbl 0x7(%rcx),%eax
movd (%rsi,%rax,4),%xmm4
movzbl 4(%rcx),%eax
movd (DATA,%rax,4),%xmm5
movzbl 5(%rcx),%eax
movd (DATA,%rax,4),%xmm6
movzbl 6(%rcx),%eax
movd (DATA,%rax,4),%xmm7
movzbl 7(%rcx),%eax
movd (DATA,%rax,4),%xmm4
punpckldq %xmm6,%xmm5
punpckldq %xmm4,%xmm7
punpcklqdq %xmm7,%xmm5
@@ -104,20 +127,20 @@ SYM_FUNC_START(blake2s_compress_ssse3)
paddd %xmm3,%xmm2
pxor %xmm2,%xmm1
movdqa %xmm1,%xmm8
psrld $0x7,%xmm1
pslld $0x19,%xmm8
psrld $7,%xmm1
pslld $25,%xmm8
por %xmm8,%xmm1
pshufd $0x93,%xmm0,%xmm0
pshufd $0x4e,%xmm3,%xmm3
pshufd $0x39,%xmm2,%xmm2
movzbl 0x8(%rcx),%eax
movd (%rsi,%rax,4),%xmm6
movzbl 0x9(%rcx),%eax
movd (%rsi,%rax,4),%xmm7
movzbl 0xa(%rcx),%eax
movd (%rsi,%rax,4),%xmm4
movzbl 0xb(%rcx),%eax
movd (%rsi,%rax,4),%xmm5
movzbl 8(%rcx),%eax
movd (DATA,%rax,4),%xmm6
movzbl 9(%rcx),%eax
movd (DATA,%rax,4),%xmm7
movzbl 10(%rcx),%eax
movd (DATA,%rax,4),%xmm4
movzbl 11(%rcx),%eax
movd (DATA,%rax,4),%xmm5
punpckldq %xmm7,%xmm6
punpckldq %xmm5,%xmm4
punpcklqdq %xmm4,%xmm6
@@ -128,17 +151,17 @@ SYM_FUNC_START(blake2s_compress_ssse3)
paddd %xmm3,%xmm2
pxor %xmm2,%xmm1
movdqa %xmm1,%xmm8
psrld $0xc,%xmm1
pslld $0x14,%xmm8
psrld $12,%xmm1
pslld $20,%xmm8
por %xmm8,%xmm1
movzbl 0xc(%rcx),%eax
movd (%rsi,%rax,4),%xmm7
movzbl 0xd(%rcx),%eax
movd (%rsi,%rax,4),%xmm4
movzbl 0xe(%rcx),%eax
movd (%rsi,%rax,4),%xmm5
movzbl 0xf(%rcx),%eax
movd (%rsi,%rax,4),%xmm6
movzbl 12(%rcx),%eax
movd (DATA,%rax,4),%xmm7
movzbl 13(%rcx),%eax
movd (DATA,%rax,4),%xmm4
movzbl 14(%rcx),%eax
movd (DATA,%rax,4),%xmm5
movzbl 15(%rcx),%eax
movd (DATA,%rax,4),%xmm6
punpckldq %xmm4,%xmm7
punpckldq %xmm6,%xmm5
punpcklqdq %xmm5,%xmm7
@@ -149,53 +172,68 @@ SYM_FUNC_START(blake2s_compress_ssse3)
paddd %xmm3,%xmm2
pxor %xmm2,%xmm1
movdqa %xmm1,%xmm8
psrld $0x7,%xmm1
pslld $0x19,%xmm8
psrld $7,%xmm1
pslld $25,%xmm8
por %xmm8,%xmm1
pshufd $0x39,%xmm0,%xmm0
pshufd $0x4e,%xmm3,%xmm3
pshufd $0x93,%xmm2,%xmm2
addq $0x10,%rcx
addq $16,%rcx
cmpq %r8,%rcx
jnz .Lroundloop
jnz .Lssse3_roundloop
// Compute the new h: h[0..7] ^= v[0..7] ^ v[8..15]
pxor %xmm2,%xmm0
pxor %xmm3,%xmm1
pxor %xmm10,%xmm0
pxor %xmm11,%xmm1
addq $0x40,%rsi
decq %rdx
jnz .Lbeginofloop
movdqu %xmm0,(%rdi)
movdqu %xmm1,0x10(%rdi)
movdqu %xmm14,0x20(%rdi)
.Lendofloop:
addq $64,DATA
decq NBLOCKS
jnz .Lssse3_mainloop
movdqu %xmm0,(CTX) // Store new h[0..3]
movdqu %xmm1,16(CTX) // Store new h[4..7]
movq %xmm14,32(CTX) // Store new t (f is unchanged)
RET
SYM_FUNC_END(blake2s_compress_ssse3)
//
// void blake2s_compress_avx512(struct blake2s_ctx *ctx,
// const u8 *data, size_t nblocks, u32 inc);
//
// Only the first three fields of struct blake2s_ctx are used:
// u32 h[8]; (inout)
// u32 t[2]; (inout)
// u32 f[2]; (in)
//
SYM_FUNC_START(blake2s_compress_avx512)
vmovdqu (%rdi),%xmm0
vmovdqu 0x10(%rdi),%xmm1
vmovdqu 0x20(%rdi),%xmm4
vmovq %rcx,%xmm5
vmovdqa IV(%rip),%xmm14
vmovdqa IV+16(%rip),%xmm15
jmp .Lblake2s_compress_avx512_mainloop
.align 32
.Lblake2s_compress_avx512_mainloop:
vmovdqa %xmm0,%xmm10
vmovdqa %xmm1,%xmm11
vpaddq %xmm5,%xmm4,%xmm4
vmovdqa %xmm14,%xmm2
vpxor %xmm15,%xmm4,%xmm3
vmovdqu (%rsi),%ymm6
vmovdqu 0x20(%rsi),%ymm7
addq $0x40,%rsi
leaq SIGMA2(%rip),%rax
movb $0xa,%cl
.Lblake2s_compress_avx512_roundloop:
vmovdqu (CTX),%xmm0 // Load h[0..3]
vmovdqu 16(CTX),%xmm1 // Load h[4..7]
vmovdqu 32(CTX),%xmm4 // Load t and f
vmovd INC,%xmm5 // Load inc
vmovdqa .Liv(%rip),%xmm14 // Load iv[0..3]
vmovdqa .Liv+16(%rip),%xmm15 // Load iv[4..7]
jmp .Lavx512_mainloop
.align 32
.Lavx512_mainloop:
// Main loop: each iteration processes one 64-byte block.
vmovdqa %xmm0,%xmm10 // Save h[0..3] and let v[0..3] = h[0..3]
vmovdqa %xmm1,%xmm11 // Save h[4..7] and let v[4..7] = h[4..7]
vpaddq %xmm5,%xmm4,%xmm4 // t += inc (64-bit addition)
vmovdqa %xmm14,%xmm2 // v[8..11] = iv[0..3]
vpxor %xmm15,%xmm4,%xmm3 // v[12..15] = iv[4..7] ^ [t, f]
vmovdqu (DATA),%ymm6 // Load first 8 data words
vmovdqu 32(DATA),%ymm7 // Load second 8 data words
addq $64,DATA
leaq .Lsigma2(%rip),%rax
movb $10,%cl // Set num rounds remaining
.Lavx512_roundloop:
// Round loop: each iteration does 1 round (of 10 rounds total).
vpmovzxbd (%rax),%ymm8
vpmovzxbd 0x8(%rax),%ymm9
addq $0x10,%rax
vpmovzxbd 8(%rax),%ymm9
addq $16,%rax
vpermi2d %ymm7,%ymm6,%ymm8
vpermi2d %ymm7,%ymm6,%ymm9
vmovdqa %ymm8,%ymm6
@@ -203,50 +241,51 @@ SYM_FUNC_START(blake2s_compress_avx512)
vpaddd %xmm8,%xmm0,%xmm0
vpaddd %xmm1,%xmm0,%xmm0
vpxor %xmm0,%xmm3,%xmm3
vprord $0x10,%xmm3,%xmm3
vprord $16,%xmm3,%xmm3
vpaddd %xmm3,%xmm2,%xmm2
vpxor %xmm2,%xmm1,%xmm1
vprord $0xc,%xmm1,%xmm1
vextracti128 $0x1,%ymm8,%xmm8
vprord $12,%xmm1,%xmm1
vextracti128 $1,%ymm8,%xmm8
vpaddd %xmm8,%xmm0,%xmm0
vpaddd %xmm1,%xmm0,%xmm0
vpxor %xmm0,%xmm3,%xmm3
vprord $0x8,%xmm3,%xmm3
vprord $8,%xmm3,%xmm3
vpaddd %xmm3,%xmm2,%xmm2
vpxor %xmm2,%xmm1,%xmm1
vprord $0x7,%xmm1,%xmm1
vprord $7,%xmm1,%xmm1
vpshufd $0x93,%xmm0,%xmm0
vpshufd $0x4e,%xmm3,%xmm3
vpshufd $0x39,%xmm2,%xmm2
vpaddd %xmm9,%xmm0,%xmm0
vpaddd %xmm1,%xmm0,%xmm0
vpxor %xmm0,%xmm3,%xmm3
vprord $0x10,%xmm3,%xmm3
vprord $16,%xmm3,%xmm3
vpaddd %xmm3,%xmm2,%xmm2
vpxor %xmm2,%xmm1,%xmm1
vprord $0xc,%xmm1,%xmm1
vextracti128 $0x1,%ymm9,%xmm9
vprord $12,%xmm1,%xmm1
vextracti128 $1,%ymm9,%xmm9
vpaddd %xmm9,%xmm0,%xmm0
vpaddd %xmm1,%xmm0,%xmm0
vpxor %xmm0,%xmm3,%xmm3
vprord $0x8,%xmm3,%xmm3
vprord $8,%xmm3,%xmm3
vpaddd %xmm3,%xmm2,%xmm2
vpxor %xmm2,%xmm1,%xmm1
vprord $0x7,%xmm1,%xmm1
vprord $7,%xmm1,%xmm1
vpshufd $0x39,%xmm0,%xmm0
vpshufd $0x4e,%xmm3,%xmm3
vpshufd $0x93,%xmm2,%xmm2
decb %cl
jne .Lblake2s_compress_avx512_roundloop
vpxor %xmm10,%xmm0,%xmm0
vpxor %xmm11,%xmm1,%xmm1
vpxor %xmm2,%xmm0,%xmm0
vpxor %xmm3,%xmm1,%xmm1
decq %rdx
jne .Lblake2s_compress_avx512_mainloop
vmovdqu %xmm0,(%rdi)
vmovdqu %xmm1,0x10(%rdi)
vmovdqu %xmm4,0x20(%rdi)
jne .Lavx512_roundloop
// Compute the new h: h[0..7] ^= v[0..7] ^ v[8..15]
vpternlogd $0x96,%xmm10,%xmm2,%xmm0
vpternlogd $0x96,%xmm11,%xmm3,%xmm1
decq NBLOCKS
jne .Lavx512_mainloop
vmovdqu %xmm0,(CTX) // Store new h[0..3]
vmovdqu %xmm1,16(CTX) // Store new h[4..7]
vmovq %xmm4,32(CTX) // Store new t (f is unchanged)
vzeroupper
RET
SYM_FUNC_END(blake2s_compress_avx512)

View File

@@ -11,24 +11,22 @@
#include <linux/kernel.h>
#include <linux/sizes.h>
asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state,
const u8 *block, const size_t nblocks,
const u32 inc);
asmlinkage void blake2s_compress_avx512(struct blake2s_state *state,
const u8 *block, const size_t nblocks,
const u32 inc);
asmlinkage void blake2s_compress_ssse3(struct blake2s_ctx *ctx,
const u8 *data, size_t nblocks, u32 inc);
asmlinkage void blake2s_compress_avx512(struct blake2s_ctx *ctx,
const u8 *data, size_t nblocks, u32 inc);
static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3);
static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512);
static void blake2s_compress(struct blake2s_state *state, const u8 *block,
size_t nblocks, const u32 inc)
static void blake2s_compress(struct blake2s_ctx *ctx,
const u8 *data, size_t nblocks, u32 inc)
{
/* SIMD disables preemption, so relax after processing each page. */
BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);
if (!static_branch_likely(&blake2s_use_ssse3) || !may_use_simd()) {
blake2s_compress_generic(state, block, nblocks, inc);
blake2s_compress_generic(ctx, data, nblocks, inc);
return;
}
@@ -38,13 +36,13 @@ static void blake2s_compress(struct blake2s_state *state, const u8 *block,
kernel_fpu_begin();
if (static_branch_likely(&blake2s_use_avx512))
blake2s_compress_avx512(state, block, blocks, inc);
blake2s_compress_avx512(ctx, data, blocks, inc);
else
blake2s_compress_ssse3(state, block, blocks, inc);
blake2s_compress_ssse3(ctx, data, blocks, inc);
kernel_fpu_end();
data += blocks * BLAKE2S_BLOCK_SIZE;
nblocks -= blocks;
block += blocks * BLAKE2S_BLOCK_SIZE;
} while (nblocks);
}

View File

@@ -0,0 +1,319 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright 2021 Google LLC
*/
/*
* This is an efficient implementation of POLYVAL using intel PCLMULQDQ-NI
* instructions. It works on 8 blocks at a time, by precomputing the first 8
* keys powers h^8, ..., h^1 in the POLYVAL finite field. This precomputation
* allows us to split finite field multiplication into two steps.
*
* In the first step, we consider h^i, m_i as normal polynomials of degree less
* than 128. We then compute p(x) = h^8m_0 + ... + h^1m_7 where multiplication
* is simply polynomial multiplication.
*
* In the second step, we compute the reduction of p(x) modulo the finite field
* modulus g(x) = x^128 + x^127 + x^126 + x^121 + 1.
*
* This two step process is equivalent to computing h^8m_0 + ... + h^1m_7 where
* multiplication is finite field multiplication. The advantage is that the
* two-step process only requires 1 finite field reduction for every 8
* polynomial multiplications. Further parallelism is gained by interleaving the
* multiplications and polynomial reductions.
*/
#include <linux/linkage.h>
#include <asm/frame.h>
#define STRIDE_BLOCKS 8
#define GSTAR %xmm7
#define PL %xmm8
#define PH %xmm9
#define TMP_XMM %xmm11
#define LO %xmm12
#define HI %xmm13
#define MI %xmm14
#define SUM %xmm15
#define ACCUMULATOR %rdi
#define KEY_POWERS %rsi
#define MSG %rdx
#define BLOCKS_LEFT %rcx
#define TMP %rax
.section .rodata.cst16.gstar, "aM", @progbits, 16
.align 16
.Lgstar:
.quad 0xc200000000000000, 0xc200000000000000
.text
/*
* Performs schoolbook1_iteration on two lists of 128-bit polynomials of length
* count pointed to by MSG and KEY_POWERS.
*/
.macro schoolbook1 count
.set i, 0
.rept (\count)
schoolbook1_iteration i 0
.set i, (i +1)
.endr
.endm
/*
* Computes the product of two 128-bit polynomials at the memory locations
* specified by (MSG + 16*i) and (KEY_POWERS + 16*i) and XORs the components of
* the 256-bit product into LO, MI, HI.
*
* Given:
* X = [X_1 : X_0]
* Y = [Y_1 : Y_0]
*
* We compute:
* LO += X_0 * Y_0
* MI += X_0 * Y_1 + X_1 * Y_0
* HI += X_1 * Y_1
*
* Later, the 256-bit result can be extracted as:
* [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0]
* This step is done when computing the polynomial reduction for efficiency
* reasons.
*
* If xor_sum == 1, then also XOR the value of SUM into m_0. This avoids an
* extra multiplication of SUM and h^8.
*/
.macro schoolbook1_iteration i xor_sum
movups (16*\i)(MSG), %xmm0
.if (\i == 0 && \xor_sum == 1)
pxor SUM, %xmm0
.endif
vpclmulqdq $0x01, (16*\i)(KEY_POWERS), %xmm0, %xmm2
vpclmulqdq $0x00, (16*\i)(KEY_POWERS), %xmm0, %xmm1
vpclmulqdq $0x10, (16*\i)(KEY_POWERS), %xmm0, %xmm3
vpclmulqdq $0x11, (16*\i)(KEY_POWERS), %xmm0, %xmm4
vpxor %xmm2, MI, MI
vpxor %xmm1, LO, LO
vpxor %xmm4, HI, HI
vpxor %xmm3, MI, MI
.endm
/*
* Performs the same computation as schoolbook1_iteration, except we expect the
* arguments to already be loaded into xmm0 and xmm1 and we set the result
* registers LO, MI, and HI directly rather than XOR'ing into them.
*/
.macro schoolbook1_noload
vpclmulqdq $0x01, %xmm0, %xmm1, MI
vpclmulqdq $0x10, %xmm0, %xmm1, %xmm2
vpclmulqdq $0x00, %xmm0, %xmm1, LO
vpclmulqdq $0x11, %xmm0, %xmm1, HI
vpxor %xmm2, MI, MI
.endm
/*
* Computes the 256-bit polynomial represented by LO, HI, MI. Stores
* the result in PL, PH.
* [PH : PL] = [HI_1 : HI_0 + MI_1 : LO_1 + MI_0 : LO_0]
*/
.macro schoolbook2
vpslldq $8, MI, PL
vpsrldq $8, MI, PH
pxor LO, PL
pxor HI, PH
.endm
/*
* Computes the 128-bit reduction of PH : PL. Stores the result in dest.
*
* This macro computes p(x) mod g(x) where p(x) is in montgomery form and g(x) =
* x^128 + x^127 + x^126 + x^121 + 1.
*
* We have a 256-bit polynomial PH : PL = P_3 : P_2 : P_1 : P_0 that is the
* product of two 128-bit polynomials in Montgomery form. We need to reduce it
* mod g(x). Also, since polynomials in Montgomery form have an "extra" factor
* of x^128, this product has two extra factors of x^128. To get it back into
* Montgomery form, we need to remove one of these factors by dividing by x^128.
*
* To accomplish both of these goals, we add multiples of g(x) that cancel out
* the low 128 bits P_1 : P_0, leaving just the high 128 bits. Since the low
* bits are zero, the polynomial division by x^128 can be done by right shifting.
*
* Since the only nonzero term in the low 64 bits of g(x) is the constant term,
* the multiple of g(x) needed to cancel out P_0 is P_0 * g(x). The CPU can
* only do 64x64 bit multiplications, so split P_0 * g(x) into x^128 * P_0 +
* x^64 * g*(x) * P_0 + P_0, where g*(x) is bits 64-127 of g(x). Adding this to
* the original polynomial gives P_3 : P_2 + P_0 + T_1 : P_1 + T_0 : 0, where T
* = T_1 : T_0 = g*(x) * P_0. Thus, bits 0-63 got "folded" into bits 64-191.
*
* Repeating this same process on the next 64 bits "folds" bits 64-127 into bits
* 128-255, giving the answer in bits 128-255. This time, we need to cancel P_1
* + T_0 in bits 64-127. The multiple of g(x) required is (P_1 + T_0) * g(x) *
* x^64. Adding this to our previous computation gives P_3 + P_1 + T_0 + V_1 :
* P_2 + P_0 + T_1 + V_0 : 0 : 0, where V = V_1 : V_0 = g*(x) * (P_1 + T_0).
*
* So our final computation is:
* T = T_1 : T_0 = g*(x) * P_0
* V = V_1 : V_0 = g*(x) * (P_1 + T_0)
* p(x) / x^{128} mod g(x) = P_3 + P_1 + T_0 + V_1 : P_2 + P_0 + T_1 + V_0
*
* The implementation below saves a XOR instruction by computing P_1 + T_0 : P_0
* + T_1 and XORing into dest, rather than separately XORing P_1 : P_0 and T_0 :
* T_1 into dest. This allows us to reuse P_1 + T_0 when computing V.
*/
.macro montgomery_reduction dest
vpclmulqdq $0x00, PL, GSTAR, TMP_XMM # TMP_XMM = T_1 : T_0 = P_0 * g*(x)
pshufd $0b01001110, TMP_XMM, TMP_XMM # TMP_XMM = T_0 : T_1
pxor PL, TMP_XMM # TMP_XMM = P_1 + T_0 : P_0 + T_1
pxor TMP_XMM, PH # PH = P_3 + P_1 + T_0 : P_2 + P_0 + T_1
pclmulqdq $0x11, GSTAR, TMP_XMM # TMP_XMM = V_1 : V_0 = V = [(P_1 + T_0) * g*(x)]
vpxor TMP_XMM, PH, \dest
.endm
/*
* Compute schoolbook multiplication for 8 blocks
* m_0h^8 + ... + m_7h^1
*
* If reduce is set, also computes the montgomery reduction of the
* previous full_stride call and XORs with the first message block.
* (m_0 + REDUCE(PL, PH))h^8 + ... + m_7h^1.
* I.e., the first multiplication uses m_0 + REDUCE(PL, PH) instead of m_0.
*/
.macro full_stride reduce
pxor LO, LO
pxor HI, HI
pxor MI, MI
schoolbook1_iteration 7 0
.if \reduce
vpclmulqdq $0x00, PL, GSTAR, TMP_XMM
.endif
schoolbook1_iteration 6 0
.if \reduce
pshufd $0b01001110, TMP_XMM, TMP_XMM
.endif
schoolbook1_iteration 5 0
.if \reduce
pxor PL, TMP_XMM
.endif
schoolbook1_iteration 4 0
.if \reduce
pxor TMP_XMM, PH
.endif
schoolbook1_iteration 3 0
.if \reduce
pclmulqdq $0x11, GSTAR, TMP_XMM
.endif
schoolbook1_iteration 2 0
.if \reduce
vpxor TMP_XMM, PH, SUM
.endif
schoolbook1_iteration 1 0
schoolbook1_iteration 0 1
addq $(8*16), MSG
schoolbook2
.endm
/*
* Process BLOCKS_LEFT blocks, where 0 < BLOCKS_LEFT < STRIDE_BLOCKS
*/
.macro partial_stride
mov BLOCKS_LEFT, TMP
shlq $4, TMP
addq $(16*STRIDE_BLOCKS), KEY_POWERS
subq TMP, KEY_POWERS
movups (MSG), %xmm0
pxor SUM, %xmm0
movups (KEY_POWERS), %xmm1
schoolbook1_noload
dec BLOCKS_LEFT
addq $16, MSG
addq $16, KEY_POWERS
test $4, BLOCKS_LEFT
jz .Lpartial4BlocksDone
schoolbook1 4
addq $(4*16), MSG
addq $(4*16), KEY_POWERS
.Lpartial4BlocksDone:
test $2, BLOCKS_LEFT
jz .Lpartial2BlocksDone
schoolbook1 2
addq $(2*16), MSG
addq $(2*16), KEY_POWERS
.Lpartial2BlocksDone:
test $1, BLOCKS_LEFT
jz .LpartialDone
schoolbook1 1
.LpartialDone:
schoolbook2
montgomery_reduction SUM
.endm
/*
* Computes a = a * b * x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1.
*
* void polyval_mul_pclmul_avx(struct polyval_elem *a,
* const struct polyval_elem *b);
*/
SYM_FUNC_START(polyval_mul_pclmul_avx)
FRAME_BEGIN
vmovdqa .Lgstar(%rip), GSTAR
movups (%rdi), %xmm0
movups (%rsi), %xmm1
schoolbook1_noload
schoolbook2
montgomery_reduction SUM
movups SUM, (%rdi)
FRAME_END
RET
SYM_FUNC_END(polyval_mul_pclmul_avx)
/*
* Perform polynomial evaluation as specified by POLYVAL. This computes:
* h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1}
* where n=nblocks, h is the hash key, and m_i are the message blocks.
*
* rdi - pointer to the accumulator
* rsi - pointer to precomputed key powers h^8 ... h^1
* rdx - pointer to message blocks
* rcx - number of blocks to hash
*
* void polyval_blocks_pclmul_avx(struct polyval_elem *acc,
* const struct polyval_key *key,
* const u8 *data, size_t nblocks);
*/
SYM_FUNC_START(polyval_blocks_pclmul_avx)
FRAME_BEGIN
vmovdqa .Lgstar(%rip), GSTAR
movups (ACCUMULATOR), SUM
subq $STRIDE_BLOCKS, BLOCKS_LEFT
js .LstrideLoopExit
full_stride 0
subq $STRIDE_BLOCKS, BLOCKS_LEFT
js .LstrideLoopExitReduce
.LstrideLoop:
full_stride 1
subq $STRIDE_BLOCKS, BLOCKS_LEFT
jns .LstrideLoop
.LstrideLoopExitReduce:
montgomery_reduction SUM
.LstrideLoopExit:
add $STRIDE_BLOCKS, BLOCKS_LEFT
jz .LskipPartial
partial_stride
.LskipPartial:
movups SUM, (ACCUMULATOR)
FRAME_END
RET
SYM_FUNC_END(polyval_blocks_pclmul_avx)

83
lib/crypto/x86/polyval.h Normal file
View File

@@ -0,0 +1,83 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* POLYVAL library functions, x86_64 optimized
*
* Copyright 2025 Google LLC
*/
#include <asm/fpu/api.h>
#include <linux/cpufeature.h>
#define NUM_H_POWERS 8
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmul_avx);
asmlinkage void polyval_mul_pclmul_avx(struct polyval_elem *a,
const struct polyval_elem *b);
asmlinkage void polyval_blocks_pclmul_avx(struct polyval_elem *acc,
const struct polyval_key *key,
const u8 *data, size_t nblocks);
static void polyval_preparekey_arch(struct polyval_key *key,
const u8 raw_key[POLYVAL_BLOCK_SIZE])
{
static_assert(ARRAY_SIZE(key->h_powers) == NUM_H_POWERS);
memcpy(&key->h_powers[NUM_H_POWERS - 1], raw_key, POLYVAL_BLOCK_SIZE);
if (static_branch_likely(&have_pclmul_avx) && irq_fpu_usable()) {
kernel_fpu_begin();
for (int i = NUM_H_POWERS - 2; i >= 0; i--) {
key->h_powers[i] = key->h_powers[i + 1];
polyval_mul_pclmul_avx(
&key->h_powers[i],
&key->h_powers[NUM_H_POWERS - 1]);
}
kernel_fpu_end();
} else {
for (int i = NUM_H_POWERS - 2; i >= 0; i--) {
key->h_powers[i] = key->h_powers[i + 1];
polyval_mul_generic(&key->h_powers[i],
&key->h_powers[NUM_H_POWERS - 1]);
}
}
}
static void polyval_mul_arch(struct polyval_elem *acc,
const struct polyval_key *key)
{
if (static_branch_likely(&have_pclmul_avx) && irq_fpu_usable()) {
kernel_fpu_begin();
polyval_mul_pclmul_avx(acc, &key->h_powers[NUM_H_POWERS - 1]);
kernel_fpu_end();
} else {
polyval_mul_generic(acc, &key->h_powers[NUM_H_POWERS - 1]);
}
}
static void polyval_blocks_arch(struct polyval_elem *acc,
const struct polyval_key *key,
const u8 *data, size_t nblocks)
{
if (static_branch_likely(&have_pclmul_avx) && irq_fpu_usable()) {
do {
/* Allow rescheduling every 4 KiB. */
size_t n = min_t(size_t, nblocks,
4096 / POLYVAL_BLOCK_SIZE);
kernel_fpu_begin();
polyval_blocks_pclmul_avx(acc, key, data, n);
kernel_fpu_end();
data += n * POLYVAL_BLOCK_SIZE;
nblocks -= n;
} while (nblocks);
} else {
polyval_blocks_generic(acc, &key->h_powers[NUM_H_POWERS - 1],
data, nblocks);
}
}
#define polyval_mod_init_arch polyval_mod_init_arch
static void polyval_mod_init_arch(void)
{
if (boot_cpu_has(X86_FEATURE_PCLMULQDQ) &&
boot_cpu_has(X86_FEATURE_AVX))
static_branch_enable(&have_pclmul_avx);
}