From 04cadb4fe0341304741ef60a297366b553f0ce36 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Fri, 10 Oct 2025 17:10:47 -0700
Subject: [PATCH 01/37] lib/crypto: Add FIPS self-tests for SHA-1 and SHA-2

Add FIPS cryptographic algorithm self-tests for all SHA-1 and SHA-2
algorithms.  Following the "Implementation Guidance for FIPS 140-3"
document, to achieve this it's sufficient to just test a single test
vector for each of HMAC-SHA1, HMAC-SHA256, and HMAC-SHA512.

Just run these tests in the initcalls, following the example of e.g.
crypto/kdf_sp800108.c.  Note that this should meet the FIPS self-test
requirement even in the built-in case, given that the initcalls run
before userspace, storage, network, etc. are accessible.

This does not fix a regression, seeing as lib/ has had SHA-1 support
since 2005 and SHA-256 support since 2018.  Neither ever had FIPS
self-tests.  Moreover, fips=1 support has always been an unfinished
feature upstream.  However, with lib/ now being used more widely, it's
now seeing more scrutiny and people seem to want these now [1][2].

[1] https://lore.kernel.org/r/3226361.1758126043@warthog.procyon.org.uk/
[2] https://lore.kernel.org/r/f31dbb22-0add-481c-aee0-e337a7731f8e@oracle.com/

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251011001047.51886-1-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 lib/crypto/fips.h                   | 38 +++++++++++++++++++++++++++++
 lib/crypto/sha1.c                   | 19 ++++++++++++++-
 lib/crypto/sha256.c                 | 26 +++++++++++++++++---
 lib/crypto/sha512.c                 | 19 ++++++++++++++-
 scripts/crypto/gen-fips-testvecs.py | 32 ++++++++++++++++++++++++
 5 files changed, 128 insertions(+), 6 deletions(-)
 create mode 100644 lib/crypto/fips.h
 create mode 100755 scripts/crypto/gen-fips-testvecs.py

diff --git a/lib/crypto/fips.h b/lib/crypto/fips.h
new file mode 100644
index 000000000000..78a1bdd33a15
--- /dev/null
+++ b/lib/crypto/fips.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* This file was generated by: gen-fips-testvecs.py */
+
+#include <linux/fips.h>
+
+static const u8 fips_test_data[] __initconst __maybe_unused = {
+	0x66, 0x69, 0x70, 0x73, 0x20, 0x74, 0x65, 0x73,
+	0x74, 0x20, 0x64, 0x61, 0x74, 0x61, 0x00, 0x00,
+};
+
+static const u8 fips_test_key[] __initconst __maybe_unused = {
+	0x66, 0x69, 0x70, 0x73, 0x20, 0x74, 0x65, 0x73,
+	0x74, 0x20, 0x6b, 0x65, 0x79, 0x00, 0x00, 0x00,
+};
+
+static const u8 fips_test_hmac_sha1_value[] __initconst __maybe_unused = {
+	0x29, 0xa9, 0x88, 0xb8, 0x5c, 0xb4, 0xaf, 0x4b,
+	0x97, 0x2a, 0xee, 0x87, 0x5b, 0x0a, 0x02, 0x55,
+	0x99, 0xbf, 0x86, 0x78,
+};
+
+static const u8 fips_test_hmac_sha256_value[] __initconst __maybe_unused = {
+	0x59, 0x25, 0x85, 0xcc, 0x40, 0xe9, 0x64, 0x2f,
+	0xe9, 0xbf, 0x82, 0xb7, 0xd3, 0x15, 0x3d, 0x43,
+	0x22, 0x0b, 0x4c, 0x00, 0x90, 0x14, 0x25, 0xcf,
+	0x9e, 0x13, 0x2b, 0xc2, 0x30, 0xe6, 0xe8, 0x93,
+};
+
+static const u8 fips_test_hmac_sha512_value[] __initconst __maybe_unused = {
+	0x6b, 0xea, 0x5d, 0x27, 0x49, 0x5b, 0x3f, 0xea,
+	0xde, 0x2d, 0xfa, 0x32, 0x75, 0xdb, 0x77, 0xc8,
+	0x26, 0xe9, 0x4e, 0x95, 0x4d, 0xad, 0x88, 0x02,
+	0x87, 0xf9, 0x52, 0x0a, 0xd1, 0x92, 0x80, 0x1d,
+	0x92, 0x7e, 0x3c, 0xbd, 0xb1, 0x3c, 0x49, 0x98,
+	0x44, 0x9c, 0x8f, 0xee, 0x3f, 0x02, 0x71, 0x51,
+	0x57, 0x0b, 0x15, 0x38, 0x95, 0xd8, 0xa3, 0x81,
+	0xba, 0xb3, 0x15, 0x37, 0x5c, 0x6d, 0x57, 0x2b,
+};
diff --git a/lib/crypto/sha1.c b/lib/crypto/sha1.c
index 5904e4ae85d2..52788278cd17 100644
--- a/lib/crypto/sha1.c
+++ b/lib/crypto/sha1.c
@@ -12,6 +12,7 @@
 #include <linux/string.h>
 #include <linux/unaligned.h>
 #include <linux/wordpart.h>
+#include "fips.h"
 
 static const struct sha1_block_state sha1_iv = {
 	.h = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 },
@@ -330,10 +331,26 @@ void hmac_sha1_usingrawkey(const u8 *raw_key, size_t raw_key_len,
 }
 EXPORT_SYMBOL_GPL(hmac_sha1_usingrawkey);
 
-#ifdef sha1_mod_init_arch
+#if defined(sha1_mod_init_arch) || defined(CONFIG_CRYPTO_FIPS)
 static int __init sha1_mod_init(void)
 {
+#ifdef sha1_mod_init_arch
 	sha1_mod_init_arch();
+#endif
+	if (fips_enabled) {
+		/*
+		 * FIPS cryptographic algorithm self-test.  As per the FIPS
+		 * Implementation Guidance, testing HMAC-SHA1 satisfies the test
+		 * requirement for SHA-1 too.
+		 */
+		u8 mac[SHA1_DIGEST_SIZE];
+
+		hmac_sha1_usingrawkey(fips_test_key, sizeof(fips_test_key),
+				      fips_test_data, sizeof(fips_test_data),
+				      mac);
+		if (memcmp(fips_test_hmac_sha1_value, mac, sizeof(mac)) != 0)
+			panic("sha1: FIPS self-test failed\n");
+	}
 	return 0;
 }
 subsys_initcall(sha1_mod_init);
diff --git a/lib/crypto/sha256.c b/lib/crypto/sha256.c
index 881b935418ce..5d6b77e7e141 100644
--- a/lib/crypto/sha256.c
+++ b/lib/crypto/sha256.c
@@ -17,6 +17,7 @@
 #include <linux/string.h>
 #include <linux/unaligned.h>
 #include <linux/wordpart.h>
+#include "fips.h"
 
 static const struct sha256_block_state sha224_iv = {
 	.h = {
@@ -269,8 +270,8 @@ void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE])
 EXPORT_SYMBOL(sha256);
 
 /*
- * Pre-boot environment (as indicated by __DISABLE_EXPORTS being defined)
- * doesn't need either HMAC support or interleaved hashing support
+ * Pre-boot environments (as indicated by __DISABLE_EXPORTS being defined) just
+ * need the generic SHA-256 code.  Omit all other features from them.
  */
 #ifndef __DISABLE_EXPORTS
 
@@ -477,12 +478,27 @@ void hmac_sha256_usingrawkey(const u8 *raw_key, size_t raw_key_len,
 	hmac_sha256_final(&ctx, out);
 }
 EXPORT_SYMBOL_GPL(hmac_sha256_usingrawkey);
-#endif /* !__DISABLE_EXPORTS */
 
-#ifdef sha256_mod_init_arch
+#if defined(sha256_mod_init_arch) || defined(CONFIG_CRYPTO_FIPS)
 static int __init sha256_mod_init(void)
 {
+#ifdef sha256_mod_init_arch
 	sha256_mod_init_arch();
+#endif
+	if (fips_enabled) {
+		/*
+		 * FIPS cryptographic algorithm self-test.  As per the FIPS
+		 * Implementation Guidance, testing HMAC-SHA256 satisfies the
+		 * test requirement for SHA-224, SHA-256, and HMAC-SHA224 too.
+		 */
+		u8 mac[SHA256_DIGEST_SIZE];
+
+		hmac_sha256_usingrawkey(fips_test_key, sizeof(fips_test_key),
+					fips_test_data, sizeof(fips_test_data),
+					mac);
+		if (memcmp(fips_test_hmac_sha256_value, mac, sizeof(mac)) != 0)
+			panic("sha256: FIPS self-test failed\n");
+	}
 	return 0;
 }
 subsys_initcall(sha256_mod_init);
@@ -493,5 +509,7 @@ static void __exit sha256_mod_exit(void)
 module_exit(sha256_mod_exit);
 #endif
 
+#endif /* !__DISABLE_EXPORTS */
+
 MODULE_DESCRIPTION("SHA-224, SHA-256, HMAC-SHA224, and HMAC-SHA256 library functions");
 MODULE_LICENSE("GPL");
diff --git a/lib/crypto/sha512.c b/lib/crypto/sha512.c
index d8062188be98..605eab51aabd 100644
--- a/lib/crypto/sha512.c
+++ b/lib/crypto/sha512.c
@@ -17,6 +17,7 @@
 #include <linux/string.h>
 #include <linux/unaligned.h>
 #include <linux/wordpart.h>
+#include "fips.h"
 
 static const struct sha512_block_state sha384_iv = {
 	.h = {
@@ -405,10 +406,26 @@ void hmac_sha512_usingrawkey(const u8 *raw_key, size_t raw_key_len,
 }
 EXPORT_SYMBOL_GPL(hmac_sha512_usingrawkey);
 
-#ifdef sha512_mod_init_arch
+#if defined(sha512_mod_init_arch) || defined(CONFIG_CRYPTO_FIPS)
 static int __init sha512_mod_init(void)
 {
+#ifdef sha512_mod_init_arch
 	sha512_mod_init_arch();
+#endif
+	if (fips_enabled) {
+		/*
+		 * FIPS cryptographic algorithm self-test.  As per the FIPS
+		 * Implementation Guidance, testing HMAC-SHA512 satisfies the
+		 * test requirement for SHA-384, SHA-512, and HMAC-SHA384 too.
+		 */
+		u8 mac[SHA512_DIGEST_SIZE];
+
+		hmac_sha512_usingrawkey(fips_test_key, sizeof(fips_test_key),
+					fips_test_data, sizeof(fips_test_data),
+					mac);
+		if (memcmp(fips_test_hmac_sha512_value, mac, sizeof(mac)) != 0)
+			panic("sha512: FIPS self-test failed\n");
+	}
 	return 0;
 }
 subsys_initcall(sha512_mod_init);
diff --git a/scripts/crypto/gen-fips-testvecs.py b/scripts/crypto/gen-fips-testvecs.py
new file mode 100755
index 000000000000..2956f88b764a
--- /dev/null
+++ b/scripts/crypto/gen-fips-testvecs.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0-or-later
+#
+# Script that generates lib/crypto/fips.h
+#
+# Copyright 2025 Google LLC
+
+import hmac
+
+fips_test_data = b"fips test data\0\0"
+fips_test_key = b"fips test key\0\0\0"
+
+def print_static_u8_array_definition(name, value):
+    print('')
+    print(f'static const u8 {name}[] __initconst __maybe_unused = {{')
+    for i in range(0, len(value), 8):
+        line = '\t' + ''.join(f'0x{b:02x}, ' for b in value[i:i+8])
+        print(f'{line.rstrip()}')
+    print('};')
+
+print('/* SPDX-License-Identifier: GPL-2.0-or-later */')
+print(f'/* This file was generated by: gen-fips-testvecs.py */')
+print()
+print('#include <linux/fips.h>')
+
+print_static_u8_array_definition("fips_test_data", fips_test_data)
+print_static_u8_array_definition("fips_test_key", fips_test_key)
+
+for alg in 'sha1', 'sha256', 'sha512':
+    ctx = hmac.new(fips_test_key, digestmod=alg)
+    ctx.update(fips_test_data)
+    print_static_u8_array_definition(f'fips_test_hmac_{alg}_value', ctx.digest())

From 50b8e36994a042103ea92b6d9f6d7de725f9ac5f Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Fri, 17 Oct 2025 21:30:57 -0700
Subject: [PATCH 02/37] lib/crypto: blake2s: Adjust parameter order of
 blake2s()

Reorder the parameters of blake2s() from (out, in, key, outlen, inlen,
keylen) to (key, keylen, in, inlen, out, outlen).

This aligns BLAKE2s with the common conventions of pairing buffers and
their lengths, and having outputs follow inputs.  This is widely used
elsewhere in lib/crypto/ and crypto/, and even elsewhere in the BLAKE2s
code itself such as blake2s_init_key() and blake2s_final().  So
blake2s() was a bit of an exception.

Notably, this results in the same order as hmac_*_usingrawkey().

Note that since the type signature changed, it's not possible for a
blake2s() call site to be silently missed.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251018043106.375964-2-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 drivers/char/random.c            |  4 ++--
 drivers/net/wireguard/cookie.c   |  4 ++--
 drivers/net/wireguard/noise.c    |  4 ++--
 include/crypto/blake2s.h         |  6 +++---
 lib/crypto/tests/blake2s_kunit.c | 16 ++++++++--------
 5 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/drivers/char/random.c b/drivers/char/random.c
index b8b24b6ed3fe..422c5c76571b 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -701,7 +701,7 @@ static void extract_entropy(void *buf, size_t len)
 
 	/* next_key = HASHPRF(seed, RDSEED || 0) */
 	block.counter = 0;
-	blake2s(next_key, (u8 *)&block, seed, sizeof(next_key), sizeof(block), sizeof(seed));
+	blake2s(seed, sizeof(seed), (const u8 *)&block, sizeof(block), next_key, sizeof(next_key));
 	blake2s_init_key(&input_pool.hash, BLAKE2S_HASH_SIZE, next_key, sizeof(next_key));
 
 	spin_unlock_irqrestore(&input_pool.lock, flags);
@@ -711,7 +711,7 @@ static void extract_entropy(void *buf, size_t len)
 		i = min_t(size_t, len, BLAKE2S_HASH_SIZE);
 		/* output = HASHPRF(seed, RDSEED || ++counter) */
 		++block.counter;
-		blake2s(buf, (u8 *)&block, seed, i, sizeof(block), sizeof(seed));
+		blake2s(seed, sizeof(seed), (const u8 *)&block, sizeof(block), buf, i);
 		len -= i;
 		buf += i;
 	}
diff --git a/drivers/net/wireguard/cookie.c b/drivers/net/wireguard/cookie.c
index 94d0a7206084..be1b83aae03b 100644
--- a/drivers/net/wireguard/cookie.c
+++ b/drivers/net/wireguard/cookie.c
@@ -77,7 +77,7 @@ static void compute_mac1(u8 mac1[COOKIE_LEN], const void *message, size_t len,
 {
 	len = len - sizeof(struct message_macs) +
 	      offsetof(struct message_macs, mac1);
-	blake2s(mac1, message, key, COOKIE_LEN, len, NOISE_SYMMETRIC_KEY_LEN);
+	blake2s(key, NOISE_SYMMETRIC_KEY_LEN, message, len, mac1, COOKIE_LEN);
 }
 
 static void compute_mac2(u8 mac2[COOKIE_LEN], const void *message, size_t len,
@@ -85,7 +85,7 @@ static void compute_mac2(u8 mac2[COOKIE_LEN], const void *message, size_t len,
 {
 	len = len - sizeof(struct message_macs) +
 	      offsetof(struct message_macs, mac2);
-	blake2s(mac2, message, cookie, COOKIE_LEN, len, COOKIE_LEN);
+	blake2s(cookie, COOKIE_LEN, message, len, mac2, COOKIE_LEN);
 }
 
 static void make_cookie(u8 cookie[COOKIE_LEN], struct sk_buff *skb,
diff --git a/drivers/net/wireguard/noise.c b/drivers/net/wireguard/noise.c
index 7eb9a23a3d4d..306abb876c80 100644
--- a/drivers/net/wireguard/noise.c
+++ b/drivers/net/wireguard/noise.c
@@ -35,8 +35,8 @@ void __init wg_noise_init(void)
 {
 	struct blake2s_state blake;
 
-	blake2s(handshake_init_chaining_key, handshake_name, NULL,
-		NOISE_HASH_LEN, sizeof(handshake_name), 0);
+	blake2s(NULL, 0, handshake_name, sizeof(handshake_name),
+		handshake_init_chaining_key, NOISE_HASH_LEN);
 	blake2s_init(&blake, NOISE_HASH_LEN);
 	blake2s_update(&blake, handshake_init_chaining_key, NOISE_HASH_LEN);
 	blake2s_update(&blake, identifier_name, sizeof(identifier_name));
diff --git a/include/crypto/blake2s.h b/include/crypto/blake2s.h
index f9ffd39194eb..a7dd678725b2 100644
--- a/include/crypto/blake2s.h
+++ b/include/crypto/blake2s.h
@@ -86,9 +86,9 @@ static inline void blake2s_init_key(struct blake2s_state *state,
 void blake2s_update(struct blake2s_state *state, const u8 *in, size_t inlen);
 void blake2s_final(struct blake2s_state *state, u8 *out);
 
-static inline void blake2s(u8 *out, const u8 *in, const u8 *key,
-			   const size_t outlen, const size_t inlen,
-			   const size_t keylen)
+static inline void blake2s(const u8 *key, const size_t keylen,
+			   const u8 *in, const size_t inlen,
+			   u8 *out, const size_t outlen)
 {
 	struct blake2s_state state;
 
diff --git a/lib/crypto/tests/blake2s_kunit.c b/lib/crypto/tests/blake2s_kunit.c
index 057c40132246..247bbdf7dc86 100644
--- a/lib/crypto/tests/blake2s_kunit.c
+++ b/lib/crypto/tests/blake2s_kunit.c
@@ -14,7 +14,7 @@
 static void blake2s_default(const u8 *data, size_t len,
 			    u8 out[BLAKE2S_HASH_SIZE])
 {
-	blake2s(out, data, NULL, BLAKE2S_HASH_SIZE, len, 0);
+	blake2s(NULL, 0, data, len, out, BLAKE2S_HASH_SIZE);
 }
 
 static void blake2s_init_default(struct blake2s_state *state)
@@ -52,7 +52,7 @@ static void test_blake2s_all_key_and_hash_lens(struct kunit *test)
 	for (int key_len = 0; key_len <= BLAKE2S_KEY_SIZE; key_len++) {
 		rand_bytes_seeded_from_len(key, key_len);
 		for (int out_len = 1; out_len <= BLAKE2S_HASH_SIZE; out_len++) {
-			blake2s(hash, data, key, out_len, data_len, key_len);
+			blake2s(key, key_len, data, data_len, hash, out_len);
 			blake2s_update(&main_state, hash, out_len);
 		}
 	}
@@ -80,10 +80,10 @@ static void test_blake2s_with_guarded_key_buf(struct kunit *test)
 		rand_bytes(key, key_len);
 		memcpy(guarded_key, key, key_len);
 
-		blake2s(hash1, test_buf, key,
-			BLAKE2S_HASH_SIZE, data_len, key_len);
-		blake2s(hash2, test_buf, guarded_key,
-			BLAKE2S_HASH_SIZE, data_len, key_len);
+		blake2s(key, key_len, test_buf, data_len,
+			hash1, BLAKE2S_HASH_SIZE);
+		blake2s(guarded_key, key_len, test_buf, data_len,
+			hash2, BLAKE2S_HASH_SIZE);
 		KUNIT_ASSERT_MEMEQ(test, hash1, hash2, BLAKE2S_HASH_SIZE);
 
 		blake2s_init_key(&state, BLAKE2S_HASH_SIZE,
@@ -107,8 +107,8 @@ static void test_blake2s_with_guarded_out_buf(struct kunit *test)
 		u8 hash[BLAKE2S_HASH_SIZE];
 		u8 *guarded_hash = &test_buf[TEST_BUF_LEN - out_len];
 
-		blake2s(hash, test_buf, NULL, out_len, data_len, 0);
-		blake2s(guarded_hash, test_buf, NULL, out_len, data_len, 0);
+		blake2s(NULL, 0, test_buf, data_len, hash, out_len);
+		blake2s(NULL, 0, test_buf, data_len, guarded_hash, out_len);
 		KUNIT_ASSERT_MEMEQ(test, hash, guarded_hash, out_len);
 	}
 }

From 5e0ec8e46d4d6488242bb39a4ce5c0276afa5f32 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Fri, 17 Oct 2025 21:30:58 -0700
Subject: [PATCH 03/37] lib/crypto: blake2s: Rename blake2s_state to
 blake2s_ctx

For consistency with the SHA-1, SHA-2, SHA-3 (in development), and MD5
library APIs, rename blake2s_state to blake2s_ctx.

As a refresher, the ctx name:

- Is a bit shorter.
- Avoids confusion with the compression function state, which is also
  often called the state (but is just part of the full context).
- Is consistent with OpenSSL.

Not a big deal, of course.  But consistency is nice.  With a BLAKE2b
library API about to be added, this is a convenient time to update this.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251018043106.375964-3-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 drivers/char/random.c            |  2 +-
 drivers/net/wireguard/cookie.c   | 14 ++++----
 drivers/net/wireguard/noise.c    | 28 +++++++--------
 include/crypto/blake2s.h         | 59 ++++++++++++++++----------------
 lib/crypto/arm/blake2s-core.S    | 10 +++---
 lib/crypto/arm/blake2s.h         |  4 +--
 lib/crypto/blake2s.c             | 58 +++++++++++++++----------------
 lib/crypto/tests/blake2s_kunit.c | 23 ++++++-------
 lib/crypto/x86/blake2s.h         | 12 +++----
 9 files changed, 104 insertions(+), 106 deletions(-)

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 422c5c76571b..7e0486d8c51d 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -636,7 +636,7 @@ enum {
 };
 
 static struct {
-	struct blake2s_state hash;
+	struct blake2s_ctx hash;
 	spinlock_t lock;
 	unsigned int init_bits;
 } input_pool = {
diff --git a/drivers/net/wireguard/cookie.c b/drivers/net/wireguard/cookie.c
index be1b83aae03b..08731b3fa32b 100644
--- a/drivers/net/wireguard/cookie.c
+++ b/drivers/net/wireguard/cookie.c
@@ -33,7 +33,7 @@ static void precompute_key(u8 key[NOISE_SYMMETRIC_KEY_LEN],
 			   const u8 pubkey[NOISE_PUBLIC_KEY_LEN],
 			   const u8 label[COOKIE_KEY_LABEL_LEN])
 {
-	struct blake2s_state blake;
+	struct blake2s_ctx blake;
 
 	blake2s_init(&blake, NOISE_SYMMETRIC_KEY_LEN);
 	blake2s_update(&blake, label, COOKIE_KEY_LABEL_LEN);
@@ -91,7 +91,7 @@ static void compute_mac2(u8 mac2[COOKIE_LEN], const void *message, size_t len,
 static void make_cookie(u8 cookie[COOKIE_LEN], struct sk_buff *skb,
 			struct cookie_checker *checker)
 {
-	struct blake2s_state state;
+	struct blake2s_ctx blake;
 
 	if (wg_birthdate_has_expired(checker->secret_birthdate,
 				     COOKIE_SECRET_MAX_AGE)) {
@@ -103,15 +103,15 @@ static void make_cookie(u8 cookie[COOKIE_LEN], struct sk_buff *skb,
 
 	down_read(&checker->secret_lock);
 
-	blake2s_init_key(&state, COOKIE_LEN, checker->secret, NOISE_HASH_LEN);
+	blake2s_init_key(&blake, COOKIE_LEN, checker->secret, NOISE_HASH_LEN);
 	if (skb->protocol == htons(ETH_P_IP))
-		blake2s_update(&state, (u8 *)&ip_hdr(skb)->saddr,
+		blake2s_update(&blake, (u8 *)&ip_hdr(skb)->saddr,
 			       sizeof(struct in_addr));
 	else if (skb->protocol == htons(ETH_P_IPV6))
-		blake2s_update(&state, (u8 *)&ipv6_hdr(skb)->saddr,
+		blake2s_update(&blake, (u8 *)&ipv6_hdr(skb)->saddr,
 			       sizeof(struct in6_addr));
-	blake2s_update(&state, (u8 *)&udp_hdr(skb)->source, sizeof(__be16));
-	blake2s_final(&state, cookie);
+	blake2s_update(&blake, (u8 *)&udp_hdr(skb)->source, sizeof(__be16));
+	blake2s_final(&blake, cookie);
 
 	up_read(&checker->secret_lock);
 }
diff --git a/drivers/net/wireguard/noise.c b/drivers/net/wireguard/noise.c
index 306abb876c80..1fe8468f0bef 100644
--- a/drivers/net/wireguard/noise.c
+++ b/drivers/net/wireguard/noise.c
@@ -33,7 +33,7 @@ static atomic64_t keypair_counter = ATOMIC64_INIT(0);
 
 void __init wg_noise_init(void)
 {
-	struct blake2s_state blake;
+	struct blake2s_ctx blake;
 
 	blake2s(NULL, 0, handshake_name, sizeof(handshake_name),
 		handshake_init_chaining_key, NOISE_HASH_LEN);
@@ -304,33 +304,33 @@ void wg_noise_set_static_identity_private_key(
 
 static void hmac(u8 *out, const u8 *in, const u8 *key, const size_t inlen, const size_t keylen)
 {
-	struct blake2s_state state;
+	struct blake2s_ctx blake;
 	u8 x_key[BLAKE2S_BLOCK_SIZE] __aligned(__alignof__(u32)) = { 0 };
 	u8 i_hash[BLAKE2S_HASH_SIZE] __aligned(__alignof__(u32));
 	int i;
 
 	if (keylen > BLAKE2S_BLOCK_SIZE) {
-		blake2s_init(&state, BLAKE2S_HASH_SIZE);
-		blake2s_update(&state, key, keylen);
-		blake2s_final(&state, x_key);
+		blake2s_init(&blake, BLAKE2S_HASH_SIZE);
+		blake2s_update(&blake, key, keylen);
+		blake2s_final(&blake, x_key);
 	} else
 		memcpy(x_key, key, keylen);
 
 	for (i = 0; i < BLAKE2S_BLOCK_SIZE; ++i)
 		x_key[i] ^= 0x36;
 
-	blake2s_init(&state, BLAKE2S_HASH_SIZE);
-	blake2s_update(&state, x_key, BLAKE2S_BLOCK_SIZE);
-	blake2s_update(&state, in, inlen);
-	blake2s_final(&state, i_hash);
+	blake2s_init(&blake, BLAKE2S_HASH_SIZE);
+	blake2s_update(&blake, x_key, BLAKE2S_BLOCK_SIZE);
+	blake2s_update(&blake, in, inlen);
+	blake2s_final(&blake, i_hash);
 
 	for (i = 0; i < BLAKE2S_BLOCK_SIZE; ++i)
 		x_key[i] ^= 0x5c ^ 0x36;
 
-	blake2s_init(&state, BLAKE2S_HASH_SIZE);
-	blake2s_update(&state, x_key, BLAKE2S_BLOCK_SIZE);
-	blake2s_update(&state, i_hash, BLAKE2S_HASH_SIZE);
-	blake2s_final(&state, i_hash);
+	blake2s_init(&blake, BLAKE2S_HASH_SIZE);
+	blake2s_update(&blake, x_key, BLAKE2S_BLOCK_SIZE);
+	blake2s_update(&blake, i_hash, BLAKE2S_HASH_SIZE);
+	blake2s_final(&blake, i_hash);
 
 	memcpy(out, i_hash, BLAKE2S_HASH_SIZE);
 	memzero_explicit(x_key, BLAKE2S_BLOCK_SIZE);
@@ -431,7 +431,7 @@ static bool __must_check mix_precomputed_dh(u8 chaining_key[NOISE_HASH_LEN],
 
 static void mix_hash(u8 hash[NOISE_HASH_LEN], const u8 *src, size_t src_len)
 {
-	struct blake2s_state blake;
+	struct blake2s_ctx blake;
 
 	blake2s_init(&blake, NOISE_HASH_LEN);
 	blake2s_update(&blake, hash, NOISE_HASH_LEN);
diff --git a/include/crypto/blake2s.h b/include/crypto/blake2s.h
index a7dd678725b2..4c8d532ee97b 100644
--- a/include/crypto/blake2s.h
+++ b/include/crypto/blake2s.h
@@ -22,7 +22,7 @@ enum blake2s_lengths {
 	BLAKE2S_256_HASH_SIZE = 32,
 };
 
-struct blake2s_state {
+struct blake2s_ctx {
 	/* 'h', 't', and 'f' are used in assembly code, so keep them as-is. */
 	u32 h[8];
 	u32 t[2];
@@ -43,62 +43,61 @@ enum blake2s_iv {
 	BLAKE2S_IV7 = 0x5BE0CD19UL,
 };
 
-static inline void __blake2s_init(struct blake2s_state *state, size_t outlen,
+static inline void __blake2s_init(struct blake2s_ctx *ctx, size_t outlen,
 				  const void *key, size_t keylen)
 {
-	state->h[0] = BLAKE2S_IV0 ^ (0x01010000 | keylen << 8 | outlen);
-	state->h[1] = BLAKE2S_IV1;
-	state->h[2] = BLAKE2S_IV2;
-	state->h[3] = BLAKE2S_IV3;
-	state->h[4] = BLAKE2S_IV4;
-	state->h[5] = BLAKE2S_IV5;
-	state->h[6] = BLAKE2S_IV6;
-	state->h[7] = BLAKE2S_IV7;
-	state->t[0] = 0;
-	state->t[1] = 0;
-	state->f[0] = 0;
-	state->f[1] = 0;
-	state->buflen = 0;
-	state->outlen = outlen;
+	ctx->h[0] = BLAKE2S_IV0 ^ (0x01010000 | keylen << 8 | outlen);
+	ctx->h[1] = BLAKE2S_IV1;
+	ctx->h[2] = BLAKE2S_IV2;
+	ctx->h[3] = BLAKE2S_IV3;
+	ctx->h[4] = BLAKE2S_IV4;
+	ctx->h[5] = BLAKE2S_IV5;
+	ctx->h[6] = BLAKE2S_IV6;
+	ctx->h[7] = BLAKE2S_IV7;
+	ctx->t[0] = 0;
+	ctx->t[1] = 0;
+	ctx->f[0] = 0;
+	ctx->f[1] = 0;
+	ctx->buflen = 0;
+	ctx->outlen = outlen;
 	if (keylen) {
-		memcpy(state->buf, key, keylen);
-		memset(&state->buf[keylen], 0, BLAKE2S_BLOCK_SIZE - keylen);
-		state->buflen = BLAKE2S_BLOCK_SIZE;
+		memcpy(ctx->buf, key, keylen);
+		memset(&ctx->buf[keylen], 0, BLAKE2S_BLOCK_SIZE - keylen);
+		ctx->buflen = BLAKE2S_BLOCK_SIZE;
 	}
 }
 
-static inline void blake2s_init(struct blake2s_state *state,
-				const size_t outlen)
+static inline void blake2s_init(struct blake2s_ctx *ctx, const size_t outlen)
 {
-	__blake2s_init(state, outlen, NULL, 0);
+	__blake2s_init(ctx, outlen, NULL, 0);
 }
 
-static inline void blake2s_init_key(struct blake2s_state *state,
+static inline void blake2s_init_key(struct blake2s_ctx *ctx,
 				    const size_t outlen, const void *key,
 				    const size_t keylen)
 {
 	WARN_ON(IS_ENABLED(DEBUG) && (!outlen || outlen > BLAKE2S_HASH_SIZE ||
 		!key || !keylen || keylen > BLAKE2S_KEY_SIZE));
 
-	__blake2s_init(state, outlen, key, keylen);
+	__blake2s_init(ctx, outlen, key, keylen);
 }
 
-void blake2s_update(struct blake2s_state *state, const u8 *in, size_t inlen);
-void blake2s_final(struct blake2s_state *state, u8 *out);
+void blake2s_update(struct blake2s_ctx *ctx, const u8 *in, size_t inlen);
+void blake2s_final(struct blake2s_ctx *ctx, u8 *out);
 
 static inline void blake2s(const u8 *key, const size_t keylen,
 			   const u8 *in, const size_t inlen,
 			   u8 *out, const size_t outlen)
 {
-	struct blake2s_state state;
+	struct blake2s_ctx ctx;
 
 	WARN_ON(IS_ENABLED(DEBUG) && ((!in && inlen > 0) || !out || !outlen ||
 		outlen > BLAKE2S_HASH_SIZE || keylen > BLAKE2S_KEY_SIZE ||
 		(!key && keylen)));
 
-	__blake2s_init(&state, outlen, key, keylen);
-	blake2s_update(&state, in, inlen);
-	blake2s_final(&state, out);
+	__blake2s_init(&ctx, outlen, key, keylen);
+	blake2s_update(&ctx, in, inlen);
+	blake2s_final(&ctx, out);
 }
 
 #endif /* _CRYPTO_BLAKE2S_H */
diff --git a/lib/crypto/arm/blake2s-core.S b/lib/crypto/arm/blake2s-core.S
index 293f44fa8f31..78e758a7cb3e 100644
--- a/lib/crypto/arm/blake2s-core.S
+++ b/lib/crypto/arm/blake2s-core.S
@@ -170,10 +170,10 @@
 .endm
 
 //
-// void blake2s_compress(struct blake2s_state *state,
+// void blake2s_compress(struct blake2s_ctx *ctx,
 //			 const u8 *block, size_t nblocks, u32 inc);
 //
-// Only the first three fields of struct blake2s_state are used:
+// Only the first three fields of struct blake2s_ctx are used:
 //	u32 h[8];	(inout)
 //	u32 t[2];	(inout)
 //	u32 f[2];	(in)
@@ -183,7 +183,7 @@ ENTRY(blake2s_compress)
 	push		{r0-r2,r4-r11,lr}	// keep this an even number
 
 .Lnext_block:
-	// r0 is 'state'
+	// r0 is 'ctx'
 	// r1 is 'block'
 	// r3 is 'inc'
 
@@ -211,7 +211,7 @@ ENTRY(blake2s_compress)
 
 	// Calculate v[8..15].  Push v[9..15] onto the stack, and leave space
 	// for spilling v[8..9].  Leave v[8..9] in r8-r9.
-	mov		r14, r0			// r14 = state
+	mov		r14, r0			// r14 = ctx
 	adr		r12, .Lblake2s_IV
 	ldmia		r12!, {r8-r9}		// load IV[0..1]
 	__ldrd		r0, r1, r14, 40		// load f[0..1]
@@ -275,7 +275,7 @@ ENTRY(blake2s_compress)
 	// Advance to the next block, if there is one.  Note that if there are
 	// multiple blocks, then 'inc' (the counter increment amount) must be
 	// 64.  So we can simply set it to 64 without re-loading it.
-	ldm		sp, {r0, r1, r2}	// load (state, block, nblocks)
+	ldm		sp, {r0, r1, r2}	// load (ctx, block, nblocks)
 	mov		r3, #64			// set 'inc'
 	subs		r2, r2, #1		// nblocks--
 	str		r2, [sp, #8]
diff --git a/lib/crypto/arm/blake2s.h b/lib/crypto/arm/blake2s.h
index aa7a97139ea7..ce009cd98de9 100644
--- a/lib/crypto/arm/blake2s.h
+++ b/lib/crypto/arm/blake2s.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
 
 /* defined in blake2s-core.S */
-void blake2s_compress(struct blake2s_state *state, const u8 *block,
-		      size_t nblocks, u32 inc);
+void blake2s_compress(struct blake2s_ctx *ctx,
+		      const u8 *block, size_t nblocks, u32 inc);
diff --git a/lib/crypto/blake2s.c b/lib/crypto/blake2s.c
index 5638ed9d882d..1ad36cb29835 100644
--- a/lib/crypto/blake2s.c
+++ b/lib/crypto/blake2s.c
@@ -29,15 +29,15 @@ static const u8 blake2s_sigma[10][16] = {
 	{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
 };
 
-static inline void blake2s_increment_counter(struct blake2s_state *state,
+static inline void blake2s_increment_counter(struct blake2s_ctx *ctx,
 					     const u32 inc)
 {
-	state->t[0] += inc;
-	state->t[1] += (state->t[0] < inc);
+	ctx->t[0] += inc;
+	ctx->t[1] += (ctx->t[0] < inc);
 }
 
 static void __maybe_unused
-blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
+blake2s_compress_generic(struct blake2s_ctx *ctx, const u8 *block,
 			 size_t nblocks, const u32 inc)
 {
 	u32 m[16];
@@ -48,18 +48,18 @@ blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
 		(nblocks > 1 && inc != BLAKE2S_BLOCK_SIZE));
 
 	while (nblocks > 0) {
-		blake2s_increment_counter(state, inc);
+		blake2s_increment_counter(ctx, inc);
 		memcpy(m, block, BLAKE2S_BLOCK_SIZE);
 		le32_to_cpu_array(m, ARRAY_SIZE(m));
-		memcpy(v, state->h, 32);
+		memcpy(v, ctx->h, 32);
 		v[ 8] = BLAKE2S_IV0;
 		v[ 9] = BLAKE2S_IV1;
 		v[10] = BLAKE2S_IV2;
 		v[11] = BLAKE2S_IV3;
-		v[12] = BLAKE2S_IV4 ^ state->t[0];
-		v[13] = BLAKE2S_IV5 ^ state->t[1];
-		v[14] = BLAKE2S_IV6 ^ state->f[0];
-		v[15] = BLAKE2S_IV7 ^ state->f[1];
+		v[12] = BLAKE2S_IV4 ^ ctx->t[0];
+		v[13] = BLAKE2S_IV5 ^ ctx->t[1];
+		v[14] = BLAKE2S_IV6 ^ ctx->f[0];
+		v[15] = BLAKE2S_IV7 ^ ctx->f[1];
 
 #define G(r, i, a, b, c, d) do { \
 	a += b + m[blake2s_sigma[r][2 * i + 0]]; \
@@ -97,7 +97,7 @@ blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
 #undef ROUND
 
 		for (i = 0; i < 8; ++i)
-			state->h[i] ^= v[i] ^ v[i + 8];
+			ctx->h[i] ^= v[i] ^ v[i + 8];
 
 		block += BLAKE2S_BLOCK_SIZE;
 		--nblocks;
@@ -110,45 +110,45 @@ blake2s_compress_generic(struct blake2s_state *state, const u8 *block,
 #define blake2s_compress blake2s_compress_generic
 #endif
 
-static inline void blake2s_set_lastblock(struct blake2s_state *state)
+static inline void blake2s_set_lastblock(struct blake2s_ctx *ctx)
 {
-	state->f[0] = -1;
+	ctx->f[0] = -1;
 }
 
-void blake2s_update(struct blake2s_state *state, const u8 *in, size_t inlen)
+void blake2s_update(struct blake2s_ctx *ctx, const u8 *in, size_t inlen)
 {
-	const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen;
+	const size_t fill = BLAKE2S_BLOCK_SIZE - ctx->buflen;
 
 	if (unlikely(!inlen))
 		return;
 	if (inlen > fill) {
-		memcpy(state->buf + state->buflen, in, fill);
-		blake2s_compress(state, state->buf, 1, BLAKE2S_BLOCK_SIZE);
-		state->buflen = 0;
+		memcpy(ctx->buf + ctx->buflen, in, fill);
+		blake2s_compress(ctx, ctx->buf, 1, BLAKE2S_BLOCK_SIZE);
+		ctx->buflen = 0;
 		in += fill;
 		inlen -= fill;
 	}
 	if (inlen > BLAKE2S_BLOCK_SIZE) {
 		const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE);
-		blake2s_compress(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE);
+		blake2s_compress(ctx, in, nblocks - 1, BLAKE2S_BLOCK_SIZE);
 		in += BLAKE2S_BLOCK_SIZE * (nblocks - 1);
 		inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1);
 	}
-	memcpy(state->buf + state->buflen, in, inlen);
-	state->buflen += inlen;
+	memcpy(ctx->buf + ctx->buflen, in, inlen);
+	ctx->buflen += inlen;
 }
 EXPORT_SYMBOL(blake2s_update);
 
-void blake2s_final(struct blake2s_state *state, u8 *out)
+void blake2s_final(struct blake2s_ctx *ctx, u8 *out)
 {
 	WARN_ON(IS_ENABLED(DEBUG) && !out);
-	blake2s_set_lastblock(state);
-	memset(state->buf + state->buflen, 0,
-	       BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */
-	blake2s_compress(state, state->buf, 1, state->buflen);
-	cpu_to_le32_array(state->h, ARRAY_SIZE(state->h));
-	memcpy(out, state->h, state->outlen);
-	memzero_explicit(state, sizeof(*state));
+	blake2s_set_lastblock(ctx);
+	memset(ctx->buf + ctx->buflen, 0,
+	       BLAKE2S_BLOCK_SIZE - ctx->buflen); /* Padding */
+	blake2s_compress(ctx, ctx->buf, 1, ctx->buflen);
+	cpu_to_le32_array(ctx->h, ARRAY_SIZE(ctx->h));
+	memcpy(out, ctx->h, ctx->outlen);
+	memzero_explicit(ctx, sizeof(*ctx));
 }
 EXPORT_SYMBOL(blake2s_final);
 
diff --git a/lib/crypto/tests/blake2s_kunit.c b/lib/crypto/tests/blake2s_kunit.c
index 247bbdf7dc86..6832d9aa7b82 100644
--- a/lib/crypto/tests/blake2s_kunit.c
+++ b/lib/crypto/tests/blake2s_kunit.c
@@ -17,9 +17,9 @@ static void blake2s_default(const u8 *data, size_t len,
 	blake2s(NULL, 0, data, len, out, BLAKE2S_HASH_SIZE);
 }
 
-static void blake2s_init_default(struct blake2s_state *state)
+static void blake2s_init_default(struct blake2s_ctx *ctx)
 {
-	blake2s_init(state, BLAKE2S_HASH_SIZE);
+	blake2s_init(ctx, BLAKE2S_HASH_SIZE);
 }
 
 /*
@@ -27,7 +27,7 @@ static void blake2s_init_default(struct blake2s_state *state)
  * with a key length of 0 and a hash length of BLAKE2S_HASH_SIZE.
  */
 #define HASH blake2s_default
-#define HASH_CTX blake2s_state
+#define HASH_CTX blake2s_ctx
 #define HASH_SIZE BLAKE2S_HASH_SIZE
 #define HASH_INIT blake2s_init_default
 #define HASH_UPDATE blake2s_update
@@ -44,19 +44,19 @@ static void test_blake2s_all_key_and_hash_lens(struct kunit *test)
 	u8 *data = &test_buf[0];
 	u8 *key = data + data_len;
 	u8 *hash = key + BLAKE2S_KEY_SIZE;
-	struct blake2s_state main_state;
+	struct blake2s_ctx main_ctx;
 	u8 main_hash[BLAKE2S_HASH_SIZE];
 
 	rand_bytes_seeded_from_len(data, data_len);
-	blake2s_init(&main_state, BLAKE2S_HASH_SIZE);
+	blake2s_init(&main_ctx, BLAKE2S_HASH_SIZE);
 	for (int key_len = 0; key_len <= BLAKE2S_KEY_SIZE; key_len++) {
 		rand_bytes_seeded_from_len(key, key_len);
 		for (int out_len = 1; out_len <= BLAKE2S_HASH_SIZE; out_len++) {
 			blake2s(key, key_len, data, data_len, hash, out_len);
-			blake2s_update(&main_state, hash, out_len);
+			blake2s_update(&main_ctx, hash, out_len);
 		}
 	}
-	blake2s_final(&main_state, main_hash);
+	blake2s_final(&main_ctx, main_hash);
 	KUNIT_ASSERT_MEMEQ(test, main_hash, blake2s_keyed_testvec_consolidated,
 			   BLAKE2S_HASH_SIZE);
 }
@@ -75,7 +75,7 @@ static void test_blake2s_with_guarded_key_buf(struct kunit *test)
 		u8 *guarded_key = &test_buf[TEST_BUF_LEN - key_len];
 		u8 hash1[BLAKE2S_HASH_SIZE];
 		u8 hash2[BLAKE2S_HASH_SIZE];
-		struct blake2s_state state;
+		struct blake2s_ctx ctx;
 
 		rand_bytes(key, key_len);
 		memcpy(guarded_key, key, key_len);
@@ -86,10 +86,9 @@ static void test_blake2s_with_guarded_key_buf(struct kunit *test)
 			hash2, BLAKE2S_HASH_SIZE);
 		KUNIT_ASSERT_MEMEQ(test, hash1, hash2, BLAKE2S_HASH_SIZE);
 
-		blake2s_init_key(&state, BLAKE2S_HASH_SIZE,
-				 guarded_key, key_len);
-		blake2s_update(&state, test_buf, data_len);
-		blake2s_final(&state, hash2);
+		blake2s_init_key(&ctx, BLAKE2S_HASH_SIZE, guarded_key, key_len);
+		blake2s_update(&ctx, test_buf, data_len);
+		blake2s_final(&ctx, hash2);
 		KUNIT_ASSERT_MEMEQ(test, hash1, hash2, BLAKE2S_HASH_SIZE);
 	}
 }
diff --git a/lib/crypto/x86/blake2s.h b/lib/crypto/x86/blake2s.h
index b6d30d2fa045..de360935b820 100644
--- a/lib/crypto/x86/blake2s.h
+++ b/lib/crypto/x86/blake2s.h
@@ -11,24 +11,24 @@
 #include <linux/kernel.h>
 #include <linux/sizes.h>
 
-asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state,
+asmlinkage void blake2s_compress_ssse3(struct blake2s_ctx *ctx,
 				       const u8 *block, const size_t nblocks,
 				       const u32 inc);
-asmlinkage void blake2s_compress_avx512(struct blake2s_state *state,
+asmlinkage void blake2s_compress_avx512(struct blake2s_ctx *ctx,
 					const u8 *block, const size_t nblocks,
 					const u32 inc);
 
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3);
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512);
 
-static void blake2s_compress(struct blake2s_state *state, const u8 *block,
+static void blake2s_compress(struct blake2s_ctx *ctx, const u8 *block,
 			     size_t nblocks, const u32 inc)
 {
 	/* SIMD disables preemption, so relax after processing each page. */
 	BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);
 
 	if (!static_branch_likely(&blake2s_use_ssse3) || !may_use_simd()) {
-		blake2s_compress_generic(state, block, nblocks, inc);
+		blake2s_compress_generic(ctx, block, nblocks, inc);
 		return;
 	}
 
@@ -38,9 +38,9 @@ static void blake2s_compress(struct blake2s_state *state, const u8 *block,
 
 		kernel_fpu_begin();
 		if (static_branch_likely(&blake2s_use_avx512))
-			blake2s_compress_avx512(state, block, blocks, inc);
+			blake2s_compress_avx512(ctx, block, blocks, inc);
 		else
-			blake2s_compress_ssse3(state, block, blocks, inc);
+			blake2s_compress_ssse3(ctx, block, blocks, inc);
 		kernel_fpu_end();
 
 		nblocks -= blocks;

From 5385bcbffe5a76a74d6bb135af1c88fb235f8134 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Fri, 17 Oct 2025 21:30:59 -0700
Subject: [PATCH 04/37] lib/crypto: blake2s: Drop excessive const & rename
 block => data

A couple more small cleanups to the BLAKE2s code before these things get
propagated into the BLAKE2b code:

- Drop 'const' from some non-pointer function parameters.  It was a bit
  excessive and not conventional.

- Rename 'block' argument of blake2s_compress*() to 'data'.  This is for
  consistency with the SHA-* code, and also to avoid the implication
  that it points to a singular "block".

No functional changes.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251018043106.375964-4-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 include/crypto/blake2s.h      | 13 ++++++-------
 lib/crypto/arm/blake2s-core.S |  6 +++---
 lib/crypto/arm/blake2s.h      |  2 +-
 lib/crypto/blake2s.c          | 12 ++++++------
 lib/crypto/x86/blake2s.h      | 18 ++++++++----------
 5 files changed, 24 insertions(+), 27 deletions(-)

diff --git a/include/crypto/blake2s.h b/include/crypto/blake2s.h
index 4c8d532ee97b..33893057eb41 100644
--- a/include/crypto/blake2s.h
+++ b/include/crypto/blake2s.h
@@ -67,14 +67,13 @@ static inline void __blake2s_init(struct blake2s_ctx *ctx, size_t outlen,
 	}
 }
 
-static inline void blake2s_init(struct blake2s_ctx *ctx, const size_t outlen)
+static inline void blake2s_init(struct blake2s_ctx *ctx, size_t outlen)
 {
 	__blake2s_init(ctx, outlen, NULL, 0);
 }
 
-static inline void blake2s_init_key(struct blake2s_ctx *ctx,
-				    const size_t outlen, const void *key,
-				    const size_t keylen)
+static inline void blake2s_init_key(struct blake2s_ctx *ctx, size_t outlen,
+				    const void *key, size_t keylen)
 {
 	WARN_ON(IS_ENABLED(DEBUG) && (!outlen || outlen > BLAKE2S_HASH_SIZE ||
 		!key || !keylen || keylen > BLAKE2S_KEY_SIZE));
@@ -85,9 +84,9 @@ static inline void blake2s_init_key(struct blake2s_ctx *ctx,
 void blake2s_update(struct blake2s_ctx *ctx, const u8 *in, size_t inlen);
 void blake2s_final(struct blake2s_ctx *ctx, u8 *out);
 
-static inline void blake2s(const u8 *key, const size_t keylen,
-			   const u8 *in, const size_t inlen,
-			   u8 *out, const size_t outlen)
+static inline void blake2s(const u8 *key, size_t keylen,
+			   const u8 *in, size_t inlen,
+			   u8 *out, size_t outlen)
 {
 	struct blake2s_ctx ctx;
 
diff --git a/lib/crypto/arm/blake2s-core.S b/lib/crypto/arm/blake2s-core.S
index 78e758a7cb3e..14eb7c18a836 100644
--- a/lib/crypto/arm/blake2s-core.S
+++ b/lib/crypto/arm/blake2s-core.S
@@ -171,7 +171,7 @@
 
 //
 // void blake2s_compress(struct blake2s_ctx *ctx,
-//			 const u8 *block, size_t nblocks, u32 inc);
+//			 const u8 *data, size_t nblocks, u32 inc);
 //
 // Only the first three fields of struct blake2s_ctx are used:
 //	u32 h[8];	(inout)
@@ -184,7 +184,7 @@ ENTRY(blake2s_compress)
 
 .Lnext_block:
 	// r0 is 'ctx'
-	// r1 is 'block'
+	// r1 is 'data'
 	// r3 is 'inc'
 
 	// Load and increment the counter t[0..1].
@@ -275,7 +275,7 @@ ENTRY(blake2s_compress)
 	// Advance to the next block, if there is one.  Note that if there are
 	// multiple blocks, then 'inc' (the counter increment amount) must be
 	// 64.  So we can simply set it to 64 without re-loading it.
-	ldm		sp, {r0, r1, r2}	// load (ctx, block, nblocks)
+	ldm		sp, {r0, r1, r2}	// load (ctx, data, nblocks)
 	mov		r3, #64			// set 'inc'
 	subs		r2, r2, #1		// nblocks--
 	str		r2, [sp, #8]
diff --git a/lib/crypto/arm/blake2s.h b/lib/crypto/arm/blake2s.h
index ce009cd98de9..42c04440c191 100644
--- a/lib/crypto/arm/blake2s.h
+++ b/lib/crypto/arm/blake2s.h
@@ -2,4 +2,4 @@
 
 /* defined in blake2s-core.S */
 void blake2s_compress(struct blake2s_ctx *ctx,
-		      const u8 *block, size_t nblocks, u32 inc);
+		      const u8 *data, size_t nblocks, u32 inc);
diff --git a/lib/crypto/blake2s.c b/lib/crypto/blake2s.c
index 1ad36cb29835..6182c21ed943 100644
--- a/lib/crypto/blake2s.c
+++ b/lib/crypto/blake2s.c
@@ -29,16 +29,15 @@ static const u8 blake2s_sigma[10][16] = {
 	{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
 };
 
-static inline void blake2s_increment_counter(struct blake2s_ctx *ctx,
-					     const u32 inc)
+static inline void blake2s_increment_counter(struct blake2s_ctx *ctx, u32 inc)
 {
 	ctx->t[0] += inc;
 	ctx->t[1] += (ctx->t[0] < inc);
 }
 
 static void __maybe_unused
-blake2s_compress_generic(struct blake2s_ctx *ctx, const u8 *block,
-			 size_t nblocks, const u32 inc)
+blake2s_compress_generic(struct blake2s_ctx *ctx,
+			 const u8 *data, size_t nblocks, u32 inc)
 {
 	u32 m[16];
 	u32 v[16];
@@ -49,7 +48,7 @@ blake2s_compress_generic(struct blake2s_ctx *ctx, const u8 *block,
 
 	while (nblocks > 0) {
 		blake2s_increment_counter(ctx, inc);
-		memcpy(m, block, BLAKE2S_BLOCK_SIZE);
+		memcpy(m, data, BLAKE2S_BLOCK_SIZE);
 		le32_to_cpu_array(m, ARRAY_SIZE(m));
 		memcpy(v, ctx->h, 32);
 		v[ 8] = BLAKE2S_IV0;
@@ -99,7 +98,7 @@ blake2s_compress_generic(struct blake2s_ctx *ctx, const u8 *block,
 		for (i = 0; i < 8; ++i)
 			ctx->h[i] ^= v[i] ^ v[i + 8];
 
-		block += BLAKE2S_BLOCK_SIZE;
+		data += BLAKE2S_BLOCK_SIZE;
 		--nblocks;
 	}
 }
@@ -130,6 +129,7 @@ void blake2s_update(struct blake2s_ctx *ctx, const u8 *in, size_t inlen)
 	}
 	if (inlen > BLAKE2S_BLOCK_SIZE) {
 		const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE);
+
 		blake2s_compress(ctx, in, nblocks - 1, BLAKE2S_BLOCK_SIZE);
 		in += BLAKE2S_BLOCK_SIZE * (nblocks - 1);
 		inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1);
diff --git a/lib/crypto/x86/blake2s.h b/lib/crypto/x86/blake2s.h
index de360935b820..f8eed6cb042e 100644
--- a/lib/crypto/x86/blake2s.h
+++ b/lib/crypto/x86/blake2s.h
@@ -12,23 +12,21 @@
 #include <linux/sizes.h>
 
 asmlinkage void blake2s_compress_ssse3(struct blake2s_ctx *ctx,
-				       const u8 *block, const size_t nblocks,
-				       const u32 inc);
+				       const u8 *data, size_t nblocks, u32 inc);
 asmlinkage void blake2s_compress_avx512(struct blake2s_ctx *ctx,
-					const u8 *block, const size_t nblocks,
-					const u32 inc);
+					const u8 *data, size_t nblocks, u32 inc);
 
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3);
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512);
 
-static void blake2s_compress(struct blake2s_ctx *ctx, const u8 *block,
-			     size_t nblocks, const u32 inc)
+static void blake2s_compress(struct blake2s_ctx *ctx,
+			     const u8 *data, size_t nblocks, u32 inc)
 {
 	/* SIMD disables preemption, so relax after processing each page. */
 	BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);
 
 	if (!static_branch_likely(&blake2s_use_ssse3) || !may_use_simd()) {
-		blake2s_compress_generic(ctx, block, nblocks, inc);
+		blake2s_compress_generic(ctx, data, nblocks, inc);
 		return;
 	}
 
@@ -38,13 +36,13 @@ static void blake2s_compress(struct blake2s_ctx *ctx, const u8 *block,
 
 		kernel_fpu_begin();
 		if (static_branch_likely(&blake2s_use_avx512))
-			blake2s_compress_avx512(ctx, block, blocks, inc);
+			blake2s_compress_avx512(ctx, data, blocks, inc);
 		else
-			blake2s_compress_ssse3(ctx, block, blocks, inc);
+			blake2s_compress_ssse3(ctx, data, blocks, inc);
 		kernel_fpu_end();
 
+		data += blocks * BLAKE2S_BLOCK_SIZE;
 		nblocks -= blocks;
-		block += blocks * BLAKE2S_BLOCK_SIZE;
 	} while (nblocks);
 }
 

From b95d4471cb5830b59667ead8d1d59dc3d661a1df Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Fri, 17 Oct 2025 21:31:00 -0700
Subject: [PATCH 05/37] lib/crypto: blake2s: Document the BLAKE2s library API

Add kerneldoc for the BLAKE2s library API.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251018043106.375964-5-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 include/crypto/blake2s.h | 58 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)

diff --git a/include/crypto/blake2s.h b/include/crypto/blake2s.h
index 33893057eb41..648cb7824358 100644
--- a/include/crypto/blake2s.h
+++ b/include/crypto/blake2s.h
@@ -22,6 +22,15 @@ enum blake2s_lengths {
 	BLAKE2S_256_HASH_SIZE = 32,
 };
 
+/**
+ * struct blake2s_ctx - Context for hashing a message with BLAKE2s
+ * @h: compression function state
+ * @t: block counter
+ * @f: finalization indicator
+ * @buf: partial block buffer; 'buflen' bytes are valid
+ * @buflen: number of bytes buffered in @buf
+ * @outlen: length of output hash value in bytes, at most BLAKE2S_HASH_SIZE
+ */
 struct blake2s_ctx {
 	/* 'h', 't', and 'f' are used in assembly code, so keep them as-is. */
 	u32 h[8];
@@ -67,11 +76,27 @@ static inline void __blake2s_init(struct blake2s_ctx *ctx, size_t outlen,
 	}
 }
 
+/**
+ * blake2s_init() - Initialize a BLAKE2s context for a new message (unkeyed)
+ * @ctx: the context to initialize
+ * @outlen: length of output hash value in bytes, at most BLAKE2S_HASH_SIZE
+ *
+ * Context: Any context.
+ */
 static inline void blake2s_init(struct blake2s_ctx *ctx, size_t outlen)
 {
 	__blake2s_init(ctx, outlen, NULL, 0);
 }
 
+/**
+ * blake2s_init_key() - Initialize a BLAKE2s context for a new message (keyed)
+ * @ctx: the context to initialize
+ * @outlen: length of output hash value in bytes, at most BLAKE2S_HASH_SIZE
+ * @key: the key
+ * @keylen: the key length in bytes, at most BLAKE2S_KEY_SIZE
+ *
+ * Context: Any context.
+ */
 static inline void blake2s_init_key(struct blake2s_ctx *ctx, size_t outlen,
 				    const void *key, size_t keylen)
 {
@@ -81,9 +106,42 @@ static inline void blake2s_init_key(struct blake2s_ctx *ctx, size_t outlen,
 	__blake2s_init(ctx, outlen, key, keylen);
 }
 
+/**
+ * blake2s_update() - Update a BLAKE2s context with message data
+ * @ctx: the context to update; must have been initialized
+ * @in: the message data
+ * @inlen: the data length in bytes
+ *
+ * This can be called any number of times.
+ *
+ * Context: Any context.
+ */
 void blake2s_update(struct blake2s_ctx *ctx, const u8 *in, size_t inlen);
+
+/**
+ * blake2s_final() - Finish computing a BLAKE2s hash
+ * @ctx: the context to finalize; must have been initialized
+ * @out: (output) the resulting BLAKE2s hash.  Its length will be equal to the
+ *	 @outlen that was passed to blake2s_init() or blake2s_init_key().
+ *
+ * After finishing, this zeroizes @ctx.  So the caller does not need to do it.
+ *
+ * Context: Any context.
+ */
 void blake2s_final(struct blake2s_ctx *ctx, u8 *out);
 
+/**
+ * blake2s() - Compute BLAKE2s hash in one shot
+ * @key: the key, or NULL for an unkeyed hash
+ * @keylen: the key length in bytes (at most BLAKE2S_KEY_SIZE), or 0 for an
+ *	    unkeyed hash
+ * @in: the message data
+ * @inlen: the data length in bytes
+ * @out: (output) the resulting BLAKE2s hash, with length @outlen
+ * @outlen: length of output hash value in bytes, at most BLAKE2S_HASH_SIZE
+ *
+ * Context: Any context.
+ */
 static inline void blake2s(const u8 *key, size_t keylen,
 			   const u8 *in, size_t inlen,
 			   u8 *out, size_t outlen)

From c99d30706043481a1d631bbd9c7a4b70fe002a2b Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Fri, 17 Oct 2025 21:31:01 -0700
Subject: [PATCH 06/37] byteorder: Add le64_to_cpu_array() and
 cpu_to_le64_array()

Add le64_to_cpu_array() and cpu_to_le64_array().  These mirror the
corresponding 32-bit functions.

These will be used by the BLAKE2b code.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251018043106.375964-6-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 include/linux/byteorder/generic.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/include/linux/byteorder/generic.h b/include/linux/byteorder/generic.h
index b3705e8bbe2b..55a44199de87 100644
--- a/include/linux/byteorder/generic.h
+++ b/include/linux/byteorder/generic.h
@@ -173,6 +173,22 @@ static inline void cpu_to_le32_array(u32 *buf, unsigned int words)
 	}
 }
 
+static inline void le64_to_cpu_array(u64 *buf, unsigned int words)
+{
+	while (words--) {
+		__le64_to_cpus(buf);
+		buf++;
+	}
+}
+
+static inline void cpu_to_le64_array(u64 *buf, unsigned int words)
+{
+	while (words--) {
+		__cpu_to_le64s(buf);
+		buf++;
+	}
+}
+
 static inline void memcpy_from_le32(u32 *dst, const __le32 *src, size_t words)
 {
 	size_t i;

From 23a16c9533ed92cc639c8f5bd9eb104809fe2919 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Fri, 17 Oct 2025 21:31:02 -0700
Subject: [PATCH 07/37] lib/crypto: blake2b: Add BLAKE2b library functions

Add a library API for BLAKE2b, closely modeled after the BLAKE2s API.

This will allow in-kernel users such as btrfs to use BLAKE2b without
going through the generic crypto layer.  In addition, as usual the
BLAKE2b crypto_shash algorithms will be reimplemented on top of this.

Note: to create lib/crypto/blake2b.c I made a copy of
lib/crypto/blake2s.c and made the updates from BLAKE2s => BLAKE2b.  This
way, the BLAKE2s and BLAKE2b code is kept consistent.  Therefore, it
borrows the SPDX-License-Identifier and Copyright from
lib/crypto/blake2s.c rather than crypto/blake2b_generic.c.

The library API uses 'struct blake2b_ctx', consistent with other
lib/crypto/ APIs.  The existing 'struct blake2b_state' will be removed
once the blake2b crypto_shash algorithms are updated to stop using it.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251018043106.375964-7-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 include/crypto/blake2b.h          | 133 ++++++++++++++++++++---
 include/crypto/internal/blake2b.h |  17 ++-
 lib/crypto/Kconfig                |  10 ++
 lib/crypto/Makefile               |   9 ++
 lib/crypto/blake2b.c              | 174 ++++++++++++++++++++++++++++++
 5 files changed, 330 insertions(+), 13 deletions(-)
 create mode 100644 lib/crypto/blake2b.c

diff --git a/include/crypto/blake2b.h b/include/crypto/blake2b.h
index dd7694477e50..4879e2ec2686 100644
--- a/include/crypto/blake2b.h
+++ b/include/crypto/blake2b.h
@@ -28,6 +28,25 @@ enum blake2b_lengths {
 	BLAKE2B_512_HASH_SIZE = 64,
 };
 
+/**
+ * struct blake2b_ctx - Context for hashing a message with BLAKE2b
+ * @h: compression function state
+ * @t: block counter
+ * @f: finalization indicator
+ * @buf: partial block buffer; 'buflen' bytes are valid
+ * @buflen: number of bytes buffered in @buf
+ * @outlen: length of output hash value in bytes, at most BLAKE2B_HASH_SIZE
+ */
+struct blake2b_ctx {
+	/* 'h', 't', and 'f' are used in assembly code, so keep them as-is. */
+	u64 h[8];
+	u64 t[2];
+	u64 f[2];
+	u8 buf[BLAKE2B_BLOCK_SIZE];
+	unsigned int buflen;
+	unsigned int outlen;
+};
+
 enum blake2b_iv {
 	BLAKE2B_IV0 = 0x6A09E667F3BCC908ULL,
 	BLAKE2B_IV1 = 0xBB67AE8584CAA73BULL,
@@ -39,19 +58,109 @@ enum blake2b_iv {
 	BLAKE2B_IV7 = 0x5BE0CD19137E2179ULL,
 };
 
-static inline void __blake2b_init(struct blake2b_state *state, size_t outlen,
-				  size_t keylen)
+static inline void __blake2b_init(struct blake2b_ctx *ctx, size_t outlen,
+				  const void *key, size_t keylen)
 {
-	state->h[0] = BLAKE2B_IV0 ^ (0x01010000 | keylen << 8 | outlen);
-	state->h[1] = BLAKE2B_IV1;
-	state->h[2] = BLAKE2B_IV2;
-	state->h[3] = BLAKE2B_IV3;
-	state->h[4] = BLAKE2B_IV4;
-	state->h[5] = BLAKE2B_IV5;
-	state->h[6] = BLAKE2B_IV6;
-	state->h[7] = BLAKE2B_IV7;
-	state->t[0] = 0;
-	state->t[1] = 0;
+	ctx->h[0] = BLAKE2B_IV0 ^ (0x01010000 | keylen << 8 | outlen);
+	ctx->h[1] = BLAKE2B_IV1;
+	ctx->h[2] = BLAKE2B_IV2;
+	ctx->h[3] = BLAKE2B_IV3;
+	ctx->h[4] = BLAKE2B_IV4;
+	ctx->h[5] = BLAKE2B_IV5;
+	ctx->h[6] = BLAKE2B_IV6;
+	ctx->h[7] = BLAKE2B_IV7;
+	ctx->t[0] = 0;
+	ctx->t[1] = 0;
+	ctx->f[0] = 0;
+	ctx->f[1] = 0;
+	ctx->buflen = 0;
+	ctx->outlen = outlen;
+	if (keylen) {
+		memcpy(ctx->buf, key, keylen);
+		memset(&ctx->buf[keylen], 0, BLAKE2B_BLOCK_SIZE - keylen);
+		ctx->buflen = BLAKE2B_BLOCK_SIZE;
+	}
+}
+
+/**
+ * blake2b_init() - Initialize a BLAKE2b context for a new message (unkeyed)
+ * @ctx: the context to initialize
+ * @outlen: length of output hash value in bytes, at most BLAKE2B_HASH_SIZE
+ *
+ * Context: Any context.
+ */
+static inline void blake2b_init(struct blake2b_ctx *ctx, size_t outlen)
+{
+	__blake2b_init(ctx, outlen, NULL, 0);
+}
+
+/**
+ * blake2b_init_key() - Initialize a BLAKE2b context for a new message (keyed)
+ * @ctx: the context to initialize
+ * @outlen: length of output hash value in bytes, at most BLAKE2B_HASH_SIZE
+ * @key: the key
+ * @keylen: the key length in bytes, at most BLAKE2B_KEY_SIZE
+ *
+ * Context: Any context.
+ */
+static inline void blake2b_init_key(struct blake2b_ctx *ctx, size_t outlen,
+				    const void *key, size_t keylen)
+{
+	WARN_ON(IS_ENABLED(DEBUG) && (!outlen || outlen > BLAKE2B_HASH_SIZE ||
+		!key || !keylen || keylen > BLAKE2B_KEY_SIZE));
+
+	__blake2b_init(ctx, outlen, key, keylen);
+}
+
+/**
+ * blake2b_update() - Update a BLAKE2b context with message data
+ * @ctx: the context to update; must have been initialized
+ * @in: the message data
+ * @inlen: the data length in bytes
+ *
+ * This can be called any number of times.
+ *
+ * Context: Any context.
+ */
+void blake2b_update(struct blake2b_ctx *ctx, const u8 *in, size_t inlen);
+
+/**
+ * blake2b_final() - Finish computing a BLAKE2b hash
+ * @ctx: the context to finalize; must have been initialized
+ * @out: (output) the resulting BLAKE2b hash.  Its length will be equal to the
+ *	 @outlen that was passed to blake2b_init() or blake2b_init_key().
+ *
+ * After finishing, this zeroizes @ctx.  So the caller does not need to do it.
+ *
+ * Context: Any context.
+ */
+void blake2b_final(struct blake2b_ctx *ctx, u8 *out);
+
+/**
+ * blake2b() - Compute BLAKE2b hash in one shot
+ * @key: the key, or NULL for an unkeyed hash
+ * @keylen: the key length in bytes (at most BLAKE2B_KEY_SIZE), or 0 for an
+ *	    unkeyed hash
+ * @in: the message data
+ * @inlen: the data length in bytes
+ * @out: (output) the resulting BLAKE2b hash, with length @outlen
+ * @outlen: length of output hash value in bytes, at most BLAKE2B_HASH_SIZE
+ *
+ * Context: Any context.
+ */
+static inline void blake2b(const u8 *key, size_t keylen,
+			   const u8 *in, size_t inlen,
+			   u8 *out, size_t outlen)
+{
+	struct blake2b_ctx ctx;
+
+	WARN_ON(IS_ENABLED(DEBUG) && ((!in && inlen > 0) || !out || !outlen ||
+		outlen > BLAKE2B_HASH_SIZE || keylen > BLAKE2B_KEY_SIZE ||
+		(!key && keylen)));
+
+	__blake2b_init(&ctx, outlen, key, keylen);
+	blake2b_update(&ctx, in, inlen);
+	blake2b_final(&ctx, out);
 }
 
 #endif /* _CRYPTO_BLAKE2B_H */
diff --git a/include/crypto/internal/blake2b.h b/include/crypto/internal/blake2b.h
index 3e09e2485306..3712df69def1 100644
--- a/include/crypto/internal/blake2b.h
+++ b/include/crypto/internal/blake2b.h
@@ -57,13 +57,28 @@ static inline int crypto_blake2b_setkey(struct crypto_shash *tfm,
 	return 0;
 }
 
+static inline void __crypto_blake2b_init(struct blake2b_state *state,
+					 size_t outlen, size_t keylen)
+{
+	state->h[0] = BLAKE2B_IV0 ^ (0x01010000 | keylen << 8 | outlen);
+	state->h[1] = BLAKE2B_IV1;
+	state->h[2] = BLAKE2B_IV2;
+	state->h[3] = BLAKE2B_IV3;
+	state->h[4] = BLAKE2B_IV4;
+	state->h[5] = BLAKE2B_IV5;
+	state->h[6] = BLAKE2B_IV6;
+	state->h[7] = BLAKE2B_IV7;
+	state->t[0] = 0;
+	state->t[1] = 0;
+}
+
 static inline int crypto_blake2b_init(struct shash_desc *desc)
 {
 	const struct blake2b_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
 	struct blake2b_state *state = shash_desc_ctx(desc);
 	unsigned int outlen = crypto_shash_digestsize(desc->tfm);
 
-	__blake2b_init(state, outlen, tctx->keylen);
+	__crypto_blake2b_init(state, outlen, tctx->keylen);
 	return tctx->keylen ?
 	       crypto_shash_update(desc, tctx->key, BLAKE2B_BLOCK_SIZE) : 0;
 }
diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig
index 8886055e938f..918378b7e833 100644
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -28,6 +28,16 @@ config CRYPTO_LIB_ARC4
 config CRYPTO_LIB_GF128MUL
 	tristate
 
+config CRYPTO_LIB_BLAKE2B
+	tristate
+	help
+	  The BLAKE2b library functions.  Select this if your module uses any of
+	  the functions from <crypto/blake2b.h>.
+
+config CRYPTO_LIB_BLAKE2B_ARCH
+	bool
+	depends on CRYPTO_LIB_BLAKE2B && !UML
+
 # BLAKE2s support is always built-in, so there's no CRYPTO_LIB_BLAKE2S option.
 
 config CRYPTO_LIB_BLAKE2S_ARCH
diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile
index bded351aeace..f863417b1681 100644
--- a/lib/crypto/Makefile
+++ b/lib/crypto/Makefile
@@ -31,6 +31,15 @@ obj-$(CONFIG_CRYPTO_LIB_GF128MUL)		+= gf128mul.o
 
 ################################################################################
 
+obj-$(CONFIG_CRYPTO_LIB_BLAKE2B) += libblake2b.o
+libblake2b-y := blake2b.o
+CFLAGS_blake2b.o := -Wframe-larger-than=4096 #  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105930
+ifeq ($(CONFIG_CRYPTO_LIB_BLAKE2B_ARCH),y)
+CFLAGS_blake2b.o += -I$(src)/$(SRCARCH)
+endif # CONFIG_CRYPTO_LIB_BLAKE2B_ARCH
+
+################################################################################
+
 # blake2s is used by the /dev/random driver which is always builtin
 obj-y += blake2s.o
 ifeq ($(CONFIG_CRYPTO_LIB_BLAKE2S_ARCH),y)
diff --git a/lib/crypto/blake2b.c b/lib/crypto/blake2b.c
new file mode 100644
index 000000000000..09c6d65d8a6e
--- /dev/null
+++ b/lib/crypto/blake2b.c
@@ -0,0 +1,174 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/*
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright 2025 Google LLC
+ *
+ * This is an implementation of the BLAKE2b hash and PRF functions.
+ *
+ * Information: https://blake2.net/
+ */
+
+#include <crypto/blake2b.h>
+#include <linux/bug.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+static const u8 blake2b_sigma[12][16] = {
+	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+	{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+	{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+	{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+	{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
+};
+
+static inline void blake2b_increment_counter(struct blake2b_ctx *ctx, u32 inc)
+{
+	ctx->t[0] += inc;
+	ctx->t[1] += (ctx->t[0] < inc);
+}
+
+static void __maybe_unused
+blake2b_compress_generic(struct blake2b_ctx *ctx,
+			 const u8 *data, size_t nblocks, u32 inc)
+{
+	u64 m[16];
+	u64 v[16];
+	int i;
+
+	WARN_ON(IS_ENABLED(DEBUG) &&
+		(nblocks > 1 && inc != BLAKE2B_BLOCK_SIZE));
+
+	while (nblocks > 0) {
+		blake2b_increment_counter(ctx, inc);
+		memcpy(m, data, BLAKE2B_BLOCK_SIZE);
+		le64_to_cpu_array(m, ARRAY_SIZE(m));
+		memcpy(v, ctx->h, 64);
+		v[ 8] = BLAKE2B_IV0;
+		v[ 9] = BLAKE2B_IV1;
+		v[10] = BLAKE2B_IV2;
+		v[11] = BLAKE2B_IV3;
+		v[12] = BLAKE2B_IV4 ^ ctx->t[0];
+		v[13] = BLAKE2B_IV5 ^ ctx->t[1];
+		v[14] = BLAKE2B_IV6 ^ ctx->f[0];
+		v[15] = BLAKE2B_IV7 ^ ctx->f[1];
+
+#define G(r, i, a, b, c, d) do { \
+	a += b + m[blake2b_sigma[r][2 * i + 0]]; \
+	d = ror64(d ^ a, 32); \
+	c += d; \
+	b = ror64(b ^ c, 24); \
+	a += b + m[blake2b_sigma[r][2 * i + 1]]; \
+	d = ror64(d ^ a, 16); \
+	c += d; \
+	b = ror64(b ^ c, 63); \
+} while (0)
+
+#define ROUND(r) do { \
+	G(r, 0, v[0], v[ 4], v[ 8], v[12]); \
+	G(r, 1, v[1], v[ 5], v[ 9], v[13]); \
+	G(r, 2, v[2], v[ 6], v[10], v[14]); \
+	G(r, 3, v[3], v[ 7], v[11], v[15]); \
+	G(r, 4, v[0], v[ 5], v[10], v[15]); \
+	G(r, 5, v[1], v[ 6], v[11], v[12]); \
+	G(r, 6, v[2], v[ 7], v[ 8], v[13]); \
+	G(r, 7, v[3], v[ 4], v[ 9], v[14]); \
+} while (0)
+		ROUND(0);
+		ROUND(1);
+		ROUND(2);
+		ROUND(3);
+		ROUND(4);
+		ROUND(5);
+		ROUND(6);
+		ROUND(7);
+		ROUND(8);
+		ROUND(9);
+		ROUND(10);
+		ROUND(11);
+
+#undef G
+#undef ROUND
+
+		for (i = 0; i < 8; ++i)
+			ctx->h[i] ^= v[i] ^ v[i + 8];
+
+		data += BLAKE2B_BLOCK_SIZE;
+		--nblocks;
+	}
+}
+
+#ifdef CONFIG_CRYPTO_LIB_BLAKE2B_ARCH
+#include "blake2b.h" /* $(SRCARCH)/blake2b.h */
+#else
+#define blake2b_compress blake2b_compress_generic
+#endif
+
+static inline void blake2b_set_lastblock(struct blake2b_ctx *ctx)
+{
+	ctx->f[0] = -1;
+}
+
+void blake2b_update(struct blake2b_ctx *ctx, const u8 *in, size_t inlen)
+{
+	const size_t fill = BLAKE2B_BLOCK_SIZE - ctx->buflen;
+
+	if (unlikely(!inlen))
+		return;
+	if (inlen > fill) {
+		memcpy(ctx->buf + ctx->buflen, in, fill);
+		blake2b_compress(ctx, ctx->buf, 1, BLAKE2B_BLOCK_SIZE);
+		ctx->buflen = 0;
+		in += fill;
+		inlen -= fill;
+	}
+	if (inlen > BLAKE2B_BLOCK_SIZE) {
+		const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2B_BLOCK_SIZE);
+
+		blake2b_compress(ctx, in, nblocks - 1, BLAKE2B_BLOCK_SIZE);
+		in += BLAKE2B_BLOCK_SIZE * (nblocks - 1);
+		inlen -= BLAKE2B_BLOCK_SIZE * (nblocks - 1);
+	}
+	memcpy(ctx->buf + ctx->buflen, in, inlen);
+	ctx->buflen += inlen;
+}
+EXPORT_SYMBOL(blake2b_update);
+
+void blake2b_final(struct blake2b_ctx *ctx, u8 *out)
+{
+	WARN_ON(IS_ENABLED(DEBUG) && !out);
+	blake2b_set_lastblock(ctx);
+	memset(ctx->buf + ctx->buflen, 0,
+	       BLAKE2B_BLOCK_SIZE - ctx->buflen); /* Padding */
+	blake2b_compress(ctx, ctx->buf, 1, ctx->buflen);
+	cpu_to_le64_array(ctx->h, ARRAY_SIZE(ctx->h));
+	memcpy(out, ctx->h, ctx->outlen);
+	memzero_explicit(ctx, sizeof(*ctx));
+}
+EXPORT_SYMBOL(blake2b_final);
+
+#ifdef blake2b_mod_init_arch
+static int __init blake2b_mod_init(void)
+{
+	blake2b_mod_init_arch();
+	return 0;
+}
+subsys_initcall(blake2b_mod_init);
+
+static void __exit blake2b_mod_exit(void)
+{
+}
+module_exit(blake2b_mod_exit);
+#endif
+
+MODULE_DESCRIPTION("BLAKE2b hash function");
+MODULE_LICENSE("GPL");

From ba6617bd47c2263bd2ead34e1b31d90c66af5dea Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Fri, 17 Oct 2025 21:31:03 -0700
Subject: [PATCH 08/37] lib/crypto: arm/blake2b: Migrate optimized code into
 library

Migrate the arm-optimized BLAKE2b code from arch/arm/crypto/ to
lib/crypto/arm/.  This makes the BLAKE2b library able to use it, and it
also simplifies the code because it's easier to integrate with the
library than crypto_shash.

This temporarily makes the arm-optimized BLAKE2b code unavailable via
crypto_shash.  A later commit reimplements the blake2b-* crypto_shash
algorithms on top of the BLAKE2b library API, making it available again.

Note that as per the lib/crypto/ convention, the optimized code is now
enabled by default.  So, this also fixes the longstanding issue where
the optimized BLAKE2b code was not enabled by default.

To see the diff from arch/arm/crypto/blake2b-neon-glue.c to
lib/crypto/arm/blake2b.h, view this commit with 'git show -M10'.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251018043106.375964-8-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 arch/arm/crypto/Kconfig                       |  16 ---
 arch/arm/crypto/Makefile                      |   2 -
 arch/arm/crypto/blake2b-neon-glue.c           | 104 ------------------
 lib/crypto/Kconfig                            |   1 +
 lib/crypto/Makefile                           |   1 +
 .../crypto/arm}/blake2b-neon-core.S           |  29 ++---
 lib/crypto/arm/blake2b.h                      |  41 +++++++
 7 files changed, 59 insertions(+), 135 deletions(-)
 delete mode 100644 arch/arm/crypto/blake2b-neon-glue.c
 rename {arch/arm/crypto => lib/crypto/arm}/blake2b-neon-core.S (94%)
 create mode 100644 lib/crypto/arm/blake2b.h

diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index c436eec22d86..f30d743df264 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -33,22 +33,6 @@ config CRYPTO_NHPOLY1305_NEON
 	  Architecture: arm using:
 	  - NEON (Advanced SIMD) extensions
 
-config CRYPTO_BLAKE2B_NEON
-	tristate "Hash functions: BLAKE2b (NEON)"
-	depends on KERNEL_MODE_NEON
-	select CRYPTO_BLAKE2B
-	help
-	  BLAKE2b cryptographic hash function (RFC 7693)
-
-	  Architecture: arm using
-	  - NEON (Advanced SIMD) extensions
-
-	  BLAKE2b digest algorithm optimized with ARM NEON instructions.
-	  On ARM processors that have NEON support but not the ARMv8
-	  Crypto Extensions, typically this BLAKE2b implementation is
-	  much faster than the SHA-2 family and slightly faster than
-	  SHA-1.
-
 config CRYPTO_AES_ARM
 	tristate "Ciphers: AES"
 	select CRYPTO_ALGAPI
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index 6346a73effc0..86dd43313dbf 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -5,7 +5,6 @@
 
 obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o
 obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
-obj-$(CONFIG_CRYPTO_BLAKE2B_NEON) += blake2b-neon.o
 obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
 
 obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
@@ -13,7 +12,6 @@ obj-$(CONFIG_CRYPTO_GHASH_ARM_CE) += ghash-arm-ce.o
 
 aes-arm-y	:= aes-cipher-core.o aes-cipher-glue.o
 aes-arm-bs-y	:= aes-neonbs-core.o aes-neonbs-glue.o
-blake2b-neon-y  := blake2b-neon-core.o blake2b-neon-glue.o
 aes-arm-ce-y	:= aes-ce-core.o aes-ce-glue.o
 ghash-arm-ce-y	:= ghash-ce-core.o ghash-ce-glue.o
 nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
diff --git a/arch/arm/crypto/blake2b-neon-glue.c b/arch/arm/crypto/blake2b-neon-glue.c
deleted file mode 100644
index 2ff443a91724..000000000000
--- a/arch/arm/crypto/blake2b-neon-glue.c
+++ /dev/null
@@ -1,104 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * BLAKE2b digest algorithm, NEON accelerated
- *
- * Copyright 2020 Google LLC
- */
-
-#include <crypto/internal/blake2b.h>
-#include <crypto/internal/hash.h>
-
-#include <linux/module.h>
-#include <linux/sizes.h>
-
-#include <asm/neon.h>
-#include <asm/simd.h>
-
-asmlinkage void blake2b_compress_neon(struct blake2b_state *state,
-				      const u8 *block, size_t nblocks, u32 inc);
-
-static void blake2b_compress_arch(struct blake2b_state *state,
-				  const u8 *block, size_t nblocks, u32 inc)
-{
-	do {
-		const size_t blocks = min_t(size_t, nblocks,
-					    SZ_4K / BLAKE2B_BLOCK_SIZE);
-
-		kernel_neon_begin();
-		blake2b_compress_neon(state, block, blocks, inc);
-		kernel_neon_end();
-
-		nblocks -= blocks;
-		block += blocks * BLAKE2B_BLOCK_SIZE;
-	} while (nblocks);
-}
-
-static int crypto_blake2b_update_neon(struct shash_desc *desc,
-				      const u8 *in, unsigned int inlen)
-{
-	return crypto_blake2b_update_bo(desc, in, inlen, blake2b_compress_arch);
-}
-
-static int crypto_blake2b_finup_neon(struct shash_desc *desc, const u8 *in,
-				     unsigned int inlen, u8 *out)
-{
-	return crypto_blake2b_finup(desc, in, inlen, out,
-				    blake2b_compress_arch);
-}
-
-#define BLAKE2B_ALG(name, driver_name, digest_size)			\
-	{								\
-		.base.cra_name		= name,				\
-		.base.cra_driver_name	= driver_name,			\
-		.base.cra_priority	= 200,				\
-		.base.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY |	\
-					  CRYPTO_AHASH_ALG_BLOCK_ONLY |	\
-					  CRYPTO_AHASH_ALG_FINAL_NONZERO, \
-		.base.cra_blocksize	= BLAKE2B_BLOCK_SIZE,		\
-		.base.cra_ctxsize	= sizeof(struct blake2b_tfm_ctx), \
-		.base.cra_module	= THIS_MODULE,			\
-		.digestsize		= digest_size,			\
-		.setkey			= crypto_blake2b_setkey,	\
-		.init			= crypto_blake2b_init,		\
-		.update			= crypto_blake2b_update_neon,	\
-		.finup			= crypto_blake2b_finup_neon,	\
-		.descsize		= sizeof(struct blake2b_state),	\
-		.statesize		= BLAKE2B_STATE_SIZE,		\
-	}
-
-static struct shash_alg blake2b_neon_algs[] = {
-	BLAKE2B_ALG("blake2b-160", "blake2b-160-neon", BLAKE2B_160_HASH_SIZE),
-	BLAKE2B_ALG("blake2b-256", "blake2b-256-neon", BLAKE2B_256_HASH_SIZE),
-	BLAKE2B_ALG("blake2b-384", "blake2b-384-neon", BLAKE2B_384_HASH_SIZE),
-	BLAKE2B_ALG("blake2b-512", "blake2b-512-neon", BLAKE2B_512_HASH_SIZE),
-};
-
-static int __init blake2b_neon_mod_init(void)
-{
-	if (!(elf_hwcap & HWCAP_NEON))
-		return -ENODEV;
-
-	return crypto_register_shashes(blake2b_neon_algs,
-				       ARRAY_SIZE(blake2b_neon_algs));
-}
-
-static void __exit blake2b_neon_mod_exit(void)
-{
-	crypto_unregister_shashes(blake2b_neon_algs,
-				  ARRAY_SIZE(blake2b_neon_algs));
-}
-
-module_init(blake2b_neon_mod_init);
-module_exit(blake2b_neon_mod_exit);
-
-MODULE_DESCRIPTION("BLAKE2b digest algorithm, NEON accelerated");
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
-MODULE_ALIAS_CRYPTO("blake2b-160");
-MODULE_ALIAS_CRYPTO("blake2b-160-neon");
-MODULE_ALIAS_CRYPTO("blake2b-256");
-MODULE_ALIAS_CRYPTO("blake2b-256-neon");
-MODULE_ALIAS_CRYPTO("blake2b-384");
-MODULE_ALIAS_CRYPTO("blake2b-384-neon");
-MODULE_ALIAS_CRYPTO("blake2b-512");
-MODULE_ALIAS_CRYPTO("blake2b-512-neon");
diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig
index 918378b7e833..280b888153bf 100644
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -37,6 +37,7 @@ config CRYPTO_LIB_BLAKE2B
 config CRYPTO_LIB_BLAKE2B_ARCH
 	bool
 	depends on CRYPTO_LIB_BLAKE2B && !UML
+	default y if ARM && KERNEL_MODE_NEON
 
 # BLAKE2s support is always built-in, so there's no CRYPTO_LIB_BLAKE2S option.
 
diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile
index f863417b1681..bc26777d08e9 100644
--- a/lib/crypto/Makefile
+++ b/lib/crypto/Makefile
@@ -36,6 +36,7 @@ libblake2b-y := blake2b.o
 CFLAGS_blake2b.o := -Wframe-larger-than=4096 #  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105930
 ifeq ($(CONFIG_CRYPTO_LIB_BLAKE2B_ARCH),y)
 CFLAGS_blake2b.o += -I$(src)/$(SRCARCH)
+libblake2b-$(CONFIG_ARM) += arm/blake2b-neon-core.o
 endif # CONFIG_CRYPTO_LIB_BLAKE2B_ARCH
 
 ################################################################################
diff --git a/arch/arm/crypto/blake2b-neon-core.S b/lib/crypto/arm/blake2b-neon-core.S
similarity index 94%
rename from arch/arm/crypto/blake2b-neon-core.S
rename to lib/crypto/arm/blake2b-neon-core.S
index 0406a186377f..b55c37f0b88f 100644
--- a/arch/arm/crypto/blake2b-neon-core.S
+++ b/lib/crypto/arm/blake2b-neon-core.S
@@ -1,6 +1,9 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
- * BLAKE2b digest algorithm, NEON accelerated
+ * BLAKE2b digest algorithm optimized with ARM NEON instructions.  On ARM
+ * processors that have NEON support but not the ARMv8 Crypto Extensions,
+ * typically this BLAKE2b implementation is much faster than the SHA-2 family
+ * and slightly faster than SHA-1.
  *
  * Copyright 2020 Google LLC
  *
@@ -13,8 +16,8 @@
 	.fpu		neon
 
 	// The arguments to blake2b_compress_neon()
-	STATE		.req	r0
-	BLOCK		.req	r1
+	CTX		.req	r0
+	DATA		.req	r1
 	NBLOCKS		.req	r2
 	INC		.req	r3
 
@@ -234,10 +237,10 @@
 .endm
 
 //
-// void blake2b_compress_neon(struct blake2b_state *state,
-//			      const u8 *block, size_t nblocks, u32 inc);
+// void blake2b_compress_neon(struct blake2b_ctx *ctx,
+//			      const u8 *data, size_t nblocks, u32 inc);
 //
-// Only the first three fields of struct blake2b_state are used:
+// Only the first three fields of struct blake2b_ctx are used:
 //	u64 h[8];	(inout)
 //	u64 t[2];	(inout)
 //	u64 f[2];	(in)
@@ -255,7 +258,7 @@ ENTRY(blake2b_compress_neon)
 	adr		ROR24_TABLE, .Lror24_table
 	adr		ROR16_TABLE, .Lror16_table
 
-	mov		ip, STATE
+	mov		ip, CTX
 	vld1.64		{q0-q1}, [ip]!		// Load h[0..3]
 	vld1.64		{q2-q3}, [ip]!		// Load h[4..7]
 .Lnext_block:
@@ -281,14 +284,14 @@ ENTRY(blake2b_compress_neon)
 	// (q8-q9) in an aligned buffer on the stack so that they can be
 	// reloaded when needed.  (We could just reload directly from the
 	// message buffer, but it's faster to use aligned loads.)
-	vld1.8		{q8-q9}, [BLOCK]!
+	vld1.8		{q8-q9}, [DATA]!
 	  veor		q6, q6, q14	// v[12..13] = IV[4..5] ^ t[0..1]
-	vld1.8		{q10-q11}, [BLOCK]!
+	vld1.8		{q10-q11}, [DATA]!
 	  veor		q7, q7, q15	// v[14..15] = IV[6..7] ^ f[0..1]
-	vld1.8		{q12-q13}, [BLOCK]!
+	vld1.8		{q12-q13}, [DATA]!
 	vst1.8		{q8-q9}, [sp, :256]
-	  mov		ip, STATE
-	vld1.8		{q14-q15}, [BLOCK]!
+	  mov		ip, CTX
+	vld1.8		{q14-q15}, [DATA]!
 
 	// Execute the rounds.  Each round is provided the order in which it
 	// needs to use the message words.
@@ -319,7 +322,7 @@ ENTRY(blake2b_compress_neon)
 	veor		q3, q3, q7		// v[6..7] ^= v[14..15]
 	veor		q0, q0, q8		// v[0..1] ^= h[0..1]
 	veor		q1, q1, q9		// v[2..3] ^= h[2..3]
-	  mov		ip, STATE
+	  mov		ip, CTX
 	  subs		NBLOCKS, NBLOCKS, #1	// nblocks--
 	  vst1.64	{q0-q1}, [ip]!		// Store new h[0..3]
 	veor		q2, q2, q10		// v[4..5] ^= h[4..5]
diff --git a/lib/crypto/arm/blake2b.h b/lib/crypto/arm/blake2b.h
new file mode 100644
index 000000000000..1b9154d119db
--- /dev/null
+++ b/lib/crypto/arm/blake2b.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * BLAKE2b digest algorithm, NEON accelerated
+ *
+ * Copyright 2020 Google LLC
+ */
+
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
+
+asmlinkage void blake2b_compress_neon(struct blake2b_ctx *ctx,
+				      const u8 *data, size_t nblocks, u32 inc);
+
+static void blake2b_compress(struct blake2b_ctx *ctx,
+			     const u8 *data, size_t nblocks, u32 inc)
+{
+	if (!static_branch_likely(&have_neon) || !may_use_simd()) {
+		blake2b_compress_generic(ctx, data, nblocks, inc);
+		return;
+	}
+	do {
+		const size_t blocks = min_t(size_t, nblocks,
+					    SZ_4K / BLAKE2B_BLOCK_SIZE);
+
+		kernel_neon_begin();
+		blake2b_compress_neon(ctx, data, blocks, inc);
+		kernel_neon_end();
+
+		data += blocks * BLAKE2B_BLOCK_SIZE;
+		nblocks -= blocks;
+	} while (nblocks);
+}
+
+#define blake2b_mod_init_arch blake2b_mod_init_arch
+static void blake2b_mod_init_arch(void)
+{
+	if (elf_hwcap & HWCAP_NEON)
+		static_branch_enable(&have_neon);
+}

From fa3ca9bfe3f001ed306cb3ce9761dacffbe143f8 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Fri, 17 Oct 2025 21:31:05 -0700
Subject: [PATCH 09/37] crypto: blake2b - Reimplement using library API

Replace blake2b_generic.c with a new file blake2b.c which implements the
BLAKE2b crypto_shash algorithms on top of the BLAKE2b library API.

Change the driver name suffix from "-generic" to "-lib" to reflect that
these algorithms now just use the (possibly arch-optimized) library.

This closely mirrors crypto/{md5,sha1,sha256,sha512}.c.

Remove include/crypto/internal/blake2b.h since it is no longer used.
Likewise, remove struct blake2b_state from include/crypto/blake2b.h.

Omit support for import_core and export_core, since there are no legacy
drivers that need these for these algorithms.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251018043106.375964-10-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 crypto/Kconfig                    |   1 +
 crypto/Makefile                   |   3 +-
 crypto/blake2b.c                  | 111 +++++++++++++++++
 crypto/blake2b_generic.c          | 192 ------------------------------
 crypto/testmgr.c                  |   4 +
 include/crypto/blake2b.h          |  10 --
 include/crypto/internal/blake2b.h | 116 ------------------
 7 files changed, 117 insertions(+), 320 deletions(-)
 create mode 100644 crypto/blake2b.c
 delete mode 100644 crypto/blake2b_generic.c
 delete mode 100644 include/crypto/internal/blake2b.h

diff --git a/crypto/Kconfig b/crypto/Kconfig
index a04595f9d0ca..0a7e74ac870b 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -881,6 +881,7 @@ menu "Hashes, digests, and MACs"
 config CRYPTO_BLAKE2B
 	tristate "BLAKE2b"
 	select CRYPTO_HASH
+	select CRYPTO_LIB_BLAKE2B
 	help
 	  BLAKE2b cryptographic hash function (RFC 7693)
 
diff --git a/crypto/Makefile b/crypto/Makefile
index e430e6e99b6a..5b02ca2cb04e 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -83,8 +83,7 @@ obj-$(CONFIG_CRYPTO_SM3_GENERIC) += sm3_generic.o
 obj-$(CONFIG_CRYPTO_STREEBOG) += streebog_generic.o
 obj-$(CONFIG_CRYPTO_WP512) += wp512.o
 CFLAGS_wp512.o := $(call cc-option,-fno-schedule-insns)  # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79149
-obj-$(CONFIG_CRYPTO_BLAKE2B) += blake2b_generic.o
-CFLAGS_blake2b_generic.o := -Wframe-larger-than=4096 #  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105930
+obj-$(CONFIG_CRYPTO_BLAKE2B) += blake2b.o
 obj-$(CONFIG_CRYPTO_ECB) += ecb.o
 obj-$(CONFIG_CRYPTO_CBC) += cbc.o
 obj-$(CONFIG_CRYPTO_PCBC) += pcbc.o
diff --git a/crypto/blake2b.c b/crypto/blake2b.c
new file mode 100644
index 000000000000..67a6dae43a54
--- /dev/null
+++ b/crypto/blake2b.c
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Crypto API support for BLAKE2b
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <crypto/blake2b.h>
+#include <crypto/internal/hash.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+struct blake2b_tfm_ctx {
+	unsigned int keylen;
+	u8 key[BLAKE2B_KEY_SIZE];
+};
+
+static int crypto_blake2b_setkey(struct crypto_shash *tfm,
+				 const u8 *key, unsigned int keylen)
+{
+	struct blake2b_tfm_ctx *tctx = crypto_shash_ctx(tfm);
+
+	if (keylen > BLAKE2B_KEY_SIZE)
+		return -EINVAL;
+	memcpy(tctx->key, key, keylen);
+	tctx->keylen = keylen;
+	return 0;
+}
+
+#define BLAKE2B_CTX(desc) ((struct blake2b_ctx *)shash_desc_ctx(desc))
+
+static int crypto_blake2b_init(struct shash_desc *desc)
+{
+	const struct blake2b_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
+	unsigned int digestsize = crypto_shash_digestsize(desc->tfm);
+
+	blake2b_init_key(BLAKE2B_CTX(desc), digestsize,
+			 tctx->key, tctx->keylen);
+	return 0;
+}
+
+static int crypto_blake2b_update(struct shash_desc *desc,
+				 const u8 *data, unsigned int len)
+{
+	blake2b_update(BLAKE2B_CTX(desc), data, len);
+	return 0;
+}
+
+static int crypto_blake2b_final(struct shash_desc *desc, u8 *out)
+{
+	blake2b_final(BLAKE2B_CTX(desc), out);
+	return 0;
+}
+
+static int crypto_blake2b_digest(struct shash_desc *desc,
+				 const u8 *data, unsigned int len, u8 *out)
+{
+	const struct blake2b_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
+	unsigned int digestsize = crypto_shash_digestsize(desc->tfm);
+
+	blake2b(tctx->key, tctx->keylen, data, len, out, digestsize);
+	return 0;
+}
+
+#define BLAKE2B_ALG(name, digest_size)					\
+	{								\
+		.base.cra_name		= name,				\
+		.base.cra_driver_name	= name "-lib",			\
+		.base.cra_priority	= 300,				\
+		.base.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,	\
+		.base.cra_blocksize	= BLAKE2B_BLOCK_SIZE,		\
+		.base.cra_ctxsize	= sizeof(struct blake2b_tfm_ctx), \
+		.base.cra_module	= THIS_MODULE,			\
+		.digestsize		= digest_size,			\
+		.setkey			= crypto_blake2b_setkey,	\
+		.init			= crypto_blake2b_init,		\
+		.update			= crypto_blake2b_update,	\
+		.final			= crypto_blake2b_final,		\
+		.digest			= crypto_blake2b_digest,	\
+		.descsize		= sizeof(struct blake2b_ctx),	\
+	}
+
+static struct shash_alg algs[] = {
+	BLAKE2B_ALG("blake2b-160", BLAKE2B_160_HASH_SIZE),
+	BLAKE2B_ALG("blake2b-256", BLAKE2B_256_HASH_SIZE),
+	BLAKE2B_ALG("blake2b-384", BLAKE2B_384_HASH_SIZE),
+	BLAKE2B_ALG("blake2b-512", BLAKE2B_512_HASH_SIZE),
+};
+
+static int __init crypto_blake2b_mod_init(void)
+{
+	return crypto_register_shashes(algs, ARRAY_SIZE(algs));
+}
+module_init(crypto_blake2b_mod_init);
+
+static void __exit crypto_blake2b_mod_exit(void)
+{
+	crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
+}
+module_exit(crypto_blake2b_mod_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Crypto API support for BLAKE2b");
+
+MODULE_ALIAS_CRYPTO("blake2b-160");
+MODULE_ALIAS_CRYPTO("blake2b-160-lib");
+MODULE_ALIAS_CRYPTO("blake2b-256");
+MODULE_ALIAS_CRYPTO("blake2b-256-lib");
+MODULE_ALIAS_CRYPTO("blake2b-384");
+MODULE_ALIAS_CRYPTO("blake2b-384-lib");
+MODULE_ALIAS_CRYPTO("blake2b-512");
+MODULE_ALIAS_CRYPTO("blake2b-512-lib");
diff --git a/crypto/blake2b_generic.c b/crypto/blake2b_generic.c
deleted file mode 100644
index 60f056217510..000000000000
--- a/crypto/blake2b_generic.c
+++ /dev/null
@@ -1,192 +0,0 @@
-// SPDX-License-Identifier: (GPL-2.0-only OR Apache-2.0)
-/*
- * Generic implementation of the BLAKE2b digest algorithm.  Based on the BLAKE2b
- * reference implementation, but it has been heavily modified for use in the
- * kernel.  The reference implementation was:
- *
- *	Copyright 2012, Samuel Neves <sneves@dei.uc.pt>.  You may use this under
- *	the terms of the CC0, the OpenSSL Licence, or the Apache Public License
- *	2.0, at your option.  The terms of these licenses can be found at:
- *
- *	- CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
- *	- OpenSSL license   : https://www.openssl.org/source/license.html
- *	- Apache 2.0        : https://www.apache.org/licenses/LICENSE-2.0
- *
- * More information about BLAKE2 can be found at https://blake2.net.
- */
-
-#include <crypto/internal/blake2b.h>
-#include <crypto/internal/hash.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/unaligned.h>
-
-static const u8 blake2b_sigma[12][16] = {
-	{  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
-	{ 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
-	{ 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 },
-	{  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 },
-	{  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 },
-	{  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 },
-	{ 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 },
-	{ 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 },
-	{  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 },
-	{ 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13,  0 },
-	{  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
-	{ 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 }
-};
-
-static void blake2b_increment_counter(struct blake2b_state *S, const u64 inc)
-{
-	S->t[0] += inc;
-	S->t[1] += (S->t[0] < inc);
-}
-
-#define G(r,i,a,b,c,d)                                  \
-	do {                                            \
-		a = a + b + m[blake2b_sigma[r][2*i+0]]; \
-		d = ror64(d ^ a, 32);                   \
-		c = c + d;                              \
-		b = ror64(b ^ c, 24);                   \
-		a = a + b + m[blake2b_sigma[r][2*i+1]]; \
-		d = ror64(d ^ a, 16);                   \
-		c = c + d;                              \
-		b = ror64(b ^ c, 63);                   \
-	} while (0)
-
-#define ROUND(r)                                \
-	do {                                    \
-		G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
-		G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
-		G(r,2,v[ 2],v[ 6],v[10],v[14]); \
-		G(r,3,v[ 3],v[ 7],v[11],v[15]); \
-		G(r,4,v[ 0],v[ 5],v[10],v[15]); \
-		G(r,5,v[ 1],v[ 6],v[11],v[12]); \
-		G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
-		G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
-	} while (0)
-
-static void blake2b_compress_one_generic(struct blake2b_state *S,
-					 const u8 block[BLAKE2B_BLOCK_SIZE])
-{
-	u64 m[16];
-	u64 v[16];
-	size_t i;
-
-	for (i = 0; i < 16; ++i)
-		m[i] = get_unaligned_le64(block + i * sizeof(m[i]));
-
-	for (i = 0; i < 8; ++i)
-		v[i] = S->h[i];
-
-	v[ 8] = BLAKE2B_IV0;
-	v[ 9] = BLAKE2B_IV1;
-	v[10] = BLAKE2B_IV2;
-	v[11] = BLAKE2B_IV3;
-	v[12] = BLAKE2B_IV4 ^ S->t[0];
-	v[13] = BLAKE2B_IV5 ^ S->t[1];
-	v[14] = BLAKE2B_IV6 ^ S->f[0];
-	v[15] = BLAKE2B_IV7 ^ S->f[1];
-
-	ROUND(0);
-	ROUND(1);
-	ROUND(2);
-	ROUND(3);
-	ROUND(4);
-	ROUND(5);
-	ROUND(6);
-	ROUND(7);
-	ROUND(8);
-	ROUND(9);
-	ROUND(10);
-	ROUND(11);
-#ifdef CONFIG_CC_IS_CLANG
-#pragma nounroll /* https://llvm.org/pr45803 */
-#endif
-	for (i = 0; i < 8; ++i)
-		S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];
-}
-
-#undef G
-#undef ROUND
-
-static void blake2b_compress_generic(struct blake2b_state *state,
-				     const u8 *block, size_t nblocks, u32 inc)
-{
-	do {
-		blake2b_increment_counter(state, inc);
-		blake2b_compress_one_generic(state, block);
-		block += BLAKE2B_BLOCK_SIZE;
-	} while (--nblocks);
-}
-
-static int crypto_blake2b_update_generic(struct shash_desc *desc,
-					 const u8 *in, unsigned int inlen)
-{
-	return crypto_blake2b_update_bo(desc, in, inlen,
-					blake2b_compress_generic);
-}
-
-static int crypto_blake2b_finup_generic(struct shash_desc *desc, const u8 *in,
-					unsigned int inlen, u8 *out)
-{
-	return crypto_blake2b_finup(desc, in, inlen, out,
-				    blake2b_compress_generic);
-}
-
-#define BLAKE2B_ALG(name, driver_name, digest_size)			\
-	{								\
-		.base.cra_name		= name,				\
-		.base.cra_driver_name	= driver_name,			\
-		.base.cra_priority	= 100,				\
-		.base.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY |	\
-					  CRYPTO_AHASH_ALG_BLOCK_ONLY |	\
-					  CRYPTO_AHASH_ALG_FINAL_NONZERO, \
-		.base.cra_blocksize	= BLAKE2B_BLOCK_SIZE,		\
-		.base.cra_ctxsize	= sizeof(struct blake2b_tfm_ctx), \
-		.base.cra_module	= THIS_MODULE,			\
-		.digestsize		= digest_size,			\
-		.setkey			= crypto_blake2b_setkey,	\
-		.init			= crypto_blake2b_init,		\
-		.update			= crypto_blake2b_update_generic, \
-		.finup			= crypto_blake2b_finup_generic,	\
-		.descsize		= BLAKE2B_DESC_SIZE,		\
-		.statesize		= BLAKE2B_STATE_SIZE,		\
-	}
-
-static struct shash_alg blake2b_algs[] = {
-	BLAKE2B_ALG("blake2b-160", "blake2b-160-generic",
-		    BLAKE2B_160_HASH_SIZE),
-	BLAKE2B_ALG("blake2b-256", "blake2b-256-generic",
-		    BLAKE2B_256_HASH_SIZE),
-	BLAKE2B_ALG("blake2b-384", "blake2b-384-generic",
-		    BLAKE2B_384_HASH_SIZE),
-	BLAKE2B_ALG("blake2b-512", "blake2b-512-generic",
-		    BLAKE2B_512_HASH_SIZE),
-};
-
-static int __init blake2b_mod_init(void)
-{
-	return crypto_register_shashes(blake2b_algs, ARRAY_SIZE(blake2b_algs));
-}
-
-static void __exit blake2b_mod_fini(void)
-{
-	crypto_unregister_shashes(blake2b_algs, ARRAY_SIZE(blake2b_algs));
-}
-
-module_init(blake2b_mod_init);
-module_exit(blake2b_mod_fini);
-
-MODULE_AUTHOR("David Sterba <kdave@kernel.org>");
-MODULE_DESCRIPTION("BLAKE2b generic implementation");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_CRYPTO("blake2b-160");
-MODULE_ALIAS_CRYPTO("blake2b-160-generic");
-MODULE_ALIAS_CRYPTO("blake2b-256");
-MODULE_ALIAS_CRYPTO("blake2b-256-generic");
-MODULE_ALIAS_CRYPTO("blake2b-384");
-MODULE_ALIAS_CRYPTO("blake2b-384-generic");
-MODULE_ALIAS_CRYPTO("blake2b-512");
-MODULE_ALIAS_CRYPTO("blake2b-512-generic");
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 6a490aaa71b9..3ab7adc1cdce 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -4332,6 +4332,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		.fips_allowed = 1,
 	}, {
 		.alg = "blake2b-160",
+		.generic_driver = "blake2b-160-lib",
 		.test = alg_test_hash,
 		.fips_allowed = 0,
 		.suite = {
@@ -4339,6 +4340,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		}
 	}, {
 		.alg = "blake2b-256",
+		.generic_driver = "blake2b-256-lib",
 		.test = alg_test_hash,
 		.fips_allowed = 0,
 		.suite = {
@@ -4346,6 +4348,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		}
 	}, {
 		.alg = "blake2b-384",
+		.generic_driver = "blake2b-384-lib",
 		.test = alg_test_hash,
 		.fips_allowed = 0,
 		.suite = {
@@ -4353,6 +4356,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		}
 	}, {
 		.alg = "blake2b-512",
+		.generic_driver = "blake2b-512-lib",
 		.test = alg_test_hash,
 		.fips_allowed = 0,
 		.suite = {
diff --git a/include/crypto/blake2b.h b/include/crypto/blake2b.h
index 4879e2ec2686..3bc37fd103a7 100644
--- a/include/crypto/blake2b.h
+++ b/include/crypto/blake2b.h
@@ -7,20 +7,10 @@
 #include <linux/types.h>
 #include <linux/string.h>
 
-struct blake2b_state {
-	/* 'h', 't', and 'f' are used in assembly code, so keep them as-is. */
-	u64 h[8];
-	u64 t[2];
-	/* The true state ends here.  The rest is temporary storage. */
-	u64 f[2];
-};
-
 enum blake2b_lengths {
 	BLAKE2B_BLOCK_SIZE = 128,
 	BLAKE2B_HASH_SIZE = 64,
 	BLAKE2B_KEY_SIZE = 64,
-	BLAKE2B_STATE_SIZE = offsetof(struct blake2b_state, f),
-	BLAKE2B_DESC_SIZE = sizeof(struct blake2b_state),
 
 	BLAKE2B_160_HASH_SIZE = 20,
 	BLAKE2B_256_HASH_SIZE = 32,
diff --git a/include/crypto/internal/blake2b.h b/include/crypto/internal/blake2b.h
deleted file mode 100644
index 3712df69def1..000000000000
--- a/include/crypto/internal/blake2b.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR MIT */
-/*
- * Helper functions for BLAKE2b implementations.
- * Keep this in sync with the corresponding BLAKE2s header.
- */
-
-#ifndef _CRYPTO_INTERNAL_BLAKE2B_H
-#define _CRYPTO_INTERNAL_BLAKE2B_H
-
-#include <asm/byteorder.h>
-#include <crypto/blake2b.h>
-#include <crypto/internal/hash.h>
-#include <linux/array_size.h>
-#include <linux/compiler.h>
-#include <linux/build_bug.h>
-#include <linux/errno.h>
-#include <linux/math.h>
-#include <linux/string.h>
-#include <linux/types.h>
-
-static inline void blake2b_set_lastblock(struct blake2b_state *state)
-{
-	state->f[0] = -1;
-	state->f[1] = 0;
-}
-
-static inline void blake2b_set_nonlast(struct blake2b_state *state)
-{
-	state->f[0] = 0;
-	state->f[1] = 0;
-}
-
-typedef void (*blake2b_compress_t)(struct blake2b_state *state,
-				   const u8 *block, size_t nblocks, u32 inc);
-
-/* Helper functions for shash implementations of BLAKE2b */
-
-struct blake2b_tfm_ctx {
-	u8 key[BLAKE2B_BLOCK_SIZE];
-	unsigned int keylen;
-};
-
-static inline int crypto_blake2b_setkey(struct crypto_shash *tfm,
-					const u8 *key, unsigned int keylen)
-{
-	struct blake2b_tfm_ctx *tctx = crypto_shash_ctx(tfm);
-
-	if (keylen > BLAKE2B_KEY_SIZE)
-		return -EINVAL;
-
-	BUILD_BUG_ON(BLAKE2B_KEY_SIZE > BLAKE2B_BLOCK_SIZE);
-
-	memcpy(tctx->key, key, keylen);
-	memset(tctx->key + keylen, 0, BLAKE2B_BLOCK_SIZE - keylen);
-	tctx->keylen = keylen;
-
-	return 0;
-}
-
-static inline void __crypto_blake2b_init(struct blake2b_state *state,
-					 size_t outlen, size_t keylen)
-{
-	state->h[0] = BLAKE2B_IV0 ^ (0x01010000 | keylen << 8 | outlen);
-	state->h[1] = BLAKE2B_IV1;
-	state->h[2] = BLAKE2B_IV2;
-	state->h[3] = BLAKE2B_IV3;
-	state->h[4] = BLAKE2B_IV4;
-	state->h[5] = BLAKE2B_IV5;
-	state->h[6] = BLAKE2B_IV6;
-	state->h[7] = BLAKE2B_IV7;
-	state->t[0] = 0;
-	state->t[1] = 0;
-}
-
-static inline int crypto_blake2b_init(struct shash_desc *desc)
-{
-	const struct blake2b_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
-	struct blake2b_state *state = shash_desc_ctx(desc);
-	unsigned int outlen = crypto_shash_digestsize(desc->tfm);
-
-	__crypto_blake2b_init(state, outlen, tctx->keylen);
-	return tctx->keylen ?
-	       crypto_shash_update(desc, tctx->key, BLAKE2B_BLOCK_SIZE) : 0;
-}
-
-static inline int crypto_blake2b_update_bo(struct shash_desc *desc,
-					   const u8 *in, unsigned int inlen,
-					   blake2b_compress_t compress)
-{
-	struct blake2b_state *state = shash_desc_ctx(desc);
-
-	blake2b_set_nonlast(state);
-	compress(state, in, inlen / BLAKE2B_BLOCK_SIZE, BLAKE2B_BLOCK_SIZE);
-	return inlen - round_down(inlen, BLAKE2B_BLOCK_SIZE);
-}
-
-static inline int crypto_blake2b_finup(struct shash_desc *desc, const u8 *in,
-				       unsigned int inlen, u8 *out,
-				       blake2b_compress_t compress)
-{
-	struct blake2b_state *state = shash_desc_ctx(desc);
-	u8 buf[BLAKE2B_BLOCK_SIZE];
-	int i;
-
-	memcpy(buf, in, inlen);
-	memset(buf + inlen, 0, BLAKE2B_BLOCK_SIZE - inlen);
-	blake2b_set_lastblock(state);
-	compress(state, buf, 1, inlen);
-	for (i = 0; i < ARRAY_SIZE(state->h); i++)
-		__cpu_to_le64s(&state->h[i]);
-	memcpy(out, state->h, crypto_shash_digestsize(desc->tfm));
-	memzero_explicit(buf, sizeof(buf));
-	return 0;
-}
-
-#endif /* _CRYPTO_INTERNAL_BLAKE2B_H */

From 863ee5a3aa9111da80f87cda1f9d716055f4c11a Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 25 Oct 2025 22:50:18 -0700
Subject: [PATCH 10/37] crypto: s390/sha3 - Rename conflicting functions

Rename the s390 sha3_*_init() functions to have an "s390_" prefix to
avoid a name conflict with the upcoming SHA-3 library functions.

Note: this code will be superseded later.  This commit simply keeps the
kernel building for the initial introduction of the library.

[EB: dropped unnecessary rename of import and export functions, and
     improved commit message]

Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: Harald Freudenberger <freude@linux.ibm.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251026055032.1413733-2-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 arch/s390/crypto/sha3_256_s390.c | 10 +++++-----
 arch/s390/crypto/sha3_512_s390.c | 10 +++++-----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/arch/s390/crypto/sha3_256_s390.c b/arch/s390/crypto/sha3_256_s390.c
index 03bb4f4bab70..7415d56649a5 100644
--- a/arch/s390/crypto/sha3_256_s390.c
+++ b/arch/s390/crypto/sha3_256_s390.c
@@ -19,7 +19,7 @@
 
 #include "sha.h"
 
-static int sha3_256_init(struct shash_desc *desc)
+static int s390_sha3_256_init(struct shash_desc *desc)
 {
 	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
 
@@ -79,7 +79,7 @@ static int sha3_224_import(struct shash_desc *desc, const void *in)
 
 static struct shash_alg sha3_256_alg = {
 	.digestsize	=	SHA3_256_DIGEST_SIZE,	   /* = 32 */
-	.init		=	sha3_256_init,
+	.init		=	s390_sha3_256_init,
 	.update		=	s390_sha_update_blocks,
 	.finup		=	s390_sha_finup,
 	.export		=	sha3_256_export,
@@ -96,18 +96,18 @@ static struct shash_alg sha3_256_alg = {
 	}
 };
 
-static int sha3_224_init(struct shash_desc *desc)
+static int s390_sha3_224_init(struct shash_desc *desc)
 {
 	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
 
-	sha3_256_init(desc);
+	s390_sha3_256_init(desc);
 	sctx->func = CPACF_KIMD_SHA3_224;
 	return 0;
 }
 
 static struct shash_alg sha3_224_alg = {
 	.digestsize	=	SHA3_224_DIGEST_SIZE,
-	.init		=	sha3_224_init,
+	.init		=	s390_sha3_224_init,
 	.update		=	s390_sha_update_blocks,
 	.finup		=	s390_sha_finup,
 	.export		=	sha3_256_export, /* same as for 256 */
diff --git a/arch/s390/crypto/sha3_512_s390.c b/arch/s390/crypto/sha3_512_s390.c
index a5c9690eecb1..ff6ee5584400 100644
--- a/arch/s390/crypto/sha3_512_s390.c
+++ b/arch/s390/crypto/sha3_512_s390.c
@@ -18,7 +18,7 @@
 
 #include "sha.h"
 
-static int sha3_512_init(struct shash_desc *desc)
+static int s390_sha3_512_init(struct shash_desc *desc)
 {
 	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
 
@@ -78,7 +78,7 @@ static int sha3_384_import(struct shash_desc *desc, const void *in)
 
 static struct shash_alg sha3_512_alg = {
 	.digestsize	=	SHA3_512_DIGEST_SIZE,
-	.init		=	sha3_512_init,
+	.init		=	s390_sha3_512_init,
 	.update		=	s390_sha_update_blocks,
 	.finup		=	s390_sha_finup,
 	.export		=	sha3_512_export,
@@ -97,18 +97,18 @@ static struct shash_alg sha3_512_alg = {
 
 MODULE_ALIAS_CRYPTO("sha3-512");
 
-static int sha3_384_init(struct shash_desc *desc)
+static int s390_sha3_384_init(struct shash_desc *desc)
 {
 	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
 
-	sha3_512_init(desc);
+	s390_sha3_512_init(desc);
 	sctx->func = CPACF_KIMD_SHA3_384;
 	return 0;
 }
 
 static struct shash_alg sha3_384_alg = {
 	.digestsize	=	SHA3_384_DIGEST_SIZE,
-	.init		=	sha3_384_init,
+	.init		=	s390_sha3_384_init,
 	.update		=	s390_sha_update_blocks,
 	.finup		=	s390_sha_finup,
 	.export		=	sha3_512_export, /* same as for 512 */

From 414121190348edc3da26505fc1e6c7d8f2d75fae Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 25 Oct 2025 22:50:19 -0700
Subject: [PATCH 11/37] crypto: arm64/sha3 - Rename conflicting function

Rename the arm64 sha3_update() function to have an "arm64_" prefix to
avoid a name conflict with the upcoming SHA-3 library.

Note: this code will be superseded later.  This commit simply keeps the
kernel building for the initial introduction of the library.

[EB: dropped unnecessary rename of sha3_finup(), and improved commit
     message]

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251026055032.1413733-3-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 arch/arm64/crypto/sha3-ce-glue.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/crypto/sha3-ce-glue.c b/arch/arm64/crypto/sha3-ce-glue.c
index b4f1001046c9..f5c830234933 100644
--- a/arch/arm64/crypto/sha3-ce-glue.c
+++ b/arch/arm64/crypto/sha3-ce-glue.c
@@ -31,8 +31,8 @@ MODULE_ALIAS_CRYPTO("sha3-512");
 asmlinkage int sha3_ce_transform(u64 *st, const u8 *data, int blocks,
 				 int md_len);
 
-static int sha3_update(struct shash_desc *desc, const u8 *data,
-		       unsigned int len)
+static int arm64_sha3_update(struct shash_desc *desc, const u8 *data,
+			     unsigned int len)
 {
 	struct sha3_state *sctx = shash_desc_ctx(desc);
 	struct crypto_shash *tfm = desc->tfm;
@@ -90,7 +90,7 @@ static int sha3_finup(struct shash_desc *desc, const u8 *src, unsigned int len,
 static struct shash_alg algs[] = { {
 	.digestsize		= SHA3_224_DIGEST_SIZE,
 	.init			= crypto_sha3_init,
-	.update			= sha3_update,
+	.update			= arm64_sha3_update,
 	.finup			= sha3_finup,
 	.descsize		= SHA3_STATE_SIZE,
 	.base.cra_name		= "sha3-224",
@@ -102,7 +102,7 @@ static struct shash_alg algs[] = { {
 }, {
 	.digestsize		= SHA3_256_DIGEST_SIZE,
 	.init			= crypto_sha3_init,
-	.update			= sha3_update,
+	.update			= arm64_sha3_update,
 	.finup			= sha3_finup,
 	.descsize		= SHA3_STATE_SIZE,
 	.base.cra_name		= "sha3-256",
@@ -114,7 +114,7 @@ static struct shash_alg algs[] = { {
 }, {
 	.digestsize		= SHA3_384_DIGEST_SIZE,
 	.init			= crypto_sha3_init,
-	.update			= sha3_update,
+	.update			= arm64_sha3_update,
 	.finup			= sha3_finup,
 	.descsize		= SHA3_STATE_SIZE,
 	.base.cra_name		= "sha3-384",
@@ -126,7 +126,7 @@ static struct shash_alg algs[] = { {
 }, {
 	.digestsize		= SHA3_512_DIGEST_SIZE,
 	.init			= crypto_sha3_init,
-	.update			= sha3_update,
+	.update			= arm64_sha3_update,
 	.finup			= sha3_finup,
 	.descsize		= SHA3_STATE_SIZE,
 	.base.cra_name		= "sha3-512",

From 0593447248044ab609b43b947d0e198c887ac281 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 25 Oct 2025 22:50:20 -0700
Subject: [PATCH 12/37] lib/crypto: sha3: Add SHA-3 support

Add SHA-3 support to lib/crypto/.  All six algorithms in the SHA-3
family are supported: four digests (SHA3-224, SHA3-256, SHA3-384, and
SHA3-512) and two extendable-output functions (SHAKE128 and SHAKE256).

The SHAKE algorithms will be required for ML-DSA.

[EB: simplified the API to use fewer types and functions, fixed bug that
     sometimes caused incorrect SHAKE output, cleaned up the
     documentation, dropped an ad-hoc test that was inconsistent with
     the rest of lib/crypto/, and many other cleanups]

Signed-off-by: David Howells <dhowells@redhat.com>
Co-developed-by: Eric Biggers <ebiggers@kernel.org>
Tested-by: Harald Freudenberger <freude@linux.ibm.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251026055032.1413733-4-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 Documentation/crypto/index.rst |   1 +
 Documentation/crypto/sha3.rst  | 119 +++++++++++
 include/crypto/sha3.h          | 326 +++++++++++++++++++++++++++++-
 lib/crypto/Kconfig             |   7 +
 lib/crypto/Makefile            |   5 +
 lib/crypto/sha3.c              | 359 +++++++++++++++++++++++++++++++++
 6 files changed, 812 insertions(+), 5 deletions(-)
 create mode 100644 Documentation/crypto/sha3.rst
 create mode 100644 lib/crypto/sha3.c

diff --git a/Documentation/crypto/index.rst b/Documentation/crypto/index.rst
index 100b47d049c0..4ee667c446f9 100644
--- a/Documentation/crypto/index.rst
+++ b/Documentation/crypto/index.rst
@@ -27,3 +27,4 @@ for cryptographic use cases, as well as programming examples.
    descore-readme
    device_drivers/index
    krb5
+   sha3
diff --git a/Documentation/crypto/sha3.rst b/Documentation/crypto/sha3.rst
new file mode 100644
index 000000000000..b705e70691d7
--- /dev/null
+++ b/Documentation/crypto/sha3.rst
@@ -0,0 +1,119 @@
+.. SPDX-License-Identifier: GPL-2.0-or-later
+
+==========================
+SHA-3 Algorithm Collection
+==========================
+
+.. contents::
+
+Overview
+========
+
+The SHA-3 family of algorithms, as specified in NIST FIPS-202 [1]_, contains six
+algorithms based on the Keccak sponge function.  The differences between them
+are: the "rate" (how much of the state buffer gets updated with new data between
+invocations of the Keccak function and analogous to the "block size"), what
+domain separation suffix gets appended to the input data, and how much output
+data is extracted at the end.  The Keccak sponge function is designed such that
+arbitrary amounts of output can be obtained for certain algorithms.
+
+Four digest algorithms are provided:
+
+ - SHA3-224
+ - SHA3-256
+ - SHA3-384
+ - SHA3-512
+
+Additionally, two Extendable-Output Functions (XOFs) are provided:
+
+ - SHAKE128
+ - SHAKE256
+
+The SHA-3 library API supports all six of these algorithms.  The four digest
+algorithms are also supported by the crypto_shash and crypto_ahash APIs.
+
+This document describes the SHA-3 library API.
+
+
+Digests
+=======
+
+The following functions compute SHA-3 digests::
+
+	void sha3_224(const u8 *in, size_t in_len, u8 out[SHA3_224_DIGEST_SIZE]);
+	void sha3_256(const u8 *in, size_t in_len, u8 out[SHA3_256_DIGEST_SIZE]);
+	void sha3_384(const u8 *in, size_t in_len, u8 out[SHA3_384_DIGEST_SIZE]);
+	void sha3_512(const u8 *in, size_t in_len, u8 out[SHA3_512_DIGEST_SIZE]);
+
+For users that need to pass in data incrementally, an incremental API is also
+provided.  The incremental API uses the following struct::
+
+	struct sha3_ctx { ... };
+
+Initialization is done with one of::
+
+	void sha3_224_init(struct sha3_ctx *ctx);
+	void sha3_256_init(struct sha3_ctx *ctx);
+	void sha3_384_init(struct sha3_ctx *ctx);
+	void sha3_512_init(struct sha3_ctx *ctx);
+
+Input data is then added with any number of calls to::
+
+	void sha3_update(struct sha3_ctx *ctx, const u8 *in, size_t in_len);
+
+Finally, the digest is generated using::
+
+	void sha3_final(struct sha3_ctx *ctx, u8 *out);
+
+which also zeroizes the context.  The length of the digest is determined by the
+initialization function that was called.
+
+
+Extendable-Output Functions
+===========================
+
+The following functions compute the SHA-3 extendable-output functions (XOFs)::
+
+	void shake128(const u8 *in, size_t in_len, u8 *out, size_t out_len);
+	void shake256(const u8 *in, size_t in_len, u8 *out, size_t out_len);
+
+For users that need to provide the input data incrementally and/or receive the
+output data incrementally, an incremental API is also provided.  The incremental
+API uses the following struct::
+
+	struct shake_ctx { ... };
+
+Initialization is done with one of::
+
+	void shake128_init(struct shake_ctx *ctx);
+	void shake256_init(struct shake_ctx *ctx);
+
+Input data is then added with any number of calls to::
+
+	void shake_update(struct shake_ctx *ctx, const u8 *in, size_t in_len);
+
+Finally, the output data is extracted with any number of calls to::
+
+	void shake_squeeze(struct shake_ctx *ctx, u8 *out, size_t out_len);
+
+and telling it how much data should be extracted.  Note that performing multiple
+squeezes, with the output laid consecutively in a buffer, gets exactly the same
+output as doing a single squeeze for the combined amount over the same buffer.
+
+More input data cannot be added after squeezing has started.
+
+Once all the desired output has been extracted, zeroize the context::
+
+	void shake_zeroize_ctx(struct shake_ctx *ctx);
+
+
+References
+==========
+
+.. [1] https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.202.pdf
+
+
+API Function Reference
+======================
+
+.. kernel-doc:: include/crypto/sha3.h
diff --git a/include/crypto/sha3.h b/include/crypto/sha3.h
index 41e1b83a6d91..c0c468ee099e 100644
--- a/include/crypto/sha3.h
+++ b/include/crypto/sha3.h
@@ -1,11 +1,14 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Common values for SHA-3 algorithms
+ *
+ * See also Documentation/crypto/sha3.rst
  */
 #ifndef __CRYPTO_SHA3_H__
 #define __CRYPTO_SHA3_H__
 
 #include <linux/types.h>
+#include <linux/string.h>
 
 #define SHA3_224_DIGEST_SIZE	(224 / 8)
 #define SHA3_224_BLOCK_SIZE	(200 - 2 * SHA3_224_DIGEST_SIZE)
@@ -23,14 +26,327 @@
 #define SHA3_512_BLOCK_SIZE	(200 - 2 * SHA3_512_DIGEST_SIZE)
 #define SHA3_512_EXPORT_SIZE	SHA3_STATE_SIZE + SHA3_512_BLOCK_SIZE + 1
 
+/*
+ * SHAKE128 and SHAKE256 actually have variable output size, but this is used to
+ * calculate the block size (rate) analogously to the above.
+ */
+#define SHAKE128_DEFAULT_SIZE	(128 / 8)
+#define SHAKE128_BLOCK_SIZE	(200 - 2 * SHAKE128_DEFAULT_SIZE)
+#define SHAKE256_DEFAULT_SIZE	(256 / 8)
+#define SHAKE256_BLOCK_SIZE	(200 - 2 * SHAKE256_DEFAULT_SIZE)
+
 #define SHA3_STATE_SIZE		200
 
 struct shash_desc;
 
-struct sha3_state {
-	u64		st[SHA3_STATE_SIZE / 8];
-};
-
 int crypto_sha3_init(struct shash_desc *desc);
 
-#endif
+/*
+ * State for the Keccak-f[1600] permutation: 25 64-bit words.
+ *
+ * We usually keep the state words as little-endian, to make absorbing and
+ * squeezing easier.  (It means that absorbing and squeezing can just treat the
+ * state as a byte array.)  The state words are converted to native-endian only
+ * temporarily by implementations of the permutation that need native-endian
+ * words.  Of course, that conversion is a no-op on little-endian machines.
+ */
+struct sha3_state {
+	union {
+		u64 st[SHA3_STATE_SIZE / 8]; /* temporarily retained for compatibility purposes */
+
+		__le64 words[SHA3_STATE_SIZE / 8];
+		u8 bytes[SHA3_STATE_SIZE];
+
+		u64 native_words[SHA3_STATE_SIZE / 8]; /* see comment above */
+	};
+};
+
+/* Internal context, shared by the digests (SHA3-*) and the XOFs (SHAKE*) */
+struct __sha3_ctx {
+	struct sha3_state state;
+	u8 digest_size;		/* Digests only: the digest size in bytes */
+	u8 block_size;		/* Block size in bytes */
+	u8 absorb_offset;	/* Index of next state byte to absorb into */
+	u8 squeeze_offset;	/* XOFs only: index of next state byte to extract */
+};
+
+void __sha3_update(struct __sha3_ctx *ctx, const u8 *in, size_t in_len);
+
+/**
+ * struct sha3_ctx - Context for SHA3-224, SHA3-256, SHA3-384, or SHA3-512
+ * @ctx: private
+ */
+struct sha3_ctx {
+	struct __sha3_ctx ctx;
+};
+
+/**
+ * sha3_zeroize_ctx() - Zeroize a SHA-3 context
+ * @ctx: The context to zeroize
+ *
+ * This is already called by sha3_final().  Call this explicitly when abandoning
+ * a context without calling sha3_final().
+ */
+static inline void sha3_zeroize_ctx(struct sha3_ctx *ctx)
+{
+	memzero_explicit(ctx, sizeof(*ctx));
+}
+
+/**
+ * struct shake_ctx - Context for SHAKE128 or SHAKE256
+ * @ctx: private
+ */
+struct shake_ctx {
+	struct __sha3_ctx ctx;
+};
+
+/**
+ * shake_zeroize_ctx() - Zeroize a SHAKE context
+ * @ctx: The context to zeroize
+ *
+ * Call this after the last squeeze.
+ */
+static inline void shake_zeroize_ctx(struct shake_ctx *ctx)
+{
+	memzero_explicit(ctx, sizeof(*ctx));
+}
+
+/**
+ * sha3_224_init() - Initialize a context for SHA3-224
+ * @ctx: The context to initialize
+ *
+ * This begins a new SHA3-224 message digest computation.
+ *
+ * Context: Any context.
+ */
+static inline void sha3_224_init(struct sha3_ctx *ctx)
+{
+	*ctx = (struct sha3_ctx){
+		.ctx.digest_size = SHA3_224_DIGEST_SIZE,
+		.ctx.block_size = SHA3_224_BLOCK_SIZE,
+	};
+}
+
+/**
+ * sha3_256_init() - Initialize a context for SHA3-256
+ * @ctx: The context to initialize
+ *
+ * This begins a new SHA3-256 message digest computation.
+ *
+ * Context: Any context.
+ */
+static inline void sha3_256_init(struct sha3_ctx *ctx)
+{
+	*ctx = (struct sha3_ctx){
+		.ctx.digest_size = SHA3_256_DIGEST_SIZE,
+		.ctx.block_size = SHA3_256_BLOCK_SIZE,
+	};
+}
+
+/**
+ * sha3_384_init() - Initialize a context for SHA3-384
+ * @ctx: The context to initialize
+ *
+ * This begins a new SHA3-384 message digest computation.
+ *
+ * Context: Any context.
+ */
+static inline void sha3_384_init(struct sha3_ctx *ctx)
+{
+	*ctx = (struct sha3_ctx){
+		.ctx.digest_size = SHA3_384_DIGEST_SIZE,
+		.ctx.block_size = SHA3_384_BLOCK_SIZE,
+	};
+}
+
+/**
+ * sha3_512_init() - Initialize a context for SHA3-512
+ * @ctx: The context to initialize
+ *
+ * This begins a new SHA3-512 message digest computation.
+ *
+ * Context: Any context.
+ */
+static inline void sha3_512_init(struct sha3_ctx *ctx)
+{
+	*ctx = (struct sha3_ctx){
+		.ctx.digest_size = SHA3_512_DIGEST_SIZE,
+		.ctx.block_size = SHA3_512_BLOCK_SIZE,
+	};
+}
+
+/**
+ * sha3_update() - Update a SHA-3 digest context with input data
+ * @ctx: The context to update; must have been initialized
+ * @in: The input data
+ * @in_len: Length of the input data in bytes
+ *
+ * This can be called any number of times to add data to a SHA3-224, SHA3-256,
+ * SHA3-384, or SHA3-512 digest (depending on which init function was called).
+ *
+ * Context: Any context.
+ */
+static inline void sha3_update(struct sha3_ctx *ctx,
+			       const u8 *in, size_t in_len)
+{
+	__sha3_update(&ctx->ctx, in, in_len);
+}
+
+/**
+ * sha3_final() - Finish computing a SHA-3 message digest
+ * @ctx: The context to finalize; must have been initialized
+ * @out: (output) The resulting SHA3-224, SHA3-256, SHA3-384, or SHA3-512
+ *	 message digest, matching the init function that was called.  Note that
+ *	 the size differs for each one; see SHA3_*_DIGEST_SIZE.
+ *
+ * After finishing, this zeroizes @ctx.  So the caller does not need to do it.
+ *
+ * Context: Any context.
+ */
+void sha3_final(struct sha3_ctx *ctx, u8 *out);
+
+/**
+ * shake128_init() - Initialize a context for SHAKE128
+ * @ctx: The context to initialize
+ *
+ * This begins a new SHAKE128 extendable-output function (XOF) computation.
+ *
+ * Context: Any context.
+ */
+static inline void shake128_init(struct shake_ctx *ctx)
+{
+	*ctx = (struct shake_ctx){
+		.ctx.block_size = SHAKE128_BLOCK_SIZE,
+	};
+}
+
+/**
+ * shake256_init() - Initialize a context for SHAKE256
+ * @ctx: The context to initialize
+ *
+ * This begins a new SHAKE256 extendable-output function (XOF) computation.
+ *
+ * Context: Any context.
+ */
+static inline void shake256_init(struct shake_ctx *ctx)
+{
+	*ctx = (struct shake_ctx){
+		.ctx.block_size = SHAKE256_BLOCK_SIZE,
+	};
+}
+
+/**
+ * shake_update() - Update a SHAKE context with input data
+ * @ctx: The context to update; must have been initialized
+ * @in: The input data
+ * @in_len: Length of the input data in bytes
+ *
+ * This can be called any number of times to add more input data to SHAKE128 or
+ * SHAKE256.  This cannot be called after squeezing has begun.
+ *
+ * Context: Any context.
+ */
+static inline void shake_update(struct shake_ctx *ctx,
+				const u8 *in, size_t in_len)
+{
+	__sha3_update(&ctx->ctx, in, in_len);
+}
+
+/**
+ * shake_squeeze() - Generate output from SHAKE128 or SHAKE256
+ * @ctx: The context to squeeze; must have been initialized
+ * @out: Where to write the resulting output data
+ * @out_len: The amount of data to extract to @out in bytes
+ *
+ * This may be called multiple times.  A number of consecutive squeezes laid
+ * end-to-end will yield the same output as one big squeeze generating the same
+ * total amount of output.  More input cannot be provided after squeezing has
+ * begun.  After the last squeeze, call shake_zeroize_ctx().
+ *
+ * Context: Any context.
+ */
+void shake_squeeze(struct shake_ctx *ctx, u8 *out, size_t out_len);
+
+/**
+ * sha3_224() - Compute SHA3-224 digest in one shot
+ * @in: The input data to be digested
+ * @in_len: Length of the input data in bytes
+ * @out: The buffer into which the digest will be stored
+ *
+ * Convenience function that computes a SHA3-224 digest.  Use this instead of
+ * the incremental API if you're able to provide all the input at once.
+ *
+ * Context: Any context.
+ */
+void sha3_224(const u8 *in, size_t in_len, u8 out[SHA3_224_DIGEST_SIZE]);
+
+/**
+ * sha3_256() - Compute SHA3-256 digest in one shot
+ * @in: The input data to be digested
+ * @in_len: Length of the input data in bytes
+ * @out: The buffer into which the digest will be stored
+ *
+ * Convenience function that computes a SHA3-256 digest.  Use this instead of
+ * the incremental API if you're able to provide all the input at once.
+ *
+ * Context: Any context.
+ */
+void sha3_256(const u8 *in, size_t in_len, u8 out[SHA3_256_DIGEST_SIZE]);
+
+/**
+ * sha3_384() - Compute SHA3-384 digest in one shot
+ * @in: The input data to be digested
+ * @in_len: Length of the input data in bytes
+ * @out: The buffer into which the digest will be stored
+ *
+ * Convenience function that computes a SHA3-384 digest.  Use this instead of
+ * the incremental API if you're able to provide all the input at once.
+ *
+ * Context: Any context.
+ */
+void sha3_384(const u8 *in, size_t in_len, u8 out[SHA3_384_DIGEST_SIZE]);
+
+/**
+ * sha3_512() - Compute SHA3-512 digest in one shot
+ * @in: The input data to be digested
+ * @in_len: Length of the input data in bytes
+ * @out: The buffer into which the digest will be stored
+ *
+ * Convenience function that computes a SHA3-512 digest.  Use this instead of
+ * the incremental API if you're able to provide all the input at once.
+ *
+ * Context: Any context.
+ */
+void sha3_512(const u8 *in, size_t in_len, u8 out[SHA3_512_DIGEST_SIZE]);
+
+/**
+ * shake128() - Compute SHAKE128 in one shot
+ * @in: The input data to be used
+ * @in_len: Length of the input data in bytes
+ * @out: The buffer into which the output will be stored
+ * @out_len: Length of the output to produce in bytes
+ *
+ * Convenience function that computes SHAKE128 in one shot.  Use this instead of
+ * the incremental API if you're able to provide all the input at once as well
+ * as receive all the output at once.  All output lengths are supported.
+ *
+ * Context: Any context.
+ */
+void shake128(const u8 *in, size_t in_len, u8 *out, size_t out_len);
+
+/**
+ * shake256() - Compute SHAKE256 in one shot
+ * @in: The input data to be used
+ * @in_len: Length of the input data in bytes
+ * @out: The buffer into which the output will be stored
+ * @out_len: Length of the output to produce in bytes
+ *
+ * Convenience function that computes SHAKE256 in one shot.  Use this instead of
+ * the incremental API if you're able to provide all the input at once as well
+ * as receive all the output at once.  All output lengths are supported.
+ *
+ * Context: Any context.
+ */
+void shake256(const u8 *in, size_t in_len, u8 *out, size_t out_len);
+
+#endif /* __CRYPTO_SHA3_H__ */
diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig
index 280b888153bf..a05f5a349cd8 100644
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -195,6 +195,13 @@ config CRYPTO_LIB_SHA512_ARCH
 	default y if SPARC64
 	default y if X86_64
 
+config CRYPTO_LIB_SHA3
+	tristate
+	select CRYPTO_LIB_UTILS
+	help
+	  The SHA3 library functions.  Select this if your module uses any of
+	  the functions from <crypto/sha3.h>.
+
 config CRYPTO_LIB_SM3
 	tristate
 
diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile
index bc26777d08e9..0cfdb511f32b 100644
--- a/lib/crypto/Makefile
+++ b/lib/crypto/Makefile
@@ -278,6 +278,11 @@ endif # CONFIG_CRYPTO_LIB_SHA512_ARCH
 
 ################################################################################
 
+obj-$(CONFIG_CRYPTO_LIB_SHA3) += libsha3.o
+libsha3-y := sha3.o
+
+################################################################################
+
 obj-$(CONFIG_MPILIB) += mpi/
 
 obj-$(CONFIG_CRYPTO_SELFTESTS_FULL)		+= simd.o
diff --git a/lib/crypto/sha3.c b/lib/crypto/sha3.c
new file mode 100644
index 000000000000..56d8353f9c5b
--- /dev/null
+++ b/lib/crypto/sha3.c
@@ -0,0 +1,359 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * SHA-3, as specified in
+ * https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.202.pdf
+ *
+ * SHA-3 code by Jeff Garzik <jeff@garzik.org>
+ *               Ard Biesheuvel <ard.biesheuvel@linaro.org>
+ *               David Howells <dhowells@redhat.com>
+ *
+ * See also Documentation/crypto/sha3.rst
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <crypto/sha3.h>
+#include <crypto/utils.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/unaligned.h>
+
+/*
+ * On some 32-bit architectures, such as h8300, GCC ends up using over 1 KB of
+ * stack if the round calculation gets inlined into the loop in
+ * sha3_keccakf_generic().  On the other hand, on 64-bit architectures with
+ * plenty of [64-bit wide] general purpose registers, not inlining it severely
+ * hurts performance.  So let's use 64-bitness as a heuristic to decide whether
+ * to inline or not.
+ */
+#ifdef CONFIG_64BIT
+#define SHA3_INLINE inline
+#else
+#define SHA3_INLINE noinline
+#endif
+
+#define SHA3_KECCAK_ROUNDS 24
+
+static const u64 sha3_keccakf_rndc[SHA3_KECCAK_ROUNDS] = {
+	0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808aULL,
+	0x8000000080008000ULL, 0x000000000000808bULL, 0x0000000080000001ULL,
+	0x8000000080008081ULL, 0x8000000000008009ULL, 0x000000000000008aULL,
+	0x0000000000000088ULL, 0x0000000080008009ULL, 0x000000008000000aULL,
+	0x000000008000808bULL, 0x800000000000008bULL, 0x8000000000008089ULL,
+	0x8000000000008003ULL, 0x8000000000008002ULL, 0x8000000000000080ULL,
+	0x000000000000800aULL, 0x800000008000000aULL, 0x8000000080008081ULL,
+	0x8000000000008080ULL, 0x0000000080000001ULL, 0x8000000080008008ULL
+};
+
+/*
+ * Perform a single round of Keccak mixing.
+ */
+static SHA3_INLINE void sha3_keccakf_one_round_generic(u64 st[25])
+{
+	u64 t[5], tt, bc[5];
+
+	/* Theta */
+	bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20];
+	bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21];
+	bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22];
+	bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23];
+	bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24];
+
+	t[0] = bc[4] ^ rol64(bc[1], 1);
+	t[1] = bc[0] ^ rol64(bc[2], 1);
+	t[2] = bc[1] ^ rol64(bc[3], 1);
+	t[3] = bc[2] ^ rol64(bc[4], 1);
+	t[4] = bc[3] ^ rol64(bc[0], 1);
+
+	st[0] ^= t[0];
+
+	/* Rho Pi */
+	tt = st[1];
+	st[ 1] = rol64(st[ 6] ^ t[1], 44);
+	st[ 6] = rol64(st[ 9] ^ t[4], 20);
+	st[ 9] = rol64(st[22] ^ t[2], 61);
+	st[22] = rol64(st[14] ^ t[4], 39);
+	st[14] = rol64(st[20] ^ t[0], 18);
+	st[20] = rol64(st[ 2] ^ t[2], 62);
+	st[ 2] = rol64(st[12] ^ t[2], 43);
+	st[12] = rol64(st[13] ^ t[3], 25);
+	st[13] = rol64(st[19] ^ t[4],  8);
+	st[19] = rol64(st[23] ^ t[3], 56);
+	st[23] = rol64(st[15] ^ t[0], 41);
+	st[15] = rol64(st[ 4] ^ t[4], 27);
+	st[ 4] = rol64(st[24] ^ t[4], 14);
+	st[24] = rol64(st[21] ^ t[1],  2);
+	st[21] = rol64(st[ 8] ^ t[3], 55);
+	st[ 8] = rol64(st[16] ^ t[1], 45);
+	st[16] = rol64(st[ 5] ^ t[0], 36);
+	st[ 5] = rol64(st[ 3] ^ t[3], 28);
+	st[ 3] = rol64(st[18] ^ t[3], 21);
+	st[18] = rol64(st[17] ^ t[2], 15);
+	st[17] = rol64(st[11] ^ t[1], 10);
+	st[11] = rol64(st[ 7] ^ t[2],  6);
+	st[ 7] = rol64(st[10] ^ t[0],  3);
+	st[10] = rol64(    tt ^ t[1],  1);
+
+	/* Chi */
+	bc[ 0] = ~st[ 1] & st[ 2];
+	bc[ 1] = ~st[ 2] & st[ 3];
+	bc[ 2] = ~st[ 3] & st[ 4];
+	bc[ 3] = ~st[ 4] & st[ 0];
+	bc[ 4] = ~st[ 0] & st[ 1];
+	st[ 0] ^= bc[ 0];
+	st[ 1] ^= bc[ 1];
+	st[ 2] ^= bc[ 2];
+	st[ 3] ^= bc[ 3];
+	st[ 4] ^= bc[ 4];
+
+	bc[ 0] = ~st[ 6] & st[ 7];
+	bc[ 1] = ~st[ 7] & st[ 8];
+	bc[ 2] = ~st[ 8] & st[ 9];
+	bc[ 3] = ~st[ 9] & st[ 5];
+	bc[ 4] = ~st[ 5] & st[ 6];
+	st[ 5] ^= bc[ 0];
+	st[ 6] ^= bc[ 1];
+	st[ 7] ^= bc[ 2];
+	st[ 8] ^= bc[ 3];
+	st[ 9] ^= bc[ 4];
+
+	bc[ 0] = ~st[11] & st[12];
+	bc[ 1] = ~st[12] & st[13];
+	bc[ 2] = ~st[13] & st[14];
+	bc[ 3] = ~st[14] & st[10];
+	bc[ 4] = ~st[10] & st[11];
+	st[10] ^= bc[ 0];
+	st[11] ^= bc[ 1];
+	st[12] ^= bc[ 2];
+	st[13] ^= bc[ 3];
+	st[14] ^= bc[ 4];
+
+	bc[ 0] = ~st[16] & st[17];
+	bc[ 1] = ~st[17] & st[18];
+	bc[ 2] = ~st[18] & st[19];
+	bc[ 3] = ~st[19] & st[15];
+	bc[ 4] = ~st[15] & st[16];
+	st[15] ^= bc[ 0];
+	st[16] ^= bc[ 1];
+	st[17] ^= bc[ 2];
+	st[18] ^= bc[ 3];
+	st[19] ^= bc[ 4];
+
+	bc[ 0] = ~st[21] & st[22];
+	bc[ 1] = ~st[22] & st[23];
+	bc[ 2] = ~st[23] & st[24];
+	bc[ 3] = ~st[24] & st[20];
+	bc[ 4] = ~st[20] & st[21];
+	st[20] ^= bc[ 0];
+	st[21] ^= bc[ 1];
+	st[22] ^= bc[ 2];
+	st[23] ^= bc[ 3];
+	st[24] ^= bc[ 4];
+}
+
+/* Generic implementation of the Keccak-f[1600] permutation */
+static void sha3_keccakf_generic(struct sha3_state *state)
+{
+	/*
+	 * Temporarily convert the state words from little-endian to native-
+	 * endian so that they can be operated on.  Note that on little-endian
+	 * machines this conversion is a no-op and is optimized out.
+	 */
+
+	for (int i = 0; i < ARRAY_SIZE(state->words); i++)
+		state->native_words[i] = le64_to_cpu(state->words[i]);
+
+	for (int round = 0; round < SHA3_KECCAK_ROUNDS; round++) {
+		sha3_keccakf_one_round_generic(state->native_words);
+		/* Iota */
+		state->native_words[0] ^= sha3_keccakf_rndc[round];
+	}
+
+	for (int i = 0; i < ARRAY_SIZE(state->words); i++)
+		state->words[i] = cpu_to_le64(state->native_words[i]);
+}
+
+/*
+ * Generic implementation of absorbing the given nonzero number of full blocks
+ * into the sponge function Keccak[r=8*block_size, c=1600-8*block_size].
+ */
+static void __maybe_unused
+sha3_absorb_blocks_generic(struct sha3_state *state, const u8 *data,
+			   size_t nblocks, size_t block_size)
+{
+	do {
+		for (size_t i = 0; i < block_size; i += 8)
+			state->words[i / 8] ^= get_unaligned((__le64 *)&data[i]);
+		sha3_keccakf_generic(state);
+		data += block_size;
+	} while (--nblocks);
+}
+
+#ifdef CONFIG_CRYPTO_LIB_SHA3_ARCH
+#include "sha3.h" /* $(SRCARCH)/sha3.h */
+#else
+#define sha3_keccakf		sha3_keccakf_generic
+#define sha3_absorb_blocks	sha3_absorb_blocks_generic
+#endif
+
+void __sha3_update(struct __sha3_ctx *ctx, const u8 *in, size_t in_len)
+{
+	const size_t block_size = ctx->block_size;
+	size_t absorb_offset = ctx->absorb_offset;
+
+	/* Warn if squeezing has already begun. */
+	WARN_ON_ONCE(absorb_offset >= block_size);
+
+	if (absorb_offset && absorb_offset + in_len >= block_size) {
+		crypto_xor(&ctx->state.bytes[absorb_offset], in,
+			   block_size - absorb_offset);
+		in += block_size - absorb_offset;
+		in_len -= block_size - absorb_offset;
+		sha3_keccakf(&ctx->state);
+		absorb_offset = 0;
+	}
+
+	if (in_len >= block_size) {
+		size_t nblocks = in_len / block_size;
+
+		sha3_absorb_blocks(&ctx->state, in, nblocks, block_size);
+		in += nblocks * block_size;
+		in_len -= nblocks * block_size;
+	}
+
+	if (in_len) {
+		crypto_xor(&ctx->state.bytes[absorb_offset], in, in_len);
+		absorb_offset += in_len;
+	}
+	ctx->absorb_offset = absorb_offset;
+}
+EXPORT_SYMBOL_GPL(__sha3_update);
+
+void sha3_final(struct sha3_ctx *sha3_ctx, u8 *out)
+{
+	struct __sha3_ctx *ctx = &sha3_ctx->ctx;
+
+	ctx->state.bytes[ctx->absorb_offset] ^= 0x06;
+	ctx->state.bytes[ctx->block_size - 1] ^= 0x80;
+	sha3_keccakf(&ctx->state);
+	memcpy(out, ctx->state.bytes, ctx->digest_size);
+	sha3_zeroize_ctx(sha3_ctx);
+}
+EXPORT_SYMBOL_GPL(sha3_final);
+
+void shake_squeeze(struct shake_ctx *shake_ctx, u8 *out, size_t out_len)
+{
+	struct __sha3_ctx *ctx = &shake_ctx->ctx;
+	const size_t block_size = ctx->block_size;
+	size_t squeeze_offset = ctx->squeeze_offset;
+
+	if (ctx->absorb_offset < block_size) {
+		/* First squeeze: */
+
+		/* Add the domain separation suffix and padding. */
+		ctx->state.bytes[ctx->absorb_offset] ^= 0x1f;
+		ctx->state.bytes[block_size - 1] ^= 0x80;
+
+		/* Indicate that squeezing has begun. */
+		ctx->absorb_offset = block_size;
+
+		/*
+		 * Indicate that no output is pending yet, i.e. sha3_keccakf()
+		 * will need to be called before the first copy.
+		 */
+		squeeze_offset = block_size;
+	}
+	while (out_len) {
+		if (squeeze_offset == block_size) {
+			sha3_keccakf(&ctx->state);
+			squeeze_offset = 0;
+		}
+		size_t copy = min(out_len, block_size - squeeze_offset);
+
+		memcpy(out, &ctx->state.bytes[squeeze_offset], copy);
+		out += copy;
+		out_len -= copy;
+		squeeze_offset += copy;
+	}
+	ctx->squeeze_offset = squeeze_offset;
+}
+EXPORT_SYMBOL_GPL(shake_squeeze);
+
+void sha3_224(const u8 *in, size_t in_len, u8 out[SHA3_224_DIGEST_SIZE])
+{
+	struct sha3_ctx ctx;
+
+	sha3_224_init(&ctx);
+	sha3_update(&ctx, in, in_len);
+	sha3_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(sha3_224);
+
+void sha3_256(const u8 *in, size_t in_len, u8 out[SHA3_256_DIGEST_SIZE])
+{
+	struct sha3_ctx ctx;
+
+	sha3_256_init(&ctx);
+	sha3_update(&ctx, in, in_len);
+	sha3_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(sha3_256);
+
+void sha3_384(const u8 *in, size_t in_len, u8 out[SHA3_384_DIGEST_SIZE])
+{
+	struct sha3_ctx ctx;
+
+	sha3_384_init(&ctx);
+	sha3_update(&ctx, in, in_len);
+	sha3_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(sha3_384);
+
+void sha3_512(const u8 *in, size_t in_len, u8 out[SHA3_512_DIGEST_SIZE])
+{
+	struct sha3_ctx ctx;
+
+	sha3_512_init(&ctx);
+	sha3_update(&ctx, in, in_len);
+	sha3_final(&ctx, out);
+}
+EXPORT_SYMBOL_GPL(sha3_512);
+
+void shake128(const u8 *in, size_t in_len, u8 *out, size_t out_len)
+{
+	struct shake_ctx ctx;
+
+	shake128_init(&ctx);
+	shake_update(&ctx, in, in_len);
+	shake_squeeze(&ctx, out, out_len);
+	shake_zeroize_ctx(&ctx);
+}
+EXPORT_SYMBOL_GPL(shake128);
+
+void shake256(const u8 *in, size_t in_len, u8 *out, size_t out_len)
+{
+	struct shake_ctx ctx;
+
+	shake256_init(&ctx);
+	shake_update(&ctx, in, in_len);
+	shake_squeeze(&ctx, out, out_len);
+	shake_zeroize_ctx(&ctx);
+}
+EXPORT_SYMBOL_GPL(shake256);
+
+#ifdef sha3_mod_init_arch
+static int __init sha3_mod_init(void)
+{
+	sha3_mod_init_arch();
+	return 0;
+}
+subsys_initcall(sha3_mod_init);
+
+static void __exit sha3_mod_exit(void)
+{
+}
+module_exit(sha3_mod_exit);
+#endif
+
+MODULE_DESCRIPTION("SHA-3 library functions");
+MODULE_LICENSE("GPL");

From c0db39e253ebca9dea57e8885450ff0a0a6d6155 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 25 Oct 2025 22:50:21 -0700
Subject: [PATCH 13/37] lib/crypto: sha3: Move SHA3 Iota step mapping into
 round function

In crypto/sha3_generic.c, the keccakf() function calls keccakf_round()
to do four of Keccak-f's five step mappings.  However, it does not do
the Iota step mapping - presumably because that is dependent on round
number, whereas Theta, Rho, Pi and Chi are not.

Note that the keccakf_round() function needs to be explicitly
non-inlined on certain architectures as gcc's produced output will (or
used to) use over 1KiB of stack space if inlined.

Now, this code was copied more or less verbatim into lib/crypto/sha3.c,
so that has the same aesthetic issue.  Fix this there by passing the
round number into sha3_keccakf_one_round_generic() and doing the Iota
step mapping there.

crypto/sha3_generic.c is left untouched as that will be converted to use
lib/crypto/sha3.c at some point.

Suggested-by: Eric Biggers <ebiggers@kernel.org>
Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: Harald Freudenberger <freude@linux.ibm.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251026055032.1413733-5-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 lib/crypto/sha3.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/lib/crypto/sha3.c b/lib/crypto/sha3.c
index 56d8353f9c5b..2102c2ecac96 100644
--- a/lib/crypto/sha3.c
+++ b/lib/crypto/sha3.c
@@ -48,7 +48,7 @@ static const u64 sha3_keccakf_rndc[SHA3_KECCAK_ROUNDS] = {
 /*
  * Perform a single round of Keccak mixing.
  */
-static SHA3_INLINE void sha3_keccakf_one_round_generic(u64 st[25])
+static SHA3_INLINE void sha3_keccakf_one_round_generic(u64 st[25], int round)
 {
 	u64 t[5], tt, bc[5];
 
@@ -149,6 +149,9 @@ static SHA3_INLINE void sha3_keccakf_one_round_generic(u64 st[25])
 	st[22] ^= bc[ 2];
 	st[23] ^= bc[ 3];
 	st[24] ^= bc[ 4];
+
+	/* Iota */
+	st[0] ^= sha3_keccakf_rndc[round];
 }
 
 /* Generic implementation of the Keccak-f[1600] permutation */
@@ -163,11 +166,8 @@ static void sha3_keccakf_generic(struct sha3_state *state)
 	for (int i = 0; i < ARRAY_SIZE(state->words); i++)
 		state->native_words[i] = le64_to_cpu(state->words[i]);
 
-	for (int round = 0; round < SHA3_KECCAK_ROUNDS; round++) {
-		sha3_keccakf_one_round_generic(state->native_words);
-		/* Iota */
-		state->native_words[0] ^= sha3_keccakf_rndc[round];
-	}
+	for (int round = 0; round < SHA3_KECCAK_ROUNDS; round++)
+		sha3_keccakf_one_round_generic(state->native_words, round);
 
 	for (int i = 0; i < ARRAY_SIZE(state->words); i++)
 		state->words[i] = cpu_to_le64(state->native_words[i]);

From 6fa873641c0bdfa849130a81aa7339ccfd42b52a Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Sat, 25 Oct 2025 22:50:24 -0700
Subject: [PATCH 14/37] lib/crypto: sha3: Add FIPS cryptographic algorithm
 self-test

Since the SHA-3 algorithms are FIPS-approved, add the boot-time
self-test which is apparently required.  This closely follows the
corresponding SHA-1, SHA-256, and SHA-512 tests.

Tested-by: Harald Freudenberger <freude@linux.ibm.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251026055032.1413733-8-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 lib/crypto/fips.h                   |  7 +++++++
 lib/crypto/sha3.c                   | 17 ++++++++++++++++-
 scripts/crypto/gen-fips-testvecs.py |  4 ++++
 3 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/lib/crypto/fips.h b/lib/crypto/fips.h
index 78a1bdd33a15..023410c2e0db 100644
--- a/lib/crypto/fips.h
+++ b/lib/crypto/fips.h
@@ -36,3 +36,10 @@ static const u8 fips_test_hmac_sha512_value[] __initconst __maybe_unused = {
 	0x57, 0x0b, 0x15, 0x38, 0x95, 0xd8, 0xa3, 0x81,
 	0xba, 0xb3, 0x15, 0x37, 0x5c, 0x6d, 0x57, 0x2b,
 };
+
+static const u8 fips_test_sha3_256_value[] __initconst __maybe_unused = {
+	0x77, 0xc4, 0x8b, 0x69, 0x70, 0x5f, 0x0a, 0xb1,
+	0xb1, 0xa5, 0x82, 0x0a, 0x22, 0x2b, 0x49, 0x31,
+	0xba, 0x9b, 0xb6, 0xaa, 0x32, 0xa7, 0x97, 0x00,
+	0x98, 0xdb, 0xff, 0xe7, 0xc6, 0xde, 0xb5, 0x82,
+};
diff --git a/lib/crypto/sha3.c b/lib/crypto/sha3.c
index 2102c2ecac96..8434f9bff138 100644
--- a/lib/crypto/sha3.c
+++ b/lib/crypto/sha3.c
@@ -17,6 +17,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/unaligned.h>
+#include "fips.h"
 
 /*
  * On some 32-bit architectures, such as h8300, GCC ends up using over 1 KB of
@@ -341,10 +342,24 @@ void shake256(const u8 *in, size_t in_len, u8 *out, size_t out_len)
 }
 EXPORT_SYMBOL_GPL(shake256);
 
-#ifdef sha3_mod_init_arch
+#if defined(sha3_mod_init_arch) || defined(CONFIG_CRYPTO_FIPS)
 static int __init sha3_mod_init(void)
 {
+#ifdef sha3_mod_init_arch
 	sha3_mod_init_arch();
+#endif
+	if (fips_enabled) {
+		/*
+		 * FIPS cryptographic algorithm self-test.  As per the FIPS
+		 * Implementation Guidance, testing any SHA-3 algorithm
+		 * satisfies the test requirement for all of them.
+		 */
+		u8 hash[SHA3_256_DIGEST_SIZE];
+
+		sha3_256(fips_test_data, sizeof(fips_test_data), hash);
+		if (memcmp(fips_test_sha3_256_value, hash, sizeof(hash)) != 0)
+			panic("sha3: FIPS self-test failed\n");
+	}
 	return 0;
 }
 subsys_initcall(sha3_mod_init);
diff --git a/scripts/crypto/gen-fips-testvecs.py b/scripts/crypto/gen-fips-testvecs.py
index 2956f88b764a..db873f88619a 100755
--- a/scripts/crypto/gen-fips-testvecs.py
+++ b/scripts/crypto/gen-fips-testvecs.py
@@ -5,6 +5,7 @@
 #
 # Copyright 2025 Google LLC
 
+import hashlib
 import hmac
 
 fips_test_data = b"fips test data\0\0"
@@ -30,3 +31,6 @@ for alg in 'sha1', 'sha256', 'sha512':
     ctx = hmac.new(fips_test_key, digestmod=alg)
     ctx.update(fips_test_data)
     print_static_u8_array_definition(f'fips_test_hmac_{alg}_value', ctx.digest())
+
+print_static_u8_array_definition(f'fips_test_sha3_256_value',
+                                 hashlib.sha3_256(fips_test_data).digest())

From be755eb2b021495e1935813abaf9e1f1de1d8c78 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Sat, 25 Oct 2025 22:50:25 -0700
Subject: [PATCH 15/37] crypto: arm64/sha3 - Update sha3_ce_transform() to
 prepare for library

- Use size_t lengths, to match the library.

- Pass the block size instead of digest size, and add support for the
  block size that SHAKE128 uses.  This allows the code to be used with
  SHAKE128 and SHAKE256, which don't have the concept of a digest size.
  SHAKE256 has the same block size as SHA3-256, but SHAKE128 has a
  unique block size.  Thus, there are now 5 supported block sizes.

Don't bother changing the "glue" code arm64_sha3_update() too much, as
it gets deleted when the SHA-3 code is migrated into lib/crypto/ anyway.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251026055032.1413733-9-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 arch/arm64/crypto/sha3-ce-core.S | 67 ++++++++++++++++----------------
 arch/arm64/crypto/sha3-ce-glue.c | 11 +++---
 2 files changed, 39 insertions(+), 39 deletions(-)

diff --git a/arch/arm64/crypto/sha3-ce-core.S b/arch/arm64/crypto/sha3-ce-core.S
index 9c77313f5a60..b62bd714839b 100644
--- a/arch/arm64/crypto/sha3-ce-core.S
+++ b/arch/arm64/crypto/sha3-ce-core.S
@@ -37,7 +37,11 @@
 	.endm
 
 	/*
-	 * int sha3_ce_transform(u64 *st, const u8 *data, int blocks, int dg_size)
+	 * size_t sha3_ce_transform(struct sha3_state *state, const u8 *data,
+	 *			    size_t nblocks, size_t block_size)
+	 *
+	 * block_size is assumed to be one of 72 (SHA3-512), 104 (SHA3-384), 136
+	 * (SHA3-256 and SHAKE256), 144 (SHA3-224), or 168 (SHAKE128).
 	 */
 	.text
 SYM_FUNC_START(sha3_ce_transform)
@@ -51,58 +55,55 @@ SYM_FUNC_START(sha3_ce_transform)
 	ld1	{v20.1d-v23.1d}, [x8], #32
 	ld1	{v24.1d}, [x8]
 
-0:	sub	w2, w2, #1
+0:	sub	x2, x2, #1
 	mov	w8, #24
 	adr_l	x9, .Lsha3_rcon
 
 	/* load input */
 	ld1	{v25.8b-v28.8b}, [x1], #32
-	ld1	{v29.8b-v31.8b}, [x1], #24
+	ld1	{v29.8b}, [x1], #8
 	eor	v0.8b, v0.8b, v25.8b
 	eor	v1.8b, v1.8b, v26.8b
 	eor	v2.8b, v2.8b, v27.8b
 	eor	v3.8b, v3.8b, v28.8b
 	eor	v4.8b, v4.8b, v29.8b
-	eor	v5.8b, v5.8b, v30.8b
-	eor	v6.8b, v6.8b, v31.8b
-
-	tbnz	x3, #6, 2f		// SHA3-512
 
 	ld1	{v25.8b-v28.8b}, [x1], #32
-	ld1	{v29.8b-v30.8b}, [x1], #16
-	eor	 v7.8b,  v7.8b, v25.8b
-	eor	 v8.8b,  v8.8b, v26.8b
-	eor	 v9.8b,  v9.8b, v27.8b
-	eor	v10.8b, v10.8b, v28.8b
-	eor	v11.8b, v11.8b, v29.8b
-	eor	v12.8b, v12.8b, v30.8b
+	eor	v5.8b, v5.8b, v25.8b
+	eor	v6.8b, v6.8b, v26.8b
+	eor	v7.8b, v7.8b, v27.8b
+	eor	v8.8b, v8.8b, v28.8b
+	cmp	x3, #72
+	b.eq	3f	/* SHA3-512 (block_size=72)? */
 
-	tbnz	x3, #4, 1f		// SHA3-384 or SHA3-224
+	ld1	{v25.8b-v28.8b}, [x1], #32
+	eor	v9.8b, v9.8b, v25.8b
+	eor	v10.8b, v10.8b, v26.8b
+	eor	v11.8b, v11.8b, v27.8b
+	eor	v12.8b, v12.8b, v28.8b
+	cmp	x3, #104
+	b.eq	3f	/* SHA3-384 (block_size=104)? */
 
-	// SHA3-256
 	ld1	{v25.8b-v28.8b}, [x1], #32
 	eor	v13.8b, v13.8b, v25.8b
 	eor	v14.8b, v14.8b, v26.8b
 	eor	v15.8b, v15.8b, v27.8b
 	eor	v16.8b, v16.8b, v28.8b
-	b	3f
+	cmp	x3, #144
+	b.lt	3f	/* SHA3-256 or SHAKE256 (block_size=136)? */
+	b.eq	2f	/* SHA3-224 (block_size=144)? */
 
-1:	tbz	x3, #2, 3f		// bit 2 cleared? SHA-384
-
-	// SHA3-224
+	/* SHAKE128 (block_size=168) */
 	ld1	{v25.8b-v28.8b}, [x1], #32
-	ld1	{v29.8b}, [x1], #8
-	eor	v13.8b, v13.8b, v25.8b
-	eor	v14.8b, v14.8b, v26.8b
-	eor	v15.8b, v15.8b, v27.8b
-	eor	v16.8b, v16.8b, v28.8b
-	eor	v17.8b, v17.8b, v29.8b
+	eor	v17.8b, v17.8b, v25.8b
+	eor	v18.8b, v18.8b, v26.8b
+	eor	v19.8b, v19.8b, v27.8b
+	eor	v20.8b, v20.8b, v28.8b
 	b	3f
-
-	// SHA3-512
-2:	ld1	{v25.8b-v26.8b}, [x1], #16
-	eor	 v7.8b,  v7.8b, v25.8b
-	eor	 v8.8b,  v8.8b, v26.8b
+2:
+	/* SHA3-224 (block_size=144) */
+	ld1	{v25.8b}, [x1], #8
+	eor	v17.8b, v17.8b, v25.8b
 
 3:	sub	w8, w8, #1
 
@@ -185,7 +186,7 @@ SYM_FUNC_START(sha3_ce_transform)
 
 	cbnz	w8, 3b
 	cond_yield 4f, x8, x9
-	cbnz	w2, 0b
+	cbnz	x2, 0b
 
 	/* save state */
 4:	st1	{ v0.1d- v3.1d}, [x0], #32
@@ -195,7 +196,7 @@ SYM_FUNC_START(sha3_ce_transform)
 	st1	{v16.1d-v19.1d}, [x0], #32
 	st1	{v20.1d-v23.1d}, [x0], #32
 	st1	{v24.1d}, [x0]
-	mov	w0, w2
+	mov	x0, x2
 	ret
 SYM_FUNC_END(sha3_ce_transform)
 
diff --git a/arch/arm64/crypto/sha3-ce-glue.c b/arch/arm64/crypto/sha3-ce-glue.c
index f5c830234933..250f4fb76b47 100644
--- a/arch/arm64/crypto/sha3-ce-glue.c
+++ b/arch/arm64/crypto/sha3-ce-glue.c
@@ -28,18 +28,17 @@ MODULE_ALIAS_CRYPTO("sha3-256");
 MODULE_ALIAS_CRYPTO("sha3-384");
 MODULE_ALIAS_CRYPTO("sha3-512");
 
-asmlinkage int sha3_ce_transform(u64 *st, const u8 *data, int blocks,
-				 int md_len);
+asmlinkage size_t sha3_ce_transform(struct sha3_state *state, const u8 *data,
+				    size_t nblocks, size_t block_size);
 
 static int arm64_sha3_update(struct shash_desc *desc, const u8 *data,
 			     unsigned int len)
 {
 	struct sha3_state *sctx = shash_desc_ctx(desc);
 	struct crypto_shash *tfm = desc->tfm;
-	unsigned int bs, ds;
+	unsigned int bs;
 	int blocks;
 
-	ds = crypto_shash_digestsize(tfm);
 	bs = crypto_shash_blocksize(tfm);
 	blocks = len / bs;
 	len -= blocks * bs;
@@ -47,7 +46,7 @@ static int arm64_sha3_update(struct shash_desc *desc, const u8 *data,
 		int rem;
 
 		kernel_neon_begin();
-		rem = sha3_ce_transform(sctx->st, data, blocks, ds);
+		rem = sha3_ce_transform(sctx, data, blocks, bs);
 		kernel_neon_end();
 		data += (blocks - rem) * bs;
 		blocks = rem;
@@ -74,7 +73,7 @@ static int sha3_finup(struct shash_desc *desc, const u8 *src, unsigned int len,
 	block[bs - 1] |= 0x80;
 
 	kernel_neon_begin();
-	sha3_ce_transform(sctx->st, block, 1, ds);
+	sha3_ce_transform(sctx, block, 1, bs);
 	kernel_neon_end();
 	memzero_explicit(block , sizeof(block));
 

From 1e29a750572a25200fcea995d91e5f6448f340c0 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Sat, 25 Oct 2025 22:50:26 -0700
Subject: [PATCH 16/37] lib/crypto: arm64/sha3: Migrate optimized code into
 library

Instead of exposing the arm64-optimized SHA-3 code via arm64-specific
crypto_shash algorithms, instead just implement the sha3_absorb_blocks()
and sha3_keccakf() library functions.  This is much simpler, it makes
the SHA-3 library functions be arm64-optimized, and it fixes the
longstanding issue where the arm64-optimized SHA-3 code was disabled by
default.  SHA-3 still remains available through crypto_shash, but
individual architectures no longer need to handle it.

Note: to see the diff from arch/arm64/crypto/sha3-ce-glue.c to
lib/crypto/arm64/sha3.h, view this commit with 'git show -M10'.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251026055032.1413733-10-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 arch/arm64/configs/defconfig                  |   2 +-
 arch/arm64/crypto/Kconfig                     |  11 --
 arch/arm64/crypto/Makefile                    |   3 -
 arch/arm64/crypto/sha3-ce-glue.c              | 150 ------------------
 lib/crypto/Kconfig                            |   5 +
 lib/crypto/Makefile                           |   5 +
 .../crypto/arm64}/sha3-ce-core.S              |   0
 lib/crypto/arm64/sha3.h                       |  62 ++++++++
 8 files changed, 73 insertions(+), 165 deletions(-)
 delete mode 100644 arch/arm64/crypto/sha3-ce-glue.c
 rename {arch/arm64/crypto => lib/crypto/arm64}/sha3-ce-core.S (100%)
 create mode 100644 lib/crypto/arm64/sha3.h

diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index e3a2d37bd104..20dd3a39faea 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -1783,10 +1783,10 @@ CONFIG_CRYPTO_CHACHA20=m
 CONFIG_CRYPTO_BENCHMARK=m
 CONFIG_CRYPTO_ECHAINIV=y
 CONFIG_CRYPTO_MICHAEL_MIC=m
+CONFIG_CRYPTO_SHA3=m
 CONFIG_CRYPTO_ANSI_CPRNG=y
 CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_GHASH_ARM64_CE=y
-CONFIG_CRYPTO_SHA3_ARM64=m
 CONFIG_CRYPTO_SM3_ARM64_CE=m
 CONFIG_CRYPTO_AES_ARM64_CE_BLK=y
 CONFIG_CRYPTO_AES_ARM64_BS=m
diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index 91f3093eee6a..376d6b50743f 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -25,17 +25,6 @@ config CRYPTO_NHPOLY1305_NEON
 	  Architecture: arm64 using:
 	  - NEON (Advanced SIMD) extensions
 
-config CRYPTO_SHA3_ARM64
-	tristate "Hash functions: SHA-3 (ARMv8.2 Crypto Extensions)"
-	depends on KERNEL_MODE_NEON
-	select CRYPTO_HASH
-	select CRYPTO_SHA3
-	help
-	  SHA-3 secure hash algorithms (FIPS 202)
-
-	  Architecture: arm64 using:
-	  - ARMv8.2 Crypto Extensions
-
 config CRYPTO_SM3_NEON
 	tristate "Hash functions: SM3 (NEON)"
 	depends on KERNEL_MODE_NEON
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index a8b2cdbe202c..fd3d590fa113 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -5,9 +5,6 @@
 # Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
 #
 
-obj-$(CONFIG_CRYPTO_SHA3_ARM64) += sha3-ce.o
-sha3-ce-y := sha3-ce-glue.o sha3-ce-core.o
-
 obj-$(CONFIG_CRYPTO_SM3_NEON) += sm3-neon.o
 sm3-neon-y := sm3-neon-glue.o sm3-neon-core.o
 
diff --git a/arch/arm64/crypto/sha3-ce-glue.c b/arch/arm64/crypto/sha3-ce-glue.c
deleted file mode 100644
index 250f4fb76b47..000000000000
--- a/arch/arm64/crypto/sha3-ce-glue.c
+++ /dev/null
@@ -1,150 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * sha3-ce-glue.c - core SHA-3 transform using v8.2 Crypto Extensions
- *
- * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-#include <crypto/internal/hash.h>
-#include <crypto/sha3.h>
-#include <linux/cpufeature.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/unaligned.h>
-
-MODULE_DESCRIPTION("SHA3 secure hash using ARMv8 Crypto Extensions");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS_CRYPTO("sha3-224");
-MODULE_ALIAS_CRYPTO("sha3-256");
-MODULE_ALIAS_CRYPTO("sha3-384");
-MODULE_ALIAS_CRYPTO("sha3-512");
-
-asmlinkage size_t sha3_ce_transform(struct sha3_state *state, const u8 *data,
-				    size_t nblocks, size_t block_size);
-
-static int arm64_sha3_update(struct shash_desc *desc, const u8 *data,
-			     unsigned int len)
-{
-	struct sha3_state *sctx = shash_desc_ctx(desc);
-	struct crypto_shash *tfm = desc->tfm;
-	unsigned int bs;
-	int blocks;
-
-	bs = crypto_shash_blocksize(tfm);
-	blocks = len / bs;
-	len -= blocks * bs;
-	do {
-		int rem;
-
-		kernel_neon_begin();
-		rem = sha3_ce_transform(sctx, data, blocks, bs);
-		kernel_neon_end();
-		data += (blocks - rem) * bs;
-		blocks = rem;
-	} while (blocks);
-	return len;
-}
-
-static int sha3_finup(struct shash_desc *desc, const u8 *src, unsigned int len,
-		      u8 *out)
-{
-	struct sha3_state *sctx = shash_desc_ctx(desc);
-	struct crypto_shash *tfm = desc->tfm;
-	__le64 *digest = (__le64 *)out;
-	u8 block[SHA3_224_BLOCK_SIZE];
-	unsigned int bs, ds;
-	int i;
-
-	ds = crypto_shash_digestsize(tfm);
-	bs = crypto_shash_blocksize(tfm);
-	memcpy(block, src, len);
-
-	block[len++] = 0x06;
-	memset(block + len, 0, bs - len);
-	block[bs - 1] |= 0x80;
-
-	kernel_neon_begin();
-	sha3_ce_transform(sctx, block, 1, bs);
-	kernel_neon_end();
-	memzero_explicit(block , sizeof(block));
-
-	for (i = 0; i < ds / 8; i++)
-		put_unaligned_le64(sctx->st[i], digest++);
-
-	if (ds & 4)
-		put_unaligned_le32(sctx->st[i], (__le32 *)digest);
-
-	return 0;
-}
-
-static struct shash_alg algs[] = { {
-	.digestsize		= SHA3_224_DIGEST_SIZE,
-	.init			= crypto_sha3_init,
-	.update			= arm64_sha3_update,
-	.finup			= sha3_finup,
-	.descsize		= SHA3_STATE_SIZE,
-	.base.cra_name		= "sha3-224",
-	.base.cra_driver_name	= "sha3-224-ce",
-	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
-	.base.cra_blocksize	= SHA3_224_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-	.base.cra_priority	= 200,
-}, {
-	.digestsize		= SHA3_256_DIGEST_SIZE,
-	.init			= crypto_sha3_init,
-	.update			= arm64_sha3_update,
-	.finup			= sha3_finup,
-	.descsize		= SHA3_STATE_SIZE,
-	.base.cra_name		= "sha3-256",
-	.base.cra_driver_name	= "sha3-256-ce",
-	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
-	.base.cra_blocksize	= SHA3_256_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-	.base.cra_priority	= 200,
-}, {
-	.digestsize		= SHA3_384_DIGEST_SIZE,
-	.init			= crypto_sha3_init,
-	.update			= arm64_sha3_update,
-	.finup			= sha3_finup,
-	.descsize		= SHA3_STATE_SIZE,
-	.base.cra_name		= "sha3-384",
-	.base.cra_driver_name	= "sha3-384-ce",
-	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
-	.base.cra_blocksize	= SHA3_384_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-	.base.cra_priority	= 200,
-}, {
-	.digestsize		= SHA3_512_DIGEST_SIZE,
-	.init			= crypto_sha3_init,
-	.update			= arm64_sha3_update,
-	.finup			= sha3_finup,
-	.descsize		= SHA3_STATE_SIZE,
-	.base.cra_name		= "sha3-512",
-	.base.cra_driver_name	= "sha3-512-ce",
-	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
-	.base.cra_blocksize	= SHA3_512_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-	.base.cra_priority	= 200,
-} };
-
-static int __init sha3_neon_mod_init(void)
-{
-	return crypto_register_shashes(algs, ARRAY_SIZE(algs));
-}
-
-static void __exit sha3_neon_mod_fini(void)
-{
-	crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
-}
-
-module_cpu_feature_match(SHA3, sha3_neon_mod_init);
-module_exit(sha3_neon_mod_fini);
diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig
index a05f5a349cd8..587490ca6565 100644
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -202,6 +202,11 @@ config CRYPTO_LIB_SHA3
 	  The SHA3 library functions.  Select this if your module uses any of
 	  the functions from <crypto/sha3.h>.
 
+config CRYPTO_LIB_SHA3_ARCH
+	bool
+	depends on CRYPTO_LIB_SHA3 && !UML
+	default y if ARM64 && KERNEL_MODE_NEON
+
 config CRYPTO_LIB_SM3
 	tristate
 
diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile
index 0cfdb511f32b..5515e73bfd5e 100644
--- a/lib/crypto/Makefile
+++ b/lib/crypto/Makefile
@@ -281,6 +281,11 @@ endif # CONFIG_CRYPTO_LIB_SHA512_ARCH
 obj-$(CONFIG_CRYPTO_LIB_SHA3) += libsha3.o
 libsha3-y := sha3.o
 
+ifeq ($(CONFIG_CRYPTO_LIB_SHA3_ARCH),y)
+CFLAGS_sha3.o += -I$(src)/$(SRCARCH)
+libsha3-$(CONFIG_ARM64) += arm64/sha3-ce-core.o
+endif # CONFIG_CRYPTO_LIB_SHA3_ARCH
+
 ################################################################################
 
 obj-$(CONFIG_MPILIB) += mpi/
diff --git a/arch/arm64/crypto/sha3-ce-core.S b/lib/crypto/arm64/sha3-ce-core.S
similarity index 100%
rename from arch/arm64/crypto/sha3-ce-core.S
rename to lib/crypto/arm64/sha3-ce-core.S
diff --git a/lib/crypto/arm64/sha3.h b/lib/crypto/arm64/sha3.h
new file mode 100644
index 000000000000..6dd5183056da
--- /dev/null
+++ b/lib/crypto/arm64/sha3.h
@@ -0,0 +1,62 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <linux/cpufeature.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha3);
+
+asmlinkage size_t sha3_ce_transform(struct sha3_state *state, const u8 *data,
+				    size_t nblocks, size_t block_size);
+
+static void sha3_absorb_blocks(struct sha3_state *state, const u8 *data,
+			       size_t nblocks, size_t block_size)
+{
+	if (static_branch_likely(&have_sha3) && likely(may_use_simd())) {
+		do {
+			size_t rem;
+
+			kernel_neon_begin();
+			rem = sha3_ce_transform(state, data, nblocks,
+						block_size);
+			kernel_neon_end();
+			data += (nblocks - rem) * block_size;
+			nblocks = rem;
+		} while (nblocks);
+	} else {
+		sha3_absorb_blocks_generic(state, data, nblocks, block_size);
+	}
+}
+
+static void sha3_keccakf(struct sha3_state *state)
+{
+	if (static_branch_likely(&have_sha3) && likely(may_use_simd())) {
+		/*
+		 * Passing zeroes into sha3_ce_transform() gives the plain
+		 * Keccak-f permutation, which is what we want here.  Any
+		 * supported block size may be used.  Use SHA3_512_BLOCK_SIZE
+		 * since it's the shortest.
+		 */
+		static const u8 zeroes[SHA3_512_BLOCK_SIZE];
+
+		kernel_neon_begin();
+		sha3_ce_transform(state, zeroes, 1, sizeof(zeroes));
+		kernel_neon_end();
+	} else {
+		sha3_keccakf_generic(state);
+	}
+}
+
+#define sha3_mod_init_arch sha3_mod_init_arch
+static void sha3_mod_init_arch(void)
+{
+	if (cpu_have_named_feature(SHA3))
+		static_branch_enable(&have_sha3);
+}

From 04171105d33ae77945791a637d23b76789f3bef9 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Sat, 25 Oct 2025 22:50:27 -0700
Subject: [PATCH 17/37] lib/crypto: s390/sha3: Add optimized Keccak functions

Implement sha3_absorb_blocks() and sha3_keccakf() using the hardware-
accelerated SHA-3 support in Message-Security-Assist Extension 6.

This accelerates the SHA3-224, SHA3-256, SHA3-384, SHA3-512, and
SHAKE256 library functions.

Note that arch/s390/crypto/ already has SHA-3 code that uses this
extension, but it is exposed only via crypto_shash.  This commit brings
the same acceleration to the SHA-3 library.  The arch/s390/crypto/
version will become redundant and be removed in later changes.

Tested-by: Harald Freudenberger <freude@linux.ibm.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251026055032.1413733-11-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 lib/crypto/Kconfig     |  1 +
 lib/crypto/s390/sha3.h | 88 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 89 insertions(+)
 create mode 100644 lib/crypto/s390/sha3.h

diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig
index 587490ca6565..7445054fc0ad 100644
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -206,6 +206,7 @@ config CRYPTO_LIB_SHA3_ARCH
 	bool
 	depends on CRYPTO_LIB_SHA3 && !UML
 	default y if ARM64 && KERNEL_MODE_NEON
+	default y if S390
 
 config CRYPTO_LIB_SM3
 	tristate
diff --git a/lib/crypto/s390/sha3.h b/lib/crypto/s390/sha3.h
new file mode 100644
index 000000000000..668e53da93d2
--- /dev/null
+++ b/lib/crypto/s390/sha3.h
@@ -0,0 +1,88 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * SHA-3 optimized using the CP Assist for Cryptographic Functions (CPACF)
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/cpacf.h>
+#include <linux/cpufeature.h>
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha3);
+
+static void sha3_absorb_blocks(struct sha3_state *state, const u8 *data,
+			       size_t nblocks, size_t block_size)
+{
+	if (static_branch_likely(&have_sha3)) {
+		/*
+		 * Note that KIMD assumes little-endian order of the state
+		 * words.  sha3_state already uses that order, though, so
+		 * there's no need for a byteswap.
+		 */
+		switch (block_size) {
+		case SHA3_224_BLOCK_SIZE:
+			cpacf_kimd(CPACF_KIMD_SHA3_224, state,
+				   data, nblocks * block_size);
+			return;
+		case SHA3_256_BLOCK_SIZE:
+			/*
+			 * This case handles both SHA3-256 and SHAKE256, since
+			 * they have the same block size.
+			 */
+			cpacf_kimd(CPACF_KIMD_SHA3_256, state,
+				   data, nblocks * block_size);
+			return;
+		case SHA3_384_BLOCK_SIZE:
+			cpacf_kimd(CPACF_KIMD_SHA3_384, state,
+				   data, nblocks * block_size);
+			return;
+		case SHA3_512_BLOCK_SIZE:
+			cpacf_kimd(CPACF_KIMD_SHA3_512, state,
+				   data, nblocks * block_size);
+			return;
+		}
+	}
+	sha3_absorb_blocks_generic(state, data, nblocks, block_size);
+}
+
+static void sha3_keccakf(struct sha3_state *state)
+{
+	if (static_branch_likely(&have_sha3)) {
+		/*
+		 * Passing zeroes into any of CPACF_KIMD_SHA3_* gives the plain
+		 * Keccak-f permutation, which is what we want here.  Use
+		 * SHA3-512 since it has the smallest block size.
+		 */
+		static const u8 zeroes[SHA3_512_BLOCK_SIZE];
+
+		cpacf_kimd(CPACF_KIMD_SHA3_512, state, zeroes, sizeof(zeroes));
+	} else {
+		sha3_keccakf_generic(state);
+	}
+}
+
+#define sha3_mod_init_arch sha3_mod_init_arch
+static void sha3_mod_init_arch(void)
+{
+	int num_present = 0;
+	int num_possible = 0;
+
+	if (!cpu_have_feature(S390_CPU_FEATURE_MSA))
+		return;
+	/*
+	 * Since all the SHA-3 functions are in Message-Security-Assist
+	 * Extension 6, just treat them as all or nothing.  This way we need
+	 * only one static_key.
+	 */
+#define QUERY(opcode, func) \
+	({ num_present += !!cpacf_query_func(opcode, func); num_possible++; })
+	QUERY(CPACF_KIMD, CPACF_KIMD_SHA3_224);
+	QUERY(CPACF_KIMD, CPACF_KIMD_SHA3_256);
+	QUERY(CPACF_KIMD, CPACF_KIMD_SHA3_384);
+	QUERY(CPACF_KIMD, CPACF_KIMD_SHA3_512);
+#undef QUERY
+
+	if (num_present == num_possible)
+		static_branch_enable(&have_sha3);
+	else if (num_present != 0)
+		pr_warn("Unsupported combination of SHA-3 facilities\n");
+}

From 0354d3c1f1b8628e60eceb304b6d2ef75eea6f41 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Sat, 25 Oct 2025 22:50:28 -0700
Subject: [PATCH 18/37] lib/crypto: sha3: Support arch overrides of one-shot
 digest functions

Add support for architecture-specific overrides of sha3_224(),
sha3_256(), sha3_384(), and sha3_512().  This will be used to implement
these functions more efficiently on s390 than is possible via the usual
init + update + final flow.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Tested-by: Harald Freudenberger <freude@linux.ibm.com>
Link: https://lore.kernel.org/r/20251026055032.1413733-12-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 lib/crypto/sha3.c | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/lib/crypto/sha3.c b/lib/crypto/sha3.c
index 8434f9bff138..32b7074de792 100644
--- a/lib/crypto/sha3.c
+++ b/lib/crypto/sha3.c
@@ -280,10 +280,41 @@ void shake_squeeze(struct shake_ctx *shake_ctx, u8 *out, size_t out_len)
 }
 EXPORT_SYMBOL_GPL(shake_squeeze);
 
+#ifndef sha3_224_arch
+static inline bool sha3_224_arch(const u8 *in, size_t in_len,
+				 u8 out[SHA3_224_DIGEST_SIZE])
+{
+	return false;
+}
+#endif
+#ifndef sha3_256_arch
+static inline bool sha3_256_arch(const u8 *in, size_t in_len,
+				 u8 out[SHA3_256_DIGEST_SIZE])
+{
+	return false;
+}
+#endif
+#ifndef sha3_384_arch
+static inline bool sha3_384_arch(const u8 *in, size_t in_len,
+				 u8 out[SHA3_384_DIGEST_SIZE])
+{
+	return false;
+}
+#endif
+#ifndef sha3_512_arch
+static inline bool sha3_512_arch(const u8 *in, size_t in_len,
+				 u8 out[SHA3_512_DIGEST_SIZE])
+{
+	return false;
+}
+#endif
+
 void sha3_224(const u8 *in, size_t in_len, u8 out[SHA3_224_DIGEST_SIZE])
 {
 	struct sha3_ctx ctx;
 
+	if (sha3_224_arch(in, in_len, out))
+		return;
 	sha3_224_init(&ctx);
 	sha3_update(&ctx, in, in_len);
 	sha3_final(&ctx, out);
@@ -294,6 +325,8 @@ void sha3_256(const u8 *in, size_t in_len, u8 out[SHA3_256_DIGEST_SIZE])
 {
 	struct sha3_ctx ctx;
 
+	if (sha3_256_arch(in, in_len, out))
+		return;
 	sha3_256_init(&ctx);
 	sha3_update(&ctx, in, in_len);
 	sha3_final(&ctx, out);
@@ -304,6 +337,8 @@ void sha3_384(const u8 *in, size_t in_len, u8 out[SHA3_384_DIGEST_SIZE])
 {
 	struct sha3_ctx ctx;
 
+	if (sha3_384_arch(in, in_len, out))
+		return;
 	sha3_384_init(&ctx);
 	sha3_update(&ctx, in, in_len);
 	sha3_final(&ctx, out);
@@ -314,6 +349,8 @@ void sha3_512(const u8 *in, size_t in_len, u8 out[SHA3_512_DIGEST_SIZE])
 {
 	struct sha3_ctx ctx;
 
+	if (sha3_512_arch(in, in_len, out))
+		return;
 	sha3_512_init(&ctx);
 	sha3_update(&ctx, in, in_len);
 	sha3_final(&ctx, out);

From 862445d3b9e74f58360a7a89787da4dca783e6dd Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Sat, 25 Oct 2025 22:50:29 -0700
Subject: [PATCH 19/37] lib/crypto: s390/sha3: Add optimized one-shot SHA-3
 digest functions

Some z/Architecture processors can compute a SHA-3 digest in a single
instruction.  arch/s390/crypto/ already uses this capability to optimize
the SHA-3 crypto_shash algorithms.

Use this capability to implement the sha3_224(), sha3_256(), sha3_384(),
and sha3_512() library functions too.

SHA3-256 benchmark results provided by Harald Freudenberger
(https://lore.kernel.org/r/4188d18bfcc8a64941c5ebd8de10ede2@linux.ibm.com/)
on a z/Architecture machine with "facility 86" (MSA level 12):

    Length (bytes)    Before (MB/s)   After (MB/s)
    ==============    =============   ============
          16                212             225
          64                820             915
         256               1850            3350
        1024               5400            8300
        4096              11200           11300

Note: the original data from Harald was given in the form of a graph for
each length, showing the distribution of throughputs from 500 runs.  I
guesstimated the peak of each one.

Harald also reported that the generic SHA-3 code was at most 259 MB/s
(https://lore.kernel.org/r/c39f6b6c110def0095e5da5becc12085@linux.ibm.com/).
So as expected, the earlier commit that optimized sha3_absorb_blocks()
and sha3_keccakf() is the more important one; it optimized the Keccak
permutation which is the most performance-critical part of SHA-3.
Still, this additional commit does notably improve performance further
on some lengths.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Tested-by: Harald Freudenberger <freude@linux.ibm.com>
Link: https://lore.kernel.org/r/20251026055032.1413733-13-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 lib/crypto/s390/sha3.h | 67 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 65 insertions(+), 2 deletions(-)

diff --git a/lib/crypto/s390/sha3.h b/lib/crypto/s390/sha3.h
index 668e53da93d2..85471404775a 100644
--- a/lib/crypto/s390/sha3.h
+++ b/lib/crypto/s390/sha3.h
@@ -8,6 +8,7 @@
 #include <linux/cpufeature.h>
 
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha3);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_sha3_init_optim);
 
 static void sha3_absorb_blocks(struct sha3_state *state, const u8 *data,
 			       size_t nblocks, size_t block_size)
@@ -60,6 +61,61 @@ static void sha3_keccakf(struct sha3_state *state)
 	}
 }
 
+static inline bool s390_sha3(int func, const u8 *in, size_t in_len,
+			     u8 *out, size_t out_len)
+{
+	struct sha3_state state;
+
+	if (!static_branch_likely(&have_sha3))
+		return false;
+
+	if (static_branch_likely(&have_sha3_init_optim))
+		func |= CPACF_KLMD_NIP | CPACF_KLMD_DUFOP;
+	else
+		memset(&state, 0, sizeof(state));
+
+	cpacf_klmd(func, &state, in, in_len);
+
+	if (static_branch_likely(&have_sha3_init_optim))
+		kmsan_unpoison_memory(&state, out_len);
+
+	memcpy(out, &state, out_len);
+	memzero_explicit(&state, sizeof(state));
+	return true;
+}
+
+#define sha3_224_arch sha3_224_arch
+static bool sha3_224_arch(const u8 *in, size_t in_len,
+			  u8 out[SHA3_224_DIGEST_SIZE])
+{
+	return s390_sha3(CPACF_KLMD_SHA3_224, in, in_len,
+			 out, SHA3_224_DIGEST_SIZE);
+}
+
+#define sha3_256_arch sha3_256_arch
+static bool sha3_256_arch(const u8 *in, size_t in_len,
+			  u8 out[SHA3_256_DIGEST_SIZE])
+{
+	return s390_sha3(CPACF_KLMD_SHA3_256, in, in_len,
+			 out, SHA3_256_DIGEST_SIZE);
+}
+
+#define sha3_384_arch sha3_384_arch
+static bool sha3_384_arch(const u8 *in, size_t in_len,
+			  u8 out[SHA3_384_DIGEST_SIZE])
+{
+	return s390_sha3(CPACF_KLMD_SHA3_384, in, in_len,
+			 out, SHA3_384_DIGEST_SIZE);
+}
+
+#define sha3_512_arch sha3_512_arch
+static bool sha3_512_arch(const u8 *in, size_t in_len,
+			  u8 out[SHA3_512_DIGEST_SIZE])
+{
+	return s390_sha3(CPACF_KLMD_SHA3_512, in, in_len,
+			 out, SHA3_512_DIGEST_SIZE);
+}
+
 #define sha3_mod_init_arch sha3_mod_init_arch
 static void sha3_mod_init_arch(void)
 {
@@ -79,10 +135,17 @@ static void sha3_mod_init_arch(void)
 	QUERY(CPACF_KIMD, CPACF_KIMD_SHA3_256);
 	QUERY(CPACF_KIMD, CPACF_KIMD_SHA3_384);
 	QUERY(CPACF_KIMD, CPACF_KIMD_SHA3_512);
+	QUERY(CPACF_KLMD, CPACF_KLMD_SHA3_224);
+	QUERY(CPACF_KLMD, CPACF_KLMD_SHA3_256);
+	QUERY(CPACF_KLMD, CPACF_KLMD_SHA3_384);
+	QUERY(CPACF_KLMD, CPACF_KLMD_SHA3_512);
 #undef QUERY
 
-	if (num_present == num_possible)
+	if (num_present == num_possible) {
 		static_branch_enable(&have_sha3);
-	else if (num_present != 0)
+		if (test_facility(86))
+			static_branch_enable(&have_sha3_init_optim);
+	} else if (num_present != 0) {
 		pr_warn("Unsupported combination of SHA-3 facilities\n");
+	}
 }

From d280d4d56a13b271ce6af9524feb1af1aa307606 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Sat, 25 Oct 2025 22:50:30 -0700
Subject: [PATCH 20/37] crypto: jitterentropy - Use default sha3 implementation

Make jitterentropy use "sha3-256" instead of "sha3-256-generic", as the
ability to explicitly request the generic code is going away.  It's not
worth providing a special generic API just for jitterentropy.  There are
many other solutions available to it, such as doing more iterations or
using a more effective jitter collection method.

Moreover, the status quo is that SHA-3 is quite slow anyway.  Currently
only arm64 and s390 have architecture-optimized SHA-3 code.  I'm not
familiar with the performance of the s390 one, but the arm64 one isn't
actually that much faster than the generic code anyway.

Note that jitterentropy should just use the library API instead of
crypto_shash.  But that belongs in a separate change later.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Tested-by: Harald Freudenberger <freude@linux.ibm.com>
Link: https://lore.kernel.org/r/20251026055032.1413733-14-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 crypto/jitterentropy-kcapi.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/crypto/jitterentropy-kcapi.c b/crypto/jitterentropy-kcapi.c
index a53de7affe8d..7c880cf34c52 100644
--- a/crypto/jitterentropy-kcapi.c
+++ b/crypto/jitterentropy-kcapi.c
@@ -48,7 +48,7 @@
 
 #include "jitterentropy.h"
 
-#define JENT_CONDITIONING_HASH	"sha3-256-generic"
+#define JENT_CONDITIONING_HASH	"sha3-256"
 
 /***************************************************************************
  * Helper function
@@ -230,15 +230,7 @@ static int jent_kcapi_init(struct crypto_tfm *tfm)
 
 	spin_lock_init(&rng->jent_lock);
 
-	/*
-	 * Use SHA3-256 as conditioner. We allocate only the generic
-	 * implementation as we are not interested in high-performance. The
-	 * execution time of the SHA3 operation is measured and adds to the
-	 * Jitter RNG's unpredictable behavior. If we have a slower hash
-	 * implementation, the execution timing variations are larger. When
-	 * using a fast implementation, we would need to call it more often
-	 * as its variations are lower.
-	 */
+	/* Use SHA3-256 as conditioner */
 	hash = crypto_alloc_shash(JENT_CONDITIONING_HASH, 0, 0);
 	if (IS_ERR(hash)) {
 		pr_err("Cannot allocate conditioning digest\n");

From f1799d17285ca99243328cd92133a9f84ee3a593 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Sat, 25 Oct 2025 22:50:31 -0700
Subject: [PATCH 21/37] crypto: sha3 - Reimplement using library API

Replace sha3_generic.c with a new file sha3.c which implements the SHA-3
crypto_shash algorithms on top of the SHA-3 library API.

Change the driver name suffix from "-generic" to "-lib" to reflect that
these algorithms now just use the (possibly arch-optimized) library.

This closely mirrors crypto/{md5,sha1,sha256,sha512,blake2b}.c.

Implement export_core and import_core, since crypto/hmac.c expects these
to be present.  (Note that there is no security purpose in wrapping
SHA-3 with HMAC.  HMAC was designed for older algorithms that don't
resist length extension attacks.  But since someone could be using
"hmac(sha3-*)" via crypto_shash anyway, keep supporting it for now.)

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Tested-by: Harald Freudenberger <freude@linux.ibm.com>
Link: https://lore.kernel.org/r/20251026055032.1413733-15-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 crypto/Kconfig        |   1 +
 crypto/Makefile       |   2 +-
 crypto/sha3.c         | 166 ++++++++++++++++++++++++
 crypto/sha3_generic.c | 290 ------------------------------------------
 crypto/testmgr.c      |   8 ++
 include/crypto/sha3.h |   6 -
 6 files changed, 176 insertions(+), 297 deletions(-)
 create mode 100644 crypto/sha3.c
 delete mode 100644 crypto/sha3_generic.c

diff --git a/crypto/Kconfig b/crypto/Kconfig
index 0a7e74ac870b..57b85e903cf0 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1006,6 +1006,7 @@ config CRYPTO_SHA512
 config CRYPTO_SHA3
 	tristate "SHA-3"
 	select CRYPTO_HASH
+	select CRYPTO_LIB_SHA3
 	help
 	  SHA-3 secure hash algorithms (FIPS 202, ISO/IEC 10118-3)
 
diff --git a/crypto/Makefile b/crypto/Makefile
index 5b02ca2cb04e..0388ff8d219d 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -78,7 +78,7 @@ obj-$(CONFIG_CRYPTO_RMD160) += rmd160.o
 obj-$(CONFIG_CRYPTO_SHA1) += sha1.o
 obj-$(CONFIG_CRYPTO_SHA256) += sha256.o
 obj-$(CONFIG_CRYPTO_SHA512) += sha512.o
-obj-$(CONFIG_CRYPTO_SHA3) += sha3_generic.o
+obj-$(CONFIG_CRYPTO_SHA3) += sha3.o
 obj-$(CONFIG_CRYPTO_SM3_GENERIC) += sm3_generic.o
 obj-$(CONFIG_CRYPTO_STREEBOG) += streebog_generic.o
 obj-$(CONFIG_CRYPTO_WP512) += wp512.o
diff --git a/crypto/sha3.c b/crypto/sha3.c
new file mode 100644
index 000000000000..8f364979ec89
--- /dev/null
+++ b/crypto/sha3.c
@@ -0,0 +1,166 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Crypto API support for SHA-3
+ * (https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.202.pdf)
+ */
+#include <crypto/internal/hash.h>
+#include <crypto/sha3.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#define SHA3_CTX(desc) ((struct sha3_ctx *)shash_desc_ctx(desc))
+
+static int crypto_sha3_224_init(struct shash_desc *desc)
+{
+	sha3_224_init(SHA3_CTX(desc));
+	return 0;
+}
+
+static int crypto_sha3_256_init(struct shash_desc *desc)
+{
+	sha3_256_init(SHA3_CTX(desc));
+	return 0;
+}
+
+static int crypto_sha3_384_init(struct shash_desc *desc)
+{
+	sha3_384_init(SHA3_CTX(desc));
+	return 0;
+}
+
+static int crypto_sha3_512_init(struct shash_desc *desc)
+{
+	sha3_512_init(SHA3_CTX(desc));
+	return 0;
+}
+
+static int crypto_sha3_update(struct shash_desc *desc, const u8 *data,
+			      unsigned int len)
+{
+	sha3_update(SHA3_CTX(desc), data, len);
+	return 0;
+}
+
+static int crypto_sha3_final(struct shash_desc *desc, u8 *out)
+{
+	sha3_final(SHA3_CTX(desc), out);
+	return 0;
+}
+
+static int crypto_sha3_224_digest(struct shash_desc *desc,
+				  const u8 *data, unsigned int len, u8 *out)
+{
+	sha3_224(data, len, out);
+	return 0;
+}
+
+static int crypto_sha3_256_digest(struct shash_desc *desc,
+				  const u8 *data, unsigned int len, u8 *out)
+{
+	sha3_256(data, len, out);
+	return 0;
+}
+
+static int crypto_sha3_384_digest(struct shash_desc *desc,
+				  const u8 *data, unsigned int len, u8 *out)
+{
+	sha3_384(data, len, out);
+	return 0;
+}
+
+static int crypto_sha3_512_digest(struct shash_desc *desc,
+				  const u8 *data, unsigned int len, u8 *out)
+{
+	sha3_512(data, len, out);
+	return 0;
+}
+
+static int crypto_sha3_export_core(struct shash_desc *desc, void *out)
+{
+	memcpy(out, SHA3_CTX(desc), sizeof(struct sha3_ctx));
+	return 0;
+}
+
+static int crypto_sha3_import_core(struct shash_desc *desc, const void *in)
+{
+	memcpy(SHA3_CTX(desc), in, sizeof(struct sha3_ctx));
+	return 0;
+}
+
+static struct shash_alg algs[] = { {
+	.digestsize		= SHA3_224_DIGEST_SIZE,
+	.init			= crypto_sha3_224_init,
+	.update			= crypto_sha3_update,
+	.final			= crypto_sha3_final,
+	.digest			= crypto_sha3_224_digest,
+	.export_core		= crypto_sha3_export_core,
+	.import_core		= crypto_sha3_import_core,
+	.descsize		= sizeof(struct sha3_ctx),
+	.base.cra_name		= "sha3-224",
+	.base.cra_driver_name	= "sha3-224-lib",
+	.base.cra_blocksize	= SHA3_224_BLOCK_SIZE,
+	.base.cra_module	= THIS_MODULE,
+}, {
+	.digestsize		= SHA3_256_DIGEST_SIZE,
+	.init			= crypto_sha3_256_init,
+	.update			= crypto_sha3_update,
+	.final			= crypto_sha3_final,
+	.digest			= crypto_sha3_256_digest,
+	.export_core		= crypto_sha3_export_core,
+	.import_core		= crypto_sha3_import_core,
+	.descsize		= sizeof(struct sha3_ctx),
+	.base.cra_name		= "sha3-256",
+	.base.cra_driver_name	= "sha3-256-lib",
+	.base.cra_blocksize	= SHA3_256_BLOCK_SIZE,
+	.base.cra_module	= THIS_MODULE,
+}, {
+	.digestsize		= SHA3_384_DIGEST_SIZE,
+	.init			= crypto_sha3_384_init,
+	.update			= crypto_sha3_update,
+	.final			= crypto_sha3_final,
+	.digest			= crypto_sha3_384_digest,
+	.export_core		= crypto_sha3_export_core,
+	.import_core		= crypto_sha3_import_core,
+	.descsize		= sizeof(struct sha3_ctx),
+	.base.cra_name		= "sha3-384",
+	.base.cra_driver_name	= "sha3-384-lib",
+	.base.cra_blocksize	= SHA3_384_BLOCK_SIZE,
+	.base.cra_module	= THIS_MODULE,
+}, {
+	.digestsize		= SHA3_512_DIGEST_SIZE,
+	.init			= crypto_sha3_512_init,
+	.update			= crypto_sha3_update,
+	.final			= crypto_sha3_final,
+	.digest			= crypto_sha3_512_digest,
+	.export_core		= crypto_sha3_export_core,
+	.import_core		= crypto_sha3_import_core,
+	.descsize		= sizeof(struct sha3_ctx),
+	.base.cra_name		= "sha3-512",
+	.base.cra_driver_name	= "sha3-512-lib",
+	.base.cra_blocksize	= SHA3_512_BLOCK_SIZE,
+	.base.cra_module	= THIS_MODULE,
+} };
+
+static int __init crypto_sha3_mod_init(void)
+{
+	return crypto_register_shashes(algs, ARRAY_SIZE(algs));
+}
+module_init(crypto_sha3_mod_init);
+
+static void __exit crypto_sha3_mod_exit(void)
+{
+	crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
+}
+module_exit(crypto_sha3_mod_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Crypto API support for SHA-3");
+
+MODULE_ALIAS_CRYPTO("sha3-224");
+MODULE_ALIAS_CRYPTO("sha3-224-lib");
+MODULE_ALIAS_CRYPTO("sha3-256");
+MODULE_ALIAS_CRYPTO("sha3-256-lib");
+MODULE_ALIAS_CRYPTO("sha3-384");
+MODULE_ALIAS_CRYPTO("sha3-384-lib");
+MODULE_ALIAS_CRYPTO("sha3-512");
+MODULE_ALIAS_CRYPTO("sha3-512-lib");
diff --git a/crypto/sha3_generic.c b/crypto/sha3_generic.c
deleted file mode 100644
index 41d1e506e6de..000000000000
--- a/crypto/sha3_generic.c
+++ /dev/null
@@ -1,290 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Cryptographic API.
- *
- * SHA-3, as specified in
- * https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.202.pdf
- *
- * SHA-3 code by Jeff Garzik <jeff@garzik.org>
- *               Ard Biesheuvel <ard.biesheuvel@linaro.org>
- */
-#include <crypto/internal/hash.h>
-#include <crypto/sha3.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/unaligned.h>
-
-/*
- * On some 32-bit architectures (h8300), GCC ends up using
- * over 1 KB of stack if we inline the round calculation into the loop
- * in keccakf(). On the other hand, on 64-bit architectures with plenty
- * of [64-bit wide] general purpose registers, not inlining it severely
- * hurts performance. So let's use 64-bitness as a heuristic to decide
- * whether to inline or not.
- */
-#ifdef CONFIG_64BIT
-#define SHA3_INLINE	inline
-#else
-#define SHA3_INLINE	noinline
-#endif
-
-#define KECCAK_ROUNDS 24
-
-static const u64 keccakf_rndc[24] = {
-	0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808aULL,
-	0x8000000080008000ULL, 0x000000000000808bULL, 0x0000000080000001ULL,
-	0x8000000080008081ULL, 0x8000000000008009ULL, 0x000000000000008aULL,
-	0x0000000000000088ULL, 0x0000000080008009ULL, 0x000000008000000aULL,
-	0x000000008000808bULL, 0x800000000000008bULL, 0x8000000000008089ULL,
-	0x8000000000008003ULL, 0x8000000000008002ULL, 0x8000000000000080ULL,
-	0x000000000000800aULL, 0x800000008000000aULL, 0x8000000080008081ULL,
-	0x8000000000008080ULL, 0x0000000080000001ULL, 0x8000000080008008ULL
-};
-
-/* update the state with given number of rounds */
-
-static SHA3_INLINE void keccakf_round(u64 st[25])
-{
-	u64 t[5], tt, bc[5];
-
-	/* Theta */
-	bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20];
-	bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21];
-	bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22];
-	bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23];
-	bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24];
-
-	t[0] = bc[4] ^ rol64(bc[1], 1);
-	t[1] = bc[0] ^ rol64(bc[2], 1);
-	t[2] = bc[1] ^ rol64(bc[3], 1);
-	t[3] = bc[2] ^ rol64(bc[4], 1);
-	t[4] = bc[3] ^ rol64(bc[0], 1);
-
-	st[0] ^= t[0];
-
-	/* Rho Pi */
-	tt = st[1];
-	st[ 1] = rol64(st[ 6] ^ t[1], 44);
-	st[ 6] = rol64(st[ 9] ^ t[4], 20);
-	st[ 9] = rol64(st[22] ^ t[2], 61);
-	st[22] = rol64(st[14] ^ t[4], 39);
-	st[14] = rol64(st[20] ^ t[0], 18);
-	st[20] = rol64(st[ 2] ^ t[2], 62);
-	st[ 2] = rol64(st[12] ^ t[2], 43);
-	st[12] = rol64(st[13] ^ t[3], 25);
-	st[13] = rol64(st[19] ^ t[4],  8);
-	st[19] = rol64(st[23] ^ t[3], 56);
-	st[23] = rol64(st[15] ^ t[0], 41);
-	st[15] = rol64(st[ 4] ^ t[4], 27);
-	st[ 4] = rol64(st[24] ^ t[4], 14);
-	st[24] = rol64(st[21] ^ t[1],  2);
-	st[21] = rol64(st[ 8] ^ t[3], 55);
-	st[ 8] = rol64(st[16] ^ t[1], 45);
-	st[16] = rol64(st[ 5] ^ t[0], 36);
-	st[ 5] = rol64(st[ 3] ^ t[3], 28);
-	st[ 3] = rol64(st[18] ^ t[3], 21);
-	st[18] = rol64(st[17] ^ t[2], 15);
-	st[17] = rol64(st[11] ^ t[1], 10);
-	st[11] = rol64(st[ 7] ^ t[2],  6);
-	st[ 7] = rol64(st[10] ^ t[0],  3);
-	st[10] = rol64(    tt ^ t[1],  1);
-
-	/* Chi */
-	bc[ 0] = ~st[ 1] & st[ 2];
-	bc[ 1] = ~st[ 2] & st[ 3];
-	bc[ 2] = ~st[ 3] & st[ 4];
-	bc[ 3] = ~st[ 4] & st[ 0];
-	bc[ 4] = ~st[ 0] & st[ 1];
-	st[ 0] ^= bc[ 0];
-	st[ 1] ^= bc[ 1];
-	st[ 2] ^= bc[ 2];
-	st[ 3] ^= bc[ 3];
-	st[ 4] ^= bc[ 4];
-
-	bc[ 0] = ~st[ 6] & st[ 7];
-	bc[ 1] = ~st[ 7] & st[ 8];
-	bc[ 2] = ~st[ 8] & st[ 9];
-	bc[ 3] = ~st[ 9] & st[ 5];
-	bc[ 4] = ~st[ 5] & st[ 6];
-	st[ 5] ^= bc[ 0];
-	st[ 6] ^= bc[ 1];
-	st[ 7] ^= bc[ 2];
-	st[ 8] ^= bc[ 3];
-	st[ 9] ^= bc[ 4];
-
-	bc[ 0] = ~st[11] & st[12];
-	bc[ 1] = ~st[12] & st[13];
-	bc[ 2] = ~st[13] & st[14];
-	bc[ 3] = ~st[14] & st[10];
-	bc[ 4] = ~st[10] & st[11];
-	st[10] ^= bc[ 0];
-	st[11] ^= bc[ 1];
-	st[12] ^= bc[ 2];
-	st[13] ^= bc[ 3];
-	st[14] ^= bc[ 4];
-
-	bc[ 0] = ~st[16] & st[17];
-	bc[ 1] = ~st[17] & st[18];
-	bc[ 2] = ~st[18] & st[19];
-	bc[ 3] = ~st[19] & st[15];
-	bc[ 4] = ~st[15] & st[16];
-	st[15] ^= bc[ 0];
-	st[16] ^= bc[ 1];
-	st[17] ^= bc[ 2];
-	st[18] ^= bc[ 3];
-	st[19] ^= bc[ 4];
-
-	bc[ 0] = ~st[21] & st[22];
-	bc[ 1] = ~st[22] & st[23];
-	bc[ 2] = ~st[23] & st[24];
-	bc[ 3] = ~st[24] & st[20];
-	bc[ 4] = ~st[20] & st[21];
-	st[20] ^= bc[ 0];
-	st[21] ^= bc[ 1];
-	st[22] ^= bc[ 2];
-	st[23] ^= bc[ 3];
-	st[24] ^= bc[ 4];
-}
-
-static void keccakf(u64 st[25])
-{
-	int round;
-
-	for (round = 0; round < KECCAK_ROUNDS; round++) {
-		keccakf_round(st);
-		/* Iota */
-		st[0] ^= keccakf_rndc[round];
-	}
-}
-
-int crypto_sha3_init(struct shash_desc *desc)
-{
-	struct sha3_state *sctx = shash_desc_ctx(desc);
-
-	memset(sctx->st, 0, sizeof(sctx->st));
-	return 0;
-}
-EXPORT_SYMBOL(crypto_sha3_init);
-
-static int crypto_sha3_update(struct shash_desc *desc, const u8 *data,
-			      unsigned int len)
-{
-	unsigned int rsiz = crypto_shash_blocksize(desc->tfm);
-	struct sha3_state *sctx = shash_desc_ctx(desc);
-	unsigned int rsizw = rsiz / 8;
-
-	do {
-		int i;
-
-		for (i = 0; i < rsizw; i++)
-			sctx->st[i] ^= get_unaligned_le64(data + 8 * i);
-		keccakf(sctx->st);
-
-		data += rsiz;
-		len -= rsiz;
-	} while (len >= rsiz);
-	return len;
-}
-
-static int crypto_sha3_finup(struct shash_desc *desc, const u8 *src,
-			     unsigned int len, u8 *out)
-{
-	unsigned int digest_size = crypto_shash_digestsize(desc->tfm);
-	unsigned int rsiz = crypto_shash_blocksize(desc->tfm);
-	struct sha3_state *sctx = shash_desc_ctx(desc);
-	__le64 block[SHA3_224_BLOCK_SIZE / 8] = {};
-	__le64 *digest = (__le64 *)out;
-	unsigned int rsizw = rsiz / 8;
-	u8 *p;
-	int i;
-
-	p = memcpy(block, src, len);
-	p[len++] = 0x06;
-	p[rsiz - 1] |= 0x80;
-
-	for (i = 0; i < rsizw; i++)
-		sctx->st[i] ^= le64_to_cpu(block[i]);
-	memzero_explicit(block, sizeof(block));
-
-	keccakf(sctx->st);
-
-	for (i = 0; i < digest_size / 8; i++)
-		put_unaligned_le64(sctx->st[i], digest++);
-
-	if (digest_size & 4)
-		put_unaligned_le32(sctx->st[i], (__le32 *)digest);
-
-	return 0;
-}
-
-static struct shash_alg algs[] = { {
-	.digestsize		= SHA3_224_DIGEST_SIZE,
-	.init			= crypto_sha3_init,
-	.update			= crypto_sha3_update,
-	.finup			= crypto_sha3_finup,
-	.descsize		= SHA3_STATE_SIZE,
-	.base.cra_name		= "sha3-224",
-	.base.cra_driver_name	= "sha3-224-generic",
-	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
-	.base.cra_blocksize	= SHA3_224_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-}, {
-	.digestsize		= SHA3_256_DIGEST_SIZE,
-	.init			= crypto_sha3_init,
-	.update			= crypto_sha3_update,
-	.finup			= crypto_sha3_finup,
-	.descsize		= SHA3_STATE_SIZE,
-	.base.cra_name		= "sha3-256",
-	.base.cra_driver_name	= "sha3-256-generic",
-	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
-	.base.cra_blocksize	= SHA3_256_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-}, {
-	.digestsize		= SHA3_384_DIGEST_SIZE,
-	.init			= crypto_sha3_init,
-	.update			= crypto_sha3_update,
-	.finup			= crypto_sha3_finup,
-	.descsize		= SHA3_STATE_SIZE,
-	.base.cra_name		= "sha3-384",
-	.base.cra_driver_name	= "sha3-384-generic",
-	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
-	.base.cra_blocksize	= SHA3_384_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-}, {
-	.digestsize		= SHA3_512_DIGEST_SIZE,
-	.init			= crypto_sha3_init,
-	.update			= crypto_sha3_update,
-	.finup			= crypto_sha3_finup,
-	.descsize		= SHA3_STATE_SIZE,
-	.base.cra_name		= "sha3-512",
-	.base.cra_driver_name	= "sha3-512-generic",
-	.base.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
-	.base.cra_blocksize	= SHA3_512_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-} };
-
-static int __init sha3_generic_mod_init(void)
-{
-	return crypto_register_shashes(algs, ARRAY_SIZE(algs));
-}
-
-static void __exit sha3_generic_mod_fini(void)
-{
-	crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
-}
-
-module_init(sha3_generic_mod_init);
-module_exit(sha3_generic_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA-3 Secure Hash Algorithm");
-
-MODULE_ALIAS_CRYPTO("sha3-224");
-MODULE_ALIAS_CRYPTO("sha3-224-generic");
-MODULE_ALIAS_CRYPTO("sha3-256");
-MODULE_ALIAS_CRYPTO("sha3-256-generic");
-MODULE_ALIAS_CRYPTO("sha3-384");
-MODULE_ALIAS_CRYPTO("sha3-384-generic");
-MODULE_ALIAS_CRYPTO("sha3-512");
-MODULE_ALIAS_CRYPTO("sha3-512-generic");
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 3ab7adc1cdce..90d06c3ec967 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -5104,6 +5104,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		}
 	}, {
 		.alg = "hmac(sha3-224)",
+		.generic_driver = "hmac(sha3-224-lib)",
 		.test = alg_test_hash,
 		.fips_allowed = 1,
 		.suite = {
@@ -5111,6 +5112,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		}
 	}, {
 		.alg = "hmac(sha3-256)",
+		.generic_driver = "hmac(sha3-256-lib)",
 		.test = alg_test_hash,
 		.fips_allowed = 1,
 		.suite = {
@@ -5118,6 +5120,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		}
 	}, {
 		.alg = "hmac(sha3-384)",
+		.generic_driver = "hmac(sha3-384-lib)",
 		.test = alg_test_hash,
 		.fips_allowed = 1,
 		.suite = {
@@ -5125,6 +5128,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		}
 	}, {
 		.alg = "hmac(sha3-512)",
+		.generic_driver = "hmac(sha3-512-lib)",
 		.test = alg_test_hash,
 		.fips_allowed = 1,
 		.suite = {
@@ -5478,6 +5482,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		}
 	}, {
 		.alg = "sha3-224",
+		.generic_driver = "sha3-224-lib",
 		.test = alg_test_hash,
 		.fips_allowed = 1,
 		.suite = {
@@ -5485,6 +5490,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		}
 	}, {
 		.alg = "sha3-256",
+		.generic_driver = "sha3-256-lib",
 		.test = alg_test_hash,
 		.fips_allowed = 1,
 		.suite = {
@@ -5492,6 +5498,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		}
 	}, {
 		.alg = "sha3-384",
+		.generic_driver = "sha3-384-lib",
 		.test = alg_test_hash,
 		.fips_allowed = 1,
 		.suite = {
@@ -5499,6 +5506,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		}
 	}, {
 		.alg = "sha3-512",
+		.generic_driver = "sha3-512-lib",
 		.test = alg_test_hash,
 		.fips_allowed = 1,
 		.suite = {
diff --git a/include/crypto/sha3.h b/include/crypto/sha3.h
index c0c468ee099e..c9e4182ff74f 100644
--- a/include/crypto/sha3.h
+++ b/include/crypto/sha3.h
@@ -37,10 +37,6 @@
 
 #define SHA3_STATE_SIZE		200
 
-struct shash_desc;
-
-int crypto_sha3_init(struct shash_desc *desc);
-
 /*
  * State for the Keccak-f[1600] permutation: 25 64-bit words.
  *
@@ -52,8 +48,6 @@ int crypto_sha3_init(struct shash_desc *desc);
  */
 struct sha3_state {
 	union {
-		u64 st[SHA3_STATE_SIZE / 8]; /* temporarily retained for compatibility purposes */
-
 		__le64 words[SHA3_STATE_SIZE / 8];
 		u8 bytes[SHA3_STATE_SIZE];
 

From 496df7cd649dcb0437d3dbde5231bd5dcd77d2ec Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Sat, 25 Oct 2025 22:50:32 -0700
Subject: [PATCH 22/37] crypto: s390/sha3 - Remove superseded SHA-3 code

The SHA-3 library now utilizes the same s390 SHA-3 acceleration
capabilities as the arch/s390/crypto/ SHA-3 crypto_shash algorithms.
Moreover, crypto/sha3.c now uses the SHA-3 library.  The result is that
all SHA-3 APIs are now s390-accelerated without any need for the old
SHA-3 code in arch/s390/crypto/.  Remove this superseded code.

Also update the s390 defconfig and debug_defconfig files to enable
CONFIG_CRYPTO_SHA3 instead of CONFIG_CRYPTO_SHA3_256_S390 and
CONFIG_CRYPTO_SHA3_512_S390.  This makes it so that the s390-optimized
SHA-3 continues to be built when either of these defconfigs is used.

Tested-by: Harald Freudenberger <freude@linux.ibm.com>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251026055032.1413733-16-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 arch/s390/configs/debug_defconfig |   3 +-
 arch/s390/configs/defconfig       |   3 +-
 arch/s390/crypto/Kconfig          |  20 ----
 arch/s390/crypto/Makefile         |   2 -
 arch/s390/crypto/sha.h            |  51 ----------
 arch/s390/crypto/sha3_256_s390.c  | 157 ------------------------------
 arch/s390/crypto/sha3_512_s390.c  | 157 ------------------------------
 arch/s390/crypto/sha_common.c     | 117 ----------------------
 8 files changed, 2 insertions(+), 508 deletions(-)
 delete mode 100644 arch/s390/crypto/sha.h
 delete mode 100644 arch/s390/crypto/sha3_256_s390.c
 delete mode 100644 arch/s390/crypto/sha3_512_s390.c
 delete mode 100644 arch/s390/crypto/sha_common.c

diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig
index b31c1df90257..5fdfebcfd50f 100644
--- a/arch/s390/configs/debug_defconfig
+++ b/arch/s390/configs/debug_defconfig
@@ -792,6 +792,7 @@ CONFIG_CRYPTO_MD4=m
 CONFIG_CRYPTO_MD5=y
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_RMD160=m
+CONFIG_CRYPTO_SHA3=m
 CONFIG_CRYPTO_SM3_GENERIC=m
 CONFIG_CRYPTO_WP512=m
 CONFIG_CRYPTO_XCBC=m
@@ -805,8 +806,6 @@ CONFIG_CRYPTO_USER_API_HASH=m
 CONFIG_CRYPTO_USER_API_SKCIPHER=m
 CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
-CONFIG_CRYPTO_SHA3_256_S390=m
-CONFIG_CRYPTO_SHA3_512_S390=m
 CONFIG_CRYPTO_GHASH_S390=m
 CONFIG_CRYPTO_AES_S390=m
 CONFIG_CRYPTO_DES_S390=m
diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig
index 161dad7ef211..7bac3f53a95b 100644
--- a/arch/s390/configs/defconfig
+++ b/arch/s390/configs/defconfig
@@ -776,6 +776,7 @@ CONFIG_CRYPTO_MD4=m
 CONFIG_CRYPTO_MD5=y
 CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_RMD160=m
+CONFIG_CRYPTO_SHA3=m
 CONFIG_CRYPTO_SM3_GENERIC=m
 CONFIG_CRYPTO_WP512=m
 CONFIG_CRYPTO_XCBC=m
@@ -790,8 +791,6 @@ CONFIG_CRYPTO_USER_API_HASH=m
 CONFIG_CRYPTO_USER_API_SKCIPHER=m
 CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
-CONFIG_CRYPTO_SHA3_256_S390=m
-CONFIG_CRYPTO_SHA3_512_S390=m
 CONFIG_CRYPTO_GHASH_S390=m
 CONFIG_CRYPTO_AES_S390=m
 CONFIG_CRYPTO_DES_S390=m
diff --git a/arch/s390/crypto/Kconfig b/arch/s390/crypto/Kconfig
index 03f73fbd38b6..f838ca055f6d 100644
--- a/arch/s390/crypto/Kconfig
+++ b/arch/s390/crypto/Kconfig
@@ -2,26 +2,6 @@
 
 menu "Accelerated Cryptographic Algorithms for CPU (s390)"
 
-config CRYPTO_SHA3_256_S390
-	tristate "Hash functions: SHA3-224 and SHA3-256"
-	select CRYPTO_HASH
-	help
-	  SHA3-224 and SHA3-256 secure hash algorithms (FIPS 202)
-
-	  Architecture: s390
-
-	  It is available as of z14.
-
-config CRYPTO_SHA3_512_S390
-	tristate "Hash functions: SHA3-384 and SHA3-512"
-	select CRYPTO_HASH
-	help
-	  SHA3-384 and SHA3-512 secure hash algorithms (FIPS 202)
-
-	  Architecture: s390
-
-	  It is available as of z14.
-
 config CRYPTO_GHASH_S390
 	tristate "Hash functions: GHASH"
 	select CRYPTO_HASH
diff --git a/arch/s390/crypto/Makefile b/arch/s390/crypto/Makefile
index 998f4b656b18..387a229e1038 100644
--- a/arch/s390/crypto/Makefile
+++ b/arch/s390/crypto/Makefile
@@ -3,8 +3,6 @@
 # Cryptographic API
 #
 
-obj-$(CONFIG_CRYPTO_SHA3_256_S390) += sha3_256_s390.o sha_common.o
-obj-$(CONFIG_CRYPTO_SHA3_512_S390) += sha3_512_s390.o sha_common.o
 obj-$(CONFIG_CRYPTO_DES_S390) += des_s390.o
 obj-$(CONFIG_CRYPTO_AES_S390) += aes_s390.o
 obj-$(CONFIG_CRYPTO_PAES_S390) += paes_s390.o
diff --git a/arch/s390/crypto/sha.h b/arch/s390/crypto/sha.h
deleted file mode 100644
index b9cd9572dd35..000000000000
--- a/arch/s390/crypto/sha.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0+ */
-/*
- * Cryptographic API.
- *
- * s390 generic implementation of the SHA Secure Hash Algorithms.
- *
- * Copyright IBM Corp. 2007
- * Author(s): Jan Glauber (jang@de.ibm.com)
- */
-#ifndef _CRYPTO_ARCH_S390_SHA_H
-#define _CRYPTO_ARCH_S390_SHA_H
-
-#include <crypto/hash.h>
-#include <crypto/sha2.h>
-#include <crypto/sha3.h>
-#include <linux/build_bug.h>
-#include <linux/types.h>
-
-/* must be big enough for the largest SHA variant */
-#define CPACF_MAX_PARMBLOCK_SIZE	SHA3_STATE_SIZE
-#define SHA_MAX_BLOCK_SIZE		SHA3_224_BLOCK_SIZE
-
-struct s390_sha_ctx {
-	u64 count;		/* message length in bytes */
-	union {
-		u32 state[CPACF_MAX_PARMBLOCK_SIZE / sizeof(u32)];
-		struct {
-			u64 state[SHA512_DIGEST_SIZE / sizeof(u64)];
-			u64 count_hi;
-		} sha512;
-		struct {
-			__le64 state[SHA3_STATE_SIZE / sizeof(u64)];
-		} sha3;
-	};
-	int func;		/* KIMD function to use */
-	bool first_message_part;
-};
-
-struct shash_desc;
-
-int s390_sha_update_blocks(struct shash_desc *desc, const u8 *data,
-			   unsigned int len);
-int s390_sha_finup(struct shash_desc *desc, const u8 *src, unsigned int len,
-		   u8 *out);
-
-static inline void __check_s390_sha_ctx_size(void)
-{
-	BUILD_BUG_ON(S390_SHA_CTX_SIZE != sizeof(struct s390_sha_ctx));
-}
-
-#endif
diff --git a/arch/s390/crypto/sha3_256_s390.c b/arch/s390/crypto/sha3_256_s390.c
deleted file mode 100644
index 7415d56649a5..000000000000
--- a/arch/s390/crypto/sha3_256_s390.c
+++ /dev/null
@@ -1,157 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0+
-/*
- * Cryptographic API.
- *
- * s390 implementation of the SHA256 and SHA224 Secure Hash Algorithm.
- *
- * s390 Version:
- *   Copyright IBM Corp. 2019
- *   Author(s): Joerg Schmidbauer (jschmidb@de.ibm.com)
- */
-#include <asm/cpacf.h>
-#include <crypto/internal/hash.h>
-#include <crypto/sha3.h>
-#include <linux/cpufeature.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-#include "sha.h"
-
-static int s390_sha3_256_init(struct shash_desc *desc)
-{
-	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
-
-	sctx->first_message_part = test_facility(86);
-	if (!sctx->first_message_part)
-		memset(sctx->state, 0, sizeof(sctx->state));
-	sctx->count = 0;
-	sctx->func = CPACF_KIMD_SHA3_256;
-
-	return 0;
-}
-
-static int sha3_256_export(struct shash_desc *desc, void *out)
-{
-	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
-	union {
-		u8 *u8;
-		u64 *u64;
-	} p = { .u8 = out };
-	int i;
-
-	if (sctx->first_message_part) {
-		memset(out, 0, SHA3_STATE_SIZE);
-		return 0;
-	}
-	for (i = 0; i < SHA3_STATE_SIZE / 8; i++)
-		put_unaligned(le64_to_cpu(sctx->sha3.state[i]), p.u64++);
-	return 0;
-}
-
-static int sha3_256_import(struct shash_desc *desc, const void *in)
-{
-	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
-	union {
-		const u8 *u8;
-		const u64 *u64;
-	} p = { .u8 = in };
-	int i;
-
-	for (i = 0; i < SHA3_STATE_SIZE / 8; i++)
-		sctx->sha3.state[i] = cpu_to_le64(get_unaligned(p.u64++));
-	sctx->count = 0;
-	sctx->first_message_part = 0;
-	sctx->func = CPACF_KIMD_SHA3_256;
-
-	return 0;
-}
-
-static int sha3_224_import(struct shash_desc *desc, const void *in)
-{
-	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
-
-	sha3_256_import(desc, in);
-	sctx->func = CPACF_KIMD_SHA3_224;
-	return 0;
-}
-
-static struct shash_alg sha3_256_alg = {
-	.digestsize	=	SHA3_256_DIGEST_SIZE,	   /* = 32 */
-	.init		=	s390_sha3_256_init,
-	.update		=	s390_sha_update_blocks,
-	.finup		=	s390_sha_finup,
-	.export		=	sha3_256_export,
-	.import		=	sha3_256_import,
-	.descsize	=	S390_SHA_CTX_SIZE,
-	.statesize	=	SHA3_STATE_SIZE,
-	.base		=	{
-		.cra_name	 =	"sha3-256",
-		.cra_driver_name =	"sha3-256-s390",
-		.cra_priority	 =	300,
-		.cra_flags	 =	CRYPTO_AHASH_ALG_BLOCK_ONLY,
-		.cra_blocksize	 =	SHA3_256_BLOCK_SIZE,
-		.cra_module	 =	THIS_MODULE,
-	}
-};
-
-static int s390_sha3_224_init(struct shash_desc *desc)
-{
-	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
-
-	s390_sha3_256_init(desc);
-	sctx->func = CPACF_KIMD_SHA3_224;
-	return 0;
-}
-
-static struct shash_alg sha3_224_alg = {
-	.digestsize	=	SHA3_224_DIGEST_SIZE,
-	.init		=	s390_sha3_224_init,
-	.update		=	s390_sha_update_blocks,
-	.finup		=	s390_sha_finup,
-	.export		=	sha3_256_export, /* same as for 256 */
-	.import		=	sha3_224_import, /* function code different! */
-	.descsize	=	S390_SHA_CTX_SIZE,
-	.statesize	=	SHA3_STATE_SIZE,
-	.base		=	{
-		.cra_name	 =	"sha3-224",
-		.cra_driver_name =	"sha3-224-s390",
-		.cra_priority	 =	300,
-		.cra_flags	 =	CRYPTO_AHASH_ALG_BLOCK_ONLY,
-		.cra_blocksize	 =	SHA3_224_BLOCK_SIZE,
-		.cra_module	 =	THIS_MODULE,
-	}
-};
-
-static int __init sha3_256_s390_init(void)
-{
-	int ret;
-
-	if (!cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA3_256))
-		return -ENODEV;
-
-	ret = crypto_register_shash(&sha3_256_alg);
-	if (ret < 0)
-		goto out;
-
-	ret = crypto_register_shash(&sha3_224_alg);
-	if (ret < 0)
-		crypto_unregister_shash(&sha3_256_alg);
-out:
-	return ret;
-}
-
-static void __exit sha3_256_s390_fini(void)
-{
-	crypto_unregister_shash(&sha3_224_alg);
-	crypto_unregister_shash(&sha3_256_alg);
-}
-
-module_cpu_feature_match(S390_CPU_FEATURE_MSA, sha3_256_s390_init);
-module_exit(sha3_256_s390_fini);
-
-MODULE_ALIAS_CRYPTO("sha3-256");
-MODULE_ALIAS_CRYPTO("sha3-224");
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA3-256 and SHA3-224 Secure Hash Algorithm");
diff --git a/arch/s390/crypto/sha3_512_s390.c b/arch/s390/crypto/sha3_512_s390.c
deleted file mode 100644
index ff6ee5584400..000000000000
--- a/arch/s390/crypto/sha3_512_s390.c
+++ /dev/null
@@ -1,157 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0+
-/*
- * Cryptographic API.
- *
- * s390 implementation of the SHA512 and SHA384 Secure Hash Algorithm.
- *
- * Copyright IBM Corp. 2019
- * Author(s): Joerg Schmidbauer (jschmidb@de.ibm.com)
- */
-#include <asm/cpacf.h>
-#include <crypto/internal/hash.h>
-#include <crypto/sha3.h>
-#include <linux/cpufeature.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-#include "sha.h"
-
-static int s390_sha3_512_init(struct shash_desc *desc)
-{
-	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
-
-	sctx->first_message_part = test_facility(86);
-	if (!sctx->first_message_part)
-		memset(sctx->state, 0, sizeof(sctx->state));
-	sctx->count = 0;
-	sctx->func = CPACF_KIMD_SHA3_512;
-
-	return 0;
-}
-
-static int sha3_512_export(struct shash_desc *desc, void *out)
-{
-	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
-	union {
-		u8 *u8;
-		u64 *u64;
-	} p = { .u8 = out };
-	int i;
-
-	if (sctx->first_message_part) {
-		memset(out, 0, SHA3_STATE_SIZE);
-		return 0;
-	}
-	for (i = 0; i < SHA3_STATE_SIZE / 8; i++)
-		put_unaligned(le64_to_cpu(sctx->sha3.state[i]), p.u64++);
-	return 0;
-}
-
-static int sha3_512_import(struct shash_desc *desc, const void *in)
-{
-	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
-	union {
-		const u8 *u8;
-		const u64 *u64;
-	} p = { .u8 = in };
-	int i;
-
-	for (i = 0; i < SHA3_STATE_SIZE / 8; i++)
-		sctx->sha3.state[i] = cpu_to_le64(get_unaligned(p.u64++));
-	sctx->count = 0;
-	sctx->first_message_part = 0;
-	sctx->func = CPACF_KIMD_SHA3_512;
-
-	return 0;
-}
-
-static int sha3_384_import(struct shash_desc *desc, const void *in)
-{
-	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
-
-	sha3_512_import(desc, in);
-	sctx->func = CPACF_KIMD_SHA3_384;
-	return 0;
-}
-
-static struct shash_alg sha3_512_alg = {
-	.digestsize	=	SHA3_512_DIGEST_SIZE,
-	.init		=	s390_sha3_512_init,
-	.update		=	s390_sha_update_blocks,
-	.finup		=	s390_sha_finup,
-	.export		=	sha3_512_export,
-	.import		=	sha3_512_import,
-	.descsize	=	S390_SHA_CTX_SIZE,
-	.statesize	=	SHA3_STATE_SIZE,
-	.base		=	{
-		.cra_name	 =	"sha3-512",
-		.cra_driver_name =	"sha3-512-s390",
-		.cra_priority	 =	300,
-		.cra_flags	 =	CRYPTO_AHASH_ALG_BLOCK_ONLY,
-		.cra_blocksize	 =	SHA3_512_BLOCK_SIZE,
-		.cra_module	 =	THIS_MODULE,
-	}
-};
-
-MODULE_ALIAS_CRYPTO("sha3-512");
-
-static int s390_sha3_384_init(struct shash_desc *desc)
-{
-	struct s390_sha_ctx *sctx = shash_desc_ctx(desc);
-
-	s390_sha3_512_init(desc);
-	sctx->func = CPACF_KIMD_SHA3_384;
-	return 0;
-}
-
-static struct shash_alg sha3_384_alg = {
-	.digestsize	=	SHA3_384_DIGEST_SIZE,
-	.init		=	s390_sha3_384_init,
-	.update		=	s390_sha_update_blocks,
-	.finup		=	s390_sha_finup,
-	.export		=	sha3_512_export, /* same as for 512 */
-	.import		=	sha3_384_import, /* function code different! */
-	.descsize	=	S390_SHA_CTX_SIZE,
-	.statesize	=	SHA3_STATE_SIZE,
-	.base		=	{
-		.cra_name	 =	"sha3-384",
-		.cra_driver_name =	"sha3-384-s390",
-		.cra_priority	 =	300,
-		.cra_flags	 =	CRYPTO_AHASH_ALG_BLOCK_ONLY,
-		.cra_blocksize	 =	SHA3_384_BLOCK_SIZE,
-		.cra_ctxsize	 =	sizeof(struct s390_sha_ctx),
-		.cra_module	 =	THIS_MODULE,
-	}
-};
-
-MODULE_ALIAS_CRYPTO("sha3-384");
-
-static int __init init(void)
-{
-	int ret;
-
-	if (!cpacf_query_func(CPACF_KIMD, CPACF_KIMD_SHA3_512))
-		return -ENODEV;
-	ret = crypto_register_shash(&sha3_512_alg);
-	if (ret < 0)
-		goto out;
-	ret = crypto_register_shash(&sha3_384_alg);
-	if (ret < 0)
-		crypto_unregister_shash(&sha3_512_alg);
-out:
-	return ret;
-}
-
-static void __exit fini(void)
-{
-	crypto_unregister_shash(&sha3_512_alg);
-	crypto_unregister_shash(&sha3_384_alg);
-}
-
-module_cpu_feature_match(S390_CPU_FEATURE_MSA, init);
-module_exit(fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA3-512 and SHA3-384 Secure Hash Algorithm");
diff --git a/arch/s390/crypto/sha_common.c b/arch/s390/crypto/sha_common.c
deleted file mode 100644
index d6f839618794..000000000000
--- a/arch/s390/crypto/sha_common.c
+++ /dev/null
@@ -1,117 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0+
-/*
- * Cryptographic API.
- *
- * s390 generic implementation of the SHA Secure Hash Algorithms.
- *
- * Copyright IBM Corp. 2007
- * Author(s): Jan Glauber (jang@de.ibm.com)
- */
-
-#include <crypto/internal/hash.h>
-#include <linux/export.h>
-#include <linux/module.h>
-#include <asm/cpacf.h>
-#include "sha.h"
-
-int s390_sha_update_blocks(struct shash_desc *desc, const u8 *data,
-			   unsigned int len)
-{
-	unsigned int bsize = crypto_shash_blocksize(desc->tfm);
-	struct s390_sha_ctx *ctx = shash_desc_ctx(desc);
-	unsigned int n;
-	int fc;
-
-	fc = ctx->func;
-	if (ctx->first_message_part)
-		fc |= CPACF_KIMD_NIP;
-
-	/* process as many blocks as possible */
-	n = (len / bsize) * bsize;
-	ctx->count += n;
-	switch (ctx->func) {
-	case CPACF_KLMD_SHA_512:
-	case CPACF_KLMD_SHA3_384:
-		if (ctx->count < n)
-			ctx->sha512.count_hi++;
-		break;
-	}
-	cpacf_kimd(fc, ctx->state, data, n);
-	ctx->first_message_part = 0;
-	return len - n;
-}
-EXPORT_SYMBOL_GPL(s390_sha_update_blocks);
-
-static int s390_crypto_shash_parmsize(int func)
-{
-	switch (func) {
-	case CPACF_KLMD_SHA_1:
-		return 20;
-	case CPACF_KLMD_SHA_256:
-		return 32;
-	case CPACF_KLMD_SHA_512:
-		return 64;
-	case CPACF_KLMD_SHA3_224:
-	case CPACF_KLMD_SHA3_256:
-	case CPACF_KLMD_SHA3_384:
-	case CPACF_KLMD_SHA3_512:
-		return 200;
-	default:
-		return -EINVAL;
-	}
-}
-
-int s390_sha_finup(struct shash_desc *desc, const u8 *src, unsigned int len,
-		   u8 *out)
-{
-	struct s390_sha_ctx *ctx = shash_desc_ctx(desc);
-	int mbl_offset, fc;
-	u64 bits;
-
-	ctx->count += len;
-
-	bits = ctx->count * 8;
-	mbl_offset = s390_crypto_shash_parmsize(ctx->func);
-	if (mbl_offset < 0)
-		return -EINVAL;
-
-	mbl_offset = mbl_offset / sizeof(u32);
-
-	/* set total msg bit length (mbl) in CPACF parmblock */
-	switch (ctx->func) {
-	case CPACF_KLMD_SHA_512:
-		/* The SHA512 parmblock has a 128-bit mbl field. */
-		if (ctx->count < len)
-			ctx->sha512.count_hi++;
-		ctx->sha512.count_hi <<= 3;
-		ctx->sha512.count_hi |= ctx->count >> 61;
-		mbl_offset += sizeof(u64) / sizeof(u32);
-		fallthrough;
-	case CPACF_KLMD_SHA_1:
-	case CPACF_KLMD_SHA_256:
-		memcpy(ctx->state + mbl_offset, &bits, sizeof(bits));
-		break;
-	case CPACF_KLMD_SHA3_224:
-	case CPACF_KLMD_SHA3_256:
-	case CPACF_KLMD_SHA3_384:
-	case CPACF_KLMD_SHA3_512:
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	fc = ctx->func;
-	fc |= test_facility(86) ? CPACF_KLMD_DUFOP : 0;
-	if (ctx->first_message_part)
-		fc |= CPACF_KLMD_NIP;
-	cpacf_klmd(fc, ctx->state, src, len);
-
-	/* copy digest to out */
-	memcpy(out, ctx->state, crypto_shash_digestsize(desc->tfm));
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(s390_sha_finup);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("s390 SHA cipher common functions");

From b8b816ec046922ef362120c14009516b92a26385 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Sat, 1 Nov 2025 19:15:53 -0700
Subject: [PATCH 23/37] lib/crypto: arm/blake2s: Fix some comments

Fix the indices in some comments in blake2s-core.S.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251102021553.176587-1-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 lib/crypto/arm/blake2s-core.S | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/crypto/arm/blake2s-core.S b/lib/crypto/arm/blake2s-core.S
index 14eb7c18a836..933f0558b7cd 100644
--- a/lib/crypto/arm/blake2s-core.S
+++ b/lib/crypto/arm/blake2s-core.S
@@ -115,7 +115,7 @@
 
 // Execute one round of BLAKE2s by updating the state matrix v[0..15].  v[0..9]
 // are in r0..r9.  The stack pointer points to 8 bytes of scratch space for
-// spilling v[8..9], then to v[9..15], then to the message block.  r10-r12 and
+// spilling v[8..9], then to v[10..15], then to the message block.  r10-r12 and
 // r14 are free to use.  The macro arguments s0-s15 give the order in which the
 // message words are used in this round.
 //
@@ -209,18 +209,18 @@ ENTRY(blake2s_compress)
 .Lcopy_block_done:
 	str		r1, [sp, #68]		// Update message pointer
 
-	// Calculate v[8..15].  Push v[9..15] onto the stack, and leave space
+	// Calculate v[8..15].  Push v[10..15] onto the stack, and leave space
 	// for spilling v[8..9].  Leave v[8..9] in r8-r9.
 	mov		r14, r0			// r14 = ctx
 	adr		r12, .Lblake2s_IV
 	ldmia		r12!, {r8-r9}		// load IV[0..1]
 	__ldrd		r0, r1, r14, 40		// load f[0..1]
-	ldm		r12, {r2-r7}		// load IV[3..7]
+	ldm		r12, {r2-r7}		// load IV[2..7]
 	eor		r4, r4, r10		// v[12] = IV[4] ^ t[0]
 	eor		r5, r5, r11		// v[13] = IV[5] ^ t[1]
 	eor		r6, r6, r0		// v[14] = IV[6] ^ f[0]
 	eor		r7, r7, r1		// v[15] = IV[7] ^ f[1]
-	push		{r2-r7}			// push v[9..15]
+	push		{r2-r7}			// push v[10..15]
 	sub		sp, sp, #8		// leave space for v[8..9]
 
 	// Load h[0..7] == v[0..7].

From 95ce85de0b8c8a1192c692cbd5948e4dcbc1563f Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Sat, 1 Nov 2025 18:48:06 -0700
Subject: [PATCH 24/37] lib/crypto: arm, arm64: Drop filenames from file
 comments

Remove self-references to filenames from assembly files in
lib/crypto/arm/ and lib/crypto/arm64/.  This follows the recommended
practice and eliminates an outdated reference to sha2-ce-core.S.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251102014809.170713-1-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 lib/crypto/arm/sha1-armv7-neon.S  | 2 +-
 lib/crypto/arm/sha1-ce-core.S     | 2 +-
 lib/crypto/arm/sha256-ce.S        | 2 +-
 lib/crypto/arm64/sha1-ce-core.S   | 2 +-
 lib/crypto/arm64/sha256-ce.S      | 2 +-
 lib/crypto/arm64/sha3-ce-core.S   | 2 +-
 lib/crypto/arm64/sha512-ce-core.S | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/lib/crypto/arm/sha1-armv7-neon.S b/lib/crypto/arm/sha1-armv7-neon.S
index 6edba3ab62e8..a0323fa5c58a 100644
--- a/lib/crypto/arm/sha1-armv7-neon.S
+++ b/lib/crypto/arm/sha1-armv7-neon.S
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* sha1-armv7-neon.S - ARM/NEON accelerated SHA-1 transform function
+/* ARM/NEON accelerated SHA-1 transform function
  *
  * Copyright © 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
  */
diff --git a/lib/crypto/arm/sha1-ce-core.S b/lib/crypto/arm/sha1-ce-core.S
index 2de40dd25e47..7d6b2631ca8d 100644
--- a/lib/crypto/arm/sha1-ce-core.S
+++ b/lib/crypto/arm/sha1-ce-core.S
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * sha1-ce-core.S - SHA-1 secure hash using ARMv8 Crypto Extensions
+ * SHA-1 secure hash using ARMv8 Crypto Extensions
  *
  * Copyright (C) 2015 Linaro Ltd.
  * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
diff --git a/lib/crypto/arm/sha256-ce.S b/lib/crypto/arm/sha256-ce.S
index 7481ac8e6c0d..144ee805f64a 100644
--- a/lib/crypto/arm/sha256-ce.S
+++ b/lib/crypto/arm/sha256-ce.S
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * sha256-ce.S - SHA-224/256 secure hash using ARMv8 Crypto Extensions
+ * SHA-224/256 secure hash using ARMv8 Crypto Extensions
  *
  * Copyright (C) 2015 Linaro Ltd.
  * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
diff --git a/lib/crypto/arm64/sha1-ce-core.S b/lib/crypto/arm64/sha1-ce-core.S
index 21efbbafd7d6..8fbd4767f0f0 100644
--- a/lib/crypto/arm64/sha1-ce-core.S
+++ b/lib/crypto/arm64/sha1-ce-core.S
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * sha1-ce-core.S - SHA-1 secure hash using ARMv8 Crypto Extensions
+ * SHA-1 secure hash using ARMv8 Crypto Extensions
  *
  * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
  */
diff --git a/lib/crypto/arm64/sha256-ce.S b/lib/crypto/arm64/sha256-ce.S
index 410174ba5237..e4bfe42a61a9 100644
--- a/lib/crypto/arm64/sha256-ce.S
+++ b/lib/crypto/arm64/sha256-ce.S
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * sha2-ce-core.S - core SHA-224/SHA-256 transform using v8 Crypto Extensions
+ * Core SHA-224/SHA-256 transform using v8 Crypto Extensions
  *
  * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
  */
diff --git a/lib/crypto/arm64/sha3-ce-core.S b/lib/crypto/arm64/sha3-ce-core.S
index b62bd714839b..ace90b506490 100644
--- a/lib/crypto/arm64/sha3-ce-core.S
+++ b/lib/crypto/arm64/sha3-ce-core.S
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
- * sha3-ce-core.S - core SHA-3 transform using v8.2 Crypto Extensions
+ * Core SHA-3 transform using v8.2 Crypto Extensions
  *
  * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
  *
diff --git a/lib/crypto/arm64/sha512-ce-core.S b/lib/crypto/arm64/sha512-ce-core.S
index 22f1ded89bc8..ffd51acfd1ee 100644
--- a/lib/crypto/arm64/sha512-ce-core.S
+++ b/lib/crypto/arm64/sha512-ce-core.S
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
- * sha512-ce-core.S - core SHA-384/SHA-512 transform using v8 Crypto Extensions
+ * Core SHA-384/SHA-512 transform using v8 Crypto Extensions
  *
  * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
  *

From 2f22115709fc7ebcfa40af3367a508fbbd2f71e9 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Sun, 2 Nov 2025 15:42:04 -0800
Subject: [PATCH 25/37] lib/crypto: x86/blake2s: Fix 32-bit arg treated as
 64-bit

In the C code, the 'inc' argument to the assembly functions
blake2s_compress_ssse3() and blake2s_compress_avx512() is declared with
type u32, matching blake2s_compress().  The assembly code then reads it
from the 64-bit %rcx.  However, the ABI doesn't guarantee zero-extension
to 64 bits, nor do gcc or clang guarantee it.  Therefore, fix these
functions to read this argument from the 32-bit %ecx.

In theory, this bug could have caused the wrong 'inc' value to be used,
causing incorrect BLAKE2s hashes.  In practice, probably not: I've fixed
essentially this same bug in many other assembly files too, but there's
never been a real report of it having caused a problem.  In x86_64, all
writes to 32-bit registers are zero-extended to 64 bits.  That results
in zero-extension in nearly all situations.  I've only been able to
demonstrate a lack of zero-extension with a somewhat contrived example
involving truncation, e.g. when the C code has a u64 variable holding
0x1234567800000040 and passes it as a u32 expecting it to be truncated
to 0x40 (64).  But that's not what the real code does, of course.

Fixes: ed0356eda153 ("crypto: blake2s - x86_64 SIMD implementation")
Cc: stable@vger.kernel.org
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251102234209.62133-2-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 lib/crypto/x86/blake2s-core.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/crypto/x86/blake2s-core.S b/lib/crypto/x86/blake2s-core.S
index ef8e9f427aab..093e7814f387 100644
--- a/lib/crypto/x86/blake2s-core.S
+++ b/lib/crypto/x86/blake2s-core.S
@@ -52,7 +52,7 @@ SYM_FUNC_START(blake2s_compress_ssse3)
 	movdqa		ROT16(%rip),%xmm12
 	movdqa		ROR328(%rip),%xmm13
 	movdqu		0x20(%rdi),%xmm14
-	movq		%rcx,%xmm15
+	movd		%ecx,%xmm15
 	leaq		SIGMA+0xa0(%rip),%r8
 	jmp		.Lbeginofloop
 	.align		32
@@ -176,7 +176,7 @@ SYM_FUNC_START(blake2s_compress_avx512)
 	vmovdqu		(%rdi),%xmm0
 	vmovdqu		0x10(%rdi),%xmm1
 	vmovdqu		0x20(%rdi),%xmm4
-	vmovq		%rcx,%xmm5
+	vmovd		%ecx,%xmm5
 	vmovdqa		IV(%rip),%xmm14
 	vmovdqa		IV+16(%rip),%xmm15
 	jmp		.Lblake2s_compress_avx512_mainloop

From c19bdf24cc274e96006267173d664df2ef4b13db Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Sun, 2 Nov 2025 15:42:05 -0800
Subject: [PATCH 26/37] lib/crypto: x86/blake2s: Drop check for nblocks == 0

Since blake2s_compress() is always passed nblocks != 0, remove the
unnecessary check for nblocks == 0 from blake2s_compress_ssse3().

Note that this makes it consistent with blake2s_compress_avx512() in the
same file as well as the arm32 blake2s_compress().

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251102234209.62133-3-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 lib/crypto/x86/blake2s-core.S | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/lib/crypto/x86/blake2s-core.S b/lib/crypto/x86/blake2s-core.S
index 093e7814f387..aee13b97cc34 100644
--- a/lib/crypto/x86/blake2s-core.S
+++ b/lib/crypto/x86/blake2s-core.S
@@ -45,8 +45,6 @@ SIGMA2:
 
 .text
 SYM_FUNC_START(blake2s_compress_ssse3)
-	testq		%rdx,%rdx
-	je		.Lendofloop
 	movdqu		(%rdi),%xmm0
 	movdqu		0x10(%rdi),%xmm1
 	movdqa		ROT16(%rip),%xmm12
@@ -168,7 +166,6 @@ SYM_FUNC_START(blake2s_compress_ssse3)
 	movdqu		%xmm0,(%rdi)
 	movdqu		%xmm1,0x10(%rdi)
 	movdqu		%xmm14,0x20(%rdi)
-.Lendofloop:
 	RET
 SYM_FUNC_END(blake2s_compress_ssse3)
 

From 83c1a867c9999b3bb83e3da8f22eec9ec77a523c Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Sun, 2 Nov 2025 15:42:06 -0800
Subject: [PATCH 27/37] lib/crypto: x86/blake2s: Use local labels for data

Following the usual practice, prefix the names of the data labels with
".L" so that the assembler treats them as truly local.  This more
clearly expresses the intent and is less error-prone.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251102234209.62133-4-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 lib/crypto/x86/blake2s-core.S | 45 ++++++++++++++++++++---------------
 1 file changed, 26 insertions(+), 19 deletions(-)

diff --git a/lib/crypto/x86/blake2s-core.S b/lib/crypto/x86/blake2s-core.S
index aee13b97cc34..14e487559c09 100644
--- a/lib/crypto/x86/blake2s-core.S
+++ b/lib/crypto/x86/blake2s-core.S
@@ -6,19 +6,25 @@
 
 #include <linux/linkage.h>
 
-.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32
+.section .rodata.cst32.iv, "aM", @progbits, 32
 .align 32
-IV:	.octa 0xA54FF53A3C6EF372BB67AE856A09E667
+.Liv:
+	.octa 0xA54FF53A3C6EF372BB67AE856A09E667
 	.octa 0x5BE0CD191F83D9AB9B05688C510E527F
-.section .rodata.cst16.ROT16, "aM", @progbits, 16
+
+.section .rodata.cst16.ror16, "aM", @progbits, 16
 .align 16
-ROT16:	.octa 0x0D0C0F0E09080B0A0504070601000302
-.section .rodata.cst16.ROR328, "aM", @progbits, 16
+.Lror16:
+	.octa 0x0D0C0F0E09080B0A0504070601000302
+
+.section .rodata.cst16.ror8, "aM", @progbits, 16
 .align 16
-ROR328:	.octa 0x0C0F0E0D080B0A090407060500030201
-.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160
+.Lror8:
+	.octa 0x0C0F0E0D080B0A090407060500030201
+
+.section .rodata.cst64.sigma, "aM", @progbits, 160
 .align 64
-SIGMA:
+.Lsigma:
 .byte  0,  2,  4,  6,  1,  3,  5,  7, 14,  8, 10, 12, 15,  9, 11, 13
 .byte 14,  4,  9, 13, 10,  8, 15,  6,  5,  1,  0, 11,  3, 12,  2,  7
 .byte 11, 12,  5, 15,  8,  0,  2, 13,  9, 10,  3,  7,  4, 14,  6,  1
@@ -29,9 +35,10 @@ SIGMA:
 .byte 13,  7, 12,  3, 11, 14,  1,  9,  2,  5, 15,  8, 10,  0,  4,  6
 .byte  6, 14, 11,  0, 15,  9,  3,  8, 10, 12, 13,  1,  5,  2,  7,  4
 .byte 10,  8,  7,  1,  2,  4,  6,  5, 13, 15,  9,  3,  0, 11, 14, 12
-.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 160
+
+.section .rodata.cst64.sigma2, "aM", @progbits, 160
 .align 64
-SIGMA2:
+.Lsigma2:
 .byte  0,  2,  4,  6,  1,  3,  5,  7, 14,  8, 10, 12, 15,  9, 11, 13
 .byte  8,  2, 13, 15, 10,  9, 12,  3,  6,  4,  0, 14,  5, 11,  1,  7
 .byte 11, 13,  8,  6,  5, 10, 14,  3,  2,  4, 12, 15,  1,  0,  7,  9
@@ -47,21 +54,21 @@ SIGMA2:
 SYM_FUNC_START(blake2s_compress_ssse3)
 	movdqu		(%rdi),%xmm0
 	movdqu		0x10(%rdi),%xmm1
-	movdqa		ROT16(%rip),%xmm12
-	movdqa		ROR328(%rip),%xmm13
+	movdqa		.Lror16(%rip),%xmm12
+	movdqa		.Lror8(%rip),%xmm13
 	movdqu		0x20(%rdi),%xmm14
 	movd		%ecx,%xmm15
-	leaq		SIGMA+0xa0(%rip),%r8
+	leaq		.Lsigma+0xa0(%rip),%r8
 	jmp		.Lbeginofloop
 	.align		32
 .Lbeginofloop:
 	movdqa		%xmm0,%xmm10
 	movdqa		%xmm1,%xmm11
 	paddq		%xmm15,%xmm14
-	movdqa		IV(%rip),%xmm2
+	movdqa		.Liv(%rip),%xmm2
 	movdqa		%xmm14,%xmm3
-	pxor		IV+0x10(%rip),%xmm3
-	leaq		SIGMA(%rip),%rcx
+	pxor		.Liv+0x10(%rip),%xmm3
+	leaq		.Lsigma(%rip),%rcx
 .Lroundloop:
 	movzbl		(%rcx),%eax
 	movd		(%rsi,%rax,4),%xmm4
@@ -174,8 +181,8 @@ SYM_FUNC_START(blake2s_compress_avx512)
 	vmovdqu		0x10(%rdi),%xmm1
 	vmovdqu		0x20(%rdi),%xmm4
 	vmovd		%ecx,%xmm5
-	vmovdqa		IV(%rip),%xmm14
-	vmovdqa		IV+16(%rip),%xmm15
+	vmovdqa		.Liv(%rip),%xmm14
+	vmovdqa		.Liv+16(%rip),%xmm15
 	jmp		.Lblake2s_compress_avx512_mainloop
 .align 32
 .Lblake2s_compress_avx512_mainloop:
@@ -187,7 +194,7 @@ SYM_FUNC_START(blake2s_compress_avx512)
 	vmovdqu		(%rsi),%ymm6
 	vmovdqu		0x20(%rsi),%ymm7
 	addq		$0x40,%rsi
-	leaq		SIGMA2(%rip),%rax
+	leaq		.Lsigma2(%rip),%rax
 	movb		$0xa,%cl
 .Lblake2s_compress_avx512_roundloop:
 	vpmovzxbd	(%rax),%ymm8

From a7acd77ebd7f17b07a6ab2ca1dd1e4d487bdfa80 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Sun, 2 Nov 2025 15:42:07 -0800
Subject: [PATCH 28/37] lib/crypto: x86/blake2s: Improve readability

Various cleanups for readability.  No change to the generated code:

- Add some comments
- Add #defines for arguments
- Rename some labels
- Use decimal constants instead of hex where it makes sense.
  (The pshufd immediates intentionally remain as hex.)
- Add blank lines when there's a logical break

The round loop still could use some work, but this is at least a start.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251102234209.62133-5-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 lib/crypto/x86/blake2s-core.S | 231 ++++++++++++++++++++--------------
 1 file changed, 134 insertions(+), 97 deletions(-)

diff --git a/lib/crypto/x86/blake2s-core.S b/lib/crypto/x86/blake2s-core.S
index 14e487559c09..f805a49c590d 100644
--- a/lib/crypto/x86/blake2s-core.S
+++ b/lib/crypto/x86/blake2s-core.S
@@ -50,34 +50,52 @@
 .byte 15,  5,  4, 13, 10,  7,  3, 11, 12,  2,  0,  6,  9,  8,  1, 14
 .byte  8,  7, 14, 11, 13, 15,  0, 12, 10,  4,  5,  6,  3,  2,  1,  9
 
+#define CTX		%rdi
+#define DATA		%rsi
+#define NBLOCKS		%rdx
+#define INC		%ecx
+
 .text
+//
+// void blake2s_compress_ssse3(struct blake2s_ctx *ctx,
+//			       const u8 *data, size_t nblocks, u32 inc);
+//
+// Only the first three fields of struct blake2s_ctx are used:
+//	u32 h[8];	(inout)
+//	u32 t[2];	(inout)
+//	u32 f[2];	(in)
+//
 SYM_FUNC_START(blake2s_compress_ssse3)
-	movdqu		(%rdi),%xmm0
-	movdqu		0x10(%rdi),%xmm1
+	movdqu		(CTX),%xmm0		// Load h[0..3]
+	movdqu		16(CTX),%xmm1		// Load h[4..7]
 	movdqa		.Lror16(%rip),%xmm12
 	movdqa		.Lror8(%rip),%xmm13
-	movdqu		0x20(%rdi),%xmm14
-	movd		%ecx,%xmm15
-	leaq		.Lsigma+0xa0(%rip),%r8
-	jmp		.Lbeginofloop
+	movdqu		32(CTX),%xmm14		// Load t and f
+	movd		INC,%xmm15		// Load inc
+	leaq		.Lsigma+160(%rip),%r8
+	jmp		.Lssse3_mainloop
+
 	.align		32
-.Lbeginofloop:
-	movdqa		%xmm0,%xmm10
-	movdqa		%xmm1,%xmm11
-	paddq		%xmm15,%xmm14
-	movdqa		.Liv(%rip),%xmm2
+.Lssse3_mainloop:
+	// Main loop: each iteration processes one 64-byte block.
+	movdqa		%xmm0,%xmm10		// Save h[0..3] and let v[0..3] = h[0..3]
+	movdqa		%xmm1,%xmm11		// Save h[4..7] and let v[4..7] = h[4..7]
+	paddq		%xmm15,%xmm14		// t += inc (64-bit addition)
+	movdqa		.Liv(%rip),%xmm2	// v[8..11] = iv[0..3]
 	movdqa		%xmm14,%xmm3
-	pxor		.Liv+0x10(%rip),%xmm3
+	pxor		.Liv+16(%rip),%xmm3	// v[12..15] = iv[4..7] ^ [t, f]
 	leaq		.Lsigma(%rip),%rcx
-.Lroundloop:
+
+.Lssse3_roundloop:
+	// Round loop: each iteration does 1 round (of 10 rounds total).
 	movzbl		(%rcx),%eax
-	movd		(%rsi,%rax,4),%xmm4
-	movzbl		0x1(%rcx),%eax
-	movd		(%rsi,%rax,4),%xmm5
-	movzbl		0x2(%rcx),%eax
-	movd		(%rsi,%rax,4),%xmm6
-	movzbl		0x3(%rcx),%eax
-	movd		(%rsi,%rax,4),%xmm7
+	movd		(DATA,%rax,4),%xmm4
+	movzbl		1(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm5
+	movzbl		2(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm6
+	movzbl		3(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm7
 	punpckldq	%xmm5,%xmm4
 	punpckldq	%xmm7,%xmm6
 	punpcklqdq	%xmm6,%xmm4
@@ -88,17 +106,17 @@ SYM_FUNC_START(blake2s_compress_ssse3)
 	paddd		%xmm3,%xmm2
 	pxor		%xmm2,%xmm1
 	movdqa		%xmm1,%xmm8
-	psrld		$0xc,%xmm1
-	pslld		$0x14,%xmm8
+	psrld		$12,%xmm1
+	pslld		$20,%xmm8
 	por		%xmm8,%xmm1
-	movzbl		0x4(%rcx),%eax
-	movd		(%rsi,%rax,4),%xmm5
-	movzbl		0x5(%rcx),%eax
-	movd		(%rsi,%rax,4),%xmm6
-	movzbl		0x6(%rcx),%eax
-	movd		(%rsi,%rax,4),%xmm7
-	movzbl		0x7(%rcx),%eax
-	movd		(%rsi,%rax,4),%xmm4
+	movzbl		4(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm5
+	movzbl		5(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm6
+	movzbl		6(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm7
+	movzbl		7(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm4
 	punpckldq	%xmm6,%xmm5
 	punpckldq	%xmm4,%xmm7
 	punpcklqdq	%xmm7,%xmm5
@@ -109,20 +127,20 @@ SYM_FUNC_START(blake2s_compress_ssse3)
 	paddd		%xmm3,%xmm2
 	pxor		%xmm2,%xmm1
 	movdqa		%xmm1,%xmm8
-	psrld		$0x7,%xmm1
-	pslld		$0x19,%xmm8
+	psrld		$7,%xmm1
+	pslld		$25,%xmm8
 	por		%xmm8,%xmm1
 	pshufd		$0x93,%xmm0,%xmm0
 	pshufd		$0x4e,%xmm3,%xmm3
 	pshufd		$0x39,%xmm2,%xmm2
-	movzbl		0x8(%rcx),%eax
-	movd		(%rsi,%rax,4),%xmm6
-	movzbl		0x9(%rcx),%eax
-	movd		(%rsi,%rax,4),%xmm7
-	movzbl		0xa(%rcx),%eax
-	movd		(%rsi,%rax,4),%xmm4
-	movzbl		0xb(%rcx),%eax
-	movd		(%rsi,%rax,4),%xmm5
+	movzbl		8(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm6
+	movzbl		9(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm7
+	movzbl		10(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm4
+	movzbl		11(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm5
 	punpckldq	%xmm7,%xmm6
 	punpckldq	%xmm5,%xmm4
 	punpcklqdq	%xmm4,%xmm6
@@ -133,17 +151,17 @@ SYM_FUNC_START(blake2s_compress_ssse3)
 	paddd		%xmm3,%xmm2
 	pxor		%xmm2,%xmm1
 	movdqa		%xmm1,%xmm8
-	psrld		$0xc,%xmm1
-	pslld		$0x14,%xmm8
+	psrld		$12,%xmm1
+	pslld		$20,%xmm8
 	por		%xmm8,%xmm1
-	movzbl		0xc(%rcx),%eax
-	movd		(%rsi,%rax,4),%xmm7
-	movzbl		0xd(%rcx),%eax
-	movd		(%rsi,%rax,4),%xmm4
-	movzbl		0xe(%rcx),%eax
-	movd		(%rsi,%rax,4),%xmm5
-	movzbl		0xf(%rcx),%eax
-	movd		(%rsi,%rax,4),%xmm6
+	movzbl		12(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm7
+	movzbl		13(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm4
+	movzbl		14(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm5
+	movzbl		15(%rcx),%eax
+	movd		(DATA,%rax,4),%xmm6
 	punpckldq	%xmm4,%xmm7
 	punpckldq	%xmm6,%xmm5
 	punpcklqdq	%xmm5,%xmm7
@@ -154,52 +172,68 @@ SYM_FUNC_START(blake2s_compress_ssse3)
 	paddd		%xmm3,%xmm2
 	pxor		%xmm2,%xmm1
 	movdqa		%xmm1,%xmm8
-	psrld		$0x7,%xmm1
-	pslld		$0x19,%xmm8
+	psrld		$7,%xmm1
+	pslld		$25,%xmm8
 	por		%xmm8,%xmm1
 	pshufd		$0x39,%xmm0,%xmm0
 	pshufd		$0x4e,%xmm3,%xmm3
 	pshufd		$0x93,%xmm2,%xmm2
-	addq		$0x10,%rcx
+	addq		$16,%rcx
 	cmpq		%r8,%rcx
-	jnz		.Lroundloop
+	jnz		.Lssse3_roundloop
+
+	// Compute the new h: h[0..7] ^= v[0..7] ^ v[8..15]
 	pxor		%xmm2,%xmm0
 	pxor		%xmm3,%xmm1
 	pxor		%xmm10,%xmm0
 	pxor		%xmm11,%xmm1
-	addq		$0x40,%rsi
-	decq		%rdx
-	jnz		.Lbeginofloop
-	movdqu		%xmm0,(%rdi)
-	movdqu		%xmm1,0x10(%rdi)
-	movdqu		%xmm14,0x20(%rdi)
+	addq		$64,DATA
+	decq		NBLOCKS
+	jnz		.Lssse3_mainloop
+
+	movdqu		%xmm0,(CTX)		// Store new h[0..3]
+	movdqu		%xmm1,16(CTX)		// Store new h[4..7]
+	movdqu		%xmm14,32(CTX)		// Store new t and f
 	RET
 SYM_FUNC_END(blake2s_compress_ssse3)
 
+//
+// void blake2s_compress_avx512(struct blake2s_ctx *ctx,
+//				const u8 *data, size_t nblocks, u32 inc);
+//
+// Only the first three fields of struct blake2s_ctx are used:
+//	u32 h[8];	(inout)
+//	u32 t[2];	(inout)
+//	u32 f[2];	(in)
+//
 SYM_FUNC_START(blake2s_compress_avx512)
-	vmovdqu		(%rdi),%xmm0
-	vmovdqu		0x10(%rdi),%xmm1
-	vmovdqu		0x20(%rdi),%xmm4
-	vmovd		%ecx,%xmm5
-	vmovdqa		.Liv(%rip),%xmm14
-	vmovdqa		.Liv+16(%rip),%xmm15
-	jmp		.Lblake2s_compress_avx512_mainloop
-.align 32
-.Lblake2s_compress_avx512_mainloop:
-	vmovdqa		%xmm0,%xmm10
-	vmovdqa		%xmm1,%xmm11
-	vpaddq		%xmm5,%xmm4,%xmm4
-	vmovdqa		%xmm14,%xmm2
-	vpxor		%xmm15,%xmm4,%xmm3
-	vmovdqu		(%rsi),%ymm6
-	vmovdqu		0x20(%rsi),%ymm7
-	addq		$0x40,%rsi
+	vmovdqu		(CTX),%xmm0		// Load h[0..3]
+	vmovdqu		16(CTX),%xmm1		// Load h[4..7]
+	vmovdqu		32(CTX),%xmm4		// Load t and f
+	vmovd		INC,%xmm5		// Load inc
+	vmovdqa		.Liv(%rip),%xmm14	// Load iv[0..3]
+	vmovdqa		.Liv+16(%rip),%xmm15	// Load iv[4..7]
+	jmp		.Lavx512_mainloop
+
+	.align		32
+.Lavx512_mainloop:
+	// Main loop: each iteration processes one 64-byte block.
+	vmovdqa		%xmm0,%xmm10		// Save h[0..3] and let v[0..3] = h[0..3]
+	vmovdqa		%xmm1,%xmm11		// Save h[4..7] and let v[4..7] = h[4..7]
+	vpaddq		%xmm5,%xmm4,%xmm4	// t += inc (64-bit addition)
+	vmovdqa		%xmm14,%xmm2		// v[8..11] = iv[0..3]
+	vpxor		%xmm15,%xmm4,%xmm3	// v[12..15] = iv[4..7] ^ [t, f]
+	vmovdqu		(DATA),%ymm6		// Load first 8 data words
+	vmovdqu		32(DATA),%ymm7		// Load second 8 data words
+	addq		$64,DATA
 	leaq		.Lsigma2(%rip),%rax
-	movb		$0xa,%cl
-.Lblake2s_compress_avx512_roundloop:
+	movb		$10,%cl			// Set num rounds remaining
+
+.Lavx512_roundloop:
+	// Round loop: each iteration does 1 round (of 10 rounds total).
 	vpmovzxbd	(%rax),%ymm8
-	vpmovzxbd	0x8(%rax),%ymm9
-	addq		$0x10,%rax
+	vpmovzxbd	8(%rax),%ymm9
+	addq		$16,%rax
 	vpermi2d	%ymm7,%ymm6,%ymm8
 	vpermi2d	%ymm7,%ymm6,%ymm9
 	vmovdqa		%ymm8,%ymm6
@@ -207,50 +241,53 @@ SYM_FUNC_START(blake2s_compress_avx512)
 	vpaddd		%xmm8,%xmm0,%xmm0
 	vpaddd		%xmm1,%xmm0,%xmm0
 	vpxor		%xmm0,%xmm3,%xmm3
-	vprord		$0x10,%xmm3,%xmm3
+	vprord		$16,%xmm3,%xmm3
 	vpaddd		%xmm3,%xmm2,%xmm2
 	vpxor		%xmm2,%xmm1,%xmm1
-	vprord		$0xc,%xmm1,%xmm1
-	vextracti128	$0x1,%ymm8,%xmm8
+	vprord		$12,%xmm1,%xmm1
+	vextracti128	$1,%ymm8,%xmm8
 	vpaddd		%xmm8,%xmm0,%xmm0
 	vpaddd		%xmm1,%xmm0,%xmm0
 	vpxor		%xmm0,%xmm3,%xmm3
-	vprord		$0x8,%xmm3,%xmm3
+	vprord		$8,%xmm3,%xmm3
 	vpaddd		%xmm3,%xmm2,%xmm2
 	vpxor		%xmm2,%xmm1,%xmm1
-	vprord		$0x7,%xmm1,%xmm1
+	vprord		$7,%xmm1,%xmm1
 	vpshufd		$0x93,%xmm0,%xmm0
 	vpshufd		$0x4e,%xmm3,%xmm3
 	vpshufd		$0x39,%xmm2,%xmm2
 	vpaddd		%xmm9,%xmm0,%xmm0
 	vpaddd		%xmm1,%xmm0,%xmm0
 	vpxor		%xmm0,%xmm3,%xmm3
-	vprord		$0x10,%xmm3,%xmm3
+	vprord		$16,%xmm3,%xmm3
 	vpaddd		%xmm3,%xmm2,%xmm2
 	vpxor		%xmm2,%xmm1,%xmm1
-	vprord		$0xc,%xmm1,%xmm1
-	vextracti128	$0x1,%ymm9,%xmm9
+	vprord		$12,%xmm1,%xmm1
+	vextracti128	$1,%ymm9,%xmm9
 	vpaddd		%xmm9,%xmm0,%xmm0
 	vpaddd		%xmm1,%xmm0,%xmm0
 	vpxor		%xmm0,%xmm3,%xmm3
-	vprord		$0x8,%xmm3,%xmm3
+	vprord		$8,%xmm3,%xmm3
 	vpaddd		%xmm3,%xmm2,%xmm2
 	vpxor		%xmm2,%xmm1,%xmm1
-	vprord		$0x7,%xmm1,%xmm1
+	vprord		$7,%xmm1,%xmm1
 	vpshufd		$0x39,%xmm0,%xmm0
 	vpshufd		$0x4e,%xmm3,%xmm3
 	vpshufd		$0x93,%xmm2,%xmm2
 	decb		%cl
-	jne		.Lblake2s_compress_avx512_roundloop
+	jne		.Lavx512_roundloop
+
+	// Compute the new h: h[0..7] ^= v[0..7] ^ v[8..15]
 	vpxor		%xmm10,%xmm0,%xmm0
 	vpxor		%xmm11,%xmm1,%xmm1
 	vpxor		%xmm2,%xmm0,%xmm0
 	vpxor		%xmm3,%xmm1,%xmm1
-	decq		%rdx
-	jne		.Lblake2s_compress_avx512_mainloop
-	vmovdqu		%xmm0,(%rdi)
-	vmovdqu		%xmm1,0x10(%rdi)
-	vmovdqu		%xmm4,0x20(%rdi)
+	decq		NBLOCKS
+	jne		.Lavx512_mainloop
+
+	vmovdqu		%xmm0,(CTX)		// Store new h[0..3]
+	vmovdqu		%xmm1,16(CTX)		// Store new h[4..7]
+	vmovdqu		%xmm4,32(CTX)		// Store new t and f
 	vzeroupper
 	RET
 SYM_FUNC_END(blake2s_compress_avx512)

From cd5528621abb01664a477392cd3e76be2ef6296b Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Sun, 2 Nov 2025 15:42:08 -0800
Subject: [PATCH 29/37] lib/crypto: x86/blake2s: Avoid writing back unchanged
 'f' value

Just before returning, blake2s_compress_ssse3() and
blake2s_compress_avx512() store updated values to the 'h', 't', and 'f'
fields of struct blake2s_ctx.  But 'f' is always unchanged (which is
correct; only the C code changes it).  So, there's no need to write to
'f'.  Use 64-bit stores (movq and vmovq) instead of 128-bit stores
(movdqu and vmovdqu) so that only 't' is written.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251102234209.62133-6-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 lib/crypto/x86/blake2s-core.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/crypto/x86/blake2s-core.S b/lib/crypto/x86/blake2s-core.S
index f805a49c590d..869064f6ac16 100644
--- a/lib/crypto/x86/blake2s-core.S
+++ b/lib/crypto/x86/blake2s-core.S
@@ -193,7 +193,7 @@ SYM_FUNC_START(blake2s_compress_ssse3)
 
 	movdqu		%xmm0,(CTX)		// Store new h[0..3]
 	movdqu		%xmm1,16(CTX)		// Store new h[4..7]
-	movdqu		%xmm14,32(CTX)		// Store new t and f
+	movq		%xmm14,32(CTX)		// Store new t (f is unchanged)
 	RET
 SYM_FUNC_END(blake2s_compress_ssse3)
 
@@ -287,7 +287,7 @@ SYM_FUNC_START(blake2s_compress_avx512)
 
 	vmovdqu		%xmm0,(CTX)		// Store new h[0..3]
 	vmovdqu		%xmm1,16(CTX)		// Store new h[4..7]
-	vmovdqu		%xmm4,32(CTX)		// Store new t and f
+	vmovq		%xmm4,32(CTX)		// Store new t (f is unchanged)
 	vzeroupper
 	RET
 SYM_FUNC_END(blake2s_compress_avx512)

From 8ba60c5914f25a44f10189c6919a737b199f6dbf Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Sun, 2 Nov 2025 15:42:09 -0800
Subject: [PATCH 30/37] lib/crypto: x86/blake2s: Use vpternlogd for 3-input
 XORs

AVX-512 supports 3-input XORs via the vpternlogd (or vpternlogq)
instruction with immediate 0x96.  This approach, vs. the alternative of
two vpxor instructions, is already used in the CRC, AES-GCM, and AES-XTS
code, since it reduces the instruction count and is faster on some CPUs.
Make blake2s_compress_avx512() take advantage of it too.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251102234209.62133-7-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 lib/crypto/x86/blake2s-core.S | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/lib/crypto/x86/blake2s-core.S b/lib/crypto/x86/blake2s-core.S
index 869064f6ac16..7b1d98ca7482 100644
--- a/lib/crypto/x86/blake2s-core.S
+++ b/lib/crypto/x86/blake2s-core.S
@@ -278,10 +278,8 @@ SYM_FUNC_START(blake2s_compress_avx512)
 	jne		.Lavx512_roundloop
 
 	// Compute the new h: h[0..7] ^= v[0..7] ^ v[8..15]
-	vpxor		%xmm10,%xmm0,%xmm0
-	vpxor		%xmm11,%xmm1,%xmm1
-	vpxor		%xmm2,%xmm0,%xmm0
-	vpxor		%xmm3,%xmm1,%xmm1
+	vpternlogd	$0x96,%xmm10,%xmm2,%xmm0
+	vpternlogd	$0x96,%xmm11,%xmm3,%xmm1
 	decq		NBLOCKS
 	jne		.Lavx512_mainloop
 

From e1c360849794c2e638cff5486e4ee256568dd3b3 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Sun, 9 Nov 2025 15:47:16 -0800
Subject: [PATCH 31/37] crypto: polyval - Rename conflicting functions

Rename polyval_init() and polyval_update(), in preparation for adding
library functions with the same name to <crypto/polyval.h>.

Note that polyval-generic.c will be removed later, as it will be
superseded by the library.  This commit just keeps the kernel building
for the initial introduction of the library.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251109234726.638437-2-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 crypto/polyval-generic.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/crypto/polyval-generic.c b/crypto/polyval-generic.c
index db8adb56e4ca..fe5b01a4000d 100644
--- a/crypto/polyval-generic.c
+++ b/crypto/polyval-generic.c
@@ -99,7 +99,7 @@ static int polyval_setkey(struct crypto_shash *tfm,
 	return 0;
 }
 
-static int polyval_init(struct shash_desc *desc)
+static int polyval_generic_init(struct shash_desc *desc)
 {
 	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
 
@@ -108,8 +108,8 @@ static int polyval_init(struct shash_desc *desc)
 	return 0;
 }
 
-static int polyval_update(struct shash_desc *desc,
-			 const u8 *src, unsigned int srclen)
+static int polyval_generic_update(struct shash_desc *desc,
+				  const u8 *src, unsigned int srclen)
 {
 	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
 	const struct polyval_tfm_ctx *ctx = crypto_shash_ctx(desc->tfm);
@@ -135,7 +135,7 @@ static int polyval_finup(struct shash_desc *desc, const u8 *src,
 		u8 tmp[POLYVAL_BLOCK_SIZE] = {};
 
 		memcpy(tmp, src, len);
-		polyval_update(desc, tmp, POLYVAL_BLOCK_SIZE);
+		polyval_generic_update(desc, tmp, POLYVAL_BLOCK_SIZE);
 	}
 	copy_and_reverse(dst, dctx->buffer);
 	return 0;
@@ -166,8 +166,8 @@ static void polyval_exit_tfm(struct crypto_shash *tfm)
 
 static struct shash_alg polyval_alg = {
 	.digestsize	= POLYVAL_DIGEST_SIZE,
-	.init		= polyval_init,
-	.update		= polyval_update,
+	.init		= polyval_generic_init,
+	.update		= polyval_generic_update,
 	.finup		= polyval_finup,
 	.setkey		= polyval_setkey,
 	.export		= polyval_export,

From 3d176751e541362ff40c2478d6a2de41f8c62318 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Sun, 9 Nov 2025 15:47:17 -0800
Subject: [PATCH 32/37] lib/crypto: polyval: Add POLYVAL library

Add support for POLYVAL to lib/crypto/.

This will replace the polyval crypto_shash algorithm and its use in the
hctr2 template, simplifying the code and reducing overhead.

Specifically, this commit introduces the POLYVAL library API and a
generic implementation of it.  Later commits will migrate the existing
architecture-optimized implementations of POLYVAL into lib/crypto/ and
add a KUnit test suite.

I've also rewritten the generic implementation completely, using a more
modern approach instead of the traditional table-based approach.  It's
now constant-time, requires no precomputation or dynamic memory
allocations, decreases the per-key memory usage from 4096 bytes to 16
bytes, and is faster than the old polyval-generic even on bulk data
reusing the same key (at least on x86_64, where I measured 15% faster).
We should do this for GHASH too, but for now just do it for POLYVAL.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Tested-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251109234726.638437-3-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 include/crypto/polyval.h | 171 +++++++++++++++++++++-
 lib/crypto/Kconfig       |  10 ++
 lib/crypto/Makefile      |   8 +
 lib/crypto/polyval.c     | 307 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 493 insertions(+), 3 deletions(-)
 create mode 100644 lib/crypto/polyval.c

diff --git a/include/crypto/polyval.h b/include/crypto/polyval.h
index d2e63743e592..5ba4c248cad1 100644
--- a/include/crypto/polyval.h
+++ b/include/crypto/polyval.h
@@ -1,14 +1,179 @@
-/* SPDX-License-Identifier: GPL-2.0 */
+/* SPDX-License-Identifier: GPL-2.0-or-later */
 /*
- * Common values for the Polyval hash algorithm
+ * POLYVAL library API
  *
- * Copyright 2021 Google LLC
+ * Copyright 2025 Google LLC
  */
 
 #ifndef _CRYPTO_POLYVAL_H
 #define _CRYPTO_POLYVAL_H
 
+#include <linux/string.h>
+#include <linux/types.h>
+
 #define POLYVAL_BLOCK_SIZE	16
 #define POLYVAL_DIGEST_SIZE	16
 
+/**
+ * struct polyval_elem - An element of the POLYVAL finite field
+ * @bytes: View of the element as a byte array (unioned with @lo and @hi)
+ * @lo: The low 64 terms of the element's polynomial
+ * @hi: The high 64 terms of the element's polynomial
+ *
+ * This represents an element of the finite field GF(2^128), using the POLYVAL
+ * convention: little-endian byte order and natural bit order.
+ */
+struct polyval_elem {
+	union {
+		u8 bytes[POLYVAL_BLOCK_SIZE];
+		struct {
+			__le64 lo;
+			__le64 hi;
+		};
+	};
+};
+
+/**
+ * struct polyval_key - Prepared key for POLYVAL
+ *
+ * This may contain just the raw key H, or it may contain precomputed key
+ * powers, depending on the platform's POLYVAL implementation.  Use
+ * polyval_preparekey() to initialize this.
+ */
+struct polyval_key {
+#ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH
+#error "Unhandled arch"
+#else /* CONFIG_CRYPTO_LIB_POLYVAL_ARCH */
+	/** @h: The hash key H */
+	struct polyval_elem h;
+#endif /* !CONFIG_CRYPTO_LIB_POLYVAL_ARCH */
+};
+
+/**
+ * struct polyval_ctx - Context for computing a POLYVAL value
+ * @key: Pointer to the prepared POLYVAL key.  The user of the API is
+ *	 responsible for ensuring that the key lives as long as the context.
+ * @acc: The accumulator
+ * @partial: Number of data bytes processed so far modulo POLYVAL_BLOCK_SIZE
+ */
+struct polyval_ctx {
+	const struct polyval_key *key;
+	struct polyval_elem acc;
+	size_t partial;
+};
+
+/**
+ * polyval_preparekey() - Prepare a POLYVAL key
+ * @key: (output) The key structure to initialize
+ * @raw_key: The raw hash key
+ *
+ * Initialize a POLYVAL key structure from a raw key.  This may be a simple
+ * copy, or it may involve precomputing powers of the key, depending on the
+ * platform's POLYVAL implementation.
+ *
+ * Context: Any context.
+ */
+#ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH
+void polyval_preparekey(struct polyval_key *key,
+			const u8 raw_key[POLYVAL_BLOCK_SIZE]);
+
+#else
+static inline void polyval_preparekey(struct polyval_key *key,
+				      const u8 raw_key[POLYVAL_BLOCK_SIZE])
+{
+	/* Just a simple copy, so inline it. */
+	memcpy(key->h.bytes, raw_key, POLYVAL_BLOCK_SIZE);
+}
 #endif
+
+/**
+ * polyval_init() - Initialize a POLYVAL context for a new message
+ * @ctx: The context to initialize
+ * @key: The key to use.  Note that a pointer to the key is saved in the
+ *	 context, so the key must live at least as long as the context.
+ */
+static inline void polyval_init(struct polyval_ctx *ctx,
+				const struct polyval_key *key)
+{
+	*ctx = (struct polyval_ctx){ .key = key };
+}
+
+/**
+ * polyval_import_blkaligned() - Import a POLYVAL accumulator value
+ * @ctx: The context to initialize
+ * @key: The key to import.  Note that a pointer to the key is saved in the
+ *	 context, so the key must live at least as long as the context.
+ * @acc: The accumulator value to import.
+ *
+ * This imports an accumulator that was saved by polyval_export_blkaligned().
+ * The same key must be used.
+ */
+static inline void
+polyval_import_blkaligned(struct polyval_ctx *ctx,
+			  const struct polyval_key *key,
+			  const struct polyval_elem *acc)
+{
+	*ctx = (struct polyval_ctx){ .key = key, .acc = *acc };
+}
+
+/**
+ * polyval_export_blkaligned() - Export a POLYVAL accumulator value
+ * @ctx: The context to export the accumulator value from
+ * @acc: (output) The exported accumulator value
+ *
+ * This exports the accumulator from a POLYVAL context.  The number of data
+ * bytes processed so far must be a multiple of POLYVAL_BLOCK_SIZE.
+ */
+static inline void polyval_export_blkaligned(const struct polyval_ctx *ctx,
+					     struct polyval_elem *acc)
+{
+	*acc = ctx->acc;
+}
+
+/**
+ * polyval_update() - Update a POLYVAL context with message data
+ * @ctx: The context to update; must have been initialized
+ * @data: The message data
+ * @len: The data length in bytes.  Doesn't need to be block-aligned.
+ *
+ * This can be called any number of times.
+ *
+ * Context: Any context.
+ */
+void polyval_update(struct polyval_ctx *ctx, const u8 *data, size_t len);
+
+/**
+ * polyval_final() - Finish computing a POLYVAL value
+ * @ctx: The context to finalize
+ * @out: The output value
+ *
+ * If the total data length isn't a multiple of POLYVAL_BLOCK_SIZE, then the
+ * final block is automatically zero-padded.
+ *
+ * After finishing, this zeroizes @ctx.  So the caller does not need to do it.
+ *
+ * Context: Any context.
+ */
+void polyval_final(struct polyval_ctx *ctx, u8 out[POLYVAL_BLOCK_SIZE]);
+
+/**
+ * polyval() - Compute a POLYVAL value
+ * @key: The prepared key
+ * @data: The message data
+ * @len: The data length in bytes.  Doesn't need to be block-aligned.
+ * @out: The output value
+ *
+ * Context: Any context.
+ */
+static inline void polyval(const struct polyval_key *key,
+			   const u8 *data, size_t len,
+			   u8 out[POLYVAL_BLOCK_SIZE])
+{
+	struct polyval_ctx ctx;
+
+	polyval_init(&ctx, key);
+	polyval_update(&ctx, data, len);
+	polyval_final(&ctx, out);
+}
+
+#endif /* _CRYPTO_POLYVAL_H */
diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig
index 7445054fc0ad..6545f0e83b83 100644
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -135,6 +135,16 @@ config CRYPTO_LIB_POLY1305_RSIZE
 	default 9 if ARM || ARM64
 	default 1
 
+config CRYPTO_LIB_POLYVAL
+	tristate
+	help
+	  The POLYVAL library functions.  Select this if your module uses any of
+	  the functions from <crypto/polyval.h>.
+
+config CRYPTO_LIB_POLYVAL_ARCH
+	bool
+	depends on CRYPTO_LIB_POLYVAL && !UML
+
 config CRYPTO_LIB_CHACHA20POLY1305
 	tristate
 	select CRYPTO_LIB_CHACHA
diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile
index 5515e73bfd5e..055e44008805 100644
--- a/lib/crypto/Makefile
+++ b/lib/crypto/Makefile
@@ -198,6 +198,14 @@ clean-files += arm/poly1305-core.S \
 
 ################################################################################
 
+obj-$(CONFIG_CRYPTO_LIB_POLYVAL) += libpolyval.o
+libpolyval-y := polyval.o
+ifeq ($(CONFIG_CRYPTO_LIB_POLYVAL_ARCH),y)
+CFLAGS_polyval.o += -I$(src)/$(SRCARCH)
+endif
+
+################################################################################
+
 obj-$(CONFIG_CRYPTO_LIB_SHA1) += libsha1.o
 libsha1-y := sha1.o
 ifeq ($(CONFIG_CRYPTO_LIB_SHA1_ARCH),y)
diff --git a/lib/crypto/polyval.c b/lib/crypto/polyval.c
new file mode 100644
index 000000000000..5796275f574a
--- /dev/null
+++ b/lib/crypto/polyval.c
@@ -0,0 +1,307 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * POLYVAL library functions
+ *
+ * Copyright 2025 Google LLC
+ */
+
+#include <crypto/polyval.h>
+#include <linux/export.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/unaligned.h>
+
+/*
+ * POLYVAL is an almost-XOR-universal hash function.  Similar to GHASH, POLYVAL
+ * interprets the message as the coefficients of a polynomial in GF(2^128) and
+ * evaluates that polynomial at a secret point.  POLYVAL has a simple
+ * mathematical relationship with GHASH, but it uses a better field convention
+ * which makes it easier and faster to implement.
+ *
+ * POLYVAL is not a cryptographic hash function, and it should be used only by
+ * algorithms that are specifically designed to use it.
+ *
+ * POLYVAL is specified by "AES-GCM-SIV: Nonce Misuse-Resistant Authenticated
+ * Encryption" (https://datatracker.ietf.org/doc/html/rfc8452)
+ *
+ * POLYVAL is also used by HCTR2.  See "Length-preserving encryption with HCTR2"
+ * (https://eprint.iacr.org/2021/1441.pdf).
+ *
+ * This file provides a library API for POLYVAL.  This API can delegate to
+ * either a generic implementation or an architecture-optimized implementation.
+ *
+ * For the generic implementation, we don't use the traditional table approach
+ * to GF(2^128) multiplication.  That approach is not constant-time and requires
+ * a lot of memory.  Instead, we use a different approach which emulates
+ * carryless multiplication using standard multiplications by spreading the data
+ * bits apart using "holes".  This allows the carries to spill harmlessly.  This
+ * approach is borrowed from BoringSSL, which in turn credits BearSSL's
+ * documentation (https://bearssl.org/constanttime.html#ghash-for-gcm) for the
+ * "holes" trick and a presentation by Shay Gueron
+ * (https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf) for the
+ * 256-bit => 128-bit reduction algorithm.
+ */
+
+#ifdef CONFIG_ARCH_SUPPORTS_INT128
+
+/* Do a 64 x 64 => 128 bit carryless multiplication. */
+static void clmul64(u64 a, u64 b, u64 *out_lo, u64 *out_hi)
+{
+	/*
+	 * With 64-bit multiplicands and one term every 4 bits, there would be
+	 * up to 64 / 4 = 16 one bits per column when each multiplication is
+	 * written out as a series of additions in the schoolbook manner.
+	 * Unfortunately, that doesn't work since the value 16 is 1 too large to
+	 * fit in 4 bits.  Carries would sometimes overflow into the next term.
+	 *
+	 * Using one term every 5 bits would work.  However, that would cost
+	 * 5 x 5 = 25 multiplications instead of 4 x 4 = 16.
+	 *
+	 * Instead, mask off 4 bits from one multiplicand, giving a max of 15
+	 * one bits per column.  Then handle those 4 bits separately.
+	 */
+	u64 a0 = a & 0x1111111111111110;
+	u64 a1 = a & 0x2222222222222220;
+	u64 a2 = a & 0x4444444444444440;
+	u64 a3 = a & 0x8888888888888880;
+
+	u64 b0 = b & 0x1111111111111111;
+	u64 b1 = b & 0x2222222222222222;
+	u64 b2 = b & 0x4444444444444444;
+	u64 b3 = b & 0x8888888888888888;
+
+	/* Multiply the high 60 bits of @a by @b. */
+	u128 c0 = (a0 * (u128)b0) ^ (a1 * (u128)b3) ^
+		  (a2 * (u128)b2) ^ (a3 * (u128)b1);
+	u128 c1 = (a0 * (u128)b1) ^ (a1 * (u128)b0) ^
+		  (a2 * (u128)b3) ^ (a3 * (u128)b2);
+	u128 c2 = (a0 * (u128)b2) ^ (a1 * (u128)b1) ^
+		  (a2 * (u128)b0) ^ (a3 * (u128)b3);
+	u128 c3 = (a0 * (u128)b3) ^ (a1 * (u128)b2) ^
+		  (a2 * (u128)b1) ^ (a3 * (u128)b0);
+
+	/* Multiply the low 4 bits of @a by @b. */
+	u64 e0 = -(a & 1) & b;
+	u64 e1 = -((a >> 1) & 1) & b;
+	u64 e2 = -((a >> 2) & 1) & b;
+	u64 e3 = -((a >> 3) & 1) & b;
+	u64 extra_lo = e0 ^ (e1 << 1) ^ (e2 << 2) ^ (e3 << 3);
+	u64 extra_hi = (e1 >> 63) ^ (e2 >> 62) ^ (e3 >> 61);
+
+	/* Add all the intermediate products together. */
+	*out_lo = (((u64)c0) & 0x1111111111111111) ^
+		  (((u64)c1) & 0x2222222222222222) ^
+		  (((u64)c2) & 0x4444444444444444) ^
+		  (((u64)c3) & 0x8888888888888888) ^ extra_lo;
+	*out_hi = (((u64)(c0 >> 64)) & 0x1111111111111111) ^
+		  (((u64)(c1 >> 64)) & 0x2222222222222222) ^
+		  (((u64)(c2 >> 64)) & 0x4444444444444444) ^
+		  (((u64)(c3 >> 64)) & 0x8888888888888888) ^ extra_hi;
+}
+
+#else /* CONFIG_ARCH_SUPPORTS_INT128 */
+
+/* Do a 32 x 32 => 64 bit carryless multiplication. */
+static u64 clmul32(u32 a, u32 b)
+{
+	/*
+	 * With 32-bit multiplicands and one term every 4 bits, there are up to
+	 * 32 / 4 = 8 one bits per column when each multiplication is written
+	 * out as a series of additions in the schoolbook manner.  The value 8
+	 * fits in 4 bits, so the carries don't overflow into the next term.
+	 */
+	u32 a0 = a & 0x11111111;
+	u32 a1 = a & 0x22222222;
+	u32 a2 = a & 0x44444444;
+	u32 a3 = a & 0x88888888;
+
+	u32 b0 = b & 0x11111111;
+	u32 b1 = b & 0x22222222;
+	u32 b2 = b & 0x44444444;
+	u32 b3 = b & 0x88888888;
+
+	u64 c0 = (a0 * (u64)b0) ^ (a1 * (u64)b3) ^
+		 (a2 * (u64)b2) ^ (a3 * (u64)b1);
+	u64 c1 = (a0 * (u64)b1) ^ (a1 * (u64)b0) ^
+		 (a2 * (u64)b3) ^ (a3 * (u64)b2);
+	u64 c2 = (a0 * (u64)b2) ^ (a1 * (u64)b1) ^
+		 (a2 * (u64)b0) ^ (a3 * (u64)b3);
+	u64 c3 = (a0 * (u64)b3) ^ (a1 * (u64)b2) ^
+		 (a2 * (u64)b1) ^ (a3 * (u64)b0);
+
+	/* Add all the intermediate products together. */
+	return (c0 & 0x1111111111111111) ^
+	       (c1 & 0x2222222222222222) ^
+	       (c2 & 0x4444444444444444) ^
+	       (c3 & 0x8888888888888888);
+}
+
+/* Do a 64 x 64 => 128 bit carryless multiplication. */
+static void clmul64(u64 a, u64 b, u64 *out_lo, u64 *out_hi)
+{
+	u32 a_lo = (u32)a;
+	u32 a_hi = a >> 32;
+	u32 b_lo = (u32)b;
+	u32 b_hi = b >> 32;
+
+	/* Karatsuba multiplication */
+	u64 lo = clmul32(a_lo, b_lo);
+	u64 hi = clmul32(a_hi, b_hi);
+	u64 mi = clmul32(a_lo ^ a_hi, b_lo ^ b_hi) ^ lo ^ hi;
+
+	*out_lo = lo ^ (mi << 32);
+	*out_hi = hi ^ (mi >> 32);
+}
+#endif /* !CONFIG_ARCH_SUPPORTS_INT128 */
+
+/* Compute @a = @a * @b * x^-128 in the POLYVAL field. */
+static void __maybe_unused
+polyval_mul_generic(struct polyval_elem *a, const struct polyval_elem *b)
+{
+	u64 c0, c1, c2, c3, mi0, mi1;
+
+	/*
+	 * Carryless-multiply @a by @b using Karatsuba multiplication.  Store
+	 * the 256-bit product in @c0 (low) through @c3 (high).
+	 */
+	clmul64(le64_to_cpu(a->lo), le64_to_cpu(b->lo), &c0, &c1);
+	clmul64(le64_to_cpu(a->hi), le64_to_cpu(b->hi), &c2, &c3);
+	clmul64(le64_to_cpu(a->lo ^ a->hi), le64_to_cpu(b->lo ^ b->hi),
+		&mi0, &mi1);
+	mi0 ^= c0 ^ c2;
+	mi1 ^= c1 ^ c3;
+	c1 ^= mi0;
+	c2 ^= mi1;
+
+	/*
+	 * Cancel out the low 128 bits of the product by adding multiples of
+	 * G(x) = x^128 + x^127 + x^126 + x^121 + 1.  Do this in two steps, each
+	 * of which cancels out 64 bits.  Note that we break G(x) into three
+	 * parts: 1, x^64 * (x^63 + x^62 + x^57), and x^128 * 1.
+	 */
+
+	/*
+	 * First, add G(x) times c0 as follows:
+	 *
+	 * (c0, c1, c2) = (0,
+	 *                 c1 + (c0 * (x^63 + x^62 + x^57) mod x^64),
+	 *		   c2 + c0 + floor((c0 * (x^63 + x^62 + x^57)) / x^64))
+	 */
+	c1 ^= (c0 << 63) ^ (c0 << 62) ^ (c0 << 57);
+	c2 ^= c0 ^ (c0 >> 1) ^ (c0 >> 2) ^ (c0 >> 7);
+
+	/*
+	 * Second, add G(x) times the new c1:
+	 *
+	 * (c1, c2, c3) = (0,
+	 *                 c2 + (c1 * (x^63 + x^62 + x^57) mod x^64),
+	 *		   c3 + c1 + floor((c1 * (x^63 + x^62 + x^57)) / x^64))
+	 */
+	c2 ^= (c1 << 63) ^ (c1 << 62) ^ (c1 << 57);
+	c3 ^= c1 ^ (c1 >> 1) ^ (c1 >> 2) ^ (c1 >> 7);
+
+	/* Return (c2, c3).  This implicitly multiplies by x^-128. */
+	a->lo = cpu_to_le64(c2);
+	a->hi = cpu_to_le64(c3);
+}
+
+static void __maybe_unused
+polyval_blocks_generic(struct polyval_elem *acc, const struct polyval_elem *key,
+		       const u8 *data, size_t nblocks)
+{
+	do {
+		acc->lo ^= get_unaligned((__le64 *)data);
+		acc->hi ^= get_unaligned((__le64 *)(data + 8));
+		polyval_mul_generic(acc, key);
+		data += POLYVAL_BLOCK_SIZE;
+	} while (--nblocks);
+}
+
+/* Include the arch-optimized implementation of POLYVAL, if one is available. */
+#ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH
+#include "polyval.h" /* $(SRCARCH)/polyval.h */
+void polyval_preparekey(struct polyval_key *key,
+			const u8 raw_key[POLYVAL_BLOCK_SIZE])
+{
+	polyval_preparekey_arch(key, raw_key);
+}
+EXPORT_SYMBOL_GPL(polyval_preparekey);
+#endif /* Else, polyval_preparekey() is an inline function. */
+
+/*
+ * polyval_mul_generic() and polyval_blocks_generic() take the key as a
+ * polyval_elem rather than a polyval_key, so that arch-optimized
+ * implementations with a different key format can use it as a fallback (if they
+ * have H^1 stored somewhere in their struct).  Thus, the following dispatch
+ * code is needed to pass the appropriate key argument.
+ */
+
+static void polyval_mul(struct polyval_ctx *ctx)
+{
+#ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH
+	polyval_mul_arch(&ctx->acc, ctx->key);
+#else
+	polyval_mul_generic(&ctx->acc, &ctx->key->h);
+#endif
+}
+
+static void polyval_blocks(struct polyval_ctx *ctx,
+			   const u8 *data, size_t nblocks)
+{
+#ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH
+	polyval_blocks_arch(&ctx->acc, ctx->key, data, nblocks);
+#else
+	polyval_blocks_generic(&ctx->acc, &ctx->key->h, data, nblocks);
+#endif
+}
+
+void polyval_update(struct polyval_ctx *ctx, const u8 *data, size_t len)
+{
+	if (unlikely(ctx->partial)) {
+		size_t n = min(len, POLYVAL_BLOCK_SIZE - ctx->partial);
+
+		len -= n;
+		while (n--)
+			ctx->acc.bytes[ctx->partial++] ^= *data++;
+		if (ctx->partial < POLYVAL_BLOCK_SIZE)
+			return;
+		polyval_mul(ctx);
+	}
+	if (len >= POLYVAL_BLOCK_SIZE) {
+		size_t nblocks = len / POLYVAL_BLOCK_SIZE;
+
+		polyval_blocks(ctx, data, nblocks);
+		data += len & ~(POLYVAL_BLOCK_SIZE - 1);
+		len &= POLYVAL_BLOCK_SIZE - 1;
+	}
+	for (size_t i = 0; i < len; i++)
+		ctx->acc.bytes[i] ^= data[i];
+	ctx->partial = len;
+}
+EXPORT_SYMBOL_GPL(polyval_update);
+
+void polyval_final(struct polyval_ctx *ctx, u8 out[POLYVAL_BLOCK_SIZE])
+{
+	if (unlikely(ctx->partial))
+		polyval_mul(ctx);
+	memcpy(out, &ctx->acc, POLYVAL_BLOCK_SIZE);
+	memzero_explicit(ctx, sizeof(*ctx));
+}
+EXPORT_SYMBOL_GPL(polyval_final);
+
+#ifdef polyval_mod_init_arch
+static int __init polyval_mod_init(void)
+{
+	polyval_mod_init_arch();
+	return 0;
+}
+subsys_initcall(polyval_mod_init);
+
+static void __exit polyval_mod_exit(void)
+{
+}
+module_exit(polyval_mod_exit);
+#endif
+
+MODULE_DESCRIPTION("POLYVAL almost-XOR-universal hash function");
+MODULE_LICENSE("GPL");

From 37919e239ebb2cba573cca56292f7c39fa6d7415 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Sun, 9 Nov 2025 15:47:19 -0800
Subject: [PATCH 33/37] lib/crypto: arm64/polyval: Migrate optimized code into
 library

Migrate the arm64 implementation of POLYVAL into lib/crypto/, wiring it
up to the POLYVAL library interface.  This makes the POLYVAL library be
properly optimized on arm64.

This drops the arm64 optimizations of polyval in the crypto_shash API.
That's fine, since polyval will be removed from crypto_shash entirely
since it is unneeded there.  But even if it comes back, the crypto_shash
API could just be implemented on top of the library API, as usual.

Adjust the names and prototypes of the assembly functions to align more
closely with the rest of the library code.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251109234726.638437-5-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 arch/arm64/crypto/Kconfig                     |  10 --
 arch/arm64/crypto/Makefile                    |   3 -
 arch/arm64/crypto/polyval-ce-glue.c           | 158 ------------------
 include/crypto/polyval.h                      |   8 +
 lib/crypto/Kconfig                            |   1 +
 lib/crypto/Makefile                           |   1 +
 .../crypto/arm64}/polyval-ce-core.S           |  38 ++---
 lib/crypto/arm64/polyval.h                    |  82 +++++++++
 8 files changed, 110 insertions(+), 191 deletions(-)
 delete mode 100644 arch/arm64/crypto/polyval-ce-glue.c
 rename {arch/arm64/crypto => lib/crypto/arm64}/polyval-ce-core.S (92%)
 create mode 100644 lib/crypto/arm64/polyval.h

diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index 376d6b50743f..bdd276a6e540 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -47,16 +47,6 @@ config CRYPTO_SM3_ARM64_CE
 	  Architecture: arm64 using:
 	  - ARMv8.2 Crypto Extensions
 
-config CRYPTO_POLYVAL_ARM64_CE
-	tristate "Hash functions: POLYVAL (ARMv8 Crypto Extensions)"
-	depends on KERNEL_MODE_NEON
-	select CRYPTO_POLYVAL
-	help
-	  POLYVAL hash function for HCTR2
-
-	  Architecture: arm64 using:
-	  - ARMv8 Crypto Extensions
-
 config CRYPTO_AES_ARM64
 	tristate "Ciphers: AES, modes: ECB, CBC, CTR, CTS, XCTR, XTS"
 	select CRYPTO_AES
diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
index fd3d590fa113..1e330aa08d3f 100644
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@@ -29,9 +29,6 @@ sm4-neon-y := sm4-neon-glue.o sm4-neon-core.o
 obj-$(CONFIG_CRYPTO_GHASH_ARM64_CE) += ghash-ce.o
 ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o
 
-obj-$(CONFIG_CRYPTO_POLYVAL_ARM64_CE) += polyval-ce.o
-polyval-ce-y := polyval-ce-glue.o polyval-ce-core.o
-
 obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o
 aes-ce-cipher-y := aes-ce-core.o aes-ce-glue.o
 
diff --git a/arch/arm64/crypto/polyval-ce-glue.c b/arch/arm64/crypto/polyval-ce-glue.c
deleted file mode 100644
index c4e653688ea0..000000000000
--- a/arch/arm64/crypto/polyval-ce-glue.c
+++ /dev/null
@@ -1,158 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Glue code for POLYVAL using ARMv8 Crypto Extensions
- *
- * Copyright (c) 2007 Nokia Siemens Networks - Mikko Herranen <mh1@iki.fi>
- * Copyright (c) 2009 Intel Corp.
- *   Author: Huang Ying <ying.huang@intel.com>
- * Copyright 2021 Google LLC
- */
-
-/*
- * Glue code based on ghash-clmulni-intel_glue.c.
- *
- * This implementation of POLYVAL uses montgomery multiplication accelerated by
- * ARMv8 Crypto Extensions instructions to implement the finite field operations.
- */
-
-#include <asm/neon.h>
-#include <crypto/internal/hash.h>
-#include <crypto/polyval.h>
-#include <crypto/utils.h>
-#include <linux/cpufeature.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-#define NUM_KEY_POWERS	8
-
-struct polyval_tfm_ctx {
-	/*
-	 * These powers must be in the order h^8, ..., h^1.
-	 */
-	u8 key_powers[NUM_KEY_POWERS][POLYVAL_BLOCK_SIZE];
-};
-
-struct polyval_desc_ctx {
-	u8 buffer[POLYVAL_BLOCK_SIZE];
-};
-
-asmlinkage void pmull_polyval_update(const struct polyval_tfm_ctx *keys,
-	const u8 *in, size_t nblocks, u8 *accumulator);
-asmlinkage void pmull_polyval_mul(u8 *op1, const u8 *op2);
-
-static void internal_polyval_update(const struct polyval_tfm_ctx *keys,
-	const u8 *in, size_t nblocks, u8 *accumulator)
-{
-	kernel_neon_begin();
-	pmull_polyval_update(keys, in, nblocks, accumulator);
-	kernel_neon_end();
-}
-
-static void internal_polyval_mul(u8 *op1, const u8 *op2)
-{
-	kernel_neon_begin();
-	pmull_polyval_mul(op1, op2);
-	kernel_neon_end();
-}
-
-static int polyval_arm64_setkey(struct crypto_shash *tfm,
-			const u8 *key, unsigned int keylen)
-{
-	struct polyval_tfm_ctx *tctx = crypto_shash_ctx(tfm);
-	int i;
-
-	if (keylen != POLYVAL_BLOCK_SIZE)
-		return -EINVAL;
-
-	memcpy(tctx->key_powers[NUM_KEY_POWERS-1], key, POLYVAL_BLOCK_SIZE);
-
-	for (i = NUM_KEY_POWERS-2; i >= 0; i--) {
-		memcpy(tctx->key_powers[i], key, POLYVAL_BLOCK_SIZE);
-		internal_polyval_mul(tctx->key_powers[i],
-				     tctx->key_powers[i+1]);
-	}
-
-	return 0;
-}
-
-static int polyval_arm64_init(struct shash_desc *desc)
-{
-	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
-
-	memset(dctx, 0, sizeof(*dctx));
-
-	return 0;
-}
-
-static int polyval_arm64_update(struct shash_desc *desc,
-			 const u8 *src, unsigned int srclen)
-{
-	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
-	const struct polyval_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
-	unsigned int nblocks;
-
-	do {
-		/* allow rescheduling every 4K bytes */
-		nblocks = min(srclen, 4096U) / POLYVAL_BLOCK_SIZE;
-		internal_polyval_update(tctx, src, nblocks, dctx->buffer);
-		srclen -= nblocks * POLYVAL_BLOCK_SIZE;
-		src += nblocks * POLYVAL_BLOCK_SIZE;
-	} while (srclen >= POLYVAL_BLOCK_SIZE);
-
-	return srclen;
-}
-
-static int polyval_arm64_finup(struct shash_desc *desc, const u8 *src,
-			       unsigned int len, u8 *dst)
-{
-	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
-	const struct polyval_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
-
-	if (len) {
-		crypto_xor(dctx->buffer, src, len);
-		internal_polyval_mul(dctx->buffer,
-				     tctx->key_powers[NUM_KEY_POWERS-1]);
-	}
-
-	memcpy(dst, dctx->buffer, POLYVAL_BLOCK_SIZE);
-
-	return 0;
-}
-
-static struct shash_alg polyval_alg = {
-	.digestsize	= POLYVAL_DIGEST_SIZE,
-	.init		= polyval_arm64_init,
-	.update		= polyval_arm64_update,
-	.finup		= polyval_arm64_finup,
-	.setkey		= polyval_arm64_setkey,
-	.descsize	= sizeof(struct polyval_desc_ctx),
-	.base		= {
-		.cra_name		= "polyval",
-		.cra_driver_name	= "polyval-ce",
-		.cra_priority		= 200,
-		.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
-		.cra_blocksize		= POLYVAL_BLOCK_SIZE,
-		.cra_ctxsize		= sizeof(struct polyval_tfm_ctx),
-		.cra_module		= THIS_MODULE,
-	},
-};
-
-static int __init polyval_ce_mod_init(void)
-{
-	return crypto_register_shash(&polyval_alg);
-}
-
-static void __exit polyval_ce_mod_exit(void)
-{
-	crypto_unregister_shash(&polyval_alg);
-}
-
-module_cpu_feature_match(PMULL, polyval_ce_mod_init)
-module_exit(polyval_ce_mod_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("POLYVAL hash function accelerated by ARMv8 Crypto Extensions");
-MODULE_ALIAS_CRYPTO("polyval");
-MODULE_ALIAS_CRYPTO("polyval-ce");
diff --git a/include/crypto/polyval.h b/include/crypto/polyval.h
index 5ba4c248cad1..f8aaf4275fbd 100644
--- a/include/crypto/polyval.h
+++ b/include/crypto/polyval.h
@@ -39,10 +39,18 @@ struct polyval_elem {
  * This may contain just the raw key H, or it may contain precomputed key
  * powers, depending on the platform's POLYVAL implementation.  Use
  * polyval_preparekey() to initialize this.
+ *
+ * By H^i we mean H^(i-1) * H * x^-128, with base case H^1 = H.  I.e. the
+ * exponentiation repeats the POLYVAL dot operation, with its "extra" x^-128.
  */
 struct polyval_key {
 #ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH
+#ifdef CONFIG_ARM64
+	/** @h_powers: Powers of the hash key H^8 through H^1 */
+	struct polyval_elem h_powers[8];
+#else
 #error "Unhandled arch"
+#endif
 #else /* CONFIG_CRYPTO_LIB_POLYVAL_ARCH */
 	/** @h: The hash key H */
 	struct polyval_elem h;
diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig
index 6545f0e83b83..430723994142 100644
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -144,6 +144,7 @@ config CRYPTO_LIB_POLYVAL
 config CRYPTO_LIB_POLYVAL_ARCH
 	bool
 	depends on CRYPTO_LIB_POLYVAL && !UML
+	default y if ARM64 && KERNEL_MODE_NEON
 
 config CRYPTO_LIB_CHACHA20POLY1305
 	tristate
diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile
index 055e44008805..2efa96afcb4b 100644
--- a/lib/crypto/Makefile
+++ b/lib/crypto/Makefile
@@ -202,6 +202,7 @@ obj-$(CONFIG_CRYPTO_LIB_POLYVAL) += libpolyval.o
 libpolyval-y := polyval.o
 ifeq ($(CONFIG_CRYPTO_LIB_POLYVAL_ARCH),y)
 CFLAGS_polyval.o += -I$(src)/$(SRCARCH)
+libpolyval-$(CONFIG_ARM64) += arm64/polyval-ce-core.o
 endif
 
 ################################################################################
diff --git a/arch/arm64/crypto/polyval-ce-core.S b/lib/crypto/arm64/polyval-ce-core.S
similarity index 92%
rename from arch/arm64/crypto/polyval-ce-core.S
rename to lib/crypto/arm64/polyval-ce-core.S
index b5326540d2e3..7c731a044d02 100644
--- a/arch/arm64/crypto/polyval-ce-core.S
+++ b/lib/crypto/arm64/polyval-ce-core.S
@@ -27,10 +27,10 @@
 #include <linux/linkage.h>
 #define STRIDE_BLOCKS 8
 
-KEY_POWERS	.req	x0
-MSG		.req	x1
-BLOCKS_LEFT	.req	x2
-ACCUMULATOR	.req	x3
+ACCUMULATOR	.req	x0
+KEY_POWERS	.req	x1
+MSG		.req	x2
+BLOCKS_LEFT	.req	x3
 KEY_START	.req	x10
 EXTRA_BYTES	.req	x11
 TMP	.req	x13
@@ -300,15 +300,12 @@ GSTAR	.req	v24
 .endm
 
 /*
- * Perform montgomery multiplication in GF(2^128) and store result in op1.
+ * Computes a = a * b * x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1.
  *
- * Computes op1*op2*x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1
- * If op1, op2 are in montgomery form, this computes the montgomery
- * form of op1*op2.
- *
- * void pmull_polyval_mul(u8 *op1, const u8 *op2);
+ * void polyval_mul_pmull(struct polyval_elem *a,
+ *			  const struct polyval_elem *b);
  */
-SYM_FUNC_START(pmull_polyval_mul)
+SYM_FUNC_START(polyval_mul_pmull)
 	adr	TMP, .Lgstar
 	ld1	{GSTAR.2d}, [TMP]
 	ld1	{v0.16b}, [x0]
@@ -318,22 +315,23 @@ SYM_FUNC_START(pmull_polyval_mul)
 	montgomery_reduction SUM
 	st1	{SUM.16b}, [x0]
 	ret
-SYM_FUNC_END(pmull_polyval_mul)
+SYM_FUNC_END(polyval_mul_pmull)
 
 /*
  * Perform polynomial evaluation as specified by POLYVAL.  This computes:
  *	h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1}
  * where n=nblocks, h is the hash key, and m_i are the message blocks.
  *
- * x0 - pointer to precomputed key powers h^8 ... h^1
- * x1 - pointer to message blocks
- * x2 - number of blocks to hash
- * x3 - pointer to accumulator
+ * x0 - pointer to accumulator
+ * x1 - pointer to precomputed key powers h^8 ... h^1
+ * x2 - pointer to message blocks
+ * x3 - number of blocks to hash
  *
- * void pmull_polyval_update(const struct polyval_ctx *ctx, const u8 *in,
- *			     size_t nblocks, u8 *accumulator);
+ * void polyval_blocks_pmull(struct polyval_elem *acc,
+ *			     const struct polyval_key *key,
+ *			     const u8 *data, size_t nblocks);
  */
-SYM_FUNC_START(pmull_polyval_update)
+SYM_FUNC_START(polyval_blocks_pmull)
 	adr	TMP, .Lgstar
 	mov	KEY_START, KEY_POWERS
 	ld1	{GSTAR.2d}, [TMP]
@@ -358,4 +356,4 @@ SYM_FUNC_START(pmull_polyval_update)
 .LskipPartial:
 	st1	{SUM.16b}, [ACCUMULATOR]
 	ret
-SYM_FUNC_END(pmull_polyval_update)
+SYM_FUNC_END(polyval_blocks_pmull)
diff --git a/lib/crypto/arm64/polyval.h b/lib/crypto/arm64/polyval.h
new file mode 100644
index 000000000000..2486e80750d0
--- /dev/null
+++ b/lib/crypto/arm64/polyval.h
@@ -0,0 +1,82 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * POLYVAL library functions, arm64 optimized
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <linux/cpufeature.h>
+
+#define NUM_H_POWERS 8
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pmull);
+
+asmlinkage void polyval_mul_pmull(struct polyval_elem *a,
+				  const struct polyval_elem *b);
+asmlinkage void polyval_blocks_pmull(struct polyval_elem *acc,
+				     const struct polyval_key *key,
+				     const u8 *data, size_t nblocks);
+
+static void polyval_preparekey_arch(struct polyval_key *key,
+				    const u8 raw_key[POLYVAL_BLOCK_SIZE])
+{
+	static_assert(ARRAY_SIZE(key->h_powers) == NUM_H_POWERS);
+	memcpy(&key->h_powers[NUM_H_POWERS - 1], raw_key, POLYVAL_BLOCK_SIZE);
+	if (static_branch_likely(&have_pmull) && may_use_simd()) {
+		kernel_neon_begin();
+		for (int i = NUM_H_POWERS - 2; i >= 0; i--) {
+			key->h_powers[i] = key->h_powers[i + 1];
+			polyval_mul_pmull(&key->h_powers[i],
+					  &key->h_powers[NUM_H_POWERS - 1]);
+		}
+		kernel_neon_end();
+	} else {
+		for (int i = NUM_H_POWERS - 2; i >= 0; i--) {
+			key->h_powers[i] = key->h_powers[i + 1];
+			polyval_mul_generic(&key->h_powers[i],
+					    &key->h_powers[NUM_H_POWERS - 1]);
+		}
+	}
+}
+
+static void polyval_mul_arch(struct polyval_elem *acc,
+			     const struct polyval_key *key)
+{
+	if (static_branch_likely(&have_pmull) && may_use_simd()) {
+		kernel_neon_begin();
+		polyval_mul_pmull(acc, &key->h_powers[NUM_H_POWERS - 1]);
+		kernel_neon_end();
+	} else {
+		polyval_mul_generic(acc, &key->h_powers[NUM_H_POWERS - 1]);
+	}
+}
+
+static void polyval_blocks_arch(struct polyval_elem *acc,
+				const struct polyval_key *key,
+				const u8 *data, size_t nblocks)
+{
+	if (static_branch_likely(&have_pmull) && may_use_simd()) {
+		do {
+			/* Allow rescheduling every 4 KiB. */
+			size_t n = min_t(size_t, nblocks,
+					 4096 / POLYVAL_BLOCK_SIZE);
+
+			kernel_neon_begin();
+			polyval_blocks_pmull(acc, key, data, n);
+			kernel_neon_end();
+			data += n * POLYVAL_BLOCK_SIZE;
+			nblocks -= n;
+		} while (nblocks);
+	} else {
+		polyval_blocks_generic(acc, &key->h_powers[NUM_H_POWERS - 1],
+				       data, nblocks);
+	}
+}
+
+#define polyval_mod_init_arch polyval_mod_init_arch
+static void polyval_mod_init_arch(void)
+{
+	if (cpu_have_named_feature(PMULL))
+		static_branch_enable(&have_pmull);
+}

From 4d8da35579daad0392d238460ed7e9629d49ca35 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Sun, 9 Nov 2025 15:47:20 -0800
Subject: [PATCH 34/37] lib/crypto: x86/polyval: Migrate optimized code into
 library

Migrate the x86_64 implementation of POLYVAL into lib/crypto/, wiring it
up to the POLYVAL library interface.  This makes the POLYVAL library be
properly optimized on x86_64.

This drops the x86_64 optimizations of polyval in the crypto_shash API.
That's fine, since polyval will be removed from crypto_shash entirely
since it is unneeded there.  But even if it comes back, the crypto_shash
API could just be implemented on top of the library API, as usual.

Adjust the names and prototypes of the assembly functions to align more
closely with the rest of the library code.

Also replace a movaps instruction with movups to remove the assumption
that the key struct is 16-byte aligned.  Users can still align the key
if they want (and at least in this case, movups is just as fast as
movaps), but it's inconvenient to require it.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251109234726.638437-6-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 arch/x86/crypto/Kconfig                       |  10 -
 arch/x86/crypto/Makefile                      |   3 -
 arch/x86/crypto/polyval-clmulni_glue.c        | 180 ------------------
 include/crypto/polyval.h                      |   3 +
 lib/crypto/Kconfig                            |   1 +
 lib/crypto/Makefile                           |   1 +
 .../crypto/x86/polyval-pclmul-avx.S           |  40 ++--
 lib/crypto/x86/polyval.h                      |  83 ++++++++
 8 files changed, 107 insertions(+), 214 deletions(-)
 delete mode 100644 arch/x86/crypto/polyval-clmulni_glue.c
 rename arch/x86/crypto/polyval-clmulni_asm.S => lib/crypto/x86/polyval-pclmul-avx.S (91%)
 create mode 100644 lib/crypto/x86/polyval.h

diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig
index 48d3076b6053..3fd2423d3cf8 100644
--- a/arch/x86/crypto/Kconfig
+++ b/arch/x86/crypto/Kconfig
@@ -353,16 +353,6 @@ config CRYPTO_NHPOLY1305_AVX2
 	  Architecture: x86_64 using:
 	  - AVX2 (Advanced Vector Extensions 2)
 
-config CRYPTO_POLYVAL_CLMUL_NI
-	tristate "Hash functions: POLYVAL (CLMUL-NI)"
-	depends on 64BIT
-	select CRYPTO_POLYVAL
-	help
-	  POLYVAL hash function for HCTR2
-
-	  Architecture: x86_64 using:
-	  - CLMUL-NI (carry-less multiplication new instructions)
-
 config CRYPTO_SM3_AVX_X86_64
 	tristate "Hash functions: SM3 (AVX)"
 	depends on 64BIT
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 2d30d5d36145..4a24dd38da50 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -52,9 +52,6 @@ aesni-intel-$(CONFIG_64BIT) += aes-ctr-avx-x86_64.o \
 obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
 ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
 
-obj-$(CONFIG_CRYPTO_POLYVAL_CLMUL_NI) += polyval-clmulni.o
-polyval-clmulni-y := polyval-clmulni_asm.o polyval-clmulni_glue.o
-
 obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o
 nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o
 obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o
diff --git a/arch/x86/crypto/polyval-clmulni_glue.c b/arch/x86/crypto/polyval-clmulni_glue.c
deleted file mode 100644
index 6b466867f91a..000000000000
--- a/arch/x86/crypto/polyval-clmulni_glue.c
+++ /dev/null
@@ -1,180 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Glue code for POLYVAL using PCMULQDQ-NI
- *
- * Copyright (c) 2007 Nokia Siemens Networks - Mikko Herranen <mh1@iki.fi>
- * Copyright (c) 2009 Intel Corp.
- *   Author: Huang Ying <ying.huang@intel.com>
- * Copyright 2021 Google LLC
- */
-
-/*
- * Glue code based on ghash-clmulni-intel_glue.c.
- *
- * This implementation of POLYVAL uses montgomery multiplication
- * accelerated by PCLMULQDQ-NI to implement the finite field
- * operations.
- */
-
-#include <asm/cpu_device_id.h>
-#include <asm/fpu/api.h>
-#include <crypto/internal/hash.h>
-#include <crypto/polyval.h>
-#include <crypto/utils.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-#define POLYVAL_ALIGN	16
-#define POLYVAL_ALIGN_ATTR __aligned(POLYVAL_ALIGN)
-#define POLYVAL_ALIGN_EXTRA ((POLYVAL_ALIGN - 1) & ~(CRYPTO_MINALIGN - 1))
-#define POLYVAL_CTX_SIZE (sizeof(struct polyval_tfm_ctx) + POLYVAL_ALIGN_EXTRA)
-#define NUM_KEY_POWERS	8
-
-struct polyval_tfm_ctx {
-	/*
-	 * These powers must be in the order h^8, ..., h^1.
-	 */
-	u8 key_powers[NUM_KEY_POWERS][POLYVAL_BLOCK_SIZE] POLYVAL_ALIGN_ATTR;
-};
-
-struct polyval_desc_ctx {
-	u8 buffer[POLYVAL_BLOCK_SIZE];
-};
-
-asmlinkage void clmul_polyval_update(const struct polyval_tfm_ctx *keys,
-	const u8 *in, size_t nblocks, u8 *accumulator);
-asmlinkage void clmul_polyval_mul(u8 *op1, const u8 *op2);
-
-static inline struct polyval_tfm_ctx *polyval_tfm_ctx(struct crypto_shash *tfm)
-{
-	return PTR_ALIGN(crypto_shash_ctx(tfm), POLYVAL_ALIGN);
-}
-
-static void internal_polyval_update(const struct polyval_tfm_ctx *keys,
-	const u8 *in, size_t nblocks, u8 *accumulator)
-{
-	kernel_fpu_begin();
-	clmul_polyval_update(keys, in, nblocks, accumulator);
-	kernel_fpu_end();
-}
-
-static void internal_polyval_mul(u8 *op1, const u8 *op2)
-{
-	kernel_fpu_begin();
-	clmul_polyval_mul(op1, op2);
-	kernel_fpu_end();
-}
-
-static int polyval_x86_setkey(struct crypto_shash *tfm,
-			const u8 *key, unsigned int keylen)
-{
-	struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(tfm);
-	int i;
-
-	if (keylen != POLYVAL_BLOCK_SIZE)
-		return -EINVAL;
-
-	memcpy(tctx->key_powers[NUM_KEY_POWERS-1], key, POLYVAL_BLOCK_SIZE);
-
-	for (i = NUM_KEY_POWERS-2; i >= 0; i--) {
-		memcpy(tctx->key_powers[i], key, POLYVAL_BLOCK_SIZE);
-		internal_polyval_mul(tctx->key_powers[i],
-				     tctx->key_powers[i+1]);
-	}
-
-	return 0;
-}
-
-static int polyval_x86_init(struct shash_desc *desc)
-{
-	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
-
-	memset(dctx, 0, sizeof(*dctx));
-
-	return 0;
-}
-
-static int polyval_x86_update(struct shash_desc *desc,
-			 const u8 *src, unsigned int srclen)
-{
-	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
-	const struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(desc->tfm);
-	unsigned int nblocks;
-
-	do {
-		/* Allow rescheduling every 4K bytes. */
-		nblocks = min(srclen, 4096U) / POLYVAL_BLOCK_SIZE;
-		internal_polyval_update(tctx, src, nblocks, dctx->buffer);
-		srclen -= nblocks * POLYVAL_BLOCK_SIZE;
-		src += nblocks * POLYVAL_BLOCK_SIZE;
-	} while (srclen >= POLYVAL_BLOCK_SIZE);
-
-	return srclen;
-}
-
-static int polyval_x86_finup(struct shash_desc *desc, const u8 *src,
-			     unsigned int len, u8 *dst)
-{
-	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
-	const struct polyval_tfm_ctx *tctx = polyval_tfm_ctx(desc->tfm);
-
-	if (len) {
-		crypto_xor(dctx->buffer, src, len);
-		internal_polyval_mul(dctx->buffer,
-				     tctx->key_powers[NUM_KEY_POWERS-1]);
-	}
-
-	memcpy(dst, dctx->buffer, POLYVAL_BLOCK_SIZE);
-
-	return 0;
-}
-
-static struct shash_alg polyval_alg = {
-	.digestsize	= POLYVAL_DIGEST_SIZE,
-	.init		= polyval_x86_init,
-	.update		= polyval_x86_update,
-	.finup		= polyval_x86_finup,
-	.setkey		= polyval_x86_setkey,
-	.descsize	= sizeof(struct polyval_desc_ctx),
-	.base		= {
-		.cra_name		= "polyval",
-		.cra_driver_name	= "polyval-clmulni",
-		.cra_priority		= 200,
-		.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
-		.cra_blocksize		= POLYVAL_BLOCK_SIZE,
-		.cra_ctxsize		= POLYVAL_CTX_SIZE,
-		.cra_module		= THIS_MODULE,
-	},
-};
-
-__maybe_unused static const struct x86_cpu_id pcmul_cpu_id[] = {
-	X86_MATCH_FEATURE(X86_FEATURE_PCLMULQDQ, NULL),
-	{}
-};
-MODULE_DEVICE_TABLE(x86cpu, pcmul_cpu_id);
-
-static int __init polyval_clmulni_mod_init(void)
-{
-	if (!x86_match_cpu(pcmul_cpu_id))
-		return -ENODEV;
-
-	if (!boot_cpu_has(X86_FEATURE_AVX))
-		return -ENODEV;
-
-	return crypto_register_shash(&polyval_alg);
-}
-
-static void __exit polyval_clmulni_mod_exit(void)
-{
-	crypto_unregister_shash(&polyval_alg);
-}
-
-module_init(polyval_clmulni_mod_init);
-module_exit(polyval_clmulni_mod_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("POLYVAL hash function accelerated by PCLMULQDQ-NI");
-MODULE_ALIAS_CRYPTO("polyval");
-MODULE_ALIAS_CRYPTO("polyval-clmulni");
diff --git a/include/crypto/polyval.h b/include/crypto/polyval.h
index f8aaf4275fbd..b28b8ef11353 100644
--- a/include/crypto/polyval.h
+++ b/include/crypto/polyval.h
@@ -48,6 +48,9 @@ struct polyval_key {
 #ifdef CONFIG_ARM64
 	/** @h_powers: Powers of the hash key H^8 through H^1 */
 	struct polyval_elem h_powers[8];
+#elif defined(CONFIG_X86)
+	/** @h_powers: Powers of the hash key H^8 through H^1 */
+	struct polyval_elem h_powers[8];
 #else
 #error "Unhandled arch"
 #endif
diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig
index 430723994142..9d04b3771ce2 100644
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -145,6 +145,7 @@ config CRYPTO_LIB_POLYVAL_ARCH
 	bool
 	depends on CRYPTO_LIB_POLYVAL && !UML
 	default y if ARM64 && KERNEL_MODE_NEON
+	default y if X86_64
 
 config CRYPTO_LIB_CHACHA20POLY1305
 	tristate
diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile
index 2efa96afcb4b..6580991f8e12 100644
--- a/lib/crypto/Makefile
+++ b/lib/crypto/Makefile
@@ -203,6 +203,7 @@ libpolyval-y := polyval.o
 ifeq ($(CONFIG_CRYPTO_LIB_POLYVAL_ARCH),y)
 CFLAGS_polyval.o += -I$(src)/$(SRCARCH)
 libpolyval-$(CONFIG_ARM64) += arm64/polyval-ce-core.o
+libpolyval-$(CONFIG_X86) += x86/polyval-pclmul-avx.o
 endif
 
 ################################################################################
diff --git a/arch/x86/crypto/polyval-clmulni_asm.S b/lib/crypto/x86/polyval-pclmul-avx.S
similarity index 91%
rename from arch/x86/crypto/polyval-clmulni_asm.S
rename to lib/crypto/x86/polyval-pclmul-avx.S
index a6ebe4e7dd2b..7f739465ad35 100644
--- a/arch/x86/crypto/polyval-clmulni_asm.S
+++ b/lib/crypto/x86/polyval-pclmul-avx.S
@@ -36,10 +36,10 @@
 #define MI %xmm14
 #define SUM %xmm15
 
-#define KEY_POWERS %rdi
-#define MSG %rsi
-#define BLOCKS_LEFT %rdx
-#define ACCUMULATOR %rcx
+#define ACCUMULATOR %rdi
+#define KEY_POWERS %rsi
+#define MSG %rdx
+#define BLOCKS_LEFT %rcx
 #define TMP %rax
 
 .section    .rodata.cst16.gstar, "aM", @progbits, 16
@@ -234,7 +234,7 @@
 
 	movups (MSG), %xmm0
 	pxor SUM, %xmm0
-	movaps (KEY_POWERS), %xmm1
+	movups (KEY_POWERS), %xmm1
 	schoolbook1_noload
 	dec BLOCKS_LEFT
 	addq $16, MSG
@@ -261,15 +261,12 @@
 .endm
 
 /*
- * Perform montgomery multiplication in GF(2^128) and store result in op1.
+ * Computes a = a * b * x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1.
  *
- * Computes op1*op2*x^{-128} mod x^128 + x^127 + x^126 + x^121 + 1
- * If op1, op2 are in montgomery form, this computes the montgomery
- * form of op1*op2.
- *
- * void clmul_polyval_mul(u8 *op1, const u8 *op2);
+ * void polyval_mul_pclmul_avx(struct polyval_elem *a,
+ *			       const struct polyval_elem *b);
  */
-SYM_FUNC_START(clmul_polyval_mul)
+SYM_FUNC_START(polyval_mul_pclmul_avx)
 	FRAME_BEGIN
 	vmovdqa .Lgstar(%rip), GSTAR
 	movups (%rdi), %xmm0
@@ -280,22 +277,23 @@ SYM_FUNC_START(clmul_polyval_mul)
 	movups SUM, (%rdi)
 	FRAME_END
 	RET
-SYM_FUNC_END(clmul_polyval_mul)
+SYM_FUNC_END(polyval_mul_pclmul_avx)
 
 /*
  * Perform polynomial evaluation as specified by POLYVAL.  This computes:
  *	h^n * accumulator + h^n * m_0 + ... + h^1 * m_{n-1}
  * where n=nblocks, h is the hash key, and m_i are the message blocks.
  *
- * rdi - pointer to precomputed key powers h^8 ... h^1
- * rsi - pointer to message blocks
- * rdx - number of blocks to hash
- * rcx - pointer to the accumulator
+ * rdi - pointer to the accumulator
+ * rsi - pointer to precomputed key powers h^8 ... h^1
+ * rdx - pointer to message blocks
+ * rcx - number of blocks to hash
  *
- * void clmul_polyval_update(const struct polyval_tfm_ctx *keys,
- *	const u8 *in, size_t nblocks, u8 *accumulator);
+ * void polyval_blocks_pclmul_avx(struct polyval_elem *acc,
+ *				  const struct polyval_key *key,
+ *				  const u8 *data, size_t nblocks);
  */
-SYM_FUNC_START(clmul_polyval_update)
+SYM_FUNC_START(polyval_blocks_pclmul_avx)
 	FRAME_BEGIN
 	vmovdqa .Lgstar(%rip), GSTAR
 	movups (ACCUMULATOR), SUM
@@ -318,4 +316,4 @@ SYM_FUNC_START(clmul_polyval_update)
 	movups SUM, (ACCUMULATOR)
 	FRAME_END
 	RET
-SYM_FUNC_END(clmul_polyval_update)
+SYM_FUNC_END(polyval_blocks_pclmul_avx)
diff --git a/lib/crypto/x86/polyval.h b/lib/crypto/x86/polyval.h
new file mode 100644
index 000000000000..ef8797521420
--- /dev/null
+++ b/lib/crypto/x86/polyval.h
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * POLYVAL library functions, x86_64 optimized
+ *
+ * Copyright 2025 Google LLC
+ */
+#include <asm/fpu/api.h>
+#include <linux/cpufeature.h>
+
+#define NUM_H_POWERS 8
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmul_avx);
+
+asmlinkage void polyval_mul_pclmul_avx(struct polyval_elem *a,
+				       const struct polyval_elem *b);
+asmlinkage void polyval_blocks_pclmul_avx(struct polyval_elem *acc,
+					  const struct polyval_key *key,
+					  const u8 *data, size_t nblocks);
+
+static void polyval_preparekey_arch(struct polyval_key *key,
+				    const u8 raw_key[POLYVAL_BLOCK_SIZE])
+{
+	static_assert(ARRAY_SIZE(key->h_powers) == NUM_H_POWERS);
+	memcpy(&key->h_powers[NUM_H_POWERS - 1], raw_key, POLYVAL_BLOCK_SIZE);
+	if (static_branch_likely(&have_pclmul_avx) && irq_fpu_usable()) {
+		kernel_fpu_begin();
+		for (int i = NUM_H_POWERS - 2; i >= 0; i--) {
+			key->h_powers[i] = key->h_powers[i + 1];
+			polyval_mul_pclmul_avx(
+				&key->h_powers[i],
+				&key->h_powers[NUM_H_POWERS - 1]);
+		}
+		kernel_fpu_end();
+	} else {
+		for (int i = NUM_H_POWERS - 2; i >= 0; i--) {
+			key->h_powers[i] = key->h_powers[i + 1];
+			polyval_mul_generic(&key->h_powers[i],
+					    &key->h_powers[NUM_H_POWERS - 1]);
+		}
+	}
+}
+
+static void polyval_mul_arch(struct polyval_elem *acc,
+			     const struct polyval_key *key)
+{
+	if (static_branch_likely(&have_pclmul_avx) && irq_fpu_usable()) {
+		kernel_fpu_begin();
+		polyval_mul_pclmul_avx(acc, &key->h_powers[NUM_H_POWERS - 1]);
+		kernel_fpu_end();
+	} else {
+		polyval_mul_generic(acc, &key->h_powers[NUM_H_POWERS - 1]);
+	}
+}
+
+static void polyval_blocks_arch(struct polyval_elem *acc,
+				const struct polyval_key *key,
+				const u8 *data, size_t nblocks)
+{
+	if (static_branch_likely(&have_pclmul_avx) && irq_fpu_usable()) {
+		do {
+			/* Allow rescheduling every 4 KiB. */
+			size_t n = min_t(size_t, nblocks,
+					 4096 / POLYVAL_BLOCK_SIZE);
+
+			kernel_fpu_begin();
+			polyval_blocks_pclmul_avx(acc, key, data, n);
+			kernel_fpu_end();
+			data += n * POLYVAL_BLOCK_SIZE;
+			nblocks -= n;
+		} while (nblocks);
+	} else {
+		polyval_blocks_generic(acc, &key->h_powers[NUM_H_POWERS - 1],
+				       data, nblocks);
+	}
+}
+
+#define polyval_mod_init_arch polyval_mod_init_arch
+static void polyval_mod_init_arch(void)
+{
+	if (boot_cpu_has(X86_FEATURE_PCLMULQDQ) &&
+	    boot_cpu_has(X86_FEATURE_AVX))
+		static_branch_enable(&have_pclmul_avx);
+}

From d35abc0b1ddff64934dc6ab637f7b5ce1cd1d91f Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Sun, 9 Nov 2025 15:47:21 -0800
Subject: [PATCH 35/37] crypto: hctr2 - Convert to use POLYVAL library

The "hash function" in hctr2 is fixed at POLYVAL; it can never vary.
Just use the POLYVAL library, which is much easier to use than the
crypto_shash API.  It's faster, uses fixed-size structs, and never fails
(all the functions return void).

Note that this eliminates the only known user of the polyval support in
crypto_shash.  A later commit will remove support for polyval from
crypto_shash, given that the library API is sufficient.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251109234726.638437-7-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 crypto/Kconfig   |   2 +-
 crypto/hctr2.c   | 222 +++++++++++++----------------------------------
 crypto/testmgr.c |   3 +-
 3 files changed, 64 insertions(+), 163 deletions(-)

diff --git a/crypto/Kconfig b/crypto/Kconfig
index 57b85e903cf0..805172f75bf1 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -696,7 +696,7 @@ config CRYPTO_ECB
 config CRYPTO_HCTR2
 	tristate "HCTR2"
 	select CRYPTO_XCTR
-	select CRYPTO_POLYVAL
+	select CRYPTO_LIB_POLYVAL
 	select CRYPTO_MANAGER
 	help
 	  HCTR2 length-preserving encryption mode
diff --git a/crypto/hctr2.c b/crypto/hctr2.c
index c8932777bba8..f4cd6c29b4d3 100644
--- a/crypto/hctr2.c
+++ b/crypto/hctr2.c
@@ -17,7 +17,6 @@
  */
 
 #include <crypto/internal/cipher.h>
-#include <crypto/internal/hash.h>
 #include <crypto/internal/skcipher.h>
 #include <crypto/polyval.h>
 #include <crypto/scatterwalk.h>
@@ -37,23 +36,14 @@
 struct hctr2_instance_ctx {
 	struct crypto_cipher_spawn blockcipher_spawn;
 	struct crypto_skcipher_spawn xctr_spawn;
-	struct crypto_shash_spawn polyval_spawn;
 };
 
 struct hctr2_tfm_ctx {
 	struct crypto_cipher *blockcipher;
 	struct crypto_skcipher *xctr;
-	struct crypto_shash *polyval;
+	struct polyval_key poly_key;
+	struct polyval_elem hashed_tweaklens[2];
 	u8 L[BLOCKCIPHER_BLOCK_SIZE];
-	int hashed_tweak_offset;
-	/*
-	 * This struct is allocated with extra space for two exported hash
-	 * states.  Since the hash state size is not known at compile-time, we
-	 * can't add these to the struct directly.
-	 *
-	 * hashed_tweaklen_divisible;
-	 * hashed_tweaklen_remainder;
-	 */
 };
 
 struct hctr2_request_ctx {
@@ -63,39 +53,17 @@ struct hctr2_request_ctx {
 	struct scatterlist *bulk_part_src;
 	struct scatterlist sg_src[2];
 	struct scatterlist sg_dst[2];
+	struct polyval_elem hashed_tweak;
 	/*
-	 * Sub-request sizes are unknown at compile-time, so they need to go
-	 * after the members with known sizes.
+	 * skcipher sub-request size is unknown at compile-time, so it needs to
+	 * go after the members with known sizes.
 	 */
 	union {
-		struct shash_desc hash_desc;
+		struct polyval_ctx poly_ctx;
 		struct skcipher_request xctr_req;
 	} u;
-	/*
-	 * This struct is allocated with extra space for one exported hash
-	 * state.  Since the hash state size is not known at compile-time, we
-	 * can't add it to the struct directly.
-	 *
-	 * hashed_tweak;
-	 */
 };
 
-static inline u8 *hctr2_hashed_tweaklen(const struct hctr2_tfm_ctx *tctx,
-					bool has_remainder)
-{
-	u8 *p = (u8 *)tctx + sizeof(*tctx);
-
-	if (has_remainder) /* For messages not a multiple of block length */
-		p += crypto_shash_statesize(tctx->polyval);
-	return p;
-}
-
-static inline u8 *hctr2_hashed_tweak(const struct hctr2_tfm_ctx *tctx,
-				     struct hctr2_request_ctx *rctx)
-{
-	return (u8 *)rctx + tctx->hashed_tweak_offset;
-}
-
 /*
  * The input data for each HCTR2 hash step begins with a 16-byte block that
  * contains the tweak length and a flag that indicates whether the input is evenly
@@ -106,24 +74,23 @@ static inline u8 *hctr2_hashed_tweak(const struct hctr2_tfm_ctx *tctx,
  *
  * These precomputed hashes are stored in hctr2_tfm_ctx.
  */
-static int hctr2_hash_tweaklen(struct hctr2_tfm_ctx *tctx, bool has_remainder)
+static void hctr2_hash_tweaklens(struct hctr2_tfm_ctx *tctx)
 {
-	SHASH_DESC_ON_STACK(shash, tfm->polyval);
-	__le64 tweak_length_block[2];
-	int err;
+	struct polyval_ctx ctx;
 
-	shash->tfm = tctx->polyval;
-	memset(tweak_length_block, 0, sizeof(tweak_length_block));
+	for (int has_remainder = 0; has_remainder < 2; has_remainder++) {
+		const __le64 tweak_length_block[2] = {
+			cpu_to_le64(TWEAK_SIZE * 8 * 2 + 2 + has_remainder),
+		};
 
-	tweak_length_block[0] = cpu_to_le64(TWEAK_SIZE * 8 * 2 + 2 + has_remainder);
-	err = crypto_shash_init(shash);
-	if (err)
-		return err;
-	err = crypto_shash_update(shash, (u8 *)tweak_length_block,
-				  POLYVAL_BLOCK_SIZE);
-	if (err)
-		return err;
-	return crypto_shash_export(shash, hctr2_hashed_tweaklen(tctx, has_remainder));
+		polyval_init(&ctx, &tctx->poly_key);
+		polyval_update(&ctx, (const u8 *)&tweak_length_block,
+			       sizeof(tweak_length_block));
+		static_assert(sizeof(tweak_length_block) == POLYVAL_BLOCK_SIZE);
+		polyval_export_blkaligned(
+			&ctx, &tctx->hashed_tweaklens[has_remainder]);
+	}
+	memzero_explicit(&ctx, sizeof(ctx));
 }
 
 static int hctr2_setkey(struct crypto_skcipher *tfm, const u8 *key,
@@ -156,51 +123,42 @@ static int hctr2_setkey(struct crypto_skcipher *tfm, const u8 *key,
 	tctx->L[0] = 0x01;
 	crypto_cipher_encrypt_one(tctx->blockcipher, tctx->L, tctx->L);
 
-	crypto_shash_clear_flags(tctx->polyval, CRYPTO_TFM_REQ_MASK);
-	crypto_shash_set_flags(tctx->polyval, crypto_skcipher_get_flags(tfm) &
-			       CRYPTO_TFM_REQ_MASK);
-	err = crypto_shash_setkey(tctx->polyval, hbar, BLOCKCIPHER_BLOCK_SIZE);
-	if (err)
-		return err;
+	static_assert(sizeof(hbar) == POLYVAL_BLOCK_SIZE);
+	polyval_preparekey(&tctx->poly_key, hbar);
 	memzero_explicit(hbar, sizeof(hbar));
 
-	return hctr2_hash_tweaklen(tctx, true) ?: hctr2_hash_tweaklen(tctx, false);
+	hctr2_hash_tweaklens(tctx);
+	return 0;
 }
 
-static int hctr2_hash_tweak(struct skcipher_request *req)
+static void hctr2_hash_tweak(struct skcipher_request *req)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	const struct hctr2_tfm_ctx *tctx = crypto_skcipher_ctx(tfm);
 	struct hctr2_request_ctx *rctx = skcipher_request_ctx(req);
-	struct shash_desc *hash_desc = &rctx->u.hash_desc;
-	int err;
+	struct polyval_ctx *poly_ctx = &rctx->u.poly_ctx;
 	bool has_remainder = req->cryptlen % POLYVAL_BLOCK_SIZE;
 
-	hash_desc->tfm = tctx->polyval;
-	err = crypto_shash_import(hash_desc, hctr2_hashed_tweaklen(tctx, has_remainder));
-	if (err)
-		return err;
-	err = crypto_shash_update(hash_desc, req->iv, TWEAK_SIZE);
-	if (err)
-		return err;
+	polyval_import_blkaligned(poly_ctx, &tctx->poly_key,
+				  &tctx->hashed_tweaklens[has_remainder]);
+	polyval_update(poly_ctx, req->iv, TWEAK_SIZE);
 
 	// Store the hashed tweak, since we need it when computing both
 	// H(T || N) and H(T || V).
-	return crypto_shash_export(hash_desc, hctr2_hashed_tweak(tctx, rctx));
+	static_assert(TWEAK_SIZE % POLYVAL_BLOCK_SIZE == 0);
+	polyval_export_blkaligned(poly_ctx, &rctx->hashed_tweak);
 }
 
-static int hctr2_hash_message(struct skcipher_request *req,
-			      struct scatterlist *sgl,
-			      u8 digest[POLYVAL_DIGEST_SIZE])
+static void hctr2_hash_message(struct skcipher_request *req,
+			       struct scatterlist *sgl,
+			       u8 digest[POLYVAL_DIGEST_SIZE])
 {
-	static const u8 padding[BLOCKCIPHER_BLOCK_SIZE] = { 0x1 };
+	static const u8 padding = 0x1;
 	struct hctr2_request_ctx *rctx = skcipher_request_ctx(req);
-	struct shash_desc *hash_desc = &rctx->u.hash_desc;
+	struct polyval_ctx *poly_ctx = &rctx->u.poly_ctx;
 	const unsigned int bulk_len = req->cryptlen - BLOCKCIPHER_BLOCK_SIZE;
 	struct sg_mapping_iter miter;
-	unsigned int remainder = bulk_len % BLOCKCIPHER_BLOCK_SIZE;
 	int i;
-	int err = 0;
 	int n = 0;
 
 	sg_miter_start(&miter, sgl, sg_nents(sgl),
@@ -208,22 +166,13 @@ static int hctr2_hash_message(struct skcipher_request *req,
 	for (i = 0; i < bulk_len; i += n) {
 		sg_miter_next(&miter);
 		n = min_t(unsigned int, miter.length, bulk_len - i);
-		err = crypto_shash_update(hash_desc, miter.addr, n);
-		if (err)
-			break;
+		polyval_update(poly_ctx, miter.addr, n);
 	}
 	sg_miter_stop(&miter);
 
-	if (err)
-		return err;
-
-	if (remainder) {
-		err = crypto_shash_update(hash_desc, padding,
-					  BLOCKCIPHER_BLOCK_SIZE - remainder);
-		if (err)
-			return err;
-	}
-	return crypto_shash_final(hash_desc, digest);
+	if (req->cryptlen % BLOCKCIPHER_BLOCK_SIZE)
+		polyval_update(poly_ctx, &padding, 1);
+	polyval_final(poly_ctx, digest);
 }
 
 static int hctr2_finish(struct skcipher_request *req)
@@ -231,19 +180,14 @@ static int hctr2_finish(struct skcipher_request *req)
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	const struct hctr2_tfm_ctx *tctx = crypto_skcipher_ctx(tfm);
 	struct hctr2_request_ctx *rctx = skcipher_request_ctx(req);
+	struct polyval_ctx *poly_ctx = &rctx->u.poly_ctx;
 	u8 digest[POLYVAL_DIGEST_SIZE];
-	struct shash_desc *hash_desc = &rctx->u.hash_desc;
-	int err;
 
 	// U = UU ^ H(T || V)
 	// or M = MM ^ H(T || N)
-	hash_desc->tfm = tctx->polyval;
-	err = crypto_shash_import(hash_desc, hctr2_hashed_tweak(tctx, rctx));
-	if (err)
-		return err;
-	err = hctr2_hash_message(req, rctx->bulk_part_dst, digest);
-	if (err)
-		return err;
+	polyval_import_blkaligned(poly_ctx, &tctx->poly_key,
+				  &rctx->hashed_tweak);
+	hctr2_hash_message(req, rctx->bulk_part_dst, digest);
 	crypto_xor(rctx->first_block, digest, BLOCKCIPHER_BLOCK_SIZE);
 
 	// Copy U (or M) into dst scatterlist
@@ -269,7 +213,6 @@ static int hctr2_crypt(struct skcipher_request *req, bool enc)
 	struct hctr2_request_ctx *rctx = skcipher_request_ctx(req);
 	u8 digest[POLYVAL_DIGEST_SIZE];
 	int bulk_len = req->cryptlen - BLOCKCIPHER_BLOCK_SIZE;
-	int err;
 
 	// Requests must be at least one block
 	if (req->cryptlen < BLOCKCIPHER_BLOCK_SIZE)
@@ -287,12 +230,8 @@ static int hctr2_crypt(struct skcipher_request *req, bool enc)
 
 	// MM = M ^ H(T || N)
 	// or UU = U ^ H(T || V)
-	err = hctr2_hash_tweak(req);
-	if (err)
-		return err;
-	err = hctr2_hash_message(req, rctx->bulk_part_src, digest);
-	if (err)
-		return err;
+	hctr2_hash_tweak(req);
+	hctr2_hash_message(req, rctx->bulk_part_src, digest);
 	crypto_xor(digest, rctx->first_block, BLOCKCIPHER_BLOCK_SIZE);
 
 	// UU = E(MM)
@@ -338,8 +277,6 @@ static int hctr2_init_tfm(struct crypto_skcipher *tfm)
 	struct hctr2_tfm_ctx *tctx = crypto_skcipher_ctx(tfm);
 	struct crypto_skcipher *xctr;
 	struct crypto_cipher *blockcipher;
-	struct crypto_shash *polyval;
-	unsigned int subreq_size;
 	int err;
 
 	xctr = crypto_spawn_skcipher(&ictx->xctr_spawn);
@@ -352,31 +289,17 @@ static int hctr2_init_tfm(struct crypto_skcipher *tfm)
 		goto err_free_xctr;
 	}
 
-	polyval = crypto_spawn_shash(&ictx->polyval_spawn);
-	if (IS_ERR(polyval)) {
-		err = PTR_ERR(polyval);
-		goto err_free_blockcipher;
-	}
-
 	tctx->xctr = xctr;
 	tctx->blockcipher = blockcipher;
-	tctx->polyval = polyval;
 
 	BUILD_BUG_ON(offsetofend(struct hctr2_request_ctx, u) !=
 				 sizeof(struct hctr2_request_ctx));
-	subreq_size = max(sizeof_field(struct hctr2_request_ctx, u.hash_desc) +
-			  crypto_shash_descsize(polyval),
-			  sizeof_field(struct hctr2_request_ctx, u.xctr_req) +
-			  crypto_skcipher_reqsize(xctr));
-
-	tctx->hashed_tweak_offset = offsetof(struct hctr2_request_ctx, u) +
-				    subreq_size;
-	crypto_skcipher_set_reqsize(tfm, tctx->hashed_tweak_offset +
-				    crypto_shash_statesize(polyval));
+	crypto_skcipher_set_reqsize(
+		tfm, max(sizeof(struct hctr2_request_ctx),
+			 offsetofend(struct hctr2_request_ctx, u.xctr_req) +
+				 crypto_skcipher_reqsize(xctr)));
 	return 0;
 
-err_free_blockcipher:
-	crypto_free_cipher(blockcipher);
 err_free_xctr:
 	crypto_free_skcipher(xctr);
 	return err;
@@ -388,7 +311,6 @@ static void hctr2_exit_tfm(struct crypto_skcipher *tfm)
 
 	crypto_free_cipher(tctx->blockcipher);
 	crypto_free_skcipher(tctx->xctr);
-	crypto_free_shash(tctx->polyval);
 }
 
 static void hctr2_free_instance(struct skcipher_instance *inst)
@@ -397,21 +319,17 @@ static void hctr2_free_instance(struct skcipher_instance *inst)
 
 	crypto_drop_cipher(&ictx->blockcipher_spawn);
 	crypto_drop_skcipher(&ictx->xctr_spawn);
-	crypto_drop_shash(&ictx->polyval_spawn);
 	kfree(inst);
 }
 
-static int hctr2_create_common(struct crypto_template *tmpl,
-			       struct rtattr **tb,
-			       const char *xctr_name,
-			       const char *polyval_name)
+static int hctr2_create_common(struct crypto_template *tmpl, struct rtattr **tb,
+			       const char *xctr_name)
 {
 	struct skcipher_alg_common *xctr_alg;
 	u32 mask;
 	struct skcipher_instance *inst;
 	struct hctr2_instance_ctx *ictx;
 	struct crypto_alg *blockcipher_alg;
-	struct shash_alg *polyval_alg;
 	char blockcipher_name[CRYPTO_MAX_ALG_NAME];
 	int len;
 	int err;
@@ -457,19 +375,6 @@ static int hctr2_create_common(struct crypto_template *tmpl,
 	if (blockcipher_alg->cra_blocksize != BLOCKCIPHER_BLOCK_SIZE)
 		goto err_free_inst;
 
-	/* Polyval ε-∆U hash function */
-	err = crypto_grab_shash(&ictx->polyval_spawn,
-				skcipher_crypto_instance(inst),
-				polyval_name, 0, mask);
-	if (err)
-		goto err_free_inst;
-	polyval_alg = crypto_spawn_shash_alg(&ictx->polyval_spawn);
-
-	/* Ensure Polyval is being used */
-	err = -EINVAL;
-	if (strcmp(polyval_alg->base.cra_name, "polyval") != 0)
-		goto err_free_inst;
-
 	/* Instance fields */
 
 	err = -ENAMETOOLONG;
@@ -477,22 +382,16 @@ static int hctr2_create_common(struct crypto_template *tmpl,
 		     blockcipher_alg->cra_name) >= CRYPTO_MAX_ALG_NAME)
 		goto err_free_inst;
 	if (snprintf(inst->alg.base.cra_driver_name, CRYPTO_MAX_ALG_NAME,
-		     "hctr2_base(%s,%s)",
-		     xctr_alg->base.cra_driver_name,
-		     polyval_alg->base.cra_driver_name) >= CRYPTO_MAX_ALG_NAME)
+		     "hctr2_base(%s,polyval-lib)",
+		     xctr_alg->base.cra_driver_name) >= CRYPTO_MAX_ALG_NAME)
 		goto err_free_inst;
 
 	inst->alg.base.cra_blocksize = BLOCKCIPHER_BLOCK_SIZE;
-	inst->alg.base.cra_ctxsize = sizeof(struct hctr2_tfm_ctx) +
-				     polyval_alg->statesize * 2;
+	inst->alg.base.cra_ctxsize = sizeof(struct hctr2_tfm_ctx);
 	inst->alg.base.cra_alignmask = xctr_alg->base.cra_alignmask;
-	/*
-	 * The hash function is called twice, so it is weighted higher than the
-	 * xctr and blockcipher.
-	 */
 	inst->alg.base.cra_priority = (2 * xctr_alg->base.cra_priority +
-				       4 * polyval_alg->base.cra_priority +
-				       blockcipher_alg->cra_priority) / 7;
+				       blockcipher_alg->cra_priority) /
+				      3;
 
 	inst->alg.setkey = hctr2_setkey;
 	inst->alg.encrypt = hctr2_encrypt;
@@ -525,8 +424,11 @@ static int hctr2_create_base(struct crypto_template *tmpl, struct rtattr **tb)
 	polyval_name = crypto_attr_alg_name(tb[2]);
 	if (IS_ERR(polyval_name))
 		return PTR_ERR(polyval_name);
+	if (strcmp(polyval_name, "polyval") != 0 &&
+	    strcmp(polyval_name, "polyval-lib") != 0)
+		return -ENOENT;
 
-	return hctr2_create_common(tmpl, tb, xctr_name, polyval_name);
+	return hctr2_create_common(tmpl, tb, xctr_name);
 }
 
 static int hctr2_create(struct crypto_template *tmpl, struct rtattr **tb)
@@ -542,7 +444,7 @@ static int hctr2_create(struct crypto_template *tmpl, struct rtattr **tb)
 		    blockcipher_name) >= CRYPTO_MAX_ALG_NAME)
 		return -ENAMETOOLONG;
 
-	return hctr2_create_common(tmpl, tb, xctr_name, "polyval");
+	return hctr2_create_common(tmpl, tb, xctr_name);
 }
 
 static struct crypto_template hctr2_tmpls[] = {
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 90d06c3ec967..499e979a56dc 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -5059,8 +5059,7 @@ static const struct alg_test_desc alg_test_descs[] = {
 		}
 	}, {
 		.alg = "hctr2(aes)",
-		.generic_driver =
-		    "hctr2_base(xctr(aes-generic),polyval-generic)",
+		.generic_driver = "hctr2_base(xctr(aes-generic),polyval-lib)",
 		.test = alg_test_skcipher,
 		.suite = {
 			.cipher = __VECS(aes_hctr2_tv_template)

From fd36de5749244c66f55eb943a5bbedbd9d6dd385 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Sun, 9 Nov 2025 15:47:22 -0800
Subject: [PATCH 36/37] crypto: polyval - Remove the polyval crypto_shash

Remove polyval support from crypto_shash.  It no longer has any user now
that the HCTR2 code uses the POLYVAL library instead.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251109234726.638437-8-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 crypto/Kconfig           |  10 --
 crypto/Makefile          |   1 -
 crypto/polyval-generic.c | 205 ---------------------------------------
 3 files changed, 216 deletions(-)
 delete mode 100644 crypto/polyval-generic.c

diff --git a/crypto/Kconfig b/crypto/Kconfig
index 805172f75bf1..bf8b8a60a0c0 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -948,16 +948,6 @@ config CRYPTO_MICHAEL_MIC
 	  This algorithm is required for TKIP, but it should not be used for
 	  other purposes because of the weakness of the algorithm.
 
-config CRYPTO_POLYVAL
-	tristate
-	select CRYPTO_HASH
-	select CRYPTO_LIB_GF128MUL
-	help
-	  POLYVAL hash function for HCTR2
-
-	  This is used in HCTR2.  It is not a general-purpose
-	  cryptographic hash function.
-
 config CRYPTO_RMD160
 	tristate "RIPEMD-160"
 	select CRYPTO_HASH
diff --git a/crypto/Makefile b/crypto/Makefile
index 0388ff8d219d..093c56a45d3f 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -172,7 +172,6 @@ jitterentropy_rng-y := jitterentropy.o jitterentropy-kcapi.o
 obj-$(CONFIG_CRYPTO_JITTERENTROPY_TESTINTERFACE) += jitterentropy-testing.o
 obj-$(CONFIG_CRYPTO_BENCHMARK) += tcrypt.o
 obj-$(CONFIG_CRYPTO_GHASH) += ghash-generic.o
-obj-$(CONFIG_CRYPTO_POLYVAL) += polyval-generic.o
 obj-$(CONFIG_CRYPTO_USER_API) += af_alg.o
 obj-$(CONFIG_CRYPTO_USER_API_HASH) += algif_hash.o
 obj-$(CONFIG_CRYPTO_USER_API_SKCIPHER) += algif_skcipher.o
diff --git a/crypto/polyval-generic.c b/crypto/polyval-generic.c
deleted file mode 100644
index fe5b01a4000d..000000000000
--- a/crypto/polyval-generic.c
+++ /dev/null
@@ -1,205 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * POLYVAL: hash function for HCTR2.
- *
- * Copyright (c) 2007 Nokia Siemens Networks - Mikko Herranen <mh1@iki.fi>
- * Copyright (c) 2009 Intel Corp.
- *   Author: Huang Ying <ying.huang@intel.com>
- * Copyright 2021 Google LLC
- */
-
-/*
- * Code based on crypto/ghash-generic.c
- *
- * POLYVAL is a keyed hash function similar to GHASH. POLYVAL uses a different
- * modulus for finite field multiplication which makes hardware accelerated
- * implementations on little-endian machines faster. POLYVAL is used in the
- * kernel to implement HCTR2, but was originally specified for AES-GCM-SIV
- * (RFC 8452).
- *
- * For more information see:
- * Length-preserving encryption with HCTR2:
- *   https://eprint.iacr.org/2021/1441.pdf
- * AES-GCM-SIV: Nonce Misuse-Resistant Authenticated Encryption:
- *   https://datatracker.ietf.org/doc/html/rfc8452
- *
- * Like GHASH, POLYVAL is not a cryptographic hash function and should
- * not be used outside of crypto modes explicitly designed to use POLYVAL.
- *
- * This implementation uses a convenient trick involving the GHASH and POLYVAL
- * fields. This trick allows multiplication in the POLYVAL field to be
- * implemented by using multiplication in the GHASH field as a subroutine. An
- * element of the POLYVAL field can be converted to an element of the GHASH
- * field by computing x*REVERSE(a), where REVERSE reverses the byte-ordering of
- * a. Similarly, an element of the GHASH field can be converted back to the
- * POLYVAL field by computing REVERSE(x^{-1}*a). For more information, see:
- * https://datatracker.ietf.org/doc/html/rfc8452#appendix-A
- *
- * By using this trick, we do not need to implement the POLYVAL field for the
- * generic implementation.
- *
- * Warning: this generic implementation is not intended to be used in practice
- * and is not constant time. For practical use, a hardware accelerated
- * implementation of POLYVAL should be used instead.
- *
- */
-
-#include <crypto/gf128mul.h>
-#include <crypto/internal/hash.h>
-#include <crypto/polyval.h>
-#include <crypto/utils.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/unaligned.h>
-
-struct polyval_tfm_ctx {
-	struct gf128mul_4k *gf128;
-};
-
-struct polyval_desc_ctx {
-	union {
-		u8 buffer[POLYVAL_BLOCK_SIZE];
-		be128 buffer128;
-	};
-};
-
-static void copy_and_reverse(u8 dst[POLYVAL_BLOCK_SIZE],
-			     const u8 src[POLYVAL_BLOCK_SIZE])
-{
-	u64 a = get_unaligned((const u64 *)&src[0]);
-	u64 b = get_unaligned((const u64 *)&src[8]);
-
-	put_unaligned(swab64(a), (u64 *)&dst[8]);
-	put_unaligned(swab64(b), (u64 *)&dst[0]);
-}
-
-static int polyval_setkey(struct crypto_shash *tfm,
-			  const u8 *key, unsigned int keylen)
-{
-	struct polyval_tfm_ctx *ctx = crypto_shash_ctx(tfm);
-	be128 k;
-
-	if (keylen != POLYVAL_BLOCK_SIZE)
-		return -EINVAL;
-
-	gf128mul_free_4k(ctx->gf128);
-
-	BUILD_BUG_ON(sizeof(k) != POLYVAL_BLOCK_SIZE);
-	copy_and_reverse((u8 *)&k, key);
-	gf128mul_x_lle(&k, &k);
-
-	ctx->gf128 = gf128mul_init_4k_lle(&k);
-	memzero_explicit(&k, POLYVAL_BLOCK_SIZE);
-
-	if (!ctx->gf128)
-		return -ENOMEM;
-
-	return 0;
-}
-
-static int polyval_generic_init(struct shash_desc *desc)
-{
-	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
-
-	memset(dctx, 0, sizeof(*dctx));
-
-	return 0;
-}
-
-static int polyval_generic_update(struct shash_desc *desc,
-				  const u8 *src, unsigned int srclen)
-{
-	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
-	const struct polyval_tfm_ctx *ctx = crypto_shash_ctx(desc->tfm);
-	u8 tmp[POLYVAL_BLOCK_SIZE];
-
-	do {
-		copy_and_reverse(tmp, src);
-		crypto_xor(dctx->buffer, tmp, POLYVAL_BLOCK_SIZE);
-		gf128mul_4k_lle(&dctx->buffer128, ctx->gf128);
-		src += POLYVAL_BLOCK_SIZE;
-		srclen -= POLYVAL_BLOCK_SIZE;
-	} while (srclen >= POLYVAL_BLOCK_SIZE);
-
-	return srclen;
-}
-
-static int polyval_finup(struct shash_desc *desc, const u8 *src,
-			 unsigned int len, u8 *dst)
-{
-	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
-
-	if (len) {
-		u8 tmp[POLYVAL_BLOCK_SIZE] = {};
-
-		memcpy(tmp, src, len);
-		polyval_generic_update(desc, tmp, POLYVAL_BLOCK_SIZE);
-	}
-	copy_and_reverse(dst, dctx->buffer);
-	return 0;
-}
-
-static int polyval_export(struct shash_desc *desc, void *out)
-{
-	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
-
-	copy_and_reverse(out, dctx->buffer);
-	return 0;
-}
-
-static int polyval_import(struct shash_desc *desc, const void *in)
-{
-	struct polyval_desc_ctx *dctx = shash_desc_ctx(desc);
-
-	copy_and_reverse(dctx->buffer, in);
-	return 0;
-}
-
-static void polyval_exit_tfm(struct crypto_shash *tfm)
-{
-	struct polyval_tfm_ctx *ctx = crypto_shash_ctx(tfm);
-
-	gf128mul_free_4k(ctx->gf128);
-}
-
-static struct shash_alg polyval_alg = {
-	.digestsize	= POLYVAL_DIGEST_SIZE,
-	.init		= polyval_generic_init,
-	.update		= polyval_generic_update,
-	.finup		= polyval_finup,
-	.setkey		= polyval_setkey,
-	.export		= polyval_export,
-	.import		= polyval_import,
-	.exit_tfm	= polyval_exit_tfm,
-	.statesize	= sizeof(struct polyval_desc_ctx),
-	.descsize	= sizeof(struct polyval_desc_ctx),
-	.base		= {
-		.cra_name		= "polyval",
-		.cra_driver_name	= "polyval-generic",
-		.cra_priority		= 100,
-		.cra_flags		= CRYPTO_AHASH_ALG_BLOCK_ONLY,
-		.cra_blocksize		= POLYVAL_BLOCK_SIZE,
-		.cra_ctxsize		= sizeof(struct polyval_tfm_ctx),
-		.cra_module		= THIS_MODULE,
-	},
-};
-
-static int __init polyval_mod_init(void)
-{
-	return crypto_register_shash(&polyval_alg);
-}
-
-static void __exit polyval_mod_exit(void)
-{
-	crypto_unregister_shash(&polyval_alg);
-}
-
-module_init(polyval_mod_init);
-module_exit(polyval_mod_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("POLYVAL hash function");
-MODULE_ALIAS_CRYPTO("polyval");
-MODULE_ALIAS_CRYPTO("polyval-generic");

From 2dbb6f4a25d38fcf7d6c1c682e45a13e6bbe9562 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@kernel.org>
Date: Sun, 9 Nov 2025 15:47:24 -0800
Subject: [PATCH 37/37] fscrypt: Drop obsolete recommendation to enable
 optimized POLYVAL

CONFIG_CRYPTO_POLYVAL_ARM64_CE and CONFIG_CRYPTO_POLYVAL_CLMUL_NI no
longer exist.  The architecture-optimized POLYVAL code is now just
enabled automatically when HCTR2 support is enabled.  Update the fscrypt
documentation accordingly.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251109234726.638437-10-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 Documentation/filesystems/fscrypt.rst | 2 --
 1 file changed, 2 deletions(-)

diff --git a/Documentation/filesystems/fscrypt.rst b/Documentation/filesystems/fscrypt.rst
index 696a5844bfa3..70af896822e1 100644
--- a/Documentation/filesystems/fscrypt.rst
+++ b/Documentation/filesystems/fscrypt.rst
@@ -450,9 +450,7 @@ API, but the filenames mode still does.
         - CONFIG_CRYPTO_HCTR2
     - Recommended:
         - arm64: CONFIG_CRYPTO_AES_ARM64_CE_BLK
-        - arm64: CONFIG_CRYPTO_POLYVAL_ARM64_CE
         - x86: CONFIG_CRYPTO_AES_NI_INTEL
-        - x86: CONFIG_CRYPTO_POLYVAL_CLMUL_NI
 
 - Adiantum
     - Mandatory: