Merge tag 'fpsimd-on-stack-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/linux

Pull arm64 FPSIMD on-stack buffer updates from Eric Biggers:
 "This is a core arm64 change. However, I was asked to take this because
  most uses of kernel-mode FPSIMD are in crypto or CRC code.

  In v6.8, the size of task_struct on arm64 increased by 528 bytes due
  to the new 'kernel_fpsimd_state' field. This field was added to allow
  kernel-mode FPSIMD code to be preempted.

  Unfortunately, 528 bytes is kind of a lot for task_struct. This
  regression in the task_struct size was noticed and reported.

  Recover that space by making this state be allocated on the stack at
  the beginning of each kernel-mode FPSIMD section.

  To make it easier for all the users of kernel-mode FPSIMD to do that
  correctly, introduce and use a 'scoped_ksimd' abstraction"

* tag 'fpsimd-on-stack-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/linux: (23 commits)
  lib/crypto: arm64: Move remaining algorithms to scoped ksimd API
  lib/crypto: arm/blake2b: Move to scoped ksimd API
  arm64/fpsimd: Allocate kernel mode FP/SIMD buffers on the stack
  arm64/fpu: Enforce task-context only for generic kernel mode FPU
  net/mlx5: Switch to more abstract scoped ksimd guard API on arm64
  arm64/xorblocks:  Switch to 'ksimd' scoped guard API
  crypto/arm64: sm4 - Switch to 'ksimd' scoped guard API
  crypto/arm64: sm3 - Switch to 'ksimd' scoped guard API
  crypto/arm64: sha3 - Switch to 'ksimd' scoped guard API
  crypto/arm64: polyval - Switch to 'ksimd' scoped guard API
  crypto/arm64: nhpoly1305 - Switch to 'ksimd' scoped guard API
  crypto/arm64: aes-gcm - Switch to 'ksimd' scoped guard API
  crypto/arm64: aes-blk - Switch to 'ksimd' scoped guard API
  crypto/arm64: aes-ccm - Switch to 'ksimd' scoped guard API
  raid6: Move to more abstract 'ksimd' guard API
  crypto: aegis128-neon - Move to more abstract 'ksimd' guard API
  crypto/arm64: sm4-ce-gcm - Avoid pointless yield of the NEON unit
  crypto/arm64: sm4-ce-ccm - Avoid pointless yield of the NEON unit
  crypto/arm64: aes-ce-ccm - Avoid pointless yield of the NEON unit
  lib/crc: Switch ARM and arm64 to 'ksimd' scoped guard API
  ...
This commit is contained in:
Linus Torvalds
2025-12-02 18:53:50 -08:00
42 changed files with 614 additions and 709 deletions

View File

@@ -5,7 +5,6 @@
* Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
*/
#include <asm/neon.h>
#include <asm/simd.h>
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
@@ -19,22 +18,16 @@ asmlinkage void crc_t10dif_pmull8(u16 init_crc, const u8 *buf, size_t len,
static inline u16 crc_t10dif_arch(u16 crc, const u8 *data, size_t length)
{
if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE) {
if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && likely(may_use_simd())) {
if (static_branch_likely(&have_pmull)) {
if (likely(may_use_simd())) {
kernel_neon_begin();
crc = crc_t10dif_pmull64(crc, data, length);
kernel_neon_end();
return crc;
}
scoped_ksimd()
return crc_t10dif_pmull64(crc, data, length);
} else if (length > CRC_T10DIF_PMULL_CHUNK_SIZE &&
static_branch_likely(&have_neon) &&
likely(may_use_simd())) {
static_branch_likely(&have_neon)) {
u8 buf[16] __aligned(16);
kernel_neon_begin();
crc_t10dif_pmull8(crc, data, length, buf);
kernel_neon_end();
scoped_ksimd()
crc_t10dif_pmull8(crc, data, length, buf);
return crc_t10dif_generic(0, buf, sizeof(buf));
}

View File

@@ -8,7 +8,6 @@
#include <linux/cpufeature.h>
#include <asm/hwcap.h>
#include <asm/neon.h>
#include <asm/simd.h>
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32);
@@ -42,9 +41,8 @@ static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
len -= n;
}
n = round_down(len, 16);
kernel_neon_begin();
crc = crc32_pmull_le(p, n, crc);
kernel_neon_end();
scoped_ksimd()
crc = crc32_pmull_le(p, n, crc);
p += n;
len -= n;
}
@@ -71,9 +69,8 @@ static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
len -= n;
}
n = round_down(len, 16);
kernel_neon_begin();
crc = crc32c_pmull_le(p, n, crc);
kernel_neon_end();
scoped_ksimd()
crc = crc32c_pmull_le(p, n, crc);
p += n;
len -= n;
}

View File

@@ -7,7 +7,6 @@
#include <linux/cpufeature.h>
#include <asm/neon.h>
#include <asm/simd.h>
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_asimd);
@@ -21,22 +20,16 @@ asmlinkage u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
static inline u16 crc_t10dif_arch(u16 crc, const u8 *data, size_t length)
{
if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE) {
if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && likely(may_use_simd())) {
if (static_branch_likely(&have_pmull)) {
if (likely(may_use_simd())) {
kernel_neon_begin();
crc = crc_t10dif_pmull_p64(crc, data, length);
kernel_neon_end();
return crc;
}
scoped_ksimd()
return crc_t10dif_pmull_p64(crc, data, length);
} else if (length > CRC_T10DIF_PMULL_CHUNK_SIZE &&
static_branch_likely(&have_asimd) &&
likely(may_use_simd())) {
static_branch_likely(&have_asimd)) {
u8 buf[16];
kernel_neon_begin();
crc_t10dif_pmull_p8(crc, data, length, buf);
kernel_neon_end();
scoped_ksimd()
crc_t10dif_pmull_p8(crc, data, length, buf);
return crc_t10dif_generic(0, buf, sizeof(buf));
}

View File

@@ -2,7 +2,6 @@
#include <asm/alternative.h>
#include <asm/cpufeature.h>
#include <asm/neon.h>
#include <asm/simd.h>
// The minimum input length to consider the 4-way interleaved code path
@@ -23,9 +22,8 @@ static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
if (len >= min_len && cpu_have_named_feature(PMULL) &&
likely(may_use_simd())) {
kernel_neon_begin();
crc = crc32_le_arm64_4way(crc, p, len);
kernel_neon_end();
scoped_ksimd()
crc = crc32_le_arm64_4way(crc, p, len);
p += round_down(len, 64);
len %= 64;
@@ -44,9 +42,8 @@ static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
if (len >= min_len && cpu_have_named_feature(PMULL) &&
likely(may_use_simd())) {
kernel_neon_begin();
crc = crc32c_le_arm64_4way(crc, p, len);
kernel_neon_end();
scoped_ksimd()
crc = crc32c_le_arm64_4way(crc, p, len);
p += round_down(len, 64);
len %= 64;
@@ -65,9 +62,8 @@ static inline u32 crc32_be_arch(u32 crc, const u8 *p, size_t len)
if (len >= min_len && cpu_have_named_feature(PMULL) &&
likely(may_use_simd())) {
kernel_neon_begin();
crc = crc32_be_arm64_4way(crc, p, len);
kernel_neon_end();
scoped_ksimd()
crc = crc32_be_arm64_4way(crc, p, len);
p += round_down(len, 64);
len %= 64;

View File

@@ -24,9 +24,8 @@ static void blake2b_compress(struct blake2b_ctx *ctx,
const size_t blocks = min_t(size_t, nblocks,
SZ_4K / BLAKE2B_BLOCK_SIZE);
kernel_neon_begin();
blake2b_compress_neon(ctx, data, blocks, inc);
kernel_neon_end();
scoped_ksimd()
blake2b_compress_neon(ctx, data, blocks, inc);
data += blocks * BLAKE2B_BLOCK_SIZE;
nblocks -= blocks;

View File

@@ -12,7 +12,6 @@
#include <asm/cputype.h>
#include <asm/hwcap.h>
#include <asm/neon.h>
#include <asm/simd.h>
asmlinkage void chacha_block_xor_neon(const struct chacha_state *state,
@@ -68,9 +67,8 @@ static void hchacha_block_arch(const struct chacha_state *state,
if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable()) {
hchacha_block_arm(state, out, nrounds);
} else {
kernel_neon_begin();
hchacha_block_neon(state, out, nrounds);
kernel_neon_end();
scoped_ksimd()
hchacha_block_neon(state, out, nrounds);
}
}
@@ -87,9 +85,8 @@ static void chacha_crypt_arch(struct chacha_state *state, u8 *dst,
do {
unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
kernel_neon_begin();
chacha_doneon(state, dst, src, todo, nrounds);
kernel_neon_end();
scoped_ksimd()
chacha_doneon(state, dst, src, todo, nrounds);
bytes -= todo;
src += todo;

View File

@@ -25,9 +25,8 @@ static void curve25519_arch(u8 out[CURVE25519_KEY_SIZE],
const u8 point[CURVE25519_KEY_SIZE])
{
if (static_branch_likely(&have_neon) && crypto_simd_usable()) {
kernel_neon_begin();
curve25519_neon(out, scalar, point);
kernel_neon_end();
scoped_ksimd()
curve25519_neon(out, scalar, point);
} else {
curve25519_generic(out, scalar, point);
}

View File

@@ -6,7 +6,6 @@
*/
#include <asm/hwcap.h>
#include <asm/neon.h>
#include <asm/simd.h>
#include <linux/cpufeature.h>
#include <linux/jump_label.h>
@@ -32,9 +31,8 @@ static void poly1305_blocks(struct poly1305_block_state *state, const u8 *src,
do {
unsigned int todo = min_t(unsigned int, len, SZ_4K);
kernel_neon_begin();
poly1305_blocks_neon(state, src, todo, padbit);
kernel_neon_end();
scoped_ksimd()
poly1305_blocks_neon(state, src, todo, padbit);
len -= todo;
src += todo;

View File

@@ -4,7 +4,6 @@
*
* Copyright 2025 Google LLC
*/
#include <asm/neon.h>
#include <asm/simd.h>
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
@@ -22,12 +21,12 @@ static void sha1_blocks(struct sha1_block_state *state,
{
if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
static_branch_likely(&have_neon) && likely(may_use_simd())) {
kernel_neon_begin();
if (static_branch_likely(&have_ce))
sha1_ce_transform(state, data, nblocks);
else
sha1_transform_neon(state, data, nblocks);
kernel_neon_end();
scoped_ksimd() {
if (static_branch_likely(&have_ce))
sha1_ce_transform(state, data, nblocks);
else
sha1_transform_neon(state, data, nblocks);
}
} else {
sha1_block_data_order(state, data, nblocks);
}

View File

@@ -22,12 +22,12 @@ static void sha256_blocks(struct sha256_block_state *state,
{
if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
static_branch_likely(&have_neon) && likely(may_use_simd())) {
kernel_neon_begin();
if (static_branch_likely(&have_ce))
sha256_ce_transform(state, data, nblocks);
else
sha256_block_data_order_neon(state, data, nblocks);
kernel_neon_end();
scoped_ksimd() {
if (static_branch_likely(&have_ce))
sha256_ce_transform(state, data, nblocks);
else
sha256_block_data_order_neon(state, data, nblocks);
}
} else {
sha256_block_data_order(state, data, nblocks);
}

View File

@@ -19,9 +19,8 @@ static void sha512_blocks(struct sha512_block_state *state,
{
if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
static_branch_likely(&have_neon) && likely(may_use_simd())) {
kernel_neon_begin();
sha512_block_data_order_neon(state, data, nblocks);
kernel_neon_end();
scoped_ksimd()
sha512_block_data_order_neon(state, data, nblocks);
} else {
sha512_block_data_order(state, data, nblocks);
}

View File

@@ -23,7 +23,6 @@
#include <linux/kernel.h>
#include <asm/hwcap.h>
#include <asm/neon.h>
#include <asm/simd.h>
asmlinkage void chacha_block_xor_neon(const struct chacha_state *state,
@@ -65,9 +64,8 @@ static void hchacha_block_arch(const struct chacha_state *state,
if (!static_branch_likely(&have_neon) || !crypto_simd_usable()) {
hchacha_block_generic(state, out, nrounds);
} else {
kernel_neon_begin();
hchacha_block_neon(state, out, nrounds);
kernel_neon_end();
scoped_ksimd()
hchacha_block_neon(state, out, nrounds);
}
}
@@ -81,9 +79,8 @@ static void chacha_crypt_arch(struct chacha_state *state, u8 *dst,
do {
unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
kernel_neon_begin();
chacha_doneon(state, dst, src, todo, nrounds);
kernel_neon_end();
scoped_ksimd()
chacha_doneon(state, dst, src, todo, nrounds);
bytes -= todo;
src += todo;

View File

@@ -6,7 +6,6 @@
*/
#include <asm/hwcap.h>
#include <asm/neon.h>
#include <asm/simd.h>
#include <linux/cpufeature.h>
#include <linux/jump_label.h>
@@ -31,9 +30,8 @@ static void poly1305_blocks(struct poly1305_block_state *state, const u8 *src,
do {
unsigned int todo = min_t(unsigned int, len, SZ_4K);
kernel_neon_begin();
poly1305_blocks_neon(state, src, todo, padbit);
kernel_neon_end();
scoped_ksimd()
poly1305_blocks_neon(state, src, todo, padbit);
len -= todo;
src += todo;

View File

@@ -4,7 +4,6 @@
*
* Copyright 2025 Google LLC
*/
#include <asm/neon.h>
#include <asm/simd.h>
#include <linux/cpufeature.h>
@@ -24,13 +23,14 @@ static void polyval_preparekey_arch(struct polyval_key *key,
static_assert(ARRAY_SIZE(key->h_powers) == NUM_H_POWERS);
memcpy(&key->h_powers[NUM_H_POWERS - 1], raw_key, POLYVAL_BLOCK_SIZE);
if (static_branch_likely(&have_pmull) && may_use_simd()) {
kernel_neon_begin();
for (int i = NUM_H_POWERS - 2; i >= 0; i--) {
key->h_powers[i] = key->h_powers[i + 1];
polyval_mul_pmull(&key->h_powers[i],
&key->h_powers[NUM_H_POWERS - 1]);
scoped_ksimd() {
for (int i = NUM_H_POWERS - 2; i >= 0; i--) {
key->h_powers[i] = key->h_powers[i + 1];
polyval_mul_pmull(
&key->h_powers[i],
&key->h_powers[NUM_H_POWERS - 1]);
}
}
kernel_neon_end();
} else {
for (int i = NUM_H_POWERS - 2; i >= 0; i--) {
key->h_powers[i] = key->h_powers[i + 1];
@@ -44,9 +44,8 @@ static void polyval_mul_arch(struct polyval_elem *acc,
const struct polyval_key *key)
{
if (static_branch_likely(&have_pmull) && may_use_simd()) {
kernel_neon_begin();
polyval_mul_pmull(acc, &key->h_powers[NUM_H_POWERS - 1]);
kernel_neon_end();
scoped_ksimd()
polyval_mul_pmull(acc, &key->h_powers[NUM_H_POWERS - 1]);
} else {
polyval_mul_generic(acc, &key->h_powers[NUM_H_POWERS - 1]);
}
@@ -62,9 +61,8 @@ static void polyval_blocks_arch(struct polyval_elem *acc,
size_t n = min_t(size_t, nblocks,
4096 / POLYVAL_BLOCK_SIZE);
kernel_neon_begin();
polyval_blocks_pmull(acc, key, data, n);
kernel_neon_end();
scoped_ksimd()
polyval_blocks_pmull(acc, key, data, n);
data += n * POLYVAL_BLOCK_SIZE;
nblocks -= n;
} while (nblocks);

View File

@@ -4,7 +4,6 @@
*
* Copyright 2025 Google LLC
*/
#include <asm/neon.h>
#include <asm/simd.h>
#include <linux/cpufeature.h>
@@ -20,9 +19,9 @@ static void sha1_blocks(struct sha1_block_state *state,
do {
size_t rem;
kernel_neon_begin();
rem = __sha1_ce_transform(state, data, nblocks);
kernel_neon_end();
scoped_ksimd()
rem = __sha1_ce_transform(state, data, nblocks);
data += (nblocks - rem) * SHA1_BLOCK_SIZE;
nblocks = rem;
} while (nblocks);

View File

@@ -4,7 +4,6 @@
*
* Copyright 2025 Google LLC
*/
#include <asm/neon.h>
#include <asm/simd.h>
#include <linux/cpufeature.h>
@@ -27,17 +26,16 @@ static void sha256_blocks(struct sha256_block_state *state,
do {
size_t rem;
kernel_neon_begin();
rem = __sha256_ce_transform(state,
data, nblocks);
kernel_neon_end();
scoped_ksimd()
rem = __sha256_ce_transform(state, data,
nblocks);
data += (nblocks - rem) * SHA256_BLOCK_SIZE;
nblocks = rem;
} while (nblocks);
} else {
kernel_neon_begin();
sha256_block_neon(state, data, nblocks);
kernel_neon_end();
scoped_ksimd()
sha256_block_neon(state, data, nblocks);
}
} else {
sha256_block_data_order(state, data, nblocks);
@@ -66,9 +64,8 @@ static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx,
if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
static_branch_likely(&have_ce) && len >= SHA256_BLOCK_SIZE &&
len <= 65536 && likely(may_use_simd())) {
kernel_neon_begin();
sha256_ce_finup2x(ctx, data1, data2, len, out1, out2);
kernel_neon_end();
scoped_ksimd()
sha256_ce_finup2x(ctx, data1, data2, len, out1, out2);
kmsan_unpoison_memory(out1, SHA256_DIGEST_SIZE);
kmsan_unpoison_memory(out2, SHA256_DIGEST_SIZE);
return true;

View File

@@ -7,7 +7,6 @@
* published by the Free Software Foundation.
*/
#include <asm/neon.h>
#include <asm/simd.h>
#include <linux/cpufeature.h>
@@ -23,10 +22,9 @@ static void sha3_absorb_blocks(struct sha3_state *state, const u8 *data,
do {
size_t rem;
kernel_neon_begin();
rem = sha3_ce_transform(state, data, nblocks,
block_size);
kernel_neon_end();
scoped_ksimd()
rem = sha3_ce_transform(state, data, nblocks,
block_size);
data += (nblocks - rem) * block_size;
nblocks = rem;
} while (nblocks);
@@ -46,9 +44,8 @@ static void sha3_keccakf(struct sha3_state *state)
*/
static const u8 zeroes[SHA3_512_BLOCK_SIZE];
kernel_neon_begin();
sha3_ce_transform(state, zeroes, 1, sizeof(zeroes));
kernel_neon_end();
scoped_ksimd()
sha3_ce_transform(state, zeroes, 1, sizeof(zeroes));
} else {
sha3_keccakf_generic(state);
}

View File

@@ -4,7 +4,7 @@
*
* Copyright 2025 Google LLC
*/
#include <asm/neon.h>
#include <asm/simd.h>
#include <linux/cpufeature.h>
@@ -24,9 +24,9 @@ static void sha512_blocks(struct sha512_block_state *state,
do {
size_t rem;
kernel_neon_begin();
rem = __sha512_ce_transform(state, data, nblocks);
kernel_neon_end();
scoped_ksimd()
rem = __sha512_ce_transform(state, data, nblocks);
data += (nblocks - rem) * SHA512_BLOCK_SIZE;
nblocks = rem;
} while (nblocks);

View File

@@ -8,10 +8,9 @@
#include <linux/raid/pq.h>
#ifdef __KERNEL__
#include <asm/neon.h>
#include <asm/simd.h>
#else
#define kernel_neon_begin()
#define kernel_neon_end()
#define scoped_ksimd()
#define cpu_has_neon() (1)
#endif
@@ -32,10 +31,9 @@
{ \
void raid6_neon ## _n ## _gen_syndrome_real(int, \
unsigned long, void**); \
kernel_neon_begin(); \
raid6_neon ## _n ## _gen_syndrome_real(disks, \
scoped_ksimd() \
raid6_neon ## _n ## _gen_syndrome_real(disks, \
(unsigned long)bytes, ptrs); \
kernel_neon_end(); \
} \
static void raid6_neon ## _n ## _xor_syndrome(int disks, \
int start, int stop, \
@@ -43,10 +41,9 @@
{ \
void raid6_neon ## _n ## _xor_syndrome_real(int, \
int, int, unsigned long, void**); \
kernel_neon_begin(); \
raid6_neon ## _n ## _xor_syndrome_real(disks, \
start, stop, (unsigned long)bytes, ptrs); \
kernel_neon_end(); \
scoped_ksimd() \
raid6_neon ## _n ## _xor_syndrome_real(disks, \
start, stop, (unsigned long)bytes, ptrs);\
} \
struct raid6_calls const raid6_neonx ## _n = { \
raid6_neon ## _n ## _gen_syndrome, \

View File

@@ -7,11 +7,10 @@
#include <linux/raid/pq.h>
#ifdef __KERNEL__
#include <asm/neon.h>
#include <asm/simd.h>
#include "neon.h"
#else
#define kernel_neon_begin()
#define kernel_neon_end()
#define scoped_ksimd()
#define cpu_has_neon() (1)
#endif
@@ -55,9 +54,8 @@ static void raid6_2data_recov_neon(int disks, size_t bytes, int faila,
qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^
raid6_gfexp[failb]]];
kernel_neon_begin();
__raid6_2data_recov_neon(bytes, p, q, dp, dq, pbmul, qmul);
kernel_neon_end();
scoped_ksimd()
__raid6_2data_recov_neon(bytes, p, q, dp, dq, pbmul, qmul);
}
static void raid6_datap_recov_neon(int disks, size_t bytes, int faila,
@@ -86,9 +84,8 @@ static void raid6_datap_recov_neon(int disks, size_t bytes, int faila,
/* Now, pick the proper data tables */
qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
kernel_neon_begin();
__raid6_datap_recov_neon(bytes, p, q, dq, qmul);
kernel_neon_end();
scoped_ksimd()
__raid6_datap_recov_neon(bytes, p, q, dq, qmul);
}
const struct raid6_recov_calls raid6_recov_neon = {