Merge tag 'fpsimd-on-stack-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/linux

Pull arm64 FPSIMD on-stack buffer updates from Eric Biggers: "This is a core arm64 change. However, I was asked to take this because most uses of kernel-mode FPSIMD are in crypto or CRC code. In v6.8, the size of task_struct on arm64 increased by 528 bytes due to the new 'kernel_fpsimd_state' field. This field was added to allow kernel-mode FPSIMD code to be preempted. Unfortunately, 528 bytes is kind of a lot for task_struct. This regression in the task_struct size was noticed and reported. Recover that space by making this state be allocated on the stack at the beginning of each kernel-mode FPSIMD section. To make it easier for all the users of kernel-mode FPSIMD to do that correctly, introduce and use a 'scoped_ksimd' abstraction" * tag 'fpsimd-on-stack-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/linux: (23 commits) lib/crypto: arm64: Move remaining algorithms to scoped ksimd API lib/crypto: arm/blake2b: Move to scoped ksimd API arm64/fpsimd: Allocate kernel mode FP/SIMD buffers on the stack arm64/fpu: Enforce task-context only for generic kernel mode FPU net/mlx5: Switch to more abstract scoped ksimd guard API on arm64 arm64/xorblocks: Switch to 'ksimd' scoped guard API crypto/arm64: sm4 - Switch to 'ksimd' scoped guard API crypto/arm64: sm3 - Switch to 'ksimd' scoped guard API crypto/arm64: sha3 - Switch to 'ksimd' scoped guard API crypto/arm64: polyval - Switch to 'ksimd' scoped guard API crypto/arm64: nhpoly1305 - Switch to 'ksimd' scoped guard API crypto/arm64: aes-gcm - Switch to 'ksimd' scoped guard API crypto/arm64: aes-blk - Switch to 'ksimd' scoped guard API crypto/arm64: aes-ccm - Switch to 'ksimd' scoped guard API raid6: Move to more abstract 'ksimd' guard API crypto: aegis128-neon - Move to more abstract 'ksimd' guard API crypto/arm64: sm4-ce-gcm - Avoid pointless yield of the NEON unit crypto/arm64: sm4-ce-ccm - Avoid pointless yield of the NEON unit crypto/arm64: aes-ce-ccm - Avoid pointless yield of the NEON unit lib/crc: Switch ARM and arm64 to 'ksimd' scoped guard API ...
2025-12-27 10:01:39 -05:00 · 2025-12-02 18:53:50 -08:00
parent 906003e151 5dc8d27752
commit f617d24606
42 changed files with 614 additions and 709 deletions
--- a/lib/crc/arm/crc-t10dif.h
+++ b/lib/crc/arm/crc-t10dif.h
@@ -5,7 +5,6 @@
 * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
 */

-#include <asm/neon.h>
 #include <asm/simd.h>

 static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
@@ -19,22 +18,16 @@ asmlinkage void crc_t10dif_pmull8(u16 init_crc, const u8 *buf, size_t len,

 static inline u16 crc_t10dif_arch(u16 crc, const u8 *data, size_t length)
 {
-	if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE) {
+	if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && likely(may_use_simd())) {
 		if (static_branch_likely(&have_pmull)) {
-			if (likely(may_use_simd())) {
-				kernel_neon_begin();
-				crc = crc_t10dif_pmull64(crc, data, length);
-				kernel_neon_end();
-				return crc;
-			}
+			scoped_ksimd()
+				return crc_t10dif_pmull64(crc, data, length);
 		} else if (length > CRC_T10DIF_PMULL_CHUNK_SIZE &&
-			   static_branch_likely(&have_neon) &&
-			   likely(may_use_simd())) {
+			   static_branch_likely(&have_neon)) {
 			u8 buf[16] __aligned(16);

-			kernel_neon_begin();
-			crc_t10dif_pmull8(crc, data, length, buf);
-			kernel_neon_end();
+			scoped_ksimd()
+				crc_t10dif_pmull8(crc, data, length, buf);

 			return crc_t10dif_generic(0, buf, sizeof(buf));
 		}
--- a/lib/crc/arm/crc32.h
+++ b/lib/crc/arm/crc32.h
@@ -8,7 +8,6 @@
 #include <linux/cpufeature.h>

 #include <asm/hwcap.h>
-#include <asm/neon.h>
 #include <asm/simd.h>

 static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32);
@@ -42,9 +41,8 @@ static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
 			len -= n;
 		}
 		n = round_down(len, 16);
-		kernel_neon_begin();
-		crc = crc32_pmull_le(p, n, crc);
-		kernel_neon_end();
+		scoped_ksimd()
+			crc = crc32_pmull_le(p, n, crc);
 		p += n;
 		len -= n;
 	}
@@ -71,9 +69,8 @@ static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
 			len -= n;
 		}
 		n = round_down(len, 16);
-		kernel_neon_begin();
-		crc = crc32c_pmull_le(p, n, crc);
-		kernel_neon_end();
+		scoped_ksimd()
+			crc = crc32c_pmull_le(p, n, crc);
 		p += n;
 		len -= n;
 	}
--- a/lib/crc/arm64/crc-t10dif.h
+++ b/lib/crc/arm64/crc-t10dif.h
@@ -7,7 +7,6 @@

 #include <linux/cpufeature.h>

-#include <asm/neon.h>
 #include <asm/simd.h>

 static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_asimd);
@@ -21,22 +20,16 @@ asmlinkage u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);

 static inline u16 crc_t10dif_arch(u16 crc, const u8 *data, size_t length)
 {
-	if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE) {
+	if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && likely(may_use_simd())) {
 		if (static_branch_likely(&have_pmull)) {
-			if (likely(may_use_simd())) {
-				kernel_neon_begin();
-				crc = crc_t10dif_pmull_p64(crc, data, length);
-				kernel_neon_end();
-				return crc;
-			}
+			scoped_ksimd()
+				return crc_t10dif_pmull_p64(crc, data, length);
 		} else if (length > CRC_T10DIF_PMULL_CHUNK_SIZE &&
-			   static_branch_likely(&have_asimd) &&
-			   likely(may_use_simd())) {
+			   static_branch_likely(&have_asimd)) {
 			u8 buf[16];

-			kernel_neon_begin();
-			crc_t10dif_pmull_p8(crc, data, length, buf);
-			kernel_neon_end();
+			scoped_ksimd()
+				crc_t10dif_pmull_p8(crc, data, length, buf);

 			return crc_t10dif_generic(0, buf, sizeof(buf));
 		}
--- a/lib/crc/arm64/crc32.h
+++ b/lib/crc/arm64/crc32.h
@@ -2,7 +2,6 @@

 #include <asm/alternative.h>
 #include <asm/cpufeature.h>
-#include <asm/neon.h>
 #include <asm/simd.h>

 // The minimum input length to consider the 4-way interleaved code path
@@ -23,9 +22,8 @@ static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)

 	if (len >= min_len && cpu_have_named_feature(PMULL) &&
 	    likely(may_use_simd())) {
-		kernel_neon_begin();
-		crc = crc32_le_arm64_4way(crc, p, len);
-		kernel_neon_end();
+		scoped_ksimd()
+			crc = crc32_le_arm64_4way(crc, p, len);

 		p += round_down(len, 64);
 		len %= 64;
@@ -44,9 +42,8 @@ static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len)

 	if (len >= min_len && cpu_have_named_feature(PMULL) &&
 	    likely(may_use_simd())) {
-		kernel_neon_begin();
-		crc = crc32c_le_arm64_4way(crc, p, len);
-		kernel_neon_end();
+		scoped_ksimd()
+			crc = crc32c_le_arm64_4way(crc, p, len);

 		p += round_down(len, 64);
 		len %= 64;
@@ -65,9 +62,8 @@ static inline u32 crc32_be_arch(u32 crc, const u8 *p, size_t len)

 	if (len >= min_len && cpu_have_named_feature(PMULL) &&
 	    likely(may_use_simd())) {
-		kernel_neon_begin();
-		crc = crc32_be_arm64_4way(crc, p, len);
-		kernel_neon_end();
+		scoped_ksimd()
+			crc = crc32_be_arm64_4way(crc, p, len);

 		p += round_down(len, 64);
 		len %= 64;
--- a/lib/crypto/arm/blake2b.h
+++ b/lib/crypto/arm/blake2b.h
@@ -24,9 +24,8 @@ static void blake2b_compress(struct blake2b_ctx *ctx,
 		const size_t blocks = min_t(size_t, nblocks,
 					    SZ_4K / BLAKE2B_BLOCK_SIZE);

-		kernel_neon_begin();
-		blake2b_compress_neon(ctx, data, blocks, inc);
-		kernel_neon_end();
+		scoped_ksimd()
+			blake2b_compress_neon(ctx, data, blocks, inc);

 		data += blocks * BLAKE2B_BLOCK_SIZE;
 		nblocks -= blocks;
--- a/lib/crypto/arm/chacha.h
+++ b/lib/crypto/arm/chacha.h
@@ -12,7 +12,6 @@

 #include <asm/cputype.h>
 #include <asm/hwcap.h>
-#include <asm/neon.h>
 #include <asm/simd.h>

 asmlinkage void chacha_block_xor_neon(const struct chacha_state *state,
@@ -68,9 +67,8 @@ static void hchacha_block_arch(const struct chacha_state *state,
 	if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable()) {
 		hchacha_block_arm(state, out, nrounds);
 	} else {
-		kernel_neon_begin();
-		hchacha_block_neon(state, out, nrounds);
-		kernel_neon_end();
+		scoped_ksimd()
+			hchacha_block_neon(state, out, nrounds);
 	}
 }

@@ -87,9 +85,8 @@ static void chacha_crypt_arch(struct chacha_state *state, u8 *dst,
 	do {
 		unsigned int todo = min_t(unsigned int, bytes, SZ_4K);

-		kernel_neon_begin();
-		chacha_doneon(state, dst, src, todo, nrounds);
-		kernel_neon_end();
+		scoped_ksimd()
+			chacha_doneon(state, dst, src, todo, nrounds);

 		bytes -= todo;
 		src += todo;
--- a/lib/crypto/arm/curve25519.h
+++ b/lib/crypto/arm/curve25519.h
@@ -25,9 +25,8 @@ static void curve25519_arch(u8 out[CURVE25519_KEY_SIZE],
 			    const u8 point[CURVE25519_KEY_SIZE])
 {
 	if (static_branch_likely(&have_neon) && crypto_simd_usable()) {
-		kernel_neon_begin();
-		curve25519_neon(out, scalar, point);
-		kernel_neon_end();
+		scoped_ksimd()
+			curve25519_neon(out, scalar, point);
 	} else {
 		curve25519_generic(out, scalar, point);
 	}
--- a/lib/crypto/arm/poly1305.h
+++ b/lib/crypto/arm/poly1305.h
@@ -6,7 +6,6 @@
 */

 #include <asm/hwcap.h>
-#include <asm/neon.h>
 #include <asm/simd.h>
 #include <linux/cpufeature.h>
 #include <linux/jump_label.h>
@@ -32,9 +31,8 @@ static void poly1305_blocks(struct poly1305_block_state *state, const u8 *src,
 		do {
 			unsigned int todo = min_t(unsigned int, len, SZ_4K);

-			kernel_neon_begin();
-			poly1305_blocks_neon(state, src, todo, padbit);
-			kernel_neon_end();
+			scoped_ksimd()
+				poly1305_blocks_neon(state, src, todo, padbit);

 			len -= todo;
 			src += todo;
--- a/lib/crypto/arm/sha1.h
+++ b/lib/crypto/arm/sha1.h
@@ -4,7 +4,6 @@
 *
 * Copyright 2025 Google LLC
 */
-#include <asm/neon.h>
 #include <asm/simd.h>

 static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
@@ -22,12 +21,12 @@ static void sha1_blocks(struct sha1_block_state *state,
 {
 	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
 	    static_branch_likely(&have_neon) && likely(may_use_simd())) {
-		kernel_neon_begin();
-		if (static_branch_likely(&have_ce))
-			sha1_ce_transform(state, data, nblocks);
-		else
-			sha1_transform_neon(state, data, nblocks);
-		kernel_neon_end();
+		scoped_ksimd() {
+			if (static_branch_likely(&have_ce))
+				sha1_ce_transform(state, data, nblocks);
+			else
+				sha1_transform_neon(state, data, nblocks);
+		}
 	} else {
 		sha1_block_data_order(state, data, nblocks);
 	}
--- a/lib/crypto/arm/sha256.h
+++ b/lib/crypto/arm/sha256.h
@@ -22,12 +22,12 @@ static void sha256_blocks(struct sha256_block_state *state,
 {
 	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
 	    static_branch_likely(&have_neon) && likely(may_use_simd())) {
-		kernel_neon_begin();
-		if (static_branch_likely(&have_ce))
-			sha256_ce_transform(state, data, nblocks);
-		else
-			sha256_block_data_order_neon(state, data, nblocks);
-		kernel_neon_end();
+		scoped_ksimd() {
+			if (static_branch_likely(&have_ce))
+				sha256_ce_transform(state, data, nblocks);
+			else
+				sha256_block_data_order_neon(state, data, nblocks);
+		}
 	} else {
 		sha256_block_data_order(state, data, nblocks);
 	}
--- a/lib/crypto/arm/sha512.h
+++ b/lib/crypto/arm/sha512.h
@@ -19,9 +19,8 @@ static void sha512_blocks(struct sha512_block_state *state,
 {
 	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
 	    static_branch_likely(&have_neon) && likely(may_use_simd())) {
-		kernel_neon_begin();
-		sha512_block_data_order_neon(state, data, nblocks);
-		kernel_neon_end();
+		scoped_ksimd()
+			sha512_block_data_order_neon(state, data, nblocks);
 	} else {
 		sha512_block_data_order(state, data, nblocks);
 	}
--- a/lib/crypto/arm64/chacha.h
+++ b/lib/crypto/arm64/chacha.h
@@ -23,7 +23,6 @@
 #include <linux/kernel.h>

 #include <asm/hwcap.h>
-#include <asm/neon.h>
 #include <asm/simd.h>

 asmlinkage void chacha_block_xor_neon(const struct chacha_state *state,
@@ -65,9 +64,8 @@ static void hchacha_block_arch(const struct chacha_state *state,
 	if (!static_branch_likely(&have_neon) || !crypto_simd_usable()) {
 		hchacha_block_generic(state, out, nrounds);
 	} else {
-		kernel_neon_begin();
-		hchacha_block_neon(state, out, nrounds);
-		kernel_neon_end();
+		scoped_ksimd()
+			hchacha_block_neon(state, out, nrounds);
 	}
 }

@@ -81,9 +79,8 @@ static void chacha_crypt_arch(struct chacha_state *state, u8 *dst,
 	do {
 		unsigned int todo = min_t(unsigned int, bytes, SZ_4K);

-		kernel_neon_begin();
-		chacha_doneon(state, dst, src, todo, nrounds);
-		kernel_neon_end();
+		scoped_ksimd()
+			chacha_doneon(state, dst, src, todo, nrounds);

 		bytes -= todo;
 		src += todo;
--- a/lib/crypto/arm64/poly1305.h
+++ b/lib/crypto/arm64/poly1305.h
@@ -6,7 +6,6 @@
 */

 #include <asm/hwcap.h>
-#include <asm/neon.h>
 #include <asm/simd.h>
 #include <linux/cpufeature.h>
 #include <linux/jump_label.h>
@@ -31,9 +30,8 @@ static void poly1305_blocks(struct poly1305_block_state *state, const u8 *src,
 		do {
 			unsigned int todo = min_t(unsigned int, len, SZ_4K);

-			kernel_neon_begin();
-			poly1305_blocks_neon(state, src, todo, padbit);
-			kernel_neon_end();
+			scoped_ksimd()
+				poly1305_blocks_neon(state, src, todo, padbit);

 			len -= todo;
 			src += todo;
--- a/lib/crypto/arm64/polyval.h
+++ b/lib/crypto/arm64/polyval.h
@@ -4,7 +4,6 @@
 *
 * Copyright 2025 Google LLC
 */
-#include <asm/neon.h>
 #include <asm/simd.h>
 #include <linux/cpufeature.h>

@@ -24,13 +23,14 @@ static void polyval_preparekey_arch(struct polyval_key *key,
 	static_assert(ARRAY_SIZE(key->h_powers) == NUM_H_POWERS);
 	memcpy(&key->h_powers[NUM_H_POWERS - 1], raw_key, POLYVAL_BLOCK_SIZE);
 	if (static_branch_likely(&have_pmull) && may_use_simd()) {
-		kernel_neon_begin();
-		for (int i = NUM_H_POWERS - 2; i >= 0; i--) {
-			key->h_powers[i] = key->h_powers[i + 1];
-			polyval_mul_pmull(&key->h_powers[i],
-					  &key->h_powers[NUM_H_POWERS - 1]);
+		scoped_ksimd() {
+			for (int i = NUM_H_POWERS - 2; i >= 0; i--) {
+				key->h_powers[i] = key->h_powers[i + 1];
+				polyval_mul_pmull(
+					&key->h_powers[i],
+					&key->h_powers[NUM_H_POWERS - 1]);
+			}
 		}
-		kernel_neon_end();
 	} else {
 		for (int i = NUM_H_POWERS - 2; i >= 0; i--) {
 			key->h_powers[i] = key->h_powers[i + 1];
@@ -44,9 +44,8 @@ static void polyval_mul_arch(struct polyval_elem *acc,
 			     const struct polyval_key *key)
 {
 	if (static_branch_likely(&have_pmull) && may_use_simd()) {
-		kernel_neon_begin();
-		polyval_mul_pmull(acc, &key->h_powers[NUM_H_POWERS - 1]);
-		kernel_neon_end();
+		scoped_ksimd()
+			polyval_mul_pmull(acc, &key->h_powers[NUM_H_POWERS - 1]);
 	} else {
 		polyval_mul_generic(acc, &key->h_powers[NUM_H_POWERS - 1]);
 	}
@@ -62,9 +61,8 @@ static void polyval_blocks_arch(struct polyval_elem *acc,
 			size_t n = min_t(size_t, nblocks,
 					 4096 / POLYVAL_BLOCK_SIZE);

-			kernel_neon_begin();
-			polyval_blocks_pmull(acc, key, data, n);
-			kernel_neon_end();
+			scoped_ksimd()
+				polyval_blocks_pmull(acc, key, data, n);
 			data += n * POLYVAL_BLOCK_SIZE;
 			nblocks -= n;
 		} while (nblocks);
--- a/lib/crypto/arm64/sha1.h
+++ b/lib/crypto/arm64/sha1.h
@@ -4,7 +4,6 @@
 *
 * Copyright 2025 Google LLC
 */
-#include <asm/neon.h>
 #include <asm/simd.h>
 #include <linux/cpufeature.h>

@@ -20,9 +19,9 @@ static void sha1_blocks(struct sha1_block_state *state,
 		do {
 			size_t rem;

-			kernel_neon_begin();
-			rem = __sha1_ce_transform(state, data, nblocks);
-			kernel_neon_end();
+			scoped_ksimd()
+				rem = __sha1_ce_transform(state, data, nblocks);
+
 			data += (nblocks - rem) * SHA1_BLOCK_SIZE;
 			nblocks = rem;
 		} while (nblocks);
--- a/lib/crypto/arm64/sha256.h
+++ b/lib/crypto/arm64/sha256.h
@@ -4,7 +4,6 @@
 *
 * Copyright 2025 Google LLC
 */
-#include <asm/neon.h>
 #include <asm/simd.h>
 #include <linux/cpufeature.h>

@@ -27,17 +26,16 @@ static void sha256_blocks(struct sha256_block_state *state,
 			do {
 				size_t rem;

-				kernel_neon_begin();
-				rem = __sha256_ce_transform(state,
-							    data, nblocks);
-				kernel_neon_end();
+				scoped_ksimd()
+					rem = __sha256_ce_transform(state, data,
+								    nblocks);
+
 				data += (nblocks - rem) * SHA256_BLOCK_SIZE;
 				nblocks = rem;
 			} while (nblocks);
 		} else {
-			kernel_neon_begin();
-			sha256_block_neon(state, data, nblocks);
-			kernel_neon_end();
+			scoped_ksimd()
+				sha256_block_neon(state, data, nblocks);
 		}
 	} else {
 		sha256_block_data_order(state, data, nblocks);
@@ -66,9 +64,8 @@ static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx,
 	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
 	    static_branch_likely(&have_ce) && len >= SHA256_BLOCK_SIZE &&
 	    len <= 65536 && likely(may_use_simd())) {
-		kernel_neon_begin();
-		sha256_ce_finup2x(ctx, data1, data2, len, out1, out2);
-		kernel_neon_end();
+		scoped_ksimd()
+			sha256_ce_finup2x(ctx, data1, data2, len, out1, out2);
 		kmsan_unpoison_memory(out1, SHA256_DIGEST_SIZE);
 		kmsan_unpoison_memory(out2, SHA256_DIGEST_SIZE);
 		return true;
--- a/lib/crypto/arm64/sha3.h
+++ b/lib/crypto/arm64/sha3.h
@@ -7,7 +7,6 @@
 * published by the Free Software Foundation.
 */

-#include <asm/neon.h>
 #include <asm/simd.h>
 #include <linux/cpufeature.h>

@@ -23,10 +22,9 @@ static void sha3_absorb_blocks(struct sha3_state *state, const u8 *data,
 		do {
 			size_t rem;

-			kernel_neon_begin();
-			rem = sha3_ce_transform(state, data, nblocks,
-						block_size);
-			kernel_neon_end();
+			scoped_ksimd()
+				rem = sha3_ce_transform(state, data, nblocks,
+							block_size);
 			data += (nblocks - rem) * block_size;
 			nblocks = rem;
 		} while (nblocks);
@@ -46,9 +44,8 @@ static void sha3_keccakf(struct sha3_state *state)
 		 */
 		static const u8 zeroes[SHA3_512_BLOCK_SIZE];

-		kernel_neon_begin();
-		sha3_ce_transform(state, zeroes, 1, sizeof(zeroes));
-		kernel_neon_end();
+		scoped_ksimd()
+			sha3_ce_transform(state, zeroes, 1, sizeof(zeroes));
 	} else {
 		sha3_keccakf_generic(state);
 	}
--- a/lib/crypto/arm64/sha512.h
+++ b/lib/crypto/arm64/sha512.h
@@ -4,7 +4,7 @@
 *
 * Copyright 2025 Google LLC
 */
-#include <asm/neon.h>
+
 #include <asm/simd.h>
 #include <linux/cpufeature.h>

@@ -24,9 +24,9 @@ static void sha512_blocks(struct sha512_block_state *state,
 		do {
 			size_t rem;

-			kernel_neon_begin();
-			rem = __sha512_ce_transform(state, data, nblocks);
-			kernel_neon_end();
+			scoped_ksimd()
+				rem = __sha512_ce_transform(state, data, nblocks);
+
 			data += (nblocks - rem) * SHA512_BLOCK_SIZE;
 			nblocks = rem;
 		} while (nblocks);
--- a/lib/raid6/neon.c
+++ b/lib/raid6/neon.c
@@ -8,10 +8,9 @@
 #include <linux/raid/pq.h>

 #ifdef __KERNEL__
-#include <asm/neon.h>
+#include <asm/simd.h>
 #else
-#define kernel_neon_begin()
-#define kernel_neon_end()
+#define scoped_ksimd()
 #define cpu_has_neon()		(1)
 #endif

@@ -32,10 +31,9 @@
 	{								\
 		void raid6_neon ## _n  ## _gen_syndrome_real(int,	\
 						unsigned long, void**);	\
-		kernel_neon_begin();					\
-		raid6_neon ## _n ## _gen_syndrome_real(disks,		\
+		scoped_ksimd()						\
+			raid6_neon ## _n ## _gen_syndrome_real(disks,	\
 					(unsigned long)bytes, ptrs);	\
-		kernel_neon_end();					\
 	}								\
 	static void raid6_neon ## _n ## _xor_syndrome(int disks,	\
 					int start, int stop, 		\
@@ -43,10 +41,9 @@
 	{								\
 		void raid6_neon ## _n  ## _xor_syndrome_real(int,	\
 				int, int, unsigned long, void**);	\
-		kernel_neon_begin();					\
-		raid6_neon ## _n ## _xor_syndrome_real(disks,		\
-			start, stop, (unsigned long)bytes, ptrs);	\
-		kernel_neon_end();					\
+		scoped_ksimd()						\
+			raid6_neon ## _n ## _xor_syndrome_real(disks,	\
+				start, stop, (unsigned long)bytes, ptrs);\
 	}								\
 	struct raid6_calls const raid6_neonx ## _n = {			\
 		raid6_neon ## _n ## _gen_syndrome,			\
--- a/lib/raid6/recov_neon.c
+++ b/lib/raid6/recov_neon.c
@@ -7,11 +7,10 @@
 #include <linux/raid/pq.h>

 #ifdef __KERNEL__
-#include <asm/neon.h>
+#include <asm/simd.h>
 #include "neon.h"
 #else
-#define kernel_neon_begin()
-#define kernel_neon_end()
+#define scoped_ksimd()
 #define cpu_has_neon()		(1)
 #endif

@@ -55,9 +54,8 @@ static void raid6_2data_recov_neon(int disks, size_t bytes, int faila,
 	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^
 					 raid6_gfexp[failb]]];

-	kernel_neon_begin();
-	__raid6_2data_recov_neon(bytes, p, q, dp, dq, pbmul, qmul);
-	kernel_neon_end();
+	scoped_ksimd()
+		__raid6_2data_recov_neon(bytes, p, q, dp, dq, pbmul, qmul);
 }

 static void raid6_datap_recov_neon(int disks, size_t bytes, int faila,
@@ -86,9 +84,8 @@ static void raid6_datap_recov_neon(int disks, size_t bytes, int faila,
 	/* Now, pick the proper data tables */
 	qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];

-	kernel_neon_begin();
-	__raid6_datap_recov_neon(bytes, p, q, dq, qmul);
-	kernel_neon_end();
+	scoped_ksimd()
+		__raid6_datap_recov_neon(bytes, p, q, dq, qmul);
 }

 const struct raid6_recov_calls raid6_recov_neon = {